The code below will include the following:
import warnings
import itertools
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
import numpy as np
import pandas as pd
import scipy
from scipy import stats
import statsmodels.api as sm
from sklearn.feature_selection import f_regression, mutual_info_regression
from minepy import MINE
sns.set(style="whitegrid", color_codes=True)
warnings.filterwarnings('ignore')
# Set random number generator seed for reproducibility
np.random.seed(220)
"""generate linear relationship"""
x_l = np.linspace(0,10,100)
y_l0 = 2.0+0.7*x_l
noise_l = np.random.randn(100)
y_l1 = 2.+0.7*x_l+noise_l
"""generate exponential linear relationship"""
x_e = np.linspace(0,10,100)
y_e0 = np.exp((x_e+2) ** 0.5)*x_l
noise_e = np.random.uniform(0.01,250, 100)
y_e1 = y_e0+noise_e
"""generate quadriatic relationship"""
x_q = np.linspace(-10,10,100)
y_q0 = 2.0+0.7*x_q**2 + 0.5*x_q
noise_q = np.random.uniform(0.5,15, 100)
y_q1 = 2.+ 0.7*x_q**2+ 0.5*x_q + noise_q
"""generate sinusoidal relationship"""
x_s = np.linspace(-3,2,100)
#y_s0 = 3.0+np.sin(x_s)
y_s0 = np.exp(-(x_s+2) ** 2) + np.cos((x_s-2)**2)
noise_s = np.random.uniform(0.5,5, 100)
#y_s1 = 2.+ np.sin(x_s) + noise_s
y_s1 = np.exp(-(x_s+2) ** 2) + np.cos((x_s-2)**2) + noise_s
"""generate circular relationship"""
angle = np.linspace(0,10,100)
r = 50 + np.random.normal(0, 8, angle.shape)
x_cn = r * np.cos(angle)
y_cn = r * np.sin(angle)
idx = np.random.permutation(angle.shape[0])
x_cn = x_cn[idx]
y_cn = y_cn[idx]
angle = np.arctan2(x_cn, y_cn)
order = np.argsort(angle)
x_cn = x_cn[order]
y_cn = y_cn[order]
x_c = np.cos(angle)
y_c = np.sin(angle)
idx = np.random.permutation(angle.shape[0])
x_c = x_c[idx]
y_c = y_c[idx]
angle = np.arctan2(x_c, y_c)
order = np.argsort(angle)
x_c = x_c[order]
y_c = y_c[order]
fig, ax = plt.subplots(nrows=5, ncols=2, figsize=(13,13))
ax[0,0].plot(x_s, y_s0, color='r')
ax[0,1].plot(x_s, y_s1, color='r')
ax[1,0].plot(x_q, y_q0, color='b')
ax[1,1].plot(x_q, y_q1, color='b')
ax[2,0].plot(x_l, y_l0, color='g')
ax[2,1].plot(x_l, y_l1, color='g')
ax[3,0].plot(x_e, y_e0, color='purple')
ax[3,1].plot(x_e, y_e1, color='purple')
ax[4,0].plot(x_c, y_c, color='y')
ax[4,1].plot(x_cn, y_cn, color='y')
ax[0,0].set_title('Sinusoidal')
ax[0,1].set_title('Sinusoida + Noisel')
ax[1,0].set_title('Quadriatic')
ax[1,1].set_title('Quadriatic + Noisel')
ax[2,0].set_title('Linear')
ax[2,1].set_title('Linear + Noisel')
ax[3,0].set_title('Exp Linear')
ax[3,1].set_title('Exp Linear + Noisel')
ax[4,0].set_title('Circular')
ax[4,1].set_title('Circular + Noisel')
plt.tight_layout();
ts_dict = {'lin': {'x': x_l, 'y': y_l0},
'lin_n': {'x': x_l, 'y': y_l1},
'exp lin': {'x': x_e, 'y': y_e0},
'exp lin_n':{'x': x_e, 'y': y_e1},
'quadr': {'x': x_q, 'y': y_q0},
'quadr_n': {'x': x_q, 'y': y_q1},
'sin': {'x': x_l, 'y': y_s0},
'sin_n': {'x': x_l, 'y': y_s1},
'cir': {'x': x_c, 'y': y_c},
'cir_n': {'x': x_cn, 'y': y_cn}}
def compare_corrs(Dict=ts_dict):
mine = MINE(alpha=0.6, c=15, est='mic_approx')
dist_corrs, sprearman_corrs, pearson_corrs, mi_reg, f_tests, cos_sims, mic_s = [], [], [], [], [], [], []
for ts in Dict.keys():
X = ts_dict[ts]['x']
y = ts_dict[ts]['y']
dist_corr = scipy.spatial.distance.correlation(X,y)
sprearman_corr, s_p_val = stats.spearmanr(X,y)
pearson_corr, p_p_val = stats.pearsonr(X,y)
cos_dist = scipy.spatial.distance.cosine(X,y)
mi = mutual_info_regression(X.reshape(-1,1), y.reshape(-1,1))
f_test, _ = f_regression(X.reshape(-1,1), y.reshape(-1,1))
mine.compute_score(X,y)
mic = mine.mic()
dist_corrs.append(dist_corr)
sprearman_corrs.append(sprearman_corr)
pearson_corrs.append(pearson_corr)
cos_sims.append(1-cos_dist)
mi_reg.append(mi[0])
f_tests.append(f_test[0])
mic_s.append(mic)
corrs_df = pd.DataFrame(dist_corrs, index=ts_dict.keys(), columns=['Dist_corr'])
corrs_df['cos_sim'] = cos_sims
#corrs_df['f_reg'] = f_tests / np.max(f_tests)
corrs_df['mutual_info'] = mi_reg / np.max(mi_reg)
corrs_df['spear_corr'] = sprearman_corrs
corrs_df['pear_corr'] = pearson_corrs
corrs_df['mic'] = mic_s
return corrs_df
I subtracted the cosine distance from $1$, because it a distance metric. The farther the distance more dissimilar, inversely the smaller the distance the more correlated are two variables.
high correlation = high relationship
; LOW CORRELATION DISTANCE = high relationship.
In a binary case:
where:
corrs_df = compare_corrs(Dict=ts_dict)
plt.figure(figsize=(12,5))
sns.heatmap(corrs_df, annot=True)
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.yticks(rotation=0)
plt.show()
tmp_fts = corrs_df.T
fig, axs = plt.subplots(ncols=2, nrows=0, figsize=(25, 35))
plt.subplots_adjust(right=2.5)
plt.subplots_adjust(top=2)
cm = plt.get_cmap('jet')
colors = np.linspace(0.1, 1, len(tmp_fts))
for i, feature in enumerate(list(tmp_fts), 1):
plt.subplot(len(list(tmp_fts.columns)), 2, i)
sns.barplot(tmp_fts[feature], tmp_fts[feature].index)
plt.title(f'{feature}', fontsize=28, fontweight='bold')
plt.xlabel('')
for j in range(2):
plt.tick_params(axis='x', labelsize=25)
plt.tick_params(axis='y', labelsize=25)
plt.tight_layout()
plt.show()
Linear:
Exponential Linear:
Quadratic:
Sinusoidal:
Circular: