feature selection aims instead to rank the importance of the existing features in the dataset and discard less important ones
.import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.patches as mpatches
from pylab import rcParams
import seaborn as sns
from matplotlib.pyplot import figure
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
from sklearn.utils import shuffle
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from cycler import cycler
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from yellowbrick.classifier import ConfusionMatrix
from yellowbrick.classifier import ROCAUC
from yellowbrick.classifier import PrecisionRecallCurve
from yellowbrick.classifier import ClassificationReport
warnings.filterwarnings('ignore')
plt.rcParams['axes.prop_cycle'] = cycler(color='brgy')
Load and summarize data
df = pd.read_csv('mushrooms.csv')
pd.options.display.max_columns = None
df.info()
View data
df.head()
Plot target classes
total = len(df)
plt.figure(figsize=(13,5))
plt.subplot(121)
g = sns.countplot(x='class', data=df)
g.set_title("Mushroom class Count \np: Poisonous | e: Edible", fontsize=14)
g.set_ylabel('Count', fontsize=14)
for p in g.patches:
height = p.get_height()
g.text(p.get_x()+p.get_width()/2.,
height + 5,
'{:1.2f}%'.format(height/total*100),
ha="center", fontsize=14, fontweight='bold')
plt.margins(y=0.1)
plt.show()
df['class'].value_counts()
Split features and target
X = df.drop(['class'], axis = 1)
Y = df['class']
Onehot encode categorical variables
X = pd.get_dummies(X, prefix_sep='_')
X.head()
print(f'Original coulumn count = {len(df.columns)}, Onehot encolded column count = {len(X.columns)}')
Label encode target variable
Y = LabelEncoder().fit_transform(Y)
Y
Scale features
X = StandardScaler().fit_transform(X)
Function to train, test and score a RandomForestClassifier
# Classifier
trainedforest = RandomForestClassifier(n_estimators=700)
def forest_test(X, Y):
X_Train, X_Test, Y_Train, Y_Test = train_test_split(X, Y, test_size = 0.30, random_state=0)
# Classifier
trainedforest = RandomForestClassifier(n_estimators=700).fit(X_Train,Y_Train)
f, axes = plt.subplots(1,3 ,figsize=(15,5))
preds = trainedforest.predict(X_Test)
classes = list(df['class'].unique())
cm = ConfusionMatrix(
trainedforest, classes=classes, ax = axes[0],
label_encoder={0: 'Poisonous', 1: 'Edible'})
cm.fit(X_Train, Y_Train)
cm.score(X_Test, Y_Test)
axes[0].set_title('Confusion Matrix')
axes[0].set_xlabel('Predicted Class')
axes[0].set_ylabel('True Class')
roc = ROCAUC(trainedforest, classes=["Poisonous", "Edible"], ax = axes[1])
roc.fit(X_Train, Y_Train)
roc.score(X_Test, Y_Test)
axes[1].set_title('ROC AUC')
axes[1].grid(False)
axes[1].legend()
prc = PrecisionRecallCurve(trainedforest, ax = axes[2])
prc.fit(X_Train, Y_Train)
prc.score(X_Test, Y_Test)
axes[2].set_title('Precision Recall Curve')
axes[2].grid(False)
axes[2].legend()
plt.tight_layout()
plt.show();
print('\n',classification_report(Y_Test,preds))
Functions to test and plot 2d and 3d representations of the features vs the target
def complete_test_2D(X, Y, plot_name = ''):
Small_df = pd.DataFrame(data = X, columns = ['C1', 'C2'])
Small_df = pd.concat([Small_df, df['class']], axis = 1)
Small_df['class'] = LabelEncoder().fit_transform(Small_df['class'])
forest_test(X, Y)
plt.figure.figsize=(10,8)
classes = [1, 0]
colors = ['r', 'b']
for clas, color in zip(classes, colors):
plt.scatter(Small_df.loc[Small_df['class'] == clas, 'C1'],
Small_df.loc[Small_df['class'] == clas, 'C2'],
c = color, alpha=0.5)
plt.xlabel('Component 1', fontsize = 12)
plt.ylabel('Component 2', fontsize = 12)
plt.title(f'{plot_name}', fontsize = 15)
plt.legend(['Poisonous', 'Edible'])
plt.grid(False)
plt.show()
def complete_test_3D(X, Y, plot_name = ''):
Small_df = pd.DataFrame(data = X, columns = ['C1', 'C2', 'C3'])
Small_df = pd.concat([Small_df, df['class']], axis = 1)
Small_df['class'] = LabelEncoder().fit_transform(Small_df['class'])
forest_test(X, Y)
fig=plt.figure(figsize=(8,6))
ax=fig.add_subplot(1,1,1, projection="3d")
pnt3d = ax.scatter(Small_df['C1'],Small_df['C2'],Small_df['C3'],
c=Small_df['class'],alpha=.5, s=75,cmap='coolwarm',
label=list(Small_df.columns))
one = mpatches.Patch(facecolor='b', label='0', linewidth = 0.5, edgecolor = 'black')
two = mpatches.Patch(facecolor='r', label = '1', linewidth = 0.5, edgecolor = 'black')
ax.set_title(f'{plot_name}', fontsize = 15)
ax.set(xlabel=f'\n{Small_df.columns[0]}',ylabel=f'\n{Small_df.columns[1]}',zlabel=f'\n{Small_df.columns[2]}')
ax.legend(handles=[one, two], title="class", fontsize='medium', fancybox=True)
plt.show()
Intital test
forest_test(X, Y)
Testing first 2 principal components
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
PCA_df = pd.DataFrame(data = X_pca, columns = ['PC1', 'PC2'])
PCA_df = pd.concat([PCA_df, df['class']], axis = 1)
PCA_df['class'] = LabelEncoder().fit_transform(PCA_df['class'])
PCA_df.head()
complete_test_2D(X_pca, Y, 'PCA')
First 2 principal components Explained Variance
var_ratio = pca.explained_variance_ratio_
cum_var_ratio = np.cumsum(var_ratio)
# create column names
col_num = X_pca.shape[1]
feat_names = ['PC'+str(num) for num in list(range(1,col_num+1,1))]
sns.barplot(y=var_ratio, x=feat_names)
sns.pointplot(y=cum_var_ratio, x=feat_names, color='black', label='cummulative')
plt.grid(False)
plt.title("Explained variance Ratio by each principal components", fontsize=14)
plt.ylabel("Explained variance ratio in percent")
plt.legend(['cummulative'])
plt.show()
Testing first 3 principal components
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X)
complete_test_3D(X_pca, Y, 'PCA')
var_ratio = pca.explained_variance_ratio_
cum_var_ratio = np.cumsum(var_ratio)
# create column names
col_num = X_pca.shape[1]
feat_names = ['PC'+str(num) for num in list(range(1,col_num+1,1))]
sns.barplot(y=var_ratio, x=feat_names)
sns.pointplot(y=cum_var_ratio, x=feat_names, color='black', label='cummulative')
plt.grid(False)
plt.title("Explained variance Ratio by each principal components", fontsize=14)
plt.ylabel("Explained variance ratio in percent")
plt.legend(['cummulative'])
plt.show()
Gradient descent is an optimization algorithm used to minimize some function by iteratively moving in the direction of steepest descent as defined by the negative of the gradient.
X
= parameter to optimize y
= loss or cost functionX
that produces the lowest value of y
. Testing first 2 features t-sne
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
X_tsne = tsne.fit_transform(X)
complete_test_2D(X_tsne, Y, 't-SNE')
Testing first 3 features t-sne
tsne = TSNE(n_components=3, verbose=1, perplexity=40, n_iter=300)
X_tsne = tsne.fit_transform(X)
complete_test_3D(X_tsne, Y, 't-SNE')
from sklearn.decomposition import FastICA
ica = FastICA(n_components=3)
X_ica = ica.fit_transform(X)
complete_test_3D(X_ica, Y, 'ICA')
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis(n_components=1)
# run an LDA and use it to transform the features
X_lda = lda.fit(X, Y).transform(X)
forest_test(X_lda, Y)
from sklearn.manifold import LocallyLinearEmbedding
embedding = LocallyLinearEmbedding(n_components=3)
X_lle = embedding.fit_transform(X[:1500])
complete_test_3D(X_lle, Y[:1500], 'LLE')
In this example, we will start by building a basic Autoencoder (Figure Below).
from keras.layers import Input, Dense
from keras.models import Model
input_layer = Input(shape=(X.shape[1],))
encoded = Dense(3, activation='relu')(input_layer)
decoded = Dense(X.shape[1], activation='softmax')(encoded)
autoencoder = Model(input_layer, decoded)
autoencoder.compile(optimizer='adam', loss='binary_crossentropy')
X1, X2, Y1, Y2 = train_test_split(X, X, test_size=0.3, random_state=101)
autoencoder.fit(X1, Y1,
epochs=100,
batch_size=300,
shuffle=True,
verbose = 0,
validation_data=(X2, Y2))
encoder = Model(input_layer, encoded)
X_ae = encoder.predict(X)
complete_test_3D(X_ae, Y, 'Autoencoder')