import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
url = 'https://raw.githubusercontent.com/krishnaik06/Feature-Engineering-Live-sessions/master/titanic.csv'
df = pd.read_csv(url)
df.info()
df.head()
df.isnull().sum()
Age
and Cabin
? Yes, when we gave the survey all details were available outside of first age because the data was taken after the crash so that person would be dead so no Age
could be recorded.The second variable Cabin
if the passenger is dead they also cannot give any cabin information. So the Age
and Cabin
variables are MNAR.import missingno as msno
msno.heatmap(df, cmap='viridis')
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.show();
msno.dendrogram(df);
df[df['Embarked'].isnull()]
Embarked
variable is which station the passengers where picked up and dropped off at. This variable is MCAR both Age
and Cabin
are presentimport missingno as msno
msno.matrix(df);
# Percentage of missing values
df.isnull().mean()
# Binary transform if cabin is present or not
df['cabin_null'] = np.where(df['Cabin'].isnull(),1,0)
# Find percent of missing cabin values for survivers and non survivers
df.groupby(['Survived'])['cabin_null'].mean()
df = df[['Age','Fare','Survived']]
df.head()
# Percentage of missing values
df.isnull().mean()
# Function to impute with a fillna method
def impute_nan(df=df,variable='Age',strat=None, method=None):
df[variable + "_" + method]=df[variable].fillna(strat)
methods = [df.Age.mean(), df.Age.median(), df.Age.mode()]
names = ['mean', 'median', 'mode']
for m, name in zip(methods, names):
type_ = int(m)
impute_nan(df,'Age', type_, method=name)
df.head()
Median is more robust to outliers, if the dataset has outliers using the mean could affect the analysis.
age_cols = [x for x in list(df.columns) if 'Age' in x]
for col in age_cols:
df[col].plot(kind='kde', label=f'{col}, std = {round(df[col].std(),3)}')
plt.legend()
plt.show()
Advantages
Disadvantages
# Generate a random sample from Age without missing values of the same length as the missing values
sample = df['Age'].dropna().sample(df['Age'].isnull().sum(), random_state=0)
print('# of samples', len(sample))
sample.head()
def impute_random(df, variable):
df[variable + "_random"] = df[variable]
# Create the random sample to fill the NaNs
random_sample = df[variable].dropna().sample(df[variable].isnull().sum(), random_state=0)
# Pandas needs the same index in order to find and fill the NaN locations
random_sample.index = df[df[variable].isnull()].index
# New column without NaNs filled by random sample
df.loc[df[variable].isnull(),variable+'_random']=random_sample
impute_random(df,"Age")
df.info()
df.head()
age_cols = ['Age', 'Age_mode', 'Age_random']
for col in age_cols:
df[col].plot(kind='kde', label=f'{col}, std = {round(df[col].std(),3)}')
plt.legend()
plt.show()
The random sample imputation actually matches the original distribution the best with the least distortion in variance
Advantages
Disadvantages
It works well if the data are not missing completely at random but show charateristics of Missing Data Not At Random (MNAR). This also works for categorical data
.
# Create a feature that is a binary representation of its preseance
df['Age_NAN'] = np.where(df['Age'].isnull(), 1, 0)
df.head()
Advantages
Disadvantages
If there is suspicion that the missing value is not MAR or MCAR then capturing that information is important. In this case, one would want to replace missing data with values that are at the tails of the distribution of the variable.
df.Age.hist(bins=50);
sns.boxplot('Age',data=df);
Outliers only exist in the extreme
# Get extreme values from distribution
extreme = df.Age.mean() + (3 * df.Age.std())
print(f'Extreme = {extreme}')
df.loc[np.where(df['Age'] >= extreme)]
def impute_extreme(df, variable, extreme):
df[variable + "_end_dist"]=df[variable].fillna(extreme)
impute_extreme(df,'Age', extreme)
df.head()
age_cols = ['Age', 'Age_mode', 'Age_random', 'Age_end_dist']
for col in age_cols:
df[col].plot(kind='kde', label=f'{col}, std = {round(df[col].std(),3)}')
plt.legend()
plt.show()
plt.figure(figsize=(12,5))
plt.subplot(121)
sns.boxplot('Age', data=df)
plt.title('Age')
plt.subplot(122)
sns.boxplot('Age_end_dist', data=df)
plt.title('Age_end_dist')
plt.show()
Advantages
Disadvantages
This works best when there are a very small number of missing values.
# create missing values df
data = [ ('D',1,10,6,np.NaN),
('D',2,12,10,12),
('X',1,28,15,np.NaN),
('D',3,np.NaN,4,np.NaN),
('X',2,np.NaN,20,25),
('X',3,32,31,25),
('T',1,220,250,np.NaN),
('X',4,30,22,np.NaN),
('T',2,240,170,np.NaN),
('X',2,38,27,np.NaN),
('T',3,np.NaN,44,np.NaN),
('D',1,20,18,80),
('D',4,200,120,150)]
labels = ['item1', 'month','normal_price','item2','final_price']
df = pd.DataFrame.from_records(data, columns=labels)
df.info()
df
# Removing all rows
df.dropna(axis=0)
# Removing all columns
df.dropna(axis=1)
Advantages
Disadvantages
Filling in missing data with its next or previous value. This works specifically when we know the missing data recorded is missing but unchanged.
normal_price
, final_price
columns for each item with the 'normal_price','final_price' of its preceding month (if not available by its succeeding month).# Chain together bfill and ffill to then fill the remaining NaN values:
df.ffill().bfill()
Advantages
Disadvantages
With this method you impute missing data with the most frequently occurring value. This method would be best suited for categorical data, as missing values have the highest probability of being the most frequently occurring value.
url_train = 'https://raw.githubusercontent.com/liyenhsu/Kaggle-House-Prices/master/data/train.csv'
df = pd.read_csv(url_train, usecols=['BsmtQual','FireplaceQu','GarageType','SalePrice'])
df.info()
df.isnull().sum()
df.isnull().mean().sort_values(ascending=True)
fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(12,5))
df['BsmtQual'].value_counts().plot.bar(ax=ax[0])
df['GarageType'].value_counts().plot.bar(ax=ax[1])
df['FireplaceQu'].value_counts().plot.bar(ax=ax[2])
ax[0].set_title('BsmtQual')
ax[1].set_title('GarageType')
ax[2].set_title('FireplaceQu')
plt.tight_layout()
plt.show()
def impute_mode_cat(df, variable):
most_frequent_category = df[variable].dropna().mode()[0]
#print(most_frequent_category)
df[variable + '_mode'] = df[variable]
df[variable + '_mode'].fillna(most_frequent_category, inplace=True)
for feature in ['BsmtQual','FireplaceQu','GarageType']:
impute_mode_cat(df, feature)
df.head()
df.isnull().mean()
Advantages
Disadvantages
Here, the purpose is to flag missing values in the data set. You would impute the missing data with a fixed arbitrary value (a random value).
It is mostly used for categorical variables, but can also be used for numeric variables
with arbitrary values such as $0$, $999$ or other similar combinations of numbers.
def impute_new_cat(df, variable):
df[variable + "_newvar"] = np.where(df[variable].isnull(), "Missing", df[variable])
for feature in ['BsmtQual','FireplaceQu','GarageType']:
impute_new_cat(df, feature)
df.head()
Advantages
Disadvantages
For this we can use scikit learns
IterativeImputer. This models each feature with missing values as a function of other features, and uses that estimate for imputation. It does so in an iterated round-robin fashion: at each step, a feature column is designated as output $y$ and the other feature columns are treated as inputs $X$. A regressor (BayesianRidge()
, ExtraTreesRegressor()
) is fit on ($X$, $y$) for known $y$. Then, the regressor is used to predict the missing values of $y$. This is done for each feature in an iterative fashion, and then is repeated for max_iter
imputation rounds.
# create missing values df
data = [ (np.NaN, 'A', 1,10,6,np.NaN),
('D', 'D', 2,12,10,12),
('X', np.NaN, 1,28,15,np.NaN),
('D', 'D', 3,np.NaN,4,np.NaN),
('X', 'B', 2,np.NaN,20,25),
(np.NaN, 'B', 3,32,31,25),
('T', 'F', 1,220,250,np.NaN),
('X', np.NaN, 4,30,22,np.NaN),
('T', np.NaN, 2,240,170,np.NaN),
('X', 'C', 2,38,27,np.NaN),
('T', 'C', 3,np.NaN,44,np.NaN),
(np.NaN, 'A', 4,20,18,80),
('D', np.NaN, 4,200,120,150)]
labels = ['item1', 'grade', 'month','normal_price','item2','final_price']
df = pd.DataFrame.from_records(data, columns=labels)
df.info()
df_ = df.copy()
df
from sklearn import preprocessing
import warnings
warnings.filterwarnings('ignore')
encoder = preprocessing.OrdinalEncoder()
cat_cols = ['item1','grade']
# Function to encode non-null data and replace it in the original data'
def encode(data):
# Retains only non-null values
nonulls = np.array(data.dropna())
# Reshapes the data for encoding
impute_reshape = nonulls.reshape(-1,1)
# Encode data
impute_ordinal = encoder.fit_transform(impute_reshape)
# Assign back encoded values to non-null values
data.loc[data.notnull()] = np.squeeze(impute_ordinal)
return data
for columns in cat_cols:
encode(df[columns])
print('\nOriginal:', '\n', df_)
print('\nEncoded:', '\n',df)
This method works very well with categorical and non-numerical features. For this we use a keras scikit learn wrapper to use a deep learning model as a scikit learn classifier or regressor.
Pros:
Cons:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
def reg_model():
# create model
model = Sequential()
# Add and input and dense layer with 12 nurons
model.add(Dense(12, input_dim=df.shape[1] - 1, activation='relu'))
# Add output layer
model.add(Dense(1, activation='relu'))
# Compile model
model.compile(loss=tf.keras.losses.mape, optimizer='adam')
return model
def classif_model():
# create model
model = Sequential()
# Add and input and dense layer with 12 nurons
model.add(Dense(12, input_dim=df.shape[1] - 1, activation='sigmoid'))
# Add output layer
model.add(Dense(1, activation='sigmoid'))
# Compile model
model.compile(loss=tf.keras.losses.kld, optimizer='adam')
return model
The $k$ nearest neighbors is an algorithm that is used for simple classification. The algorithm uses ‘feature similarity’ to predict the values of any new data points. This means that the new point is assigned a value based on how closely it resembles the points in the training set. This can be very useful in making predictions about the missing values by finding the k’s closest neighbors to the observation with missing data and then imputing them based on the non-missing values in the neighborhood.
It creates a basic mean impute then uses the resulting complete list to construct a KDTree. Then, it uses the resulting KDTree to compute nearest neighbours (NN). After it finds the k-NNs, it takes the weighted average of them.
Pros:
Cons:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
from sklearn.neighbors import KNeighborsClassifier
reg_nn = KerasRegressor(build_fn=reg_model, epochs=20, batch_size=5, verbose=0)
classif_nn = KerasRegressor(build_fn=classif_model, epochs=20, batch_size=5, verbose=0)
reg = BayesianRidge()
knn = KNeighborsClassifier(n_neighbors=2)
z0 = np.round(IterativeImputer(estimator=reg,random_state=0).fit_transform(df.values))
z1 = IterativeImputer(estimator=knn, random_state=0).fit_transform(df.values)
z2 = np.round(IterativeImputer(estimator=reg_nn,random_state=0).fit_transform(df.values))
z3 = np.round(IterativeImputer(estimator=classif_nn,random_state=0).fit_transform(df.values))
df_imp0 = pd.DataFrame(z0, columns=df.columns)
df_imp1 = pd.DataFrame(z1, columns=df.columns)
df_imp2 = pd.DataFrame(z2, columns=df.columns)
df_imp3 = pd.DataFrame(z3, columns=df.columns)
print('\nMissing:')
print('\n',df)
print('\nBayesianRidgeRegressor:')
print('\n',df_imp0)
print('\nKNeighborsClassifier:')
print('\n',df_imp1)
print('\nNNRegressor:')
print('\n',df_imp2)
print('\nNNClassifier:')
print('\n',df_imp3)
Advantages
Disavantages
This type of imputation works by filling the missing data multiple times. Multiple Imputations (MIs) are much better than a single imputation as it measures the uncertainty of the missing values in a better way. The chained equations approach is also very flexible and can handle different variables of different data types (ie., continuous or binary) as well as complexities such as bounds or survey skip patterns.
from impyute.imputation.cs import mice
imputed_training = np.round(mice(df.astype(float).values))
print('\nMissing:')
print('\n',df)
print('\nNNRegressor:')
print('\n',df_imp2)
print('\nNNClassifier:')
print('\n',df_imp3)
print('\nMICE:')
print('\n',pd.DataFrame(imputed_training, columns=df.columns))