In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
plt.style.use('seaborn-colorblind')
%matplotlib inline
from feature_cleaning import missing_data as ms


## Load dataset

In [2]:
use_cols = [
    'Pclass', 'Sex', 'Age', 'Fare', 'SibSp',
    'Survived'
]

data = pd.read_csv('./data/titanic.csv', usecols=use_cols)
print(data.shape)
data.head(8)

(891, 6)


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare
0,0,3,male,22.0,1,7.25
1,1,1,female,38.0,1,71.2833
2,1,3,female,26.0,0,7.925
3,1,1,female,35.0,1,53.1
4,0,3,male,35.0,0,8.05
5,0,3,male,,0,8.4583
6,0,1,male,54.0,0,51.8625
7,0,3,male,2.0,3,21.075


## Missing value checking
check the total number & percentage of missing values
per variable of a pandas Dataframe

In [3]:
# only variable Age has missing values, totally 177 cases
# result is saved at the output dir (if given)

ms.check_missing(data=data,output_path=r'./output/')

result saved at ./output/ missing.csv


Unnamed: 0,total missing,proportion
Survived,0,0.0
Pclass,0,0.0
Sex,0,0.0
Age,177,0.198653
SibSp,0,0.0
Fare,0,0.0


## Listwise deletion  
excluding all cases (listwise) that have missing values

In [4]:
# 177 cases which has NA has been dropped 
data2 = ms.drop_missing(data=data)
data2.shape

(714, 6)

## Add a variable to denote NA
creating an additional variable indicating whether the data was missing for that observation

In [5]:
# Age_is_NA is created, 0-not missing 1-missing for that observation
data3 = ms.add_var_denote_NA(data=data,NA_col=['Age'])
print(data3.Age_is_NA.value_counts())
data3.head(8)

0    714
1    177
Name: Age_is_NA, dtype: int64


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare,Age_is_NA
0,0,3,male,22.0,1,7.25,0
1,1,1,female,38.0,1,71.2833,0
2,1,3,female,26.0,0,7.925,0
3,1,1,female,35.0,1,53.1,0
4,0,3,male,35.0,0,8.05,0
5,0,3,male,,0,8.4583,1
6,0,1,male,54.0,0,51.8625,0
7,0,3,male,2.0,3,21.075,0


## Arbitrary Value Imputation
Replacing the NA by arbitrary values

In [6]:
data4 = ms.impute_NA_with_arbitrary(data=data,impute_value=-999,NA_col=['Age'])
data4.head(8)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare,Age_-999
0,0,3,male,22.0,1,7.25,22.0
1,1,1,female,38.0,1,71.2833,38.0
2,1,3,female,26.0,0,7.925,26.0
3,1,1,female,35.0,1,53.1,35.0
4,0,3,male,35.0,0,8.05,35.0
5,0,3,male,,0,8.4583,-999.0
6,0,1,male,54.0,0,51.8625,54.0
7,0,3,male,2.0,3,21.075,2.0


## Mean/Median/Mode Imputation
Replacing the NA by mean/median/mode of that variable

In [7]:
print(data.Age.median())
data5 = ms.impute_NA_with_avg(data=data,strategy='median',NA_col=['Age'])
data5.head(8)

28.0


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare,Age_impute_median
0,0,3,male,22.0,1,7.25,22.0
1,1,1,female,38.0,1,71.2833,38.0
2,1,3,female,26.0,0,7.925,26.0
3,1,1,female,35.0,1,53.1,35.0
4,0,3,male,35.0,0,8.05,35.0
5,0,3,male,,0,8.4583,28.0
6,0,1,male,54.0,0,51.8625,54.0
7,0,3,male,2.0,3,21.075,2.0


##  End of distribution Imputation
replacing the NA by values that are at the far end of the distribution of that variable
calculated by mean + 3*std

In [8]:
data6 = ms.impute_NA_with_end_of_distribution(data=data,NA_col=['Age'])
data6.head(8)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare,Age_impute_end_of_distri
0,0,3,male,22.0,1,7.25,22.0
1,1,1,female,38.0,1,71.2833,38.0
2,1,3,female,26.0,0,7.925,26.0
3,1,1,female,35.0,1,53.1,35.0
4,0,3,male,35.0,0,8.05,35.0
5,0,3,male,,0,8.4583,73.27861
6,0,1,male,54.0,0,51.8625,54.0
7,0,3,male,2.0,3,21.075,2.0


##  Random Imputation
replacing the NA with random sampling from the pool of available observations of the variable


In [9]:
data7 = ms.impute_NA_with_random(data=data,NA_col=['Age'])
data7.head(8)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare,Age_random
0,0,3,male,22.0,1,7.25,22.0
1,1,1,female,38.0,1,71.2833,38.0
2,1,3,female,26.0,0,7.925,26.0
3,1,1,female,35.0,1,53.1,35.0
4,0,3,male,35.0,0,8.05,35.0
5,0,3,male,,0,8.4583,28.0
6,0,1,male,54.0,0,51.8625,54.0
7,0,3,male,2.0,3,21.075,2.0
