|
|
import pandas as pd |
|
|
import numpy as np |
|
|
from warnings import warn |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def check_missing(data,output_path=None): |
|
|
""" |
|
|
check the total number & percentage of missing values |
|
|
per variable of a pandas Dataframe |
|
|
""" |
|
|
|
|
|
result = pd.concat([data.isnull().sum(),data.isnull().mean()],axis=1) |
|
|
result = result.rename(index=str,columns={0:'total missing',1:'proportion'}) |
|
|
if output_path is not None: |
|
|
result.to_csv(output_path+'missing.csv') |
|
|
print('result saved at', output_path, 'missing.csv') |
|
|
return result |
|
|
|
|
|
|
|
|
def drop_missing(data,axis=0): |
|
|
""" |
|
|
Listwise deletion: |
|
|
excluding all cases (listwise) that have missing values |
|
|
|
|
|
Parameters |
|
|
---------- |
|
|
axis: drop cases(0)/columns(1),default 0 |
|
|
|
|
|
Returns |
|
|
------- |
|
|
Pandas dataframe with missing cases/columns dropped |
|
|
""" |
|
|
|
|
|
data_copy = data.copy(deep=True) |
|
|
data_copy = data_copy.dropna(axis=axis,inplace=False) |
|
|
return data_copy |
|
|
|
|
|
|
|
|
def add_var_denote_NA(data,NA_col=[]): |
|
|
""" |
|
|
creating an additional variable indicating whether the data |
|
|
was missing for that observation (1) or not (0). |
|
|
""" |
|
|
|
|
|
data_copy = data.copy(deep=True) |
|
|
for i in NA_col: |
|
|
if data_copy[i].isnull().sum()>0: |
|
|
data_copy[i+'_is_NA'] = np.where(data_copy[i].isnull(),1,0) |
|
|
else: |
|
|
warn("Column %s has no missing cases" % i) |
|
|
|
|
|
return data_copy |
|
|
|
|
|
|
|
|
def impute_NA_with_arbitrary(data,impute_value,NA_col=[]): |
|
|
""" |
|
|
replacing NA with arbitrary values. |
|
|
""" |
|
|
|
|
|
data_copy = data.copy(deep=True) |
|
|
for i in NA_col: |
|
|
if data_copy[i].isnull().sum()>0: |
|
|
data_copy[i+'_'+str(impute_value)] = data_copy[i].fillna(impute_value) |
|
|
else: |
|
|
warn("Column %s has no missing cases" % i) |
|
|
return data_copy |
|
|
|
|
|
|
|
|
def impute_NA_with_avg(data,strategy='mean',NA_col=[]): |
|
|
""" |
|
|
replacing the NA with mean/median/most frequent values of that variable. |
|
|
Note it should only be performed over training set and then propagated to test set. |
|
|
""" |
|
|
|
|
|
data_copy = data.copy(deep=True) |
|
|
for i in NA_col: |
|
|
if data_copy[i].isnull().sum()>0: |
|
|
if strategy=='mean': |
|
|
data_copy[i+'_impute_mean'] = data_copy[i].fillna(data[i].mean()) |
|
|
elif strategy=='median': |
|
|
data_copy[i+'_impute_median'] = data_copy[i].fillna(data[i].median()) |
|
|
elif strategy=='mode': |
|
|
data_copy[i+'_impute_mode'] = data_copy[i].fillna(data[i].mode()[0]) |
|
|
else: |
|
|
warn("Column %s has no missing" % i) |
|
|
return data_copy |
|
|
|
|
|
|
|
|
def impute_NA_with_end_of_distribution(data,NA_col=[]): |
|
|
""" |
|
|
replacing the NA by values that are at the far end of the distribution of that variable |
|
|
calculated by mean + 3*std |
|
|
""" |
|
|
|
|
|
data_copy = data.copy(deep=True) |
|
|
for i in NA_col: |
|
|
if data_copy[i].isnull().sum()>0: |
|
|
data_copy[i+'_impute_end_of_distri'] = data_copy[i].fillna(data[i].mean()+3*data[i].std()) |
|
|
else: |
|
|
warn("Column %s has no missing" % i) |
|
|
return data_copy |
|
|
|
|
|
|
|
|
def impute_NA_with_random(data,NA_col=[],random_state=0): |
|
|
""" |
|
|
replacing the NA with random sampling from the pool of available observations of the variable |
|
|
""" |
|
|
|
|
|
data_copy = data.copy(deep=True) |
|
|
for i in NA_col: |
|
|
if data_copy[i].isnull().sum()>0: |
|
|
data_copy[i+'_random'] = data_copy[i] |
|
|
|
|
|
random_sample = data_copy[i].dropna().sample(data_copy[i].isnull().sum(), random_state=random_state) |
|
|
random_sample.index = data_copy[data_copy[i].isnull()].index |
|
|
data_copy.loc[data_copy[i].isnull(), str(i)+'_random'] = random_sample |
|
|
else: |
|
|
warn("Column %s has no missing" % i) |
|
|
return data_copy |
|
|
|