File size: 3,892 Bytes
0ab7b0c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
import pandas as pd
import numpy as np
from warnings import warn
# 2018.11.07 Created by Eamon.Zhang
def check_missing(data,output_path=None):
"""
check the total number & percentage of missing values
per variable of a pandas Dataframe
"""
result = pd.concat([data.isnull().sum(),data.isnull().mean()],axis=1)
result = result.rename(index=str,columns={0:'total missing',1:'proportion'})
if output_path is not None:
result.to_csv(output_path+'missing.csv')
print('result saved at', output_path, 'missing.csv')
return result
def drop_missing(data,axis=0):
"""
Listwise deletion:
excluding all cases (listwise) that have missing values
Parameters
----------
axis: drop cases(0)/columns(1),default 0
Returns
-------
Pandas dataframe with missing cases/columns dropped
"""
data_copy = data.copy(deep=True)
data_copy = data_copy.dropna(axis=axis,inplace=False)
return data_copy
def add_var_denote_NA(data,NA_col=[]):
"""
creating an additional variable indicating whether the data
was missing for that observation (1) or not (0).
"""
data_copy = data.copy(deep=True)
for i in NA_col:
if data_copy[i].isnull().sum()>0:
data_copy[i+'_is_NA'] = np.where(data_copy[i].isnull(),1,0)
else:
warn("Column %s has no missing cases" % i)
return data_copy
def impute_NA_with_arbitrary(data,impute_value,NA_col=[]):
"""
replacing NA with arbitrary values.
"""
data_copy = data.copy(deep=True)
for i in NA_col:
if data_copy[i].isnull().sum()>0:
data_copy[i+'_'+str(impute_value)] = data_copy[i].fillna(impute_value)
else:
warn("Column %s has no missing cases" % i)
return data_copy
def impute_NA_with_avg(data,strategy='mean',NA_col=[]):
"""
replacing the NA with mean/median/most frequent values of that variable.
Note it should only be performed over training set and then propagated to test set.
"""
data_copy = data.copy(deep=True)
for i in NA_col:
if data_copy[i].isnull().sum()>0:
if strategy=='mean':
data_copy[i+'_impute_mean'] = data_copy[i].fillna(data[i].mean())
elif strategy=='median':
data_copy[i+'_impute_median'] = data_copy[i].fillna(data[i].median())
elif strategy=='mode':
data_copy[i+'_impute_mode'] = data_copy[i].fillna(data[i].mode()[0])
else:
warn("Column %s has no missing" % i)
return data_copy
def impute_NA_with_end_of_distribution(data,NA_col=[]):
"""
replacing the NA by values that are at the far end of the distribution of that variable
calculated by mean + 3*std
"""
data_copy = data.copy(deep=True)
for i in NA_col:
if data_copy[i].isnull().sum()>0:
data_copy[i+'_impute_end_of_distri'] = data_copy[i].fillna(data[i].mean()+3*data[i].std())
else:
warn("Column %s has no missing" % i)
return data_copy
def impute_NA_with_random(data,NA_col=[],random_state=0):
"""
replacing the NA with random sampling from the pool of available observations of the variable
"""
data_copy = data.copy(deep=True)
for i in NA_col:
if data_copy[i].isnull().sum()>0:
data_copy[i+'_random'] = data_copy[i]
# extract the random sample to fill the na
random_sample = data_copy[i].dropna().sample(data_copy[i].isnull().sum(), random_state=random_state)
random_sample.index = data_copy[data_copy[i].isnull()].index
data_copy.loc[data_copy[i].isnull(), str(i)+'_random'] = random_sample
else:
warn("Column %s has no missing" % i)
return data_copy
|