feature-engineering-guide / feature_cleaning /missing_data.py

Upload 52 files

0ab7b0c verified over 1 year ago

3.89 kB

	import pandas as pd
	import numpy as np
	from warnings import warn

	# 2018.11.07 Created by Eamon.Zhang


	def check_missing(data,output_path=None):
	"""
	check the total number & percentage of missing values
	per variable of a pandas Dataframe
	"""

	result = pd.concat([data.isnull().sum(),data.isnull().mean()],axis=1)
	result = result.rename(index=str,columns={0:'total missing',1:'proportion'})
	if output_path is not None:
	result.to_csv(output_path+'missing.csv')
	print('result saved at', output_path, 'missing.csv')
	return result


	def drop_missing(data,axis=0):
	"""
	Listwise deletion:
	excluding all cases (listwise) that have missing values

	Parameters
	----------
	axis: drop cases(0)/columns(1),default 0

	Returns
	-------
	Pandas dataframe with missing cases/columns dropped
	"""

	data_copy = data.copy(deep=True)
	data_copy = data_copy.dropna(axis=axis,inplace=False)
	return data_copy


	def add_var_denote_NA(data,NA_col=[]):
	"""
	creating an additional variable indicating whether the data
	was missing for that observation (1) or not (0).
	"""

	data_copy = data.copy(deep=True)
	for i in NA_col:
	if data_copy[i].isnull().sum()>0:
	data_copy[i+'_is_NA'] = np.where(data_copy[i].isnull(),1,0)
	else:
	warn("Column %s has no missing cases" % i)

	return data_copy


	def impute_NA_with_arbitrary(data,impute_value,NA_col=[]):
	"""
	replacing NA with arbitrary values.
	"""

	data_copy = data.copy(deep=True)
	for i in NA_col:
	if data_copy[i].isnull().sum()>0:
	data_copy[i+'_'+str(impute_value)] = data_copy[i].fillna(impute_value)
	else:
	warn("Column %s has no missing cases" % i)
	return data_copy


	def impute_NA_with_avg(data,strategy='mean',NA_col=[]):
	"""
	replacing the NA with mean/median/most frequent values of that variable.
	Note it should only be performed over training set and then propagated to test set.
	"""

	data_copy = data.copy(deep=True)
	for i in NA_col:
	if data_copy[i].isnull().sum()>0:
	if strategy=='mean':
	data_copy[i+'_impute_mean'] = data_copy[i].fillna(data[i].mean())
	elif strategy=='median':
	data_copy[i+'_impute_median'] = data_copy[i].fillna(data[i].median())
	elif strategy=='mode':
	data_copy[i+'_impute_mode'] = data_copy[i].fillna(data[i].mode()[0])
	else:
	warn("Column %s has no missing" % i)
	return data_copy


	def impute_NA_with_end_of_distribution(data,NA_col=[]):
	"""
	replacing the NA by values that are at the far end of the distribution of that variable
	calculated by mean + 3*std
	"""

	data_copy = data.copy(deep=True)
	for i in NA_col:
	if data_copy[i].isnull().sum()>0:
	data_copy[i+'_impute_end_of_distri'] = data_copy[i].fillna(data[i].mean()+3*data[i].std())
	else:
	warn("Column %s has no missing" % i)
	return data_copy


	def impute_NA_with_random(data,NA_col=[],random_state=0):
	"""
	replacing the NA with random sampling from the pool of available observations of the variable
	"""

	data_copy = data.copy(deep=True)
	for i in NA_col:
	if data_copy[i].isnull().sum()>0:
	data_copy[i+'_random'] = data_copy[i]
	# extract the random sample to fill the na
	random_sample = data_copy[i].dropna().sample(data_copy[i].isnull().sum(), random_state=random_state)
	random_sample.index = data_copy[data_copy[i].isnull()].index
	data_copy.loc[data_copy[i].isnull(), str(i)+'_random'] = random_sample
	else:
	warn("Column %s has no missing" % i)
	return data_copy