import pandas as pd import numpy as np # from warnings import warn # 2018.11.07 Created by Eamon.Zhang def outlier_detect_arbitrary(data,col,upper_fence,lower_fence): ''' identify outliers based on arbitrary boundaries passed to the function. ''' para = (upper_fence, lower_fence) tmp = pd.concat([data[col]>upper_fence,data[col]Upper_fence,data[col]Upper_fence,data[col] threshold print('Num of outlier detected:',outlier_index.value_counts()[1]) print('Proportion of outlier detected',outlier_index.value_counts()[1]/len(outlier_index)) return outlier_index # 2018.11.10 outlier treatment def impute_outlier_with_arbitrary(data,outlier_index,value,col=[]): """ impute outliers with arbitrary value """ data_copy = data.copy(deep=True) for i in col: data_copy.loc[outlier_index,i] = value return data_copy def windsorization(data,col,para,strategy='both'): """ top-coding & bottom coding (capping the maximum of a distribution at an arbitrarily set value,vice versa) """ data_copy = data.copy(deep=True) if strategy == 'both': data_copy.loc[data_copy[col]>para[0],col] = para[0] data_copy.loc[data_copy[col]para[0],col] = para[0] elif strategy == 'bottom': data_copy.loc[data_copy[col]