In [1]:
import pandas as pd
import numpy as np
# import seaborn as sns
# import matplotlib.pyplot as plt
import os
# plt.style.use('seaborn-colorblind')
# %matplotlib inline
from feature_cleaning import outlier as ot

## Load dataset

In [2]:
use_cols = [
    'Pclass', 'Sex', 'Age', 'Fare', 'SibSp',
    'Survived'
]


data = pd.read_csv('./data/titanic.csv', usecols=use_cols)
data.head(3)
print(data.shape)

(891, 6)


In [3]:
pd.Series(data.Fare.unique()).sort_values()

104      0.0000
163      4.0125
245      5.0000
152      6.2375
240      6.4375
237      6.4500
110      6.4958
85       6.7500
171      6.8583
238      6.9500
78       6.9750
185      7.0458
79       7.0500
218      7.0542
123      7.1250
76       7.1417
18       7.2250
32       7.2292
0        7.2500
91       7.3125
196      7.4958
186      7.5208
120      7.5500
192      7.6292
55       7.6500
182      7.7250
166      7.7292
93       7.7333
229      7.7375
227      7.7417
         ...   
46      80.0000
177     81.8583
30      82.1708
147     83.1583
47      83.4750
126     86.5000
180     89.1042
117     90.0000
136     91.0792
198     93.5000
200    106.4250
144    108.9000
143    110.8833
114    113.2750
168    120.0000
155    133.6500
151    134.5000
130    135.6333
27     146.5208
139    151.5500
129    153.4625
150    164.8667
224    211.3375
162    211.5000
199    221.7792
164    227.5250
75     247.5208
148    262.3750
23     263.0000
127    512.3292
Length: 248, dtype: floa

## Detect by arbitrary boundary
identify outliers based on arbitrary boundaries

In [4]:
index,para = ot.outlier_detect_arbitrary(data=data,col='Fare',upper_fence=300,lower_fence=5)
print('Upper bound:',para[0],'\nLower bound:',para[1])

Num of outlier detected: 19
Proportion of outlier detected 0.02132435465768799
Upper bound: 300 
Lower bound: 5


In [5]:
# check the 19 found outliers
data.loc[index,'Fare'].sort_values()

179      0.0000
806      0.0000
732      0.0000
674      0.0000
633      0.0000
597      0.0000
815      0.0000
466      0.0000
481      0.0000
302      0.0000
277      0.0000
271      0.0000
263      0.0000
413      0.0000
822      0.0000
378      4.0125
679    512.3292
737    512.3292
258    512.3292
Name: Fare, dtype: float64

## IQR method
outlier detection by Interquartile Ranges Rule

In [6]:
index,para = ot.outlier_detect_IQR(data=data,col='Fare',threshold=5)
print('Upper bound:',para[0],'\nLower bound:',para[1])

Num of outlier detected: 31
Proportion of outlier detected 0.03479236812570146
Upper bound: 146.448 
Lower bound: -107.53760000000001


In [7]:
# check the 31 found outliers
data.loc[index,'Fare'].sort_values()

31     146.5208
195    146.5208
305    151.5500
708    151.5500
297    151.5500
498    151.5500
609    153.4625
332    153.4625
268    153.4625
318    164.8667
856    164.8667
730    211.3375
779    211.3375
689    211.3375
377    211.5000
527    221.7792
700    227.5250
716    227.5250
557    227.5250
380    227.5250
299    247.5208
118    247.5208
311    262.3750
742    262.3750
341    263.0000
88     263.0000
438    263.0000
27     263.0000
679    512.3292
258    512.3292
737    512.3292
Name: Fare, dtype: float64

## Mean and Standard Deviation Method
outlier detection by Mean and Standard Deviation Method.

In [8]:
index,para = ot.outlier_detect_mean_std(data=data,col='Fare',threshold=3)
print('Upper bound:',para[0],'\nLower bound:',para[1])

Num of outlier detected: 20
Proportion of outlier detected 0.02244668911335578
Upper bound: 181.2844937601173 
Lower bound: -116.87607782296811


In [9]:
# check the 20 found outliers
data.loc[index,'Fare'].sort_values()

779    211.3375
730    211.3375
689    211.3375
377    211.5000
527    221.7792
716    227.5250
700    227.5250
380    227.5250
557    227.5250
118    247.5208
299    247.5208
311    262.3750
742    262.3750
27     263.0000
341    263.0000
88     263.0000
438    263.0000
258    512.3292
737    512.3292
679    512.3292
Name: Fare, dtype: float64

## MAD method
outlier detection by Median and Median Absolute Deviation Method (MAD)

In [10]:
# too aggressive for our dataset, about 18% of cases are detected as outliers.
index = ot.outlier_detect_MAD(data=data,col='Fare',threshold=3.5)

Num of outlier detected: 160
Proportion of outlier detected 0.17957351290684623


##  Imputation with arbitrary value
impute outliers with arbitrary value

In [11]:
# use any of the detection method above
index,para = ot.outlier_detect_arbitrary(data=data,col='Fare',upper_fence=300,lower_fence=5)
print('Upper bound:',para[0],'\nLower bound:',para[1])

Num of outlier detected: 19
Proportion of outlier detected 0.02132435465768799
Upper bound: 300 
Lower bound: 5


In [12]:
data[255:275]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare
255,1,3,female,29.0,0,15.2458
256,1,1,female,,0,79.2
257,1,1,female,30.0,0,86.5
258,1,1,female,35.0,0,512.3292
259,1,2,female,50.0,0,26.0
260,0,3,male,,0,7.75
261,1,3,male,3.0,4,31.3875
262,0,1,male,52.0,1,79.65
263,0,1,male,40.0,0,0.0
264,0,3,female,,0,7.75


In [13]:
# see index 258,263,271 have been replaced
data2 = ot.impute_outlier_with_arbitrary(data=data,outlier_index=index,
                                         value=-999,col=['Fare'])
data2[255:275]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare
255,1,3,female,29.0,0,15.2458
256,1,1,female,,0,79.2
257,1,1,female,30.0,0,86.5
258,1,1,female,35.0,0,-999.0
259,1,2,female,50.0,0,26.0
260,0,3,male,,0,7.75
261,1,3,male,3.0,4,31.3875
262,0,1,male,52.0,1,79.65
263,0,1,male,40.0,0,-999.0
264,0,3,female,,0,7.75


## Windsorization
top-coding & bottom coding (capping the maximum of a distribution at an arbitrarily set value,vice versa)


In [14]:
# use any of the detection method above
index,para = ot.outlier_detect_arbitrary(data,'Fare',300,5)
print('Upper bound:',para[0],'\nLower bound:',para[1])

Num of outlier detected: 19
Proportion of outlier detected 0.02132435465768799
Upper bound: 300 
Lower bound: 5


In [15]:
# see index 258,263,271 have been replaced with top/bottom coding

data3 = ot.windsorization(data=data,col='Fare',para=para,strategy='both')
data3[255:275]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare
255,1,3,female,29.0,0,15.2458
256,1,1,female,,0,79.2
257,1,1,female,30.0,0,86.5
258,1,1,female,35.0,0,300.0
259,1,2,female,50.0,0,26.0
260,0,3,male,,0,7.75
261,1,3,male,3.0,4,31.3875
262,0,1,male,52.0,1,79.65
263,0,1,male,40.0,0,5.0
264,0,3,female,,0,7.75


## Discard outliers
Drop the cases that are outliers

In [16]:
# use any of the detection method above
index,para = ot.outlier_detect_arbitrary(data,'Fare',300,5)
print('Upper bound:',para[0],'\nLower bound:',para[1])

Num of outlier detected: 19
Proportion of outlier detected 0.02132435465768799
Upper bound: 300 
Lower bound: 5


In [17]:
# drop the outlier.
# we can see no more observations have value >300 or <5. They've been removed.
data4 = ot.drop_outlier(data=data,outlier_index=index)
print(data4.Fare.max())
print(data4.Fare.min())

263.0
5.0


## Mean/Median/Mode Imputation
replacing the outlier by mean/median/most frequent values of that variable

In [18]:
# use any of the detection method above
index,para = ot.outlier_detect_arbitrary(data,'Fare',300,5)
print('Upper bound:',para[0],'\nLower bound:',para[1])
    

Num of outlier detected: 19
Proportion of outlier detected 0.02132435465768799
Upper bound: 300 
Lower bound: 5


In [19]:
# see index 258,263,271 have been replaced with mean

data5 = ot.impute_outlier_with_avg(data=data,col='Fare',
                                   outlier_index=index,strategy='mean')
data5[255:275]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare
255,1,3,female,29.0,0,15.2458
256,1,1,female,,0,79.2
257,1,1,female,30.0,0,86.5
258,1,1,female,35.0,0,32.204208
259,1,2,female,50.0,0,26.0
260,0,3,male,,0,7.75
261,1,3,male,3.0,4,31.3875
262,0,1,male,52.0,1,79.65
263,0,1,male,40.0,0,32.204208
264,0,3,female,,0,7.75
