File size: 1,811 Bytes
0ab7b0c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import pylab
# from warnings import warn

# 2018.11.26 Created by Eamon.Zhang
def diagnostic_plots(df, variable):
    # function to plot a histogram and a Q-Q plot
    # side by side, for a certain variable
    
    plt.figure(figsize=(15,6))
    plt.subplot(1, 2, 1)
    df[variable].hist()

    plt.subplot(1, 2, 2)
    stats.probplot(df[variable], dist="norm", plot=pylab)

    plt.show()
    
    
def log_transform(data,cols=[]):
    """
    Logarithmic transformation
    """
    
    data_copy = data.copy(deep=True)
    for i in cols:
        data_copy[i+'_log'] = np.log(data_copy[i]+1)
        print('Variable ' + i +' Q-Q plot')
        diagnostic_plots(data_copy,str(i+'_log'))       
    return data_copy 


def reciprocal_transform(data,cols=[]):
    """
    Reciprocal transformation
    """
    
    data_copy = data.copy(deep=True)
    for i in cols:
        data_copy[i+'_reciprocal'] = 1/(data_copy[i])
        print('Variable ' + i +' Q-Q plot')
        diagnostic_plots(data_copy,str(i+'_reciprocal'))       
    return data_copy 


def square_root_transform(data,cols=[]):
    """
    square root transformation
    """
    
    data_copy = data.copy(deep=True)
    for i in cols:
        data_copy[i+'_square_root'] = (data_copy[i])**(0.5)
        print('Variable ' + i +' Q-Q plot')
        diagnostic_plots(data_copy,str(i+'_square_root'))        
    return data_copy 


def exp_transform(data,coef,cols=[]):
    """
    exp transformation
    """
    
    data_copy = data.copy(deep=True)
    for i in cols:
        data_copy[i+'_exp'] = (data_copy[i])**coef
        print('Variable ' + i +' Q-Q plot')
        diagnostic_plots(data_copy,str(i+'_exp'))         
    return data_copy