lauren-cw commited on
Commit
1913aa0
·
verified ·
1 Parent(s): 9ebc029

Upload AutoPreprocess.py

Browse files
Files changed (1) hide show
  1. AutoPreprocess.py +147 -0
AutoPreprocess.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import pickle
4
+ from sklearn.base import BaseEstimator, TransformerMixin
5
+ from sklearn.preprocessing import MinMaxScaler
6
+ from sklearn.preprocessing import LabelEncoder
7
+ from sklearn.preprocessing import RobustScaler
8
+
9
+ class AutoPreprocess(BaseEstimator, TransformerMixin):
10
+ def __init__(self):
11
+ self.scaler = {}
12
+ self.fillna_value = {}
13
+ self.onehotencode_value = {}
14
+ self.field_names = []
15
+ self.final_field_names = []
16
+ self.field_dtype = {}
17
+
18
+ def fit(self, X, y = None, field_names=None):
19
+ self.__init__()
20
+ if field_names is None:
21
+ self.field_names = X.columns.tolist()
22
+ else:
23
+ self.field_names = field_names
24
+
25
+ for fname in self.field_names:
26
+ self.field_dtype = X[fname].dtype
27
+
28
+ for fname in self.field_names:
29
+ #自動補空值
30
+ # if (X[fname].dtype == object) or (X[fname].dtype == str): #字串型態欄位
31
+ if pd.api.types.is_string_dtype(X[fname]):
32
+ self.fillna_value[fname] = X[fname].mode()[0] #補眾數
33
+ # self.fillna_value[fname] = 'np.nan'
34
+ # self.fillna_value[fname] = np.nan # 維持空值
35
+ # elif X[fname].dtype == bool: #布林型態
36
+ elif pd.api.types.is_bool_dtype(X[fname]):
37
+ self.fillna_value[fname] = X[fname].mode()[0] #補眾數
38
+ else: # 數字型態
39
+ self.fillna_value[fname] = X[fname].median() #補中位數
40
+
41
+ #自動尺度轉換(scaling)
42
+ # if (X[fname].dtype == object) or (X[fname].dtype == str): #字串型態欄位
43
+ if pd.api.types.is_string_dtype(X[fname]):
44
+ pass #不用轉換
45
+ # elif X[fname].dtype == bool: #布林型態
46
+ elif pd.api.types.is_bool_dtype(X[fname]):
47
+ pass #不用轉換
48
+ else: # 數字型態
49
+ vc = X[fname].value_counts()
50
+ if X[fname].isin([0, 1]).all(): #當數值只有0跟1
51
+ pass #不用轉換
52
+ elif pd.api.types.is_integer_dtype(X[fname]) and X[fname].nunique() <= 10: #是否簡單的整數型類別且數量小於10
53
+ self.scaler[fname] = MinMaxScaler()
54
+ self.scaler[fname].fit(X[[fname]])
55
+ else: #其他的數字型態
56
+ self.scaler[fname] = RobustScaler()
57
+ self.scaler[fname].fit(X[[fname]])
58
+
59
+
60
+ #自動編碼
61
+ # if (X[fname].dtype == object) or (X[fname].dtype == str): #字串型態欄位
62
+ if pd.api.types.is_string_dtype(X[fname]):
63
+ field_value = X[fname].value_counts().index
64
+ self.onehotencode_value[fname] = field_value
65
+ for value in field_value:
66
+ fn = fname+"_"+value
67
+ # data[fn] = (data[fname] == value).astype('int8')
68
+ self.final_field_names.append(fn)
69
+ # elif X[fname].dtype == bool: #布林型態
70
+ elif pd.api.types.is_bool_dtype(X[fname]):
71
+ # data[fname] = data[fname].astype(int)
72
+ self.final_field_names.append(fname)
73
+ else: # 數字型態 不用重新編碼
74
+ self.final_field_names.append(fname)
75
+
76
+ return self
77
+
78
+ def transform(self, X):
79
+ #如果輸入的data是dict,要先轉成dataframe
80
+ if isinstance(X, dict):
81
+ for fname in self.field_names:
82
+ if fname in X:
83
+ X[fname] = [X[fname]]
84
+ else:
85
+ # X[fname] = [np.nan]
86
+ X[fname] = self.fillna_value[fname]
87
+
88
+ data = pd.DataFrame(X)
89
+
90
+ # for fname in self.field_names:
91
+ # data[fname].astype(self.field_dtype[fname])
92
+
93
+ else: #將資料複製一份,不修改原本的資料
94
+ data = X.copy()
95
+
96
+ for fname in self.field_names:
97
+ #自動補空值
98
+ if data[fname].isnull().any(): #有空值
99
+ # if fname in self.fillna_value:
100
+ data[fname] = data[fname].fillna(self.fillna_value[fname])
101
+
102
+
103
+ #自動尺度轉換(scaling)
104
+ if fname in self.scaler:
105
+ data[fname] = self.scaler[fname].transform(data[[fname]])
106
+
107
+ #自動編碼
108
+ # if (data[fname].dtype == object) or (data[fname].dtype == str): #字串型態欄位, onehotencode
109
+ if pd.api.types.is_string_dtype(data[fname]):
110
+ if fname in self.onehotencode_value:
111
+ field_value = self.onehotencode_value[fname]
112
+ for value in field_value:
113
+ fn = fname+"_"+value
114
+ data[fn] = (data[fname] == value).astype('int8')
115
+ # elif data[fname].dtype == bool: #布林型態 轉成0跟1
116
+ elif pd.api.types.is_bool_dtype(data[fname]):
117
+ data[fname] = data[fname].astype(int)
118
+ else: # 數字型態 不用重新編碼
119
+ pass
120
+ return data[self.final_field_names]
121
+
122
+ def save(self, file_name):
123
+ with open(file_name, "wb") as f:
124
+ pickle.dump(self, f)
125
+
126
+ @staticmethod
127
+ def load(file_name):
128
+ with open(file_name, "rb") as f:
129
+ return pickle.load(f)
130
+
131
+
132
+ # import pandas as pd
133
+ # mydata = pd.read_csv('C:/DATA/class/2025-07 AI數據應用人才養成班三期/data/Automobile_Train.csv')
134
+ # ap = AutoPreprocess()
135
+ # # ap.fit(mydata, field_names=['symboling', 'Normalized-losses', 'make', 'Fuel-type', 'aspiration',
136
+ # # 'Num-of-doors', 'Body-style', 'Drive-wheels', 'Engine-location',
137
+ # # 'Wheel-base', 'length', 'width', 'height', 'Curb-weight', 'Engine-type',
138
+ # # 'Num-of-cylinders', 'Engine-size', 'Fuel-system', 'bore', 'stroke',
139
+ # # 'Compression-ratio', 'horsepower', 'Peak-rpm', 'City-mpg',
140
+ # # 'Highway-mpg'])
141
+ # ap.fit(mydata)
142
+
143
+ # # 轉換 panddas dataframe
144
+ # t = ap.transform(mydata)
145
+ # print(t.head())
146
+
147
+