lauren-cw commited on
Commit
9ebc029
·
verified ·
1 Parent(s): 0659b60

Delete AutoPreprocess.py

Browse files
Files changed (1) hide show
  1. AutoPreprocess.py +0 -147
AutoPreprocess.py DELETED
@@ -1,147 +0,0 @@
1
- import pandas as pd
2
- import numpy as np
3
- import pickle
4
- from sklearn.base import BaseEstimator, TransformerMixin
5
- from sklearn.preprocessing import MinMaxScaler
6
- from sklearn.preprocessing import LabelEncoder
7
- from sklearn.preprocessing import RobustScaler
8
-
9
- class AutoPreprocess(BaseEstimator, TransformerMixin):
10
- def __init__(self):
11
- self.scaler = {}
12
- self.fillna_value = {}
13
- self.onehotencode_value = {}
14
- self.field_names = []
15
- self.final_field_names = []
16
- self.field_dtype = {}
17
-
18
- def fit(self, X, y = None, field_names=None):
19
- self.__init__()
20
- if field_names is None:
21
- self.field_names = X.columns.tolist()
22
- else:
23
- self.field_names = field_names
24
-
25
- for fname in self.field_names:
26
- self.field_dtype = X[fname].dtype
27
-
28
- for fname in self.field_names:
29
- #自動補空值
30
- # if (X[fname].dtype == object) or (X[fname].dtype == str): #字串型態欄位
31
- if pd.api.types.is_string_dtype(X[fname]):
32
- self.fillna_value[fname] = X[fname].mode()[0] #補眾數
33
- # self.fillna_value[fname] = 'np.nan'
34
- # self.fillna_value[fname] = np.nan # 維持空值
35
- # elif X[fname].dtype == bool: #布林型態
36
- elif pd.api.types.is_bool_dtype(X[fname]):
37
- self.fillna_value[fname] = X[fname].mode()[0] #補眾數
38
- else: # 數字型態
39
- self.fillna_value[fname] = X[fname].median() #補中位數
40
-
41
- #自動尺度轉換(scaling)
42
- # if (X[fname].dtype == object) or (X[fname].dtype == str): #字串型態欄位
43
- if pd.api.types.is_string_dtype(X[fname]):
44
- pass #不用轉換
45
- # elif X[fname].dtype == bool: #布林型態
46
- elif pd.api.types.is_bool_dtype(X[fname]):
47
- pass #不用轉換
48
- else: # 數字型態
49
- vc = X[fname].value_counts()
50
- if X[fname].isin([0, 1]).all(): #當數值只有0跟1
51
- pass #不用轉換
52
- elif pd.api.types.is_integer_dtype(X[fname]) and X[fname].nunique() <= 10: #是否簡單的整數型類別且數量小於10
53
- self.scaler[fname] = MinMaxScaler()
54
- self.scaler[fname].fit(X[[fname]])
55
- else: #其他的數字型態
56
- self.scaler[fname] = RobustScaler()
57
- self.scaler[fname].fit(X[[fname]])
58
-
59
-
60
- #自動編碼
61
- # if (X[fname].dtype == object) or (X[fname].dtype == str): #字串型態欄位
62
- if pd.api.types.is_string_dtype(X[fname]):
63
- field_value = X[fname].value_counts().index
64
- self.onehotencode_value[fname] = field_value
65
- for value in field_value:
66
- fn = fname+"_"+value
67
- # data[fn] = (data[fname] == value).astype('int8')
68
- self.final_field_names.append(fn)
69
- # elif X[fname].dtype == bool: #布林型態
70
- elif pd.api.types.is_bool_dtype(X[fname]):
71
- # data[fname] = data[fname].astype(int)
72
- self.final_field_names.append(fname)
73
- else: # 數字型態 不用重新編碼
74
- self.final_field_names.append(fname)
75
-
76
- return self
77
-
78
- def transform(self, X):
79
- #如果輸入的data是dict,要先轉成dataframe
80
- if isinstance(X, dict):
81
- for fname in self.field_names:
82
- if fname in X:
83
- X[fname] = [X[fname]]
84
- else:
85
- # X[fname] = [np.nan]
86
- X[fname] = self.fillna_value[fname]
87
-
88
- data = pd.DataFrame(X)
89
-
90
- # for fname in self.field_names:
91
- # data[fname].astype(self.field_dtype[fname])
92
-
93
- else: #將資料複製一份,不修改原本的資料
94
- data = X.copy()
95
-
96
- for fname in self.field_names:
97
- #自動補空值
98
- if data[fname].isnull().any(): #有空值
99
- # if fname in self.fillna_value:
100
- data[fname] = data[fname].fillna(self.fillna_value[fname])
101
-
102
-
103
- #自動尺度轉換(scaling)
104
- if fname in self.scaler:
105
- data[fname] = self.scaler[fname].transform(data[[fname]])
106
-
107
- #自動編碼
108
- # if (data[fname].dtype == object) or (data[fname].dtype == str): #字串型態欄位, onehotencode
109
- if pd.api.types.is_string_dtype(data[fname]):
110
- if fname in self.onehotencode_value:
111
- field_value = self.onehotencode_value[fname]
112
- for value in field_value:
113
- fn = fname+"_"+value
114
- data[fn] = (data[fname] == value).astype('int8')
115
- # elif data[fname].dtype == bool: #布林型態 轉成0跟1
116
- elif pd.api.types.is_bool_dtype(data[fname]):
117
- data[fname] = data[fname].astype(int)
118
- else: # 數字型態 不用重新編碼
119
- pass
120
- return data[self.final_field_names]
121
-
122
- def save(self, file_name):
123
- with open(file_name, "wb") as f:
124
- pickle.dump(self, f)
125
-
126
- @staticmethod
127
- def load(file_name):
128
- with open(file_name, "rb") as f:
129
- return pickle.load(f)
130
-
131
-
132
- # import pandas as pd
133
- # mydata = pd.read_csv('C:/DATA/class/2025-07 AI數據應用人才養成班三期/data/Automobile_Train.csv')
134
- # ap = AutoPreprocess()
135
- # # ap.fit(mydata, field_names=['symboling', 'Normalized-losses', 'make', 'Fuel-type', 'aspiration',
136
- # # 'Num-of-doors', 'Body-style', 'Drive-wheels', 'Engine-location',
137
- # # 'Wheel-base', 'length', 'width', 'height', 'Curb-weight', 'Engine-type',
138
- # # 'Num-of-cylinders', 'Engine-size', 'Fuel-system', 'bore', 'stroke',
139
- # # 'Compression-ratio', 'horsepower', 'Peak-rpm', 'City-mpg',
140
- # # 'Highway-mpg'])
141
- # ap.fit(mydata)
142
-
143
- # # 轉換 panddas dataframe
144
- # t = ap.transform(mydata)
145
- # print(t.head())
146
-
147
-