ready2drop commited on
Commit
ffe03d5
·
verified ·
1 Parent(s): 57cae00

data load

Browse files
Files changed (1) hide show
  1. util.py +87 -84
util.py CHANGED
@@ -1,85 +1,88 @@
1
- import pandas as pd
2
- from sklearn.preprocessing import MinMaxScaler
3
- from sklearn.model_selection import train_test_split
4
- import os
5
- from imblearn.over_sampling import SMOTE
6
- import warnings
7
- warnings.filterwarnings("ignore")
8
-
9
-
10
- def load_data(data_dir : str,
11
- excel_file : str,
12
- mode : str = "train",
13
- scale = bool,
14
- smote = bool,
15
- ):
16
-
17
-
18
- print("--------------Load RawData--------------")
19
- df = pd.read_csv(os.path.join(data_dir, excel_file))
20
-
21
- #Inclusion
22
- print("--------------Inclusion--------------")
23
- print('Total : ', len(df))
24
-
25
- print("--------------fillNA--------------")
26
- # data = data.dropna()
27
- df.fillna(0.0,inplace=True)
28
- print(df['REAL_STONE'].value_counts())
29
-
30
- #Column rename
31
- df.rename(columns={'ID': 'patient_id', 'REAL_STONE':'target'}, inplace=True)
32
-
33
- # df_all = ['SEX', 'FIRST_SBP', 'FIRST_DBP', 'FIRST_HR', 'FIRST_RR', 'FIRST_BT',
34
- # 'AGE', 'VISIBLE_STONE_CT', 'PANCREATITIS', 'DUCT_DILIATATION_10MM',
35
- # 'DUCT_DILIATATION_8MM', 'Hb', 'PLT', 'WBC', 'ALP', 'ALT', 'AST', 'CRP',
36
- # 'BILIRUBIN', 'HR_100', 'GGT', 'BUN', 'CREATININE', 'BT_38', 'target']
37
-
38
- # backward (n=13)
39
- columns = ['patient_id','SEX', 'AGE', 'DUCT_DILIATATION_10MM', 'DUCT_DILIATATION_8MM', 'Hb', 'PLT', 'WBC', 'ALP', 'ALT', 'AST', 'GGT', 'BUN', 'CREATININE', 'target']
40
-
41
-
42
- data = df[columns]
43
-
44
- if scale:
45
- print("--------------Scaling--------------")
46
- columns_to_scale = ['SEX', 'AGE', 'DUCT_DILIATATION_10MM', 'DUCT_DILIATATION_8MM', 'Hb', 'PLT', 'WBC', 'ALP', 'ALT', 'AST', 'GGT', 'BUN', 'CREATININE']
47
-
48
- columns_to_scale_existing = [col for col in columns_to_scale if col in data.columns]
49
-
50
- if columns_to_scale_existing:
51
- scaler = MinMaxScaler()
52
- data[columns_to_scale_existing] = scaler.fit_transform(data[columns_to_scale_existing])
53
- else:
54
- print("No columns to scale.")
55
-
56
- if mode == 'train' or mode == 'test':
57
- if smote: # Apply SMOTE if the flag is set
58
- print(data['target'].value_counts())
59
- print("Applying SMOTE...")
60
- smote = SMOTE(sampling_strategy='all', random_state=42)
61
- X_data = data.drop(columns=['target'])
62
- y_data = data['target']
63
- X_data_res, y_data_res = smote.fit_resample(X_data, y_data)
64
- data_resampled = pd.DataFrame(X_data_res, columns=X_data.columns)
65
- data_resampled['target'] = y_data_res
66
- data = data_resampled # Update train_data with resampled data
67
- print(data['target'].value_counts())
68
-
69
- train_data, test_data = train_test_split(data, test_size=0.3, stratify=data['target'], random_state=123)
70
- valid_data, test_data = train_test_split(test_data, test_size=0.4, stratify=test_data['target'], random_state=123)
71
-
72
- if mode == 'train':
73
- print("Train set shape:", train_data.shape)
74
- print("Validation set shape:", valid_data.shape)
75
- return train_data, valid_data
76
-
77
- elif mode == 'test':
78
- print("Test set shape:", test_data.shape)
79
- return test_data
80
-
81
- else:
82
- raise ValueError("Choose mode!")
83
-
84
-
 
 
 
85
 
 
1
+ import pandas as pd
2
+ from sklearn.preprocessing import MinMaxScaler
3
+ from sklearn.model_selection import train_test_split
4
+ import os
5
+ from imblearn.over_sampling import SMOTE
6
+ import warnings
7
+ warnings.filterwarnings("ignore")
8
+
9
+
10
+ def load_data(data_dir : str,
11
+ excel_file : str,
12
+ mode : str = "train",
13
+ scale = bool,
14
+ smote = bool,
15
+ ):
16
+
17
+
18
+ print("--------------Load RawData--------------")
19
+ df = pd.read_csv(os.path.join(data_dir, excel_file))
20
+
21
+ #Inclusion
22
+ print("--------------Inclusion--------------")
23
+ print('Total : ', len(df))
24
+
25
+ print("--------------fillNA--------------")
26
+ # data = data.dropna()
27
+ df.fillna(0.0,inplace=True)
28
+ print(df['REAL_STONE'].value_counts())
29
+
30
+ #Column rename
31
+ df.rename(columns={'ID': 'patient_id', 'REAL_STONE':'target'}, inplace=True)
32
+
33
+ # df_all = ['SEX', 'FIRST_SBP', 'FIRST_DBP', 'FIRST_HR', 'FIRST_RR', 'FIRST_BT',
34
+ # 'AGE', 'VISIBLE_STONE_CT', 'PANCREATITIS', 'DUCT_DILIATATION_10MM',
35
+ # 'DUCT_DILIATATION_8MM', 'Hb', 'PLT', 'WBC', 'ALP', 'ALT', 'AST', 'CRP',
36
+ # 'BILIRUBIN', 'HR_100', 'GGT', 'BUN', 'CREATININE', 'BT_38', 'target']
37
+
38
+ # Forward (n=13)
39
+ columns = ['patient_id', 'FIRST_HR', 'FIRST_RR', 'FIRST_BT','AGE', 'PANCREATITIS', 'DUCT_DILIATATION_10MM', 'WBC', 'ALP', 'ALT', 'AST','CRP', 'BILIRUBIN','GGT', 'target']
40
+
41
+ # # VISIBLE_STONE_CT (n=1)
42
+ # columns = ['patient_id','VISIBLE_STONE_CT', 'target']
43
+
44
+
45
+ data = df[columns]
46
+
47
+ if scale:
48
+ print("--------------Scaling--------------")
49
+ columns_to_scale = ['SEX', 'AGE', 'DUCT_DILIATATION_10MM', 'DUCT_DILIATATION_8MM', 'Hb', 'PLT', 'WBC', 'ALP', 'ALT', 'AST', 'GGT', 'BUN', 'CREATININE']
50
+
51
+ columns_to_scale_existing = [col for col in columns_to_scale if col in data.columns]
52
+
53
+ if columns_to_scale_existing:
54
+ scaler = MinMaxScaler()
55
+ data[columns_to_scale_existing] = scaler.fit_transform(data[columns_to_scale_existing])
56
+ else:
57
+ print("No columns to scale.")
58
+
59
+ if mode == 'train' or mode == 'test':
60
+ if smote: # Apply SMOTE if the flag is set
61
+ print(data['target'].value_counts())
62
+ print("Applying SMOTE...")
63
+ smote = SMOTE(sampling_strategy='all', random_state=42)
64
+ X_data = data.drop(columns=['target'])
65
+ y_data = data['target']
66
+ X_data_res, y_data_res = smote.fit_resample(X_data, y_data)
67
+ data_resampled = pd.DataFrame(X_data_res, columns=X_data.columns)
68
+ data_resampled['target'] = y_data_res
69
+ data = data_resampled # Update train_data with resampled data
70
+ print(data['target'].value_counts())
71
+
72
+ train_data, test_data = train_test_split(data, test_size=0.3, stratify=data['target'], random_state=123)
73
+ valid_data, test_data = train_test_split(test_data, test_size=0.4, stratify=test_data['target'], random_state=123)
74
+
75
+ if mode == 'train':
76
+ print("Train set shape:", train_data.shape)
77
+ print("Validation set shape:", valid_data.shape)
78
+ return train_data, valid_data
79
+
80
+ elif mode == 'test':
81
+ print("Test set shape:", test_data.shape)
82
+ return test_data
83
+
84
+ else:
85
+ raise ValueError("Choose mode!")
86
+
87
+
88