File size: 3,708 Bytes
ffe03d5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ee31440
ffe03d5
 
 
 
 
cbf321e
ffe03d5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
de11d67
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import os
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings("ignore")


def load_data(data_dir : str, 
              excel_file : str,
                mode : str = "train",
                scale = bool,
                smote = bool,
                ):
    
    
    print("--------------Load RawData--------------")
    df = pd.read_csv(os.path.join(data_dir, excel_file))
    
    #Inclusion
    print("--------------Inclusion--------------")
    print('Total : ', len(df))

    print("--------------fillNA--------------")
    # data = data.dropna()
    df.fillna(0.0,inplace=True)
    print(df['REAL_STONE'].value_counts())

    #Column rename
    df.rename(columns={'ID': 'patient_id', 'REAL_STONE':'target'}, inplace=True)

   # df_all = ['SEX', 'SBP', 'DBP', 'HR', 'RR', 'BT',
    #    'AGE', 'VISIBLE_STONE_CT', 'PANCREATITIS', 'DUCT_DILIATATION_10MM',
    #    'DUCT_DILIATATION_8MM', 'Hb', 'PLT', 'WBC', 'ALP', 'ALT', 'AST', 'CRP',
    #    'BILIRUBIN', 'HR_100', 'GGT', 'BUN', 'CREATININE', 'BT_38', 'target']

    # Forward (n=13)
    columns = ['patient_id',  'HR', 'BT', 'AGE','DUCT_DILIATATION_10MM', 'Hb','PLT','WBC','ALP', 'ALT', 'AST', 'TOTAL_BILIRUBIN',  'target']
    
    # # VISIBLE_STONE_CT (n=1)
    # columns = ['patient_id','VISIBLE_STONE_CT', 'target']

 
    data = df[columns]
    
    if scale:
        print("--------------Scaling--------------")
        columns_to_scale = ['SEX',  'AGE', 'DUCT_DILIATATION_10MM', 'DUCT_DILIATATION_8MM', 'Hb', 'PLT', 'WBC', 'ALP', 'ALT', 'AST',  'GGT', 'BUN', 'CREATININE']

        columns_to_scale_existing = [col for col in columns_to_scale if col in data.columns]

        if columns_to_scale_existing:
            scaler = MinMaxScaler()
            data[columns_to_scale_existing] = scaler.fit_transform(data[columns_to_scale_existing])
        else:
            print("No columns to scale.")
            
    if mode == 'train' or mode == 'test':
        if smote:  # Apply SMOTE if the flag is set
            print(data['target'].value_counts())
            print("Applying SMOTE...")
            smote = SMOTE(sampling_strategy='all', random_state=42)
            X_data = data.drop(columns=['target'])
            y_data = data['target']
            X_data_res, y_data_res = smote.fit_resample(X_data, y_data)
            data_resampled = pd.DataFrame(X_data_res, columns=X_data.columns)
            data_resampled['target'] = y_data_res
            data = data_resampled  # Update train_data with resampled data
            print(data['target'].value_counts())
            
        train_data, test_data = train_test_split(data, test_size=0.3, stratify=data['target'], random_state=123)
        valid_data, test_data = train_test_split(test_data, test_size=0.4, stratify=test_data['target'], random_state=123)
        
        if mode == 'train':
            print("Train set shape:", train_data.shape)
            print("Validation set shape:", valid_data.shape)
            return train_data, valid_data

        elif mode == 'test':
            print("Test set shape:", test_data.shape)
            return test_data
    
    else:
        raise ValueError("Choose mode!")
    
    
def load_data_and_prepare(data_dir, excel_file, mode, scale, smote):
    # Load train, validation, and test data
    train_df,val_df = load_data(data_dir, excel_file, mode, scale, smote)
    
    train_df.drop(columns=['patient_id','target'],inplace = True)
    val_df.drop(columns=['patient_id','target'],inplace = True)
    
    train = pd.concat([train_df,val_df],axis=0)
    
    return train