Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| from sklearn.preprocessing import MinMaxScaler | |
| from sklearn.model_selection import train_test_split | |
| import os | |
| from imblearn.over_sampling import SMOTE | |
| import warnings | |
| warnings.filterwarnings("ignore") | |
| def load_data(data_dir : str, | |
| excel_file : str, | |
| mode : str = "train", | |
| scale = bool, | |
| smote = bool, | |
| ): | |
| print("--------------Load RawData--------------") | |
| df = pd.read_csv(os.path.join(data_dir, excel_file)) | |
| #Inclusion | |
| print("--------------Inclusion--------------") | |
| print('Total : ', len(df)) | |
| print("--------------fillNA--------------") | |
| # data = data.dropna() | |
| df.fillna(0.0,inplace=True) | |
| print(df['REAL_STONE'].value_counts()) | |
| #Column rename | |
| df.rename(columns={'ID': 'patient_id', 'REAL_STONE':'target'}, inplace=True) | |
| # df_all = ['SEX', 'SBP', 'DBP', 'HR', 'RR', 'BT', | |
| # 'AGE', 'VISIBLE_STONE_CT', 'PANCREATITIS', 'DUCT_DILIATATION_10MM', | |
| # 'DUCT_DILIATATION_8MM', 'Hb', 'PLT', 'WBC', 'ALP', 'ALT', 'AST', 'CRP', | |
| # 'BILIRUBIN', 'HR_100', 'GGT', 'BUN', 'CREATININE', 'BT_38', 'target'] | |
| # Forward (n=13) | |
| columns = ['patient_id', 'HR', 'BT', 'AGE','DUCT_DILIATATION_10MM', 'Hb','PLT','WBC','ALP', 'ALT', 'AST', 'TOTAL_BILIRUBIN', 'target'] | |
| # # VISIBLE_STONE_CT (n=1) | |
| # columns = ['patient_id','VISIBLE_STONE_CT', 'target'] | |
| data = df[columns] | |
| if scale: | |
| print("--------------Scaling--------------") | |
| columns_to_scale = ['SEX', 'AGE', 'DUCT_DILIATATION_10MM', 'DUCT_DILIATATION_8MM', 'Hb', 'PLT', 'WBC', 'ALP', 'ALT', 'AST', 'GGT', 'BUN', 'CREATININE'] | |
| columns_to_scale_existing = [col for col in columns_to_scale if col in data.columns] | |
| if columns_to_scale_existing: | |
| scaler = MinMaxScaler() | |
| data[columns_to_scale_existing] = scaler.fit_transform(data[columns_to_scale_existing]) | |
| else: | |
| print("No columns to scale.") | |
| if mode == 'train' or mode == 'test': | |
| if smote: # Apply SMOTE if the flag is set | |
| print(data['target'].value_counts()) | |
| print("Applying SMOTE...") | |
| smote = SMOTE(sampling_strategy='all', random_state=42) | |
| X_data = data.drop(columns=['target']) | |
| y_data = data['target'] | |
| X_data_res, y_data_res = smote.fit_resample(X_data, y_data) | |
| data_resampled = pd.DataFrame(X_data_res, columns=X_data.columns) | |
| data_resampled['target'] = y_data_res | |
| data = data_resampled # Update train_data with resampled data | |
| print(data['target'].value_counts()) | |
| train_data, test_data = train_test_split(data, test_size=0.3, stratify=data['target'], random_state=123) | |
| valid_data, test_data = train_test_split(test_data, test_size=0.4, stratify=test_data['target'], random_state=123) | |
| if mode == 'train': | |
| print("Train set shape:", train_data.shape) | |
| print("Validation set shape:", valid_data.shape) | |
| return train_data, valid_data | |
| elif mode == 'test': | |
| print("Test set shape:", test_data.shape) | |
| return test_data | |
| else: | |
| raise ValueError("Choose mode!") | |
| def load_data_and_prepare(data_dir, excel_file, mode, scale, smote): | |
| # Load train, validation, and test data | |
| train_df,val_df = load_data(data_dir, excel_file, mode, scale, smote) | |
| train_df.drop(columns=['patient_id','target'],inplace = True) | |
| val_df.drop(columns=['patient_id','target'],inplace = True) | |
| train = pd.concat([train_df,val_df],axis=0) | |
| return train |