Spaces:
Sleeping
Sleeping
File size: 3,708 Bytes
ffe03d5 ee31440 ffe03d5 cbf321e ffe03d5 de11d67 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 | import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import os
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings("ignore")
def load_data(data_dir : str,
excel_file : str,
mode : str = "train",
scale = bool,
smote = bool,
):
print("--------------Load RawData--------------")
df = pd.read_csv(os.path.join(data_dir, excel_file))
#Inclusion
print("--------------Inclusion--------------")
print('Total : ', len(df))
print("--------------fillNA--------------")
# data = data.dropna()
df.fillna(0.0,inplace=True)
print(df['REAL_STONE'].value_counts())
#Column rename
df.rename(columns={'ID': 'patient_id', 'REAL_STONE':'target'}, inplace=True)
# df_all = ['SEX', 'SBP', 'DBP', 'HR', 'RR', 'BT',
# 'AGE', 'VISIBLE_STONE_CT', 'PANCREATITIS', 'DUCT_DILIATATION_10MM',
# 'DUCT_DILIATATION_8MM', 'Hb', 'PLT', 'WBC', 'ALP', 'ALT', 'AST', 'CRP',
# 'BILIRUBIN', 'HR_100', 'GGT', 'BUN', 'CREATININE', 'BT_38', 'target']
# Forward (n=13)
columns = ['patient_id', 'HR', 'BT', 'AGE','DUCT_DILIATATION_10MM', 'Hb','PLT','WBC','ALP', 'ALT', 'AST', 'TOTAL_BILIRUBIN', 'target']
# # VISIBLE_STONE_CT (n=1)
# columns = ['patient_id','VISIBLE_STONE_CT', 'target']
data = df[columns]
if scale:
print("--------------Scaling--------------")
columns_to_scale = ['SEX', 'AGE', 'DUCT_DILIATATION_10MM', 'DUCT_DILIATATION_8MM', 'Hb', 'PLT', 'WBC', 'ALP', 'ALT', 'AST', 'GGT', 'BUN', 'CREATININE']
columns_to_scale_existing = [col for col in columns_to_scale if col in data.columns]
if columns_to_scale_existing:
scaler = MinMaxScaler()
data[columns_to_scale_existing] = scaler.fit_transform(data[columns_to_scale_existing])
else:
print("No columns to scale.")
if mode == 'train' or mode == 'test':
if smote: # Apply SMOTE if the flag is set
print(data['target'].value_counts())
print("Applying SMOTE...")
smote = SMOTE(sampling_strategy='all', random_state=42)
X_data = data.drop(columns=['target'])
y_data = data['target']
X_data_res, y_data_res = smote.fit_resample(X_data, y_data)
data_resampled = pd.DataFrame(X_data_res, columns=X_data.columns)
data_resampled['target'] = y_data_res
data = data_resampled # Update train_data with resampled data
print(data['target'].value_counts())
train_data, test_data = train_test_split(data, test_size=0.3, stratify=data['target'], random_state=123)
valid_data, test_data = train_test_split(test_data, test_size=0.4, stratify=test_data['target'], random_state=123)
if mode == 'train':
print("Train set shape:", train_data.shape)
print("Validation set shape:", valid_data.shape)
return train_data, valid_data
elif mode == 'test':
print("Test set shape:", test_data.shape)
return test_data
else:
raise ValueError("Choose mode!")
def load_data_and_prepare(data_dir, excel_file, mode, scale, smote):
# Load train, validation, and test data
train_df,val_df = load_data(data_dir, excel_file, mode, scale, smote)
train_df.drop(columns=['patient_id','target'],inplace = True)
val_df.drop(columns=['patient_id','target'],inplace = True)
train = pd.concat([train_df,val_df],axis=0)
return train |