Spaces:

ready2drop
/

CalculatorCBD

Sleeping

App Files Files Community

ready2drop commited on Jan 16, 2025

Commit

d3c46ef

verified ·

1 Parent(s): fbcd943

test

Browse files

Files changed (1) hide show

app.py +9 -238

app.py CHANGED Viewed

@@ -12,150 +12,12 @@ from lime.lime_tabular import LimeTabularExplainer
 from pycaret.classification import *
 import warnings
 warnings.filterwarnings("ignore", category=FutureWarning, module="torch.storage")
-from sklearn.preprocessing import MinMaxScaler
-from sklearn.model_selection import train_test_split
-from sklearn.utils import resample
-from glob import glob
-from imblearn.over_sampling import SMOTE
-def load_data(data_dir : str,
-              excel_file : str,
-                mode : str = "train",
-                modality : str = 'mm',
-                phase : str = 'portal',  # 'portal', 'pre-enhance', 'combine'
-                smote = bool,
-                ):
-    print("--------------Load RawData--------------")
-    df = pd.read_csv(os.path.join(data_dir, excel_file))
-    #Inclusion
-    print("--------------Inclusion--------------")
-    print('Total : ', len(df))
-    print("--------------fillNA--------------")
-    # data = data.dropna()
-    df.fillna(0.0,inplace=True)
-    print(df['REAL_STONE'].value_counts())
-    #Column rename
-    df.rename(columns={'ID': 'patient_id', 'REAL_STONE':'target'}, inplace=True)
-    # Final(n=11)
-    columns = ['patient_id','DUCT_DILIATATION_8MM', 'DUCT_DILIATATION_10MM','PANCREATITIS','FIRST_SBP','FIRST_RR','Hb', 'PLT', 'WBC', 'ALP', 'AST', 'CRP', 'BILIRUBIN', 'AGE','target']
-    data = df[columns]
-    data['patient_id'] = data['patient_id'].astype(str)
-    image_list = sorted(glob(os.path.join(data_dir,"*.nii.gz")))
-    def get_patient_data(image_number):
-        row = data[data['patient_id'].astype(str).str.startswith(image_number)]
-        return row.iloc[0, 1:].tolist() if not row.empty else None
-    # Final(n=11)
-    data_dict = {key: [] for key in ['image_path','DUCT_DILIATATION_8MM', 'DUCT_DILIATATION_10MM','PANCREATITIS','FIRST_SBP','FIRST_RR','Hb', 'PLT', 'WBC', 'ALP', 'AST', 'CRP', 'BILIRUBIN', 'AGE','target']}
-    # Filter images based on the phase
-    if phase == 'portal':
-        # Filter the images for the 'portal' phase by checking for 'Portal' in the filename
-        image_list = [img for img in image_list if 'Portal' in os.path.basename(img)]
-    elif phase == 'pre-enhance':
-        # Filter the images for the 'pre-enhance' phase by checking for 'Pre_enhance' in the filename
-        image_list = [img for img in image_list if 'Pre_enhance' in os.path.basename(img)]
-    elif phase == 'combine':
-        # Include both 'portal' and 'pre-enhance' images for the 'combine' phase
-        portal_images = [img for img in image_list if 'Portal' in os.path.basename(img)]
-        pre_enhance_images = [img for img in image_list if 'Pre_enhance' in os.path.basename(img)]
-        image_list = portal_images + pre_enhance_images
-    else:
-        raise ValueError("Invalid phase. Choose from ['portal', 'pre-enhance', 'combine']")
-    for image_path in image_list:
-        image_number = os.path.basename(image_path).split('_')[0]
-        patient_data = get_patient_data(image_number)
-        if patient_data:
-            data_dict['image_path'].append(image_path)
-            keys_list = list(data_dict.keys())[1:]
-            for key, value in zip(keys_list, patient_data):
-                if key == 'image_path':
-                    continue
-                data_dict[key].append(value)
-    if modality == 'image':
-            data_dict = {k: data_dict[k] for k in ['image_path', 'target']}
-    elif modality not in ['mm', 'tabular']:
-        raise AssertionError("Select Modality for Feature engineering!")
-    #Create a DataFrame from the dictionary
-    train_df = pd.DataFrame(data_dict)
-    #if only  tabular use
-    if modality == 'tabular':
-        train_df = data
-    if mode == 'train' or mode == 'test':
-        print("--------------Class balance--------------")
-        # undersampling
-        majority_class = train_df[train_df['target'] == 1.0]
-        minority_class = train_df[train_df['target'] == 0.0]
-        # Undersample the majority class to match the number of '1's in the minority class
-        undersampled_majority_class = resample(majority_class,
-                                            replace=False,
-                                            n_samples=len(minority_class),
-                                            random_state=42)
-        # Concatenate minority class and undersampled majority class
-        data = pd.concat([undersampled_majority_class, minority_class])
-        # print("--------------Class imbalance--------------")
-        if smote:  # Apply SMOTE if the flag is set
-            data = train_df
-            print(data['target'].value_counts())
-            print("Applying SMOTE...")
-            smote = SMOTE(sampling_strategy='all', random_state=42)
-            X_data = data.drop(columns=['target'])
-            y_data = data['target']
-            X_data_res, y_data_res = smote.fit_resample(X_data, y_data)
-            data_resampled = pd.DataFrame(X_data_res, columns=X_data.columns)
-            data_resampled['target'] = y_data_res
-            data = data_resampled  # Update train_data with resampled data
-            print(data['target'].value_counts())
-        train_data, test_data = train_test_split(data, test_size=0.3, stratify=data['target'], random_state=123)
-        valid_data, test_data = train_test_split(test_data, test_size=0.4, stratify=test_data['target'], random_state=123)
-        if mode == 'train':
-            print("Train set shape:", train_data.shape)
-            print("Validation set shape:", valid_data.shape)
-            return train_data, valid_data
-        elif mode == 'test':
-            print("Test set shape:", test_data.shape)
-            return test_data
-    elif mode == 'pretrain' or mode == 'eval':
-        pretrain_data, eval_data = train_test_split(train_df, test_size=0.1, random_state=123)
-        if mode == 'pretrain':
-            print("Pretrain set shape:", pretrain_data.shape)
-            return pretrain_data
-        elif mode == 'eval':
-            print("Validation set shape:", eval_data.shape)
-            return eval_data
-    else:
-        raise ValueError("Choose mode!")
 def parse_args(args):
     parser = argparse.ArgumentParser(description="CBD Classification")
     parser.add_argument('--data_dir', type=str, default="./")
@@ -231,107 +93,16 @@ def classify(tabular_data):
 if __name__ == '__main__':
     args = parse_args(sys.argv[1:])
-    train = load_data_and_prepare(args.data_dir, args.excel_file, args.modality, args.phase, args.smote)
     model = load_model(args.model_name_or_path)
     device = torch.device(args.device)
-    # Gradio
-    examples = [
-        [
-            [['1', '0', '0', '104', '24', '10.6', '171', '14.54', '236', '182', '12.33', '3.2', '72']],
-            "PT_NO = 10001862, VISIBLE_STONE_CT = True, REAL_STONE = True",
-        ],
-        [
-            [['0', '1','0','106','18','13.6', '388', '21.13', '196', '118', '1.87', '2.7', '58']],
-            "PT_NO = 10007376, VISIBLE_STONE_CT = True, REAL_STONE = True",
-        ],
-        [
-            [['1', '0','1','205','18','9.3', '103', '8.45', '440', '100', '4.21', '4.5', '63']],
-            "PT_NO = 10040285, VISIBLE_STONE_CT = False, REAL_STONE = True",
-        ],
-        [
-            [['0', '1','1','130','20','12.1', '192', '8.63', '47', '59', '0.02', '0.4', '57']],
-            "PT_NO = 10005545, VISIBLE_STONE_CT = False, REAL_STONE = False",
-        ],
-    ]
-    tabular_header = ['DUCT_DILIATATION_8MM', 'DUCT_DILIATATION_10MM','PANCREATITIS','FIRST_SBP','FIRST_RR','Hb', 'PLT', 'WBC', 'ALP', 'AST', 'CRP', 'BILIRUBIN', 'AGE']
-    description = """
-    GPU 리소스 제약으로 인해, 온라인 데모에서는 NVIDIA RTX 3090 24GB를 사용하고 있습니다. \n
-    **Note**: 현재 저희 모델은 **총담관결석증**의 분석 및 진단을 중심으로 최적화되어 있으며, 정확하고 신뢰할 수 있는 결과를 제공합니다. \n
-    모델은 다음과 같은 입력 데이터를 처리하며, 아래와 같이 각각 **이산형(discrete)** **연속형(continuous)** 데이터로 처리됩니다. \n
-    - 이산형 변수:
-    - DUCT_DILIATATION_8MM
-    - DUCT_DILIATATION_10MM
-    - PANCREATITIS
-    - 연속형 변수:
-    - FIRST_SBP (Systolic blood pressure)
-    - FIRST_RR (Respiratory rate)
-    - Hb (Hemoglobin)
-    - PLT (Platelet)
-    - WBC (White Blood Cell)
-    - ALP (Alkaline Phosphatase)
-    - ALT (Alanine Aminotransferase)
-    - AST (Aspartate Aminotransferase)
-    - CRP (C-Reactive Protein)
-    - BILIRUBIN
-    - AGE
-    **중요**: 입력 데이터의 컬럼이 변경(추가, 삭제)될 경우, 모델의 예측 결과가 달라질 수 있습니다. \n
-    따라서 입력 데이터의 구조를 변경하기 전에 모델의 재학습 또는 재검증이 필요합니다. \n
-    """
-    title_markdown = ("""
-    # 임상 데이터 기반 머신러닝을 이용한 총담관석 예측 모델
-    ## Development of a Common Bile Duct Stone Prediction Model Using Machine Learning Based on Clinical Data
-    [📖[Learn more about Common Bile Duct Stones (총담관결석증)](https://namu.wiki/w/%EC%B4%9D%EB%8B%B4%EA%B4%80%EA%B2%B0%EC%84%9D%EC%A6%9D)]
-    ### Copyright © 2024 Dongguk University (DGU) and Dongguk University Medical Center (DUMC). All rights reserved.
-    """)
-    # def explain_with_lime(tabular_data):
-    #     """
-    #     Apply LIME to explain predictions.
-    #     Args:
-    #         tabular_data (list): List of input data points (e.g., rows in a dataframe)
-    #     Returns:
-    #         str: HTML or image showing LIME explanation
-    #     """
-    #     input_data = np.array(tabular_data, dtype=float)
-    #     explainer = LimeTabularExplainer(
-    #         training_data=x_train.values,  # Replace with your training data
-    #         feature_names=tabular_header,
-    #         class_names=['intermediate', 'High'],  # Replace with actual class names
-    #         mode='classification'
-    #     )
-    #     explanation = explainer.explain_instance(
-    #         input_data[0],  # Single instance to explain
-    #         model.predict_proba,  # Probability prediction function
-    #         num_features=len(tabular_header)
-    #     )
-    #     # Plot LIME explanation
-    #     fig = explanation.as_pyplot_figure()
-    #     fig.set_size_inches(25, 8)
-    #     buf = io.BytesIO()
-    #     fig.savefig(buf, format='png')
-    #     buf.seek(0)
-    #     encoded_image = base64.b64encode(buf.read()).decode('utf-8')
-    #     buf.close()
-    #     plt.close(fig)
-    #     return f"<img src='data:image/png;base64,{encoded_image}'/>"
-    tabular_header = ['DUCT_DILIATATION_8MM', 'DUCT_DILIATATION_10MM','PANCREATITIS','FIRST_SBP','FIRST_RR','Hb', 'PLT', 'WBC', 'ALP', 'AST', 'CRP', 'BILIRUBIN', 'AGE']
     tabular_dtype = ['number'] * len(tabular_header)
     with gr.Blocks(theme=gr.themes.Soft()) as demo:
         gr.Markdown(title_markdown)
         gr.Markdown(description)

 from pycaret.classification import *
 import warnings
 warnings.filterwarnings("ignore", category=FutureWarning, module="torch.storage")
+from util import load_data
+import view
 def parse_args(args):
     parser = argparse.ArgumentParser(description="CBD Classification")
     parser.add_argument('--data_dir', type=str, default="./")
 if __name__ == '__main__':
     args = parse_args(sys.argv[1:])
+    train = load_data_and_prepare(args.data_dir, args.excel_file, args.mode, args.scale, args.smote)
     model = load_model(args.model_name_or_path)
     device = torch.device(args.device)
+    examples = view.examples
+    description = view.description
+    title_markdown = view.title_markdown
+    tabular_header = view.tabular_header
     tabular_dtype = ['number'] * len(tabular_header)
     with gr.Blocks(theme=gr.themes.Soft()) as demo:
         gr.Markdown(title_markdown)
         gr.Markdown(description)