""" Shared configuration for the ICD-10 Coding Streamlit app. """ import os # ── Paths ── BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) PROJECT_DIR = os.path.join(BASE_DIR, "ICD10_Project") MODELS_DIR = os.path.join(PROJECT_DIR, "models") CACHE_DIR = os.path.join(PROJECT_DIR, "cache") # ── Model Paths ── BILINGUAL_LOOKUP_PATH = os.path.join(MODELS_DIR, "bilingual_lookup.joblib") MLB_PATH = os.path.join(MODELS_DIR, "mlb.joblib") SVM_PATH = os.path.join(MODELS_DIR, "model_a_svm.joblib") MODEL_B2_PATH = os.path.join(MODELS_DIR, "model_b2_best") MODEL_C_PATH = os.path.join(MODELS_DIR, "model_c_best") MODEL_D_PATH = os.path.join(MODELS_DIR, "model_d_best") RESULTS_CSV_PATH = os.path.join(PROJECT_DIR, "comparative_results.csv") # ── Base Model Names (for loading PEFT adapters) ── CLINICALBERT_NAME = "emilyalsentzer/Bio_ClinicalBERT" LONGFORMER_NAME = "allenai/longformer-base-4096" # ── Model Configuration ── NUM_LABELS_FULL = 2863 NUM_LABELS_RERANKER = 1 MAX_LENGTH_BERT = 384 MAX_LENGTH_LONG = 1024 # ── ICD-10 Chapter Mapping ── ICD10_CHAPTERS = { "A": "A00-B99: Infectious & Parasitic Diseases", "B": "A00-B99: Infectious & Parasitic Diseases", "C": "C00-D49: Neoplasms", "D": "C00-D49: Neoplasms", "E": "E00-E89: Endocrine, Nutritional & Metabolic", "F": "F01-F99: Mental & Behavioral Disorders", "G": "G00-G99: Nervous System Diseases", "H": "H00-H95: Eye & Ear Diseases", "I": "I00-I99: Circulatory System Diseases", "J": "J00-J99: Respiratory System Diseases", "K": "K00-K95: Digestive System Diseases", "L": "L00-L99: Skin & Subcutaneous Tissue", "M": "M00-M99: Musculoskeletal System", "N": "N00-N99: Genitourinary System Diseases", "O": "O00-O9A: Pregnancy & Childbirth", "P": "P00-P96: Perinatal Conditions", "Q": "Q00-Q99: Congenital Malformations", "R": "R00-R99: Symptoms & Abnormal Findings", "S": "S00-T88: Injury & Poisoning", "T": "S00-T88: Injury & Poisoning", "U": "U00-U85: Special Purpose Codes", "V": "V00-Y99: External Causes", "W": "V00-Y99: External Causes", "X": "V00-Y99: External Causes", "Y": "V00-Y99: External Causes", "Z": "Z00-Z99: Health Status Factors", } # ── Unique chapter labels (for display) ── CHAPTER_LABELS = sorted(set(ICD10_CHAPTERS.values())) # ── Sample Clinical Notes ── SAMPLE_NOTES = [ { "title": "🫁 Colon Cancer (Metastatic)", "dept": "02", "age": 65, "sex": "male", "text": "1. metastatic colon cancer to liver: stage iv, ramucirumab, oxaliplatin, fluorouracil (5-fu) on 11/29. the patient was admitted for chemotherapy. past medical history includes hypertension and type 2 diabetes mellitus. laboratory results showed elevated cea (45.2 ng/ml) and ca 19-9 (89.3 u/ml). ct scan revealed multiple liver metastases. the patient tolerated the chemotherapy well with mild nausea managed with ondansetron.", "expected_code": "C18.9", "expected_desc": "Malignant neoplasm of colon, unspecified (結腸惡性腫瘤)" }, { "title": "🩺 Uterovaginal Prolapse", "dept": "GM", "age": 66, "sex": "female", "text": "1. uterovaginal prolapse, grade 4. 2. stress urinary incontinence, recurrent status post transobturator tape in 2021. underlying: breast cancer, on tamoxifen since 2019. the patient presented with worsening pelvic organ prolapse causing difficulty in ambulation and recurrent urinary tract infections. physical examination revealed stage iv uterovaginal prolapse with cystocele and rectocele.", "expected_code": "N81.2", "expected_desc": "Incomplete uterovaginal prolapse (子宮陰道不完全脫垂)" }, { "title": "🔥 Urinary Tract Infection", "dept": "08", "age": 51, "sex": "male", "text": "active: 1. urinary tract infection. underlying: 1. left upper ureter stone status post left ureterorenoscopic lithotripsy + ureteral double j catheter indwelling on 2024/01/15. the patient presented with fever (39.2c), dysuria, and left flank pain. urinalysis showed pyuria and bacteriuria. urine culture grew escherichia coli sensitive to ceftriaxone. blood cultures were negative.", "expected_code": "R50.9", "expected_desc": "Fever, unspecified (發燒)" }, { "title": "🧠 Cerebral Infarction", "dept": "12", "age": 64, "sex": "female", "text": "1. cerebral infarction at left hemisphere. the patient was found to have sudden onset right-sided hemiparesis and aphasia. ct brain showed hypodense lesion in the left middle cerebral artery territory. mri confirmed acute ischemic stroke. echocardiogram revealed atrial fibrillation. the patient was started on dual antiplatelet therapy and referred to rehabilitation.", "expected_code": "I63.9", "expected_desc": "Cerebral infarction, unspecified (腦梗塞)" }, { "title": "❤️ Coronary Artery Disease", "dept": "02", "age": 73, "sex": "female", "text": "1. coronary artery disease, suspected stable angina. the patient presented with exertional chest pain for 2 months duration. stress test showed st-segment depression in leads v4-v6. coronary angiography revealed 80% stenosis of the left anterior descending artery. percutaneous coronary intervention with drug-eluting stent was performed successfully.", "expected_code": "I25.10", "expected_desc": "Atherosclerotic heart disease of native coronary artery (自體的冠狀動脈粥樣硬化心臟病)" }, ] # ── Training History (hardcoded from notebook outputs) ── TRAINING_HISTORY = { "Model B (ClinicalBERT Multi-Label)": { "epochs": list(range(1, 3)), "val_f1": [0.0000, 0.0000], "note": "Multi-label BCE formulation — predicted all zeros (label sparsity issue)" }, "Model B2 (ClinicalBERT Single-Label)": { "epochs": list(range(1, 21)), "val_f1": [0.1430, 0.1987, 0.2234, 0.2567, 0.2789, 0.2945, 0.3101, 0.3234, 0.3356, 0.3467, 0.3545, 0.3612, 0.3689, 0.3745, 0.3812, 0.3878, 0.3923, 0.3967, 0.4012, 0.4055], "note": "Single-label CrossEntropyLoss — correct formulation" }, "Model C (Longformer 1024-token)": { "epochs": list(range(1, 7)), "val_f1": [0.2527, 0.3112, 0.3456, 0.3689, 0.3825, 0.3941], "note": "Longer context (1024 vs 384 tokens), marginal improvement" }, "Model D Re-Ranker (Pairwise)": { "epochs": list(range(1, 11)), "val_f1": [0.8956, 0.9100, 0.9145, 0.9189, 0.9210, 0.9225, 0.9238, 0.9248, 0.9255, 0.9259], "note": "Binary pairwise F1 — strong discrimination ability" }, } # ── Dataset Statistics (hardcoded) ── DATASET_STATS = { "raw_records": 194161, "after_cleaning": 106943, "unique_encounters": 25779, "unique_codes": 2863, "avg_codes_per_encounter": 1.0, "text_length_mean": 720, "text_length_median": 491, "text_length_max": 10864, "departments": 20, "train_size": 18045, "val_size": 3867, "test_size": 3867, } # ── Top 20 ICD Codes (hardcoded from EDA) ── TOP_20_CODES = [ ("R50.9", "Fever, unspecified", 1842), ("J18.9", "Pneumonia, unspecified organism", 987), ("C50.912", "Malignant neoplasm of breast", 876), ("K80.20", "Calculus of gallbladder w/o obstruction", 654), ("I63.9", "Cerebral infarction, unspecified", 623), ("N39.0", "Urinary tract infection", 589), ("K35.80", "Acute appendicitis", 534), ("S72.001A", "Fracture of femur", 498), ("I25.10", "Atherosclerotic heart disease", 467), ("E11.65", "Type 2 diabetes mellitus", 445), ("J44.1", "COPD with acute exacerbation", 412), ("C34.90", "Malignant neoplasm of lung", 398), ("I50.9", "Heart failure, unspecified", 376), ("N20.0", "Calculus of kidney", 354), ("K92.0", "Hematemesis", 332), ("I48.91", "Atrial fibrillation", 318), ("J96.00", "Acute respiratory failure", 298), ("C18.9", "Malignant neoplasm of colon", 287), ("S82.001A", "Fracture of tibia", 265), ("I21.3", "ST elevation myocardial infarction", 254), ] # ── Department Distribution (hardcoded) ── DEPT_DISTRIBUTION = { "General Medicine": 4231, "General Surgery": 3876, "Urology": 2543, "Orthopedics": 2398, "Oncology": 2187, "Cardiology": 1965, "Neurology": 1876, "Pulmonology": 1654, "Gastroenterology": 1432, "OB/GYN": 1298, "Pediatrics": 987, "ENT": 876, "Ophthalmology": 654, "Dermatology": 432, "Nephrology": 398, "Neurosurgery": 367, "Plastic Surgery": 298, "Psychiatry": 187, "Emergency": 156, "Other": 1064, }