import pandas as pd import numpy as np import os import random from datetime import datetime, timedelta def create_data_folder(): if not os.path.exists('Data'): os.makedirs('Data') def generate_claims_data(n=20000): np.random.seed(42) service_lines = ['Cardiology', 'Pulmonology', 'Orthopedics', 'Neurology', 'General Surgery', 'Internal Medicine', 'Oncology', 'Endocrinology', 'Gastroenterology'] drgs = np.random.randint(100, 999, size=200) # Base data total_charges = np.random.uniform(5000, 150000, n) service_line_choices = [random.choice(service_lines) for _ in range(n)] complexity_choices = np.random.choice(['MCC', 'CC', 'Non-CC'], n, p=[0.25, 0.45, 0.3]) # Logic-based Denial Generation for ML Robustness # 1. High charges > 80k have high denial risk # 2. Oncology and Cardiology have higher base risk # 3. MCC complexity significantly adds risk is_denied = [] for i in range(n): risk = 0.02 # Baseline 2% if total_charges[i] > 80000: risk += 0.45 if service_line_choices[i] == 'Oncology': risk += 0.25 if service_line_choices[i] == 'Cardiology': risk += 0.15 if complexity_choices[i] == 'MCC': risk += 0.20 # Clip risk and sample is_denied.append(1 if random.random() < min(risk, 0.95) else 0) data = { 'Claim_ID': [f'CLM{100000+i}' for i in range(n)], 'Patient_ID': [f'PT{random.randint(1, 5000)}' for i in range(n)], 'Service_Line': service_line_choices, 'DRG_Code': [random.choice(drgs) for _ in range(n)], 'Admission_Date': [datetime(2023, 1, 1) + timedelta(days=random.randint(0, 700)) for _ in range(n)], 'Primary_Diagnosis': [f'I{random.randint(10, 99)}' for _ in range(n)], 'Total_Charges': total_charges, 'Reimbursement': np.random.uniform(2000, 120000, n), 'Is_Denied': is_denied, 'Complexity_Level': complexity_choices } df = pd.DataFrame(data) df.to_csv('Data/claims.csv', index=False) print(f"Created Data/claims.csv with {n} rows and ML patterns") def generate_cms_rules(n=250): rule_types = ['DRG_Logic', 'CC_MCC_Update', 'Coding_Addition', 'HCC_Revisions', 'Payment_Policy', 'Quality_Penalty', 'Telehealth', 'Site_of_Care', 'OPPS_Bundling', 'NCD_LCD_Update', 'Value_Based_Program'] targets = ['Cardiology', 'Pulmonology', 'Orthopedics', 'Neurology', 'General Surgery', 'Internal Medicine', 'Oncology', 'Endocrinology', 'Gastroenterology', 'Urology', 'Nephrology'] changes = ['Weight Decrease', 'Weight Increase', 'Reclassification', 'New CPT Codes', 'Weight Shift', 'Site-of-care shift', 'Inclusion Shift', 'Readmission Adjustment', 'Rate Standardization', 'APC Bundling', 'Coverage Determination', 'Penalty Increase'] rules = [] # 1. Orthopedic Bundle (User Example 1) rules.append({ 'Rule_ID': 'R2025_BUND_01', 'Type': 'OPPS_Bundling', 'Target': 'Orthopedics', 'Change': 'APC Bundling', 'Impact_Score': 0.85, 'Description': "CMS 2025 OPPS Update: Orthopedic supply costs (HCPCS C1713) are now 'packaged' into APC 5114 flat fee. Separate pass-through billing is no longer permitted." }) # 2. Sepsis Reclassification (User Demo Question) rules.append({ 'Rule_ID': 'R2025_SEPSIS_02', 'Type': 'DRG_Logic', 'Target': 'Internal Medicine', 'Change': 'Reclassification', 'Impact_Score': 0.78, 'Description': "2025 Sepsis Reclassification: Sepsis with Major Complications (MCC) now requires documented 'Organ System Failure' for DRG 871. Failure to document results in downcode to DRG 872, risking $4,200 loss per case." }) # 3. TKA Outpatient Shift rules.append({ 'Rule_ID': 'R2025_TKA_03', 'Type': 'Site_of_Care', 'Target': 'Orthopedics', 'Change': 'Site-of-care shift', 'Impact_Score': 0.92, 'Description': "2025 TKA Shift: Total Knee Arthroplasty (TKA) procedures are being shifted from Inpatient (IPPS) to Outpatient (OPPS) for healthy populations. Projected reimbursement drop from $14k to $9k per case." }) # 4. New Code Addition (Requirement 3B) rules.append({ 'Rule_ID': 'R2025_CODE_05', 'Type': 'Coding_Addition', 'Target': 'Cardiology', 'Change': 'New CPT Codes', 'Impact_Score': 0.45, 'Description': "CMS 2025 update adds new descriptor codes for remote cardiac monitoring. $250 increase in reimbursement per patient per month for eligible heart failure cases." }) # 5. NCD Coverage Policy (Requirement 1) rules.append({ 'Rule_ID': 'R2025_NCD_06', 'Type': 'NCD_LCD_Update', 'Target': 'Neurology', 'Change': 'Coverage Determination', 'Impact_Score': 0.72, 'Description': "New National Coverage Determination (NCD) for Alzheimer's therapeutics. Stringent clinical criteria for coverage must be met for Medicare payment eligibility." }) # 6. Value-Based Penalty (Requirement 3E) rules.append({ 'Rule_ID': 'R2025_VBP_07', 'Type': 'Value_Based_Program', 'Target': 'General Surgery', 'Change': 'Penalty Increase', 'Impact_Score': 0.55, 'Description': "MSSP Shared Savings Update: Increased weighting for surgical site infection metrics. Organizations in the bottom 25th percentile face up to 2% billing reduction." }) for i in range(n - 6): rule_type = random.choice(rule_types) target = random.choice(targets) change = random.choice(changes) impact = round(random.uniform(0.1, 0.95), 2) rules.append({ 'Rule_ID': f'R2025_{100+i}', 'Type': rule_type, 'Target': target, 'Change': change, 'Impact_Score': impact, 'Description': f"2025 {rule_type} update for {target}. Focus on {change} with a systemic impact of {impact*100}%." }) df = pd.DataFrame(rules) df.to_csv('Data/cms_rules_2025.csv', index=False) print(f"Created Data/cms_rules_2025.csv with {n} rules") def generate_chargemaster(n=2500): """Generates Temple's Chargemaster (CDM) table with batch records.""" service_lines = ['Orthopedics', 'Cardiology', 'Pulmonology', 'Oncology', 'Internal Medicine', 'Neurology', 'General Surgery', 'Endocrinology', 'Gastroenterology'] codes = [] descriptions = [] sl_list = [] status_list = [] price_list = [] # 1. Inject the "Conflict Case" (Requirement: "Revenue Loss from Missing a CMS Code Change") # At least 15% of records will be related to our "Orthopedic Bundling" example for i in range(350): codes.append(f'HCPCS_C1713_{i}') descriptions.append('Orthopedic Implant (Plate/Screw) - Unit A') sl_list.append('Orthopedics') status_list.append('Pass-Through') # THE CONFLICT: Should be 'Packaged' price_list.append(7000) # 2. Add other Orthopedic codes for i in range(400): codes.append(f'CPT_27447_{i}') descriptions.append('Total Knee Arthroplasty (TKA)') sl_list.append('Orthopedics') status_list.append('Active') price_list.append(15000) # 3. Fill the rest with random data for i in range(n - len(codes)): sl = random.choice(service_lines) codes.append(f'CODE_{10000+i}') descriptions.append(f'Hospital Procedure/Supply {i}') sl_list.append(sl) status_list.append(random.choice(['Active', 'Pass-Through', 'Inactive'])) price_list.append(random.randint(50, 45000)) data = { 'CDM_Code': codes, 'Description': descriptions, 'Service_Line': sl_list, 'Status': status_list, 'Base_Charge': price_list } df = pd.DataFrame(data) df.to_csv('Data/chargemaster.csv', index=False) print(f"Created Data/chargemaster.csv with {len(df)} records") def generate_hcc_weights(): conditions = [ 'Diabetes w/ Complications', 'COPD', 'CHF', 'End-Stage Renal Disease', 'Major Depressive Disorder', 'Morbid Obesity', 'Rheumatoid Arthritis', 'Vascular Disease', 'Cerebral Palsy', 'Multiple Sclerosis', 'HIV/AIDS', 'Metastatic Cancer', 'Septicemia', 'Drug/Alcohol Use Disorder', 'Paraplegia', 'Intestinal Obstruction', 'Severe Hematological Disorders', 'Heart Failure', 'Acute Myocardial Infarction', 'Angina Pectoris' ] data = { 'HCC_Code': [f'HCC{i}' for i in range(1, len(conditions) + 1)], 'Description': conditions, 'Old_Weight': np.random.uniform(0.1, 2.5, len(conditions)), 'New_Weight': np.random.uniform(0.1, 2.5, len(conditions)) } df = pd.DataFrame(data) df.to_csv('Data/hcc_weights.csv', index=False) print("Created Data/hcc_weights.csv") def generate_denials_data(n=3000): """Generates sample denial data for executive summary KPIs.""" np.random.seed(99) statuses = ['Paid', 'Denied', 'Open', 'Appealed', 'Partially Paid'] service_lines = ['Cardiology', 'Pulmonology', 'Orthopedics', 'Neurology', 'Surgery', 'Medicine', 'Oncology', 'Endocrinology', 'Gastroenterology'] data = { 'Denial_ID': [f'DEN{50000+i}' for i in range(n)], 'Claim_ID': [f'CLM{200000+i}' for i in range(n)], 'Service_Line': [random.choice(service_lines) for _ in range(n)], 'Status': [random.choice(statuses) for _ in range(n)], 'Denied_Amount': np.random.uniform(500, 25000, n), 'Reason': [f'Reason Code {random.randint(1, 45)}' for _ in range(n)] } df = pd.DataFrame(data) df.to_csv('Data/sample_denials_3000.csv', index=False) print("Created Data/sample_denials_3000.csv") if __name__ == '__main__': # Bypass bottleneck bottleneck os.environ['BOTTLENECK_DISABLE'] = '1' create_data_folder() generate_claims_data() generate_cms_rules() generate_chargemaster() generate_hcc_weights() generate_denials_data()