TemHealth / data_generator.py
vbzvibin's picture
Upload 32 files
1b8d0f1 verified
import pandas as pd
import numpy as np
import os
import random
from datetime import datetime, timedelta
def create_data_folder():
if not os.path.exists('Data'):
os.makedirs('Data')
def generate_claims_data(n=20000):
np.random.seed(42)
service_lines = ['Cardiology', 'Pulmonology', 'Orthopedics', 'Neurology', 'General Surgery', 'Internal Medicine', 'Oncology', 'Endocrinology', 'Gastroenterology']
drgs = np.random.randint(100, 999, size=200)
# Base data
total_charges = np.random.uniform(5000, 150000, n)
service_line_choices = [random.choice(service_lines) for _ in range(n)]
complexity_choices = np.random.choice(['MCC', 'CC', 'Non-CC'], n, p=[0.25, 0.45, 0.3])
# Logic-based Denial Generation for ML Robustness
# 1. High charges > 80k have high denial risk
# 2. Oncology and Cardiology have higher base risk
# 3. MCC complexity significantly adds risk
is_denied = []
for i in range(n):
risk = 0.02 # Baseline 2%
if total_charges[i] > 80000: risk += 0.45
if service_line_choices[i] == 'Oncology': risk += 0.25
if service_line_choices[i] == 'Cardiology': risk += 0.15
if complexity_choices[i] == 'MCC': risk += 0.20
# Clip risk and sample
is_denied.append(1 if random.random() < min(risk, 0.95) else 0)
data = {
'Claim_ID': [f'CLM{100000+i}' for i in range(n)],
'Patient_ID': [f'PT{random.randint(1, 5000)}' for i in range(n)],
'Service_Line': service_line_choices,
'DRG_Code': [random.choice(drgs) for _ in range(n)],
'Admission_Date': [datetime(2023, 1, 1) + timedelta(days=random.randint(0, 700)) for _ in range(n)],
'Primary_Diagnosis': [f'I{random.randint(10, 99)}' for _ in range(n)],
'Total_Charges': total_charges,
'Reimbursement': np.random.uniform(2000, 120000, n),
'Is_Denied': is_denied,
'Complexity_Level': complexity_choices
}
df = pd.DataFrame(data)
df.to_csv('Data/claims.csv', index=False)
print(f"Created Data/claims.csv with {n} rows and ML patterns")
def generate_cms_rules(n=250):
rule_types = ['DRG_Logic', 'CC_MCC_Update', 'Coding_Addition', 'HCC_Revisions', 'Payment_Policy', 'Quality_Penalty', 'Telehealth', 'Site_of_Care', 'OPPS_Bundling', 'NCD_LCD_Update', 'Value_Based_Program']
targets = ['Cardiology', 'Pulmonology', 'Orthopedics', 'Neurology', 'General Surgery', 'Internal Medicine', 'Oncology', 'Endocrinology', 'Gastroenterology', 'Urology', 'Nephrology']
changes = ['Weight Decrease', 'Weight Increase', 'Reclassification', 'New CPT Codes', 'Weight Shift', 'Site-of-care shift', 'Inclusion Shift', 'Readmission Adjustment', 'Rate Standardization', 'APC Bundling', 'Coverage Determination', 'Penalty Increase']
rules = []
# 1. Orthopedic Bundle (User Example 1)
rules.append({
'Rule_ID': 'R2025_BUND_01',
'Type': 'OPPS_Bundling',
'Target': 'Orthopedics',
'Change': 'APC Bundling',
'Impact_Score': 0.85,
'Description': "CMS 2025 OPPS Update: Orthopedic supply costs (HCPCS C1713) are now 'packaged' into APC 5114 flat fee. Separate pass-through billing is no longer permitted."
})
# 2. Sepsis Reclassification (User Demo Question)
rules.append({
'Rule_ID': 'R2025_SEPSIS_02',
'Type': 'DRG_Logic',
'Target': 'Internal Medicine',
'Change': 'Reclassification',
'Impact_Score': 0.78,
'Description': "2025 Sepsis Reclassification: Sepsis with Major Complications (MCC) now requires documented 'Organ System Failure' for DRG 871. Failure to document results in downcode to DRG 872, risking $4,200 loss per case."
})
# 3. TKA Outpatient Shift
rules.append({
'Rule_ID': 'R2025_TKA_03',
'Type': 'Site_of_Care',
'Target': 'Orthopedics',
'Change': 'Site-of-care shift',
'Impact_Score': 0.92,
'Description': "2025 TKA Shift: Total Knee Arthroplasty (TKA) procedures are being shifted from Inpatient (IPPS) to Outpatient (OPPS) for healthy populations. Projected reimbursement drop from $14k to $9k per case."
})
# 4. New Code Addition (Requirement 3B)
rules.append({
'Rule_ID': 'R2025_CODE_05',
'Type': 'Coding_Addition',
'Target': 'Cardiology',
'Change': 'New CPT Codes',
'Impact_Score': 0.45,
'Description': "CMS 2025 update adds new descriptor codes for remote cardiac monitoring. $250 increase in reimbursement per patient per month for eligible heart failure cases."
})
# 5. NCD Coverage Policy (Requirement 1)
rules.append({
'Rule_ID': 'R2025_NCD_06',
'Type': 'NCD_LCD_Update',
'Target': 'Neurology',
'Change': 'Coverage Determination',
'Impact_Score': 0.72,
'Description': "New National Coverage Determination (NCD) for Alzheimer's therapeutics. Stringent clinical criteria for coverage must be met for Medicare payment eligibility."
})
# 6. Value-Based Penalty (Requirement 3E)
rules.append({
'Rule_ID': 'R2025_VBP_07',
'Type': 'Value_Based_Program',
'Target': 'General Surgery',
'Change': 'Penalty Increase',
'Impact_Score': 0.55,
'Description': "MSSP Shared Savings Update: Increased weighting for surgical site infection metrics. Organizations in the bottom 25th percentile face up to 2% billing reduction."
})
for i in range(n - 6):
rule_type = random.choice(rule_types)
target = random.choice(targets)
change = random.choice(changes)
impact = round(random.uniform(0.1, 0.95), 2)
rules.append({
'Rule_ID': f'R2025_{100+i}',
'Type': rule_type,
'Target': target,
'Change': change,
'Impact_Score': impact,
'Description': f"2025 {rule_type} update for {target}. Focus on {change} with a systemic impact of {impact*100}%."
})
df = pd.DataFrame(rules)
df.to_csv('Data/cms_rules_2025.csv', index=False)
print(f"Created Data/cms_rules_2025.csv with {n} rules")
def generate_chargemaster(n=2500):
"""Generates Temple's Chargemaster (CDM) table with batch records."""
service_lines = ['Orthopedics', 'Cardiology', 'Pulmonology', 'Oncology', 'Internal Medicine', 'Neurology', 'General Surgery', 'Endocrinology', 'Gastroenterology']
codes = []
descriptions = []
sl_list = []
status_list = []
price_list = []
# 1. Inject the "Conflict Case" (Requirement: "Revenue Loss from Missing a CMS Code Change")
# At least 15% of records will be related to our "Orthopedic Bundling" example
for i in range(350):
codes.append(f'HCPCS_C1713_{i}')
descriptions.append('Orthopedic Implant (Plate/Screw) - Unit A')
sl_list.append('Orthopedics')
status_list.append('Pass-Through') # THE CONFLICT: Should be 'Packaged'
price_list.append(7000)
# 2. Add other Orthopedic codes
for i in range(400):
codes.append(f'CPT_27447_{i}')
descriptions.append('Total Knee Arthroplasty (TKA)')
sl_list.append('Orthopedics')
status_list.append('Active')
price_list.append(15000)
# 3. Fill the rest with random data
for i in range(n - len(codes)):
sl = random.choice(service_lines)
codes.append(f'CODE_{10000+i}')
descriptions.append(f'Hospital Procedure/Supply {i}')
sl_list.append(sl)
status_list.append(random.choice(['Active', 'Pass-Through', 'Inactive']))
price_list.append(random.randint(50, 45000))
data = {
'CDM_Code': codes,
'Description': descriptions,
'Service_Line': sl_list,
'Status': status_list,
'Base_Charge': price_list
}
df = pd.DataFrame(data)
df.to_csv('Data/chargemaster.csv', index=False)
print(f"Created Data/chargemaster.csv with {len(df)} records")
def generate_hcc_weights():
conditions = [
'Diabetes w/ Complications', 'COPD', 'CHF', 'End-Stage Renal Disease',
'Major Depressive Disorder', 'Morbid Obesity', 'Rheumatoid Arthritis',
'Vascular Disease', 'Cerebral Palsy', 'Multiple Sclerosis',
'HIV/AIDS', 'Metastatic Cancer', 'Septicemia', 'Drug/Alcohol Use Disorder',
'Paraplegia', 'Intestinal Obstruction', 'Severe Hematological Disorders',
'Heart Failure', 'Acute Myocardial Infarction', 'Angina Pectoris'
]
data = {
'HCC_Code': [f'HCC{i}' for i in range(1, len(conditions) + 1)],
'Description': conditions,
'Old_Weight': np.random.uniform(0.1, 2.5, len(conditions)),
'New_Weight': np.random.uniform(0.1, 2.5, len(conditions))
}
df = pd.DataFrame(data)
df.to_csv('Data/hcc_weights.csv', index=False)
print("Created Data/hcc_weights.csv")
def generate_denials_data(n=3000):
"""Generates sample denial data for executive summary KPIs."""
np.random.seed(99)
statuses = ['Paid', 'Denied', 'Open', 'Appealed', 'Partially Paid']
service_lines = ['Cardiology', 'Pulmonology', 'Orthopedics', 'Neurology', 'Surgery', 'Medicine', 'Oncology', 'Endocrinology', 'Gastroenterology']
data = {
'Denial_ID': [f'DEN{50000+i}' for i in range(n)],
'Claim_ID': [f'CLM{200000+i}' for i in range(n)],
'Service_Line': [random.choice(service_lines) for _ in range(n)],
'Status': [random.choice(statuses) for _ in range(n)],
'Denied_Amount': np.random.uniform(500, 25000, n),
'Reason': [f'Reason Code {random.randint(1, 45)}' for _ in range(n)]
}
df = pd.DataFrame(data)
df.to_csv('Data/sample_denials_3000.csv', index=False)
print("Created Data/sample_denials_3000.csv")
if __name__ == '__main__':
# Bypass bottleneck bottleneck
os.environ['BOTTLENECK_DISABLE'] = '1'
create_data_folder()
generate_claims_data()
generate_cms_rules()
generate_chargemaster()
generate_hcc_weights()
generate_denials_data()