ClaimSense / data_generator.py
Kshamaa S
Initial deployment: ClaimSense agentic clinical claims intelligence
f4403f6
Raw
History Blame Contribute Delete
6.97 kB
"""
ClaimSense — Synthetic Claims Data Generator
Generates realistic medical claims with injected anomalies for demonstration.
"""
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
random.seed(42)
np.random.seed(42)
# --- Reference Tables ---
DIAGNOSIS_CODES = {
"Z00.00": "Routine general medical examination",
"I10": "Essential (primary) hypertension",
"E11.9": "Type 2 diabetes mellitus without complications",
"J06.9": "Acute upper respiratory infection, unspecified",
"M54.5": "Low back pain",
"F32.9": "Major depressive disorder, single episode, unspecified",
"J18.9": "Pneumonia, unspecified organism",
"I25.10": "Atherosclerotic heart disease of native coronary artery",
"N39.0": "Urinary tract infection, site not specified",
"K21.0": "Gastro-esophageal reflux disease with esophagitis",
}
PROCEDURE_CODES = {
"99213": ("Office visit, established patient, low complexity", 120, 180),
"99214": ("Office visit, established patient, moderate complexity", 180, 280),
"99215": ("Office visit, established patient, high complexity", 250, 380),
"93000": ("Electrocardiogram, routine ECG", 80, 130),
"71046": ("Chest X-ray, 2 views", 120, 200),
"80053": ("Comprehensive metabolic panel", 40, 80),
"99232": ("Subsequent hospital care, moderate complexity", 180, 260),
"27447": ("Total knee arthroplasty", 8000, 14000),
"43239": ("Upper GI endoscopy with biopsy", 900, 1600),
"70553": ("MRI brain with contrast", 1200, 2200),
}
PROVIDERS = {
"PRV-001": ("Dr. Sarah Mitchell", "Internal Medicine", "normal"),
"PRV-002": ("Dr. James Chen", "Cardiology", "normal"),
"PRV-003": ("Dr. Patricia Wong", "Family Medicine", "normal"),
"PRV-004": ("Dr. Robert Garcia", "Orthopedics", "normal"),
"PRV-005": ("Dr. Lisa Patel", "Gastroenterology", "normal"),
"PRV-006": ("Dr. Michael Torres", "Family Medicine", "high_volume"),
"PRV-007": ("Dr. Amanda Foster", "Internal Medicine", "normal"),
"PRV-008": ("Dr. Kevin Park", "Neurology", "normal"),
}
PATIENTS = [f"PAT-{str(i).zfill(4)}" for i in range(1, 201)]
INSURERS = ["BlueCross BlueShield", "Aetna", "UnitedHealth", "Cigna", "Humana"]
def generate_normal_claim(claim_id, date):
"""Generate a single normal, clinically appropriate claim."""
provider_id = random.choice(list(PROVIDERS.keys()))
provider_name, specialty, _ = PROVIDERS[provider_id]
patient_id = random.choice(PATIENTS)
# Pick a clinically coherent dx + procedure pair
dx_code = random.choice(list(DIAGNOSIS_CODES.keys()))
proc_code = random.choice(list(PROCEDURE_CODES.keys()))
proc_desc, min_amt, max_amt = PROCEDURE_CODES[proc_code]
amount = round(random.uniform(min_amt, max_amt), 2)
return {
"claim_id": claim_id,
"date_of_service": date.strftime("%Y-%m-%d"),
"patient_id": patient_id,
"provider_id": provider_id,
"provider_name": provider_name,
"specialty": specialty,
"diagnosis_code": dx_code,
"diagnosis_description": DIAGNOSIS_CODES[dx_code],
"procedure_code": proc_code,
"procedure_description": proc_desc,
"billed_amount": amount,
"insurer": random.choice(INSURERS),
"anomaly_type": "none",
"anomaly_label": 0,
}
def inject_anomalies(claims_df):
"""Inject specific, labeled anomaly types into a subset of claims."""
anomaly_indices = random.sample(range(len(claims_df)), k=18)
anomaly_types = [
"diagnosis_procedure_mismatch",
"duplicate_claim",
"high_frequency_billing",
"upcoding",
"unbundling",
"impossible_date",
]
for i, idx in enumerate(anomaly_indices):
atype = anomaly_types[i % len(anomaly_types)]
row = claims_df.iloc[idx].copy()
if atype == "diagnosis_procedure_mismatch":
# Assign a procedure inconsistent with the diagnosis
row["procedure_code"] = "27447"
row["procedure_description"] = PROCEDURE_CODES["27447"][0]
row["billed_amount"] = round(random.uniform(8000, 14000), 2)
row["diagnosis_code"] = "J06.9"
row["diagnosis_description"] = DIAGNOSIS_CODES["J06.9"]
elif atype == "duplicate_claim":
# Same patient, same procedure, same date as a prior claim
base_idx = random.choice([j for j in range(len(claims_df)) if j != idx])
base = claims_df.iloc[base_idx]
row["patient_id"] = base["patient_id"]
row["procedure_code"] = base["procedure_code"]
row["procedure_description"] = base["procedure_description"]
row["date_of_service"] = base["date_of_service"]
row["billed_amount"] = base["billed_amount"]
elif atype == "high_frequency_billing":
# High-volume provider billing expensive procedure repeatedly
row["provider_id"] = "PRV-006"
row["provider_name"] = PROVIDERS["PRV-006"][0]
row["procedure_code"] = "70553"
row["procedure_description"] = PROCEDURE_CODES["70553"][0]
row["billed_amount"] = round(random.uniform(1200, 2200), 2)
elif atype == "upcoding":
# Bill a higher-complexity code than clinically warranted
row["procedure_code"] = "99215"
row["procedure_description"] = PROCEDURE_CODES["99215"][0]
row["billed_amount"] = round(random.uniform(350, 500), 2) # above normal max
elif atype == "unbundling":
# Separately bill components that should be bundled
row["billed_amount"] = round(random.uniform(900, 1400), 2)
row["procedure_code"] = "80053"
row["procedure_description"] = "Unbundled lab panel — individual components billed separately"
elif atype == "impossible_date":
# Future date of service
future_date = datetime.now() + timedelta(days=random.randint(30, 180))
row["date_of_service"] = future_date.strftime("%Y-%m-%d")
row["anomaly_type"] = atype
row["anomaly_label"] = 1
claims_df.iloc[idx] = row
return claims_df
def generate_claims_dataset(n=120):
"""Generate a full synthetic claims dataset with injected anomalies."""
start_date = datetime(2024, 1, 1)
claims = []
for i in range(n):
date = start_date + timedelta(days=random.randint(0, 365))
claims.append(generate_normal_claim(f"CLM-{str(i+1).zfill(4)}", date))
df = pd.DataFrame(claims)
df = inject_anomalies(df)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
return df
if __name__ == "__main__":
df = generate_claims_dataset()
print(f"Generated {len(df)} claims — {df['anomaly_label'].sum()} flagged anomalies")
print(df[df["anomaly_label"] == 1][["claim_id", "anomaly_type", "billed_amount"]].to_string())