""" Dynamic dataset generation for Government Fraud Detection tasks. This module perturbs the base synthetic documents at episode reset time to reduce memorization while keeping the task schema and key identifiers stable. """ from __future__ import annotations import copy import random from datetime import date, timedelta from typing import Any, Dict, Optional from .documents import TASK1_DOCUMENTS, TASK2_DOCUMENTS, TASK3_DOCUMENTS def _rng(seed: Optional[int] = None) -> random.Random: return random.Random(seed) if seed is not None else random.Random() def _shift_date(r: random.Random, iso_date: str, max_days: int = 21) -> str: year, month, day = [int(x) for x in iso_date.split("-")] d = date(year, month, day) return (d + timedelta(days=r.randint(-max_days, max_days))).isoformat() def _jitter_amount(r: random.Random, value: float, pct: float) -> float: factor = 1.0 + r.uniform(-pct, pct) return round(value * factor, 2) def _add_noise_documents( docs: Dict[str, Dict[str, Any]], r: random.Random, count: int, prefix: str, doc_type: str, title_prefix: str, preview_prefix: str, ) -> None: """Add distractor documents that look plausible but are irrelevant.""" for index in range(1, count + 1): suffix = r.randint(100, 999) doc_id = f"{prefix}-D{index:02d}-{suffix}" docs[doc_id] = { "doc_type": doc_type, "title": f"{title_prefix} {index:02d} / Ref {suffix}", "preview": f"{preview_prefix} {suffix}", "content": { "note": "Irrelevant distractor document generated for robustness testing", "reference": suffix, "seeded_variant": True, }, } def generate_dynamic_documents(task_id: str, seed: Optional[int] = None) -> Dict[str, Dict[str, Any]]: """ Return per-episode task documents with randomized surface details. Notes: - Key document IDs remain unchanged so existing policies and graders still work. - Ground-truth relationships remain intact; only surface details are varied. """ r = _rng(seed) if task_id == "duplicate_billing": docs = copy.deepcopy(TASK1_DOCUMENTS) patient_names = ["Robert Haines", "Martha Stone", "Peter Larson", "Nina Ortiz", "Daniel Brooks"] provider_names = ["MedCorp Associates LLC", "Summit Care Group", "Northfield Medical Partners"] patient_name = r.choice(patient_names) provider_name = r.choice(provider_names) patient_alias = f"{patient_name} (obfuscated)" if r.random() < 0.5 else patient_name provider_alias = r.choice([ provider_name, provider_name.replace("Associates", "Assoc."), provider_name.replace("Partners", "Ptnrs"), ]) service_date = _shift_date(r, "2024-03-15", max_days=30) near_service_date = _shift_date(r, service_date, max_days=2) billed_amount = _jitter_amount(r, 185.0, pct=0.12) for claim_id, doc in docs.items(): content = doc["content"] content["submitted_date"] = _shift_date(r, content["submitted_date"], max_days=30) content["billed_amount"] = _jitter_amount(r, float(content["billed_amount"]), pct=0.15) if claim_id in {"CLAIM-001", "CLAIM-002", "CLAIM-004"}: content["patient_name"] = patient_alias content["provider_name"] = provider_alias content["provider_id"] = "PRV-8821" content["procedure_code"] = "99213" content["billed_amount"] = billed_amount if claim_id in {"CLAIM-001", "CLAIM-002"}: content["service_date"] = service_date elif claim_id == "CLAIM-004": content["service_date"] = near_service_date doc["preview"] = ( f"Patient {content['patient_id']}, Procedure {content['procedure_code']}, " f"Date {content['service_date']}, ${content['billed_amount']:.2f}" ) _add_noise_documents( docs, r, count=2, prefix="CLAIM", doc_type="medicare_claim", title_prefix="Medicare Claim Appendix", preview_prefix="Extra claim reference", ) return docs if task_id == "shell_company": docs = copy.deepcopy(TASK2_DOCUMENTS) contract_1 = docs["CONTRACT-001"]["content"] contract_2 = docs["CONTRACT-002"]["content"] contract_1["award_date"] = _shift_date(r, contract_1["award_date"], max_days=60) contract_2["award_date"] = _shift_date(r, contract_2["award_date"], max_days=60) contract_1["project"] = r.choice([ "GSA Region 4 Facility Renovation - Atlanta", "Federal Archive Retrofit - Atlanta", "Regional Operations Building Upgrade - Atlanta", ]) contract_2["project"] = r.choice([ "DOT Regional Office Network Upgrade", "Transport Systems Security Refresh", "Regional Data Backbone Modernization", ]) vendor_reg = docs["VENDOR-REG-001"]["content"] vendor_reg["legal_name"] = r.choice(["FastBuild LLC", "FastBuild Group LLC", "FastBuild Holdings LLC"]) vendor_reg["annual_revenue_reported"] = int(_jitter_amount(r, float(vendor_reg["annual_revenue_reported"]), pct=0.35)) vendor_reg["employees_reported"] = max(2, int(round(_jitter_amount(r, float(vendor_reg["employees_reported"]), pct=0.4)))) docs["TRUST-DOC-001"]["content"]["trust_date"] = _shift_date(r, docs["TRUST-DOC-001"]["content"]["trust_date"], max_days=180) docs["INVOICE-001"]["content"]["invoice_date"] = _shift_date(r, docs["INVOICE-001"]["content"]["invoice_date"], max_days=45) docs["GOV-EMPLOYEE-001"]["content"]["title"] = r.choice([ "Senior Contracting Officer", "Contracting Officer IV", "Federal Procurement Officer", ]) docs["CONTRACT-001"]["preview"] = ( f"FastBuild LLC awarded ${contract_1['award_amount'] / 1_000_000:.2f}M for {contract_1['project']}" ) docs["CONTRACT-002"]["preview"] = ( f"FastBuild LLC awarded ${contract_2['award_amount'] / 1_000_000:.2f}M for {contract_2['project']}" ) docs["VENDOR-REG-001"]["preview"] = ( "FastBuild LLC, Delaware LLC, " f"reported revenue ${vendor_reg['annual_revenue_reported']:,}" ) docs["TRUST-DOC-001"]["preview"] = ( f"Trustee: {docs['TRUST-DOC-001']['content']['trustee']}. " f"Trust date {docs['TRUST-DOC-001']['content']['trust_date']}" ) _add_noise_documents( docs, r, count=2, prefix="CORP", doc_type="corporate_filing", title_prefix="State Filing Appendix", preview_prefix="Irrelevant state filing ref", ) return docs if task_id == "fca_complaint": docs = copy.deepcopy(TASK3_DOCUMENTS) tip = docs["ANON-TIP-001"]["content"] tip["received_date"] = _shift_date(r, tip["received_date"], max_days=75) tip["estimated_fraud_amount"] = r.choice([ "Could be in the high single-digit millions", "Likely over $8M based on claims volume", "Potentially between $8M and $12M", ]) tip["tipster_relation"] = r.choice([ "Former billing department employee", "Former coding specialist", "Former claims auditor", ]) claim_batch = docs["CMS-CLAIM-BATCH-001"]["content"] total_claims = int(round(_jitter_amount(r, float(claim_batch["total_claims"]), pct=0.1))) total_claims = max(700, min(950, total_claims)) claim_batch["total_claims"] = total_claims claim_batch["comparison_industry_avg_k0831_claims"] = max( 90, min(180, int(round(_jitter_amount(r, 120.0, pct=0.2)))), ) sampled_orders = docs["PHYSICIAN-ORDERS-001"]["content"] sampled_orders["orders_supporting_k0831"] = r.randint(10, 16) sampled_orders["orders_not_supporting_k0831"] = 50 - sampled_orders["orders_supporting_k0831"] sampled_orders["orders_missing_entirely"] = r.randint(5, 11) complaints = docs["PATIENT-COMPLAINT-001"]["content"] complaints["complaints_reviewed"] = r.randint(20, 35) expert = docs["EXPERT-ANALYSIS-001"]["content"]["findings"] expert["non_compliant_claims_estimated_pct"] = r.randint(70, 82) expert["estimated_non_compliant_claims"] = int( total_claims * (expert["non_compliant_claims_estimated_pct"] / 100.0) ) docs["CMS-CLAIM-BATCH-001"]["preview"] = ( f"{total_claims} claims for K0831 power wheelchair, ${claim_batch['total_billed'] / 1_000_000:.1f}M total" ) docs["PHYSICIAN-ORDERS-001"]["preview"] = ( f"{sampled_orders['orders_not_supporting_k0831']} of 50 sampled physician orders do not support K0831 level" ) docs["PATIENT-COMPLAINT-001"]["preview"] = ( f"{complaints['complaints_reviewed']} complaints: received wrong equipment or no equipment" ) docs["EXPERT-ANALYSIS-001"]["preview"] = ( "Expert estimates improper Medicare payments in multi-million range " f"with {expert['non_compliant_claims_estimated_pct']}% non-compliance" ) _add_noise_documents( docs, r, count=3, prefix="MISC", doc_type="misc_ledger", title_prefix="Miscellaneous Appendix", preview_prefix="Irrelevant operational note", ) return docs raise ValueError(f"Unsupported task_id for dynamic generation: {task_id}")