Spaces:
Running
Running
| """ | |
| Synthetic clinical oncology data generator for OncoAgent. | |
| Generates OncoCoT-format samples for pipeline validation. | |
| All data is 100% synthetic — zero real patient information. | |
| """ | |
| import json | |
| import os | |
| import random | |
| from typing import List, Dict | |
| # Reproducibility seed (Rule #22) | |
| random.seed(42) | |
| SYNTHETIC_ONCOCOT_SAMPLES: List[Dict[str, str]] = [ | |
| # === HIGH RISK (5 cases) === | |
| { | |
| "history": ( | |
| "62-year-old female presents with persistent dry cough for 3 months, " | |
| "unintentional weight loss of 8 kg, and hemoptysis. Chest CT reveals a " | |
| "2.5 cm spiculated mass in the left upper lobe with associated pleural " | |
| "thickening and enlarged mediastinal lymph nodes measuring 1.2 cm. " | |
| "Patient is a former smoker with 30 pack-year history." | |
| ), | |
| "reasoning": ( | |
| "1. Identify lesion characteristics: 2.5 cm mass classifies as T1c/T2a. " | |
| "2. Morphology: 'Spiculated' margins are highly indicative of malignancy " | |
| "(positive predictive value >90%). " | |
| "3. Nodal involvement: Mediastinal lymph nodes at 1.2 cm suggest N2 status. " | |
| "4. Clinical correlation: Hemoptysis + weight loss + smoking history " | |
| "significantly increase pre-test probability. " | |
| "5. Staging synthesis: T2aN2M0 → Stage IIIA per AJCC 8th edition." | |
| ), | |
| "conclusion": ( | |
| "High suspicion for non-small cell lung cancer (NSCLC), likely Stage IIIA. " | |
| "Recommend urgent tissue biopsy (CT-guided or bronchoscopy) and PET-CT " | |
| "for comprehensive staging. Multidisciplinary tumor board consultation required." | |
| ), | |
| }, | |
| { | |
| "history": ( | |
| "55-year-old male with a palpable 3.5 cm mass in the right breast, " | |
| "skin dimpling, and axillary lymphadenopathy on the ipsilateral side. " | |
| "Mammography shows an irregular dense mass with microcalcifications. " | |
| "Family history positive for BRCA2 mutation in first-degree relative." | |
| ), | |
| "reasoning": ( | |
| "1. Mass characteristics: 3.5 cm irregular mass with microcalcifications " | |
| "is highly suspicious (BI-RADS 5). " | |
| "2. Clinical signs: Skin dimpling indicates possible Cooper ligament involvement. " | |
| "3. Nodal status: Ipsilateral axillary lymphadenopathy suggests N1 involvement. " | |
| "4. Risk factors: Male breast cancer accounts for <1% of cases, but BRCA2 " | |
| "significantly increases risk (6-8% lifetime). " | |
| "5. Staging estimate: T2N1M0 → Stage IIB." | |
| ), | |
| "conclusion": ( | |
| "High suspicion for male breast carcinoma, likely Stage IIB. " | |
| "Recommend core needle biopsy with receptor testing (ER/PR/HER2), " | |
| "BRCA genetic testing, and staging workup including chest/abdominal CT." | |
| ), | |
| }, | |
| { | |
| "history": ( | |
| "70-year-old male presents with progressive difficulty swallowing solids " | |
| "over 4 months, weight loss of 12 kg, and retrosternal pain. Upper " | |
| "endoscopy reveals a 4 cm circumferential mass in the distal esophagus " | |
| "with mucosal ulceration. CT shows thickened esophageal wall and " | |
| "suspicious celiac lymph nodes." | |
| ), | |
| "reasoning": ( | |
| "1. Lesion: 4 cm circumferential mass with ulceration is T3 (adventitial invasion likely). " | |
| "2. Location: Distal esophagus suggests adenocarcinoma (Barrett's association). " | |
| "3. Nodal disease: Celiac lymph nodes represent M1 lymph node disease per AJCC. " | |
| "4. Symptoms: Progressive dysphagia + significant weight loss indicate advanced disease. " | |
| "5. Staging: T3N1M1(LYM) → Stage IVA." | |
| ), | |
| "conclusion": ( | |
| "High suspicion for esophageal adenocarcinoma, Stage IVA. " | |
| "Recommend endoscopic biopsy with HER2 testing, PET-CT for complete staging, " | |
| "and referral for palliative chemoradiation consideration." | |
| ), | |
| }, | |
| { | |
| "history": ( | |
| "48-year-old female with recently discovered hepatic masses on " | |
| "ultrasound performed for right upper quadrant pain. CT reveals " | |
| "multiple bilobar liver lesions (largest 6 cm) with arterial enhancement " | |
| "and washout. AFP level is 850 ng/mL. History of hepatitis C cirrhosis." | |
| ), | |
| "reasoning": ( | |
| "1. Imaging: Arterial enhancement with washout is pathognomonic for HCC (LI-RADS 5). " | |
| "2. Biomarker: AFP >400 ng/mL is highly specific for hepatocellular carcinoma. " | |
| "3. Risk factor: HCV cirrhosis is the leading cause of HCC. " | |
| "4. Extent: Bilobar disease precludes surgical resection. " | |
| "5. Staging: Beyond Milan criteria (single ≤5cm or ≤3 lesions each ≤3cm) → BCLC Stage C." | |
| ), | |
| "conclusion": ( | |
| "Hepatocellular carcinoma confirmed by imaging criteria (LI-RADS 5) and AFP elevation. " | |
| "BCLC Stage C. Recommend systemic therapy (atezolizumab + bevacizumab per NCCN) " | |
| "and liver transplant evaluation if disease responds." | |
| ), | |
| }, | |
| { | |
| "history": ( | |
| "58-year-old male with iron-deficiency anemia, change in bowel habits " | |
| "for 6 months, and a 2 cm mass found in the sigmoid colon on colonoscopy. " | |
| "Biopsy confirms moderately differentiated adenocarcinoma. CT abdomen shows " | |
| "3 suspicious pericolonic lymph nodes and 2 small liver lesions." | |
| ), | |
| "reasoning": ( | |
| "1. Primary tumor: 2 cm sigmoid adenocarcinoma, moderately differentiated. " | |
| "2. Local spread: Pericolonic lymph nodes suggest N1 disease. " | |
| "3. Distant metastasis: Liver lesions are concerning for M1a hepatic metastases. " | |
| "4. Presentation: Iron-deficiency anemia is classic for right-sided colon cancer " | |
| "but can occur in sigmoid lesions with chronic occult bleeding. " | |
| "5. Staging: T3N1M1a → Stage IVA (AJCC 8th edition)." | |
| ), | |
| "conclusion": ( | |
| "Sigmoid colon adenocarcinoma, Stage IVA with hepatic metastases. " | |
| "Recommend molecular profiling (MSI, KRAS/NRAS/BRAF), " | |
| "liver MRI for surgical resectability assessment, and FOLFOX/FOLFIRI-based " | |
| "systemic therapy per NCCN guidelines." | |
| ), | |
| }, | |
| # === MEDIUM RISK (3 cases) === | |
| { | |
| "history": ( | |
| "45-year-old female with a 1.5 cm solid thyroid nodule found incidentally " | |
| "on carotid ultrasound. Fine needle aspiration shows Bethesda IV " | |
| "(follicular neoplasm). No cervical lymphadenopathy. TSH is normal." | |
| ), | |
| "reasoning": ( | |
| "1. Nodule: 1.5 cm solid nodule with Bethesda IV cytology. " | |
| "2. Risk of malignancy: Bethesda IV carries 15-30% cancer risk. " | |
| "3. Favorable factors: No lymphadenopathy, normal TSH. " | |
| "4. Cannot distinguish follicular adenoma from carcinoma on cytology alone. " | |
| "5. Assessment: Intermediate risk requiring diagnostic surgery." | |
| ), | |
| "conclusion": ( | |
| "Indeterminate thyroid nodule (Bethesda IV) with moderate malignancy risk. " | |
| "Recommend molecular testing (Afirma or ThyroSeq) if available. " | |
| "If molecular testing is inconclusive, diagnostic lobectomy is indicated." | |
| ), | |
| }, | |
| { | |
| "history": ( | |
| "60-year-old male with a PSA level of 7.2 ng/mL on routine screening. " | |
| "Digital rectal exam reveals a firm nodule on the right lobe. " | |
| "MRI prostate shows a PI-RADS 4 lesion in the peripheral zone, " | |
| "15 mm in greatest dimension. No extraprostatic extension." | |
| ), | |
| "reasoning": ( | |
| "1. PSA: 7.2 ng/mL is elevated (normal <4.0), PSA density should be calculated. " | |
| "2. DRE: Palpable nodule correlates with imaging finding. " | |
| "3. MRI: PI-RADS 4 has ~60-70% probability of clinically significant cancer. " | |
| "4. Confined disease: No extraprostatic extension is favorable. " | |
| "5. Assessment: High probability of Gleason 3+4 or higher prostate cancer." | |
| ), | |
| "conclusion": ( | |
| "Probable clinically significant prostate cancer. " | |
| "Recommend MRI-targeted fusion biopsy (minimum 12 systematic + 2-3 targeted cores). " | |
| "If positive, staging with PSMA PET-CT per NCCN guidelines." | |
| ), | |
| }, | |
| { | |
| "history": ( | |
| "52-year-old female with a 2 cm pancreatic cystic lesion found on CT " | |
| "performed for back pain. MRI with MRCP shows a branch-duct IPMN in the " | |
| "pancreatic body with a mural nodule measuring 5 mm. CA 19-9 is 45 U/mL. " | |
| "No main duct dilation." | |
| ), | |
| "reasoning": ( | |
| "1. Cyst type: Branch-duct IPMN is the most common pancreatic cystic neoplasm. " | |
| "2. Worrisome feature: Mural nodule (5 mm) is a 'worrisome feature' per Fukuoka criteria. " | |
| "3. Size: 2 cm is below the high-risk threshold of 3 cm. " | |
| "4. Biomarker: CA 19-9 of 45 is borderline (normal <37). " | |
| "5. Assessment: Moderate risk — warrants EUS for further characterization." | |
| ), | |
| "conclusion": ( | |
| "Branch-duct IPMN with worrisome features (mural nodule). " | |
| "Recommend endoscopic ultrasound (EUS) with FNA for cytology and cyst fluid analysis. " | |
| "If high-grade dysplasia found, surgical resection is indicated." | |
| ), | |
| }, | |
| # === LOW RISK (2 cases) === | |
| { | |
| "history": ( | |
| "35-year-old female with a 1 cm well-circumscribed, oval, hypoechoic " | |
| "breast mass found on screening ultrasound. BI-RADS 3. No family history " | |
| "of breast cancer. No skin changes or axillary lymphadenopathy." | |
| ), | |
| "reasoning": ( | |
| "1. Mass morphology: Well-circumscribed, oval shape is characteristic of fibroadenoma. " | |
| "2. BI-RADS 3: Probably benign (<2% malignancy risk). " | |
| "3. Age: 35 years old — breast cancer is rare at this age without risk factors. " | |
| "4. No concerning features: No skin changes, no lymphadenopathy. " | |
| "5. Assessment: Low risk, likely fibroadenoma." | |
| ), | |
| "conclusion": ( | |
| "Probably benign breast mass (BI-RADS 3), most likely fibroadenoma. " | |
| "Recommend short-interval follow-up ultrasound at 6 months. " | |
| "If stable at 2 years, reclassify as BI-RADS 2 (benign)." | |
| ), | |
| }, | |
| { | |
| "history": ( | |
| "28-year-old male with a small, well-circumscribed 8 mm pulmonary nodule " | |
| "found incidentally on chest X-ray performed for pre-employment screening. " | |
| "Non-smoker, no respiratory symptoms, no weight loss. CT confirms a smooth, " | |
| "round, calcified nodule in the right middle lobe." | |
| ), | |
| "reasoning": ( | |
| "1. Nodule: 8 mm, smooth margins, calcified — benign morphology. " | |
| "2. Calcification pattern: Diffuse calcification is highly associated with granuloma. " | |
| "3. Risk factors: Non-smoker, young age, asymptomatic. " | |
| "4. Fleischner criteria: Calcified nodules are generally benign and do not " | |
| "require follow-up imaging. " | |
| "5. Assessment: Very low risk, most likely granuloma (infectious etiology)." | |
| ), | |
| "conclusion": ( | |
| "Benign calcified pulmonary granuloma. No malignancy concern. " | |
| "No further imaging or follow-up required per Fleischner Society guidelines. " | |
| "Reassure patient." | |
| ), | |
| }, | |
| ] | |
| def generate_oncocot_samples(output_path: str = "data/samples/oncocot_synthetic.json") -> str: | |
| """ | |
| Writes the synthetic OncoCoT samples to a JSON file. | |
| Args: | |
| output_path: Path to the output JSON file. | |
| Returns: | |
| The absolute path to the generated file. | |
| """ | |
| os.makedirs(os.path.dirname(output_path), exist_ok=True) | |
| with open(output_path, "w", encoding="utf-8") as f: | |
| json.dump(SYNTHETIC_ONCOCOT_SAMPLES, f, ensure_ascii=False, indent=2) | |
| print(f"✅ Generated {len(SYNTHETIC_ONCOCOT_SAMPLES)} synthetic OncoCoT samples → {output_path}") | |
| return os.path.abspath(output_path) | |
| def generate_pmc_patients_format( | |
| output_path: str = "data/samples/pmc_patients_synthetic.json", | |
| ) -> str: | |
| """ | |
| Converts the OncoCoT samples into a PMC-Patients-compatible format. | |
| Args: | |
| output_path: Path to the output JSON file. | |
| Returns: | |
| The absolute path to the generated file. | |
| """ | |
| pmc_samples: List[Dict[str, str]] = [] | |
| for sample in SYNTHETIC_ONCOCOT_SAMPLES: | |
| pmc_samples.append({ | |
| "patient": sample["history"], | |
| "medical_history": sample["history"], | |
| "reasoning": sample["reasoning"], | |
| "conclusion": sample["conclusion"], | |
| }) | |
| os.makedirs(os.path.dirname(output_path), exist_ok=True) | |
| with open(output_path, "w", encoding="utf-8") as f: | |
| json.dump(pmc_samples, f, ensure_ascii=False, indent=2) | |
| print(f"✅ Generated {len(pmc_samples)} PMC-Patients format samples → {output_path}") | |
| return os.path.abspath(output_path) | |
| if __name__ == "__main__": | |
| generate_oncocot_samples() | |
| generate_pmc_patients_format() | |
| print("🚀 All synthetic data generated successfully.") | |