File size: 13,589 Bytes
e1624f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
"""
Synthetic clinical oncology data generator for OncoAgent.
Generates OncoCoT-format samples for pipeline validation.
All data is 100% synthetic β€” zero real patient information.
"""

import json
import os
import random
from typing import List, Dict

# Reproducibility seed (Rule #22)
random.seed(42)

SYNTHETIC_ONCOCOT_SAMPLES: List[Dict[str, str]] = [
    # === HIGH RISK (5 cases) ===
    {
        "history": (
            "62-year-old female presents with persistent dry cough for 3 months, "
            "unintentional weight loss of 8 kg, and hemoptysis. Chest CT reveals a "
            "2.5 cm spiculated mass in the left upper lobe with associated pleural "
            "thickening and enlarged mediastinal lymph nodes measuring 1.2 cm. "
            "Patient is a former smoker with 30 pack-year history."
        ),
        "reasoning": (
            "1. Identify lesion characteristics: 2.5 cm mass classifies as T1c/T2a. "
            "2. Morphology: 'Spiculated' margins are highly indicative of malignancy "
            "(positive predictive value >90%). "
            "3. Nodal involvement: Mediastinal lymph nodes at 1.2 cm suggest N2 status. "
            "4. Clinical correlation: Hemoptysis + weight loss + smoking history "
            "significantly increase pre-test probability. "
            "5. Staging synthesis: T2aN2M0 β†’ Stage IIIA per AJCC 8th edition."
        ),
        "conclusion": (
            "High suspicion for non-small cell lung cancer (NSCLC), likely Stage IIIA. "
            "Recommend urgent tissue biopsy (CT-guided or bronchoscopy) and PET-CT "
            "for comprehensive staging. Multidisciplinary tumor board consultation required."
        ),
    },
    {
        "history": (
            "55-year-old male with a palpable 3.5 cm mass in the right breast, "
            "skin dimpling, and axillary lymphadenopathy on the ipsilateral side. "
            "Mammography shows an irregular dense mass with microcalcifications. "
            "Family history positive for BRCA2 mutation in first-degree relative."
        ),
        "reasoning": (
            "1. Mass characteristics: 3.5 cm irregular mass with microcalcifications "
            "is highly suspicious (BI-RADS 5). "
            "2. Clinical signs: Skin dimpling indicates possible Cooper ligament involvement. "
            "3. Nodal status: Ipsilateral axillary lymphadenopathy suggests N1 involvement. "
            "4. Risk factors: Male breast cancer accounts for <1% of cases, but BRCA2 "
            "significantly increases risk (6-8% lifetime). "
            "5. Staging estimate: T2N1M0 β†’ Stage IIB."
        ),
        "conclusion": (
            "High suspicion for male breast carcinoma, likely Stage IIB. "
            "Recommend core needle biopsy with receptor testing (ER/PR/HER2), "
            "BRCA genetic testing, and staging workup including chest/abdominal CT."
        ),
    },
    {
        "history": (
            "70-year-old male presents with progressive difficulty swallowing solids "
            "over 4 months, weight loss of 12 kg, and retrosternal pain. Upper "
            "endoscopy reveals a 4 cm circumferential mass in the distal esophagus "
            "with mucosal ulceration. CT shows thickened esophageal wall and "
            "suspicious celiac lymph nodes."
        ),
        "reasoning": (
            "1. Lesion: 4 cm circumferential mass with ulceration is T3 (adventitial invasion likely). "
            "2. Location: Distal esophagus suggests adenocarcinoma (Barrett's association). "
            "3. Nodal disease: Celiac lymph nodes represent M1 lymph node disease per AJCC. "
            "4. Symptoms: Progressive dysphagia + significant weight loss indicate advanced disease. "
            "5. Staging: T3N1M1(LYM) β†’ Stage IVA."
        ),
        "conclusion": (
            "High suspicion for esophageal adenocarcinoma, Stage IVA. "
            "Recommend endoscopic biopsy with HER2 testing, PET-CT for complete staging, "
            "and referral for palliative chemoradiation consideration."
        ),
    },
    {
        "history": (
            "48-year-old female with recently discovered hepatic masses on "
            "ultrasound performed for right upper quadrant pain. CT reveals "
            "multiple bilobar liver lesions (largest 6 cm) with arterial enhancement "
            "and washout. AFP level is 850 ng/mL. History of hepatitis C cirrhosis."
        ),
        "reasoning": (
            "1. Imaging: Arterial enhancement with washout is pathognomonic for HCC (LI-RADS 5). "
            "2. Biomarker: AFP >400 ng/mL is highly specific for hepatocellular carcinoma. "
            "3. Risk factor: HCV cirrhosis is the leading cause of HCC. "
            "4. Extent: Bilobar disease precludes surgical resection. "
            "5. Staging: Beyond Milan criteria (single ≀5cm or ≀3 lesions each ≀3cm) β†’ BCLC Stage C."
        ),
        "conclusion": (
            "Hepatocellular carcinoma confirmed by imaging criteria (LI-RADS 5) and AFP elevation. "
            "BCLC Stage C. Recommend systemic therapy (atezolizumab + bevacizumab per NCCN) "
            "and liver transplant evaluation if disease responds."
        ),
    },
    {
        "history": (
            "58-year-old male with iron-deficiency anemia, change in bowel habits "
            "for 6 months, and a 2 cm mass found in the sigmoid colon on colonoscopy. "
            "Biopsy confirms moderately differentiated adenocarcinoma. CT abdomen shows "
            "3 suspicious pericolonic lymph nodes and 2 small liver lesions."
        ),
        "reasoning": (
            "1. Primary tumor: 2 cm sigmoid adenocarcinoma, moderately differentiated. "
            "2. Local spread: Pericolonic lymph nodes suggest N1 disease. "
            "3. Distant metastasis: Liver lesions are concerning for M1a hepatic metastases. "
            "4. Presentation: Iron-deficiency anemia is classic for right-sided colon cancer "
            "but can occur in sigmoid lesions with chronic occult bleeding. "
            "5. Staging: T3N1M1a β†’ Stage IVA (AJCC 8th edition)."
        ),
        "conclusion": (
            "Sigmoid colon adenocarcinoma, Stage IVA with hepatic metastases. "
            "Recommend molecular profiling (MSI, KRAS/NRAS/BRAF), "
            "liver MRI for surgical resectability assessment, and FOLFOX/FOLFIRI-based "
            "systemic therapy per NCCN guidelines."
        ),
    },
    # === MEDIUM RISK (3 cases) ===
    {
        "history": (
            "45-year-old female with a 1.5 cm solid thyroid nodule found incidentally "
            "on carotid ultrasound. Fine needle aspiration shows Bethesda IV "
            "(follicular neoplasm). No cervical lymphadenopathy. TSH is normal."
        ),
        "reasoning": (
            "1. Nodule: 1.5 cm solid nodule with Bethesda IV cytology. "
            "2. Risk of malignancy: Bethesda IV carries 15-30% cancer risk. "
            "3. Favorable factors: No lymphadenopathy, normal TSH. "
            "4. Cannot distinguish follicular adenoma from carcinoma on cytology alone. "
            "5. Assessment: Intermediate risk requiring diagnostic surgery."
        ),
        "conclusion": (
            "Indeterminate thyroid nodule (Bethesda IV) with moderate malignancy risk. "
            "Recommend molecular testing (Afirma or ThyroSeq) if available. "
            "If molecular testing is inconclusive, diagnostic lobectomy is indicated."
        ),
    },
    {
        "history": (
            "60-year-old male with a PSA level of 7.2 ng/mL on routine screening. "
            "Digital rectal exam reveals a firm nodule on the right lobe. "
            "MRI prostate shows a PI-RADS 4 lesion in the peripheral zone, "
            "15 mm in greatest dimension. No extraprostatic extension."
        ),
        "reasoning": (
            "1. PSA: 7.2 ng/mL is elevated (normal <4.0), PSA density should be calculated. "
            "2. DRE: Palpable nodule correlates with imaging finding. "
            "3. MRI: PI-RADS 4 has ~60-70% probability of clinically significant cancer. "
            "4. Confined disease: No extraprostatic extension is favorable. "
            "5. Assessment: High probability of Gleason 3+4 or higher prostate cancer."
        ),
        "conclusion": (
            "Probable clinically significant prostate cancer. "
            "Recommend MRI-targeted fusion biopsy (minimum 12 systematic + 2-3 targeted cores). "
            "If positive, staging with PSMA PET-CT per NCCN guidelines."
        ),
    },
    {
        "history": (
            "52-year-old female with a 2 cm pancreatic cystic lesion found on CT "
            "performed for back pain. MRI with MRCP shows a branch-duct IPMN in the "
            "pancreatic body with a mural nodule measuring 5 mm. CA 19-9 is 45 U/mL. "
            "No main duct dilation."
        ),
        "reasoning": (
            "1. Cyst type: Branch-duct IPMN is the most common pancreatic cystic neoplasm. "
            "2. Worrisome feature: Mural nodule (5 mm) is a 'worrisome feature' per Fukuoka criteria. "
            "3. Size: 2 cm is below the high-risk threshold of 3 cm. "
            "4. Biomarker: CA 19-9 of 45 is borderline (normal <37). "
            "5. Assessment: Moderate risk β€” warrants EUS for further characterization."
        ),
        "conclusion": (
            "Branch-duct IPMN with worrisome features (mural nodule). "
            "Recommend endoscopic ultrasound (EUS) with FNA for cytology and cyst fluid analysis. "
            "If high-grade dysplasia found, surgical resection is indicated."
        ),
    },
    # === LOW RISK (2 cases) ===
    {
        "history": (
            "35-year-old female with a 1 cm well-circumscribed, oval, hypoechoic "
            "breast mass found on screening ultrasound. BI-RADS 3. No family history "
            "of breast cancer. No skin changes or axillary lymphadenopathy."
        ),
        "reasoning": (
            "1. Mass morphology: Well-circumscribed, oval shape is characteristic of fibroadenoma. "
            "2. BI-RADS 3: Probably benign (<2% malignancy risk). "
            "3. Age: 35 years old β€” breast cancer is rare at this age without risk factors. "
            "4. No concerning features: No skin changes, no lymphadenopathy. "
            "5. Assessment: Low risk, likely fibroadenoma."
        ),
        "conclusion": (
            "Probably benign breast mass (BI-RADS 3), most likely fibroadenoma. "
            "Recommend short-interval follow-up ultrasound at 6 months. "
            "If stable at 2 years, reclassify as BI-RADS 2 (benign)."
        ),
    },
    {
        "history": (
            "28-year-old male with a small, well-circumscribed 8 mm pulmonary nodule "
            "found incidentally on chest X-ray performed for pre-employment screening. "
            "Non-smoker, no respiratory symptoms, no weight loss. CT confirms a smooth, "
            "round, calcified nodule in the right middle lobe."
        ),
        "reasoning": (
            "1. Nodule: 8 mm, smooth margins, calcified β€” benign morphology. "
            "2. Calcification pattern: Diffuse calcification is highly associated with granuloma. "
            "3. Risk factors: Non-smoker, young age, asymptomatic. "
            "4. Fleischner criteria: Calcified nodules are generally benign and do not "
            "require follow-up imaging. "
            "5. Assessment: Very low risk, most likely granuloma (infectious etiology)."
        ),
        "conclusion": (
            "Benign calcified pulmonary granuloma. No malignancy concern. "
            "No further imaging or follow-up required per Fleischner Society guidelines. "
            "Reassure patient."
        ),
    },
]


def generate_oncocot_samples(output_path: str = "data/samples/oncocot_synthetic.json") -> str:
    """
    Writes the synthetic OncoCoT samples to a JSON file.

    Args:
        output_path: Path to the output JSON file.

    Returns:
        The absolute path to the generated file.
    """
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(SYNTHETIC_ONCOCOT_SAMPLES, f, ensure_ascii=False, indent=2)
    print(f"βœ… Generated {len(SYNTHETIC_ONCOCOT_SAMPLES)} synthetic OncoCoT samples β†’ {output_path}")
    return os.path.abspath(output_path)


def generate_pmc_patients_format(
    output_path: str = "data/samples/pmc_patients_synthetic.json",
) -> str:
    """
    Converts the OncoCoT samples into a PMC-Patients-compatible format.

    Args:
        output_path: Path to the output JSON file.

    Returns:
        The absolute path to the generated file.
    """
    pmc_samples: List[Dict[str, str]] = []
    for sample in SYNTHETIC_ONCOCOT_SAMPLES:
        pmc_samples.append({
            "patient": sample["history"],
            "medical_history": sample["history"],
            "reasoning": sample["reasoning"],
            "conclusion": sample["conclusion"],
        })

    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(pmc_samples, f, ensure_ascii=False, indent=2)
    print(f"βœ… Generated {len(pmc_samples)} PMC-Patients format samples β†’ {output_path}")
    return os.path.abspath(output_path)


if __name__ == "__main__":
    generate_oncocot_samples()
    generate_pmc_patients_format()
    print("πŸš€ All synthetic data generated successfully.")