#!/usr/bin/env python3 """ Generate sample data for testing the Clinical Trial Matching Pipeline """ import pandas as pd from datetime import datetime, timedelta def create_sample_trials(): """Create a sample trial database CSV.""" trials = [ { 'nct_id': 'NCT12345678', 'this_space': '''Metastatic non-small cell lung cancer (NSCLC) with EGFR exon 19 deletion or L858R mutation Prior treatment: At least one prior platinum-based chemotherapy regimen ECOG performance status: 0-2 Measurable disease per RECIST v1.1 Adequate organ function''', 'trial_text': '''Phase III randomized study of osimertinib versus platinum-based chemotherapy in patients with EGFR-mutated metastatic NSCLC who have progressed on first-line EGFR TKI therapy. Primary endpoint is progression-free survival. Secondary endpoints include overall survival, objective response rate, and quality of life.''', 'trial_boilerplate_text': '''No active brain metastases requiring immediate intervention No prior treatment with third-generation EGFR TKIs No interstitial lung disease or pneumonitis No congestive heart failure NYHA class III-IV No HIV, hepatitis B, or hepatitis C infection''' }, { 'nct_id': 'NCT23456789', 'this_space': '''HER2-positive metastatic breast cancer Prior treatment: Trastuzumab and pertuzumab in any setting ECOG performance status: 0-1 Brain metastases allowed if treated and stable LVEF ≥50%''', 'trial_text': '''Phase II study of trastuzumab deruxtecan in HER2-positive metastatic breast cancer patients who have received prior trastuzumab and pertuzumab. Primary endpoint is objective response rate. Key secondary endpoints include duration of response, progression-free survival, and safety.''', 'trial_boilerplate_text': '''No history of pneumonitis or interstitial lung disease No concurrent cardiac dysfunction No active hepatitis B or C infection No pregnancy or breastfeeding''' }, { 'nct_id': 'NCT34567890', 'this_space': '''Advanced melanoma with BRAF V600E or V600K mutation Treatment-naive for metastatic disease (adjuvant therapy allowed if completed >6 months prior) ECOG performance status: 0-1 No active autoimmune disease requiring systemic therapy Adequate bone marrow, hepatic, and renal function''', 'trial_text': '''Phase III randomized trial comparing dabrafenib plus trametinib versus vemurafenib monotherapy in previously untreated BRAF-mutant metastatic melanoma. Primary endpoint is overall survival. Secondary endpoints include progression-free survival, response rate, and toxicity.''', 'trial_boilerplate_text': '''No prior systemic therapy for metastatic melanoma No active brain metastases (treated and stable brain metastases allowed) No history of inflammatory bowel disease No significant cardiac disease No HIV infection on antiretroviral therapy''' }, { 'nct_id': 'NCT45678901', 'this_space': '''Microsatellite instability-high (MSI-H) or mismatch repair deficient (dMMR) advanced solid tumors Progressive disease on or after prior standard therapy ECOG performance status: 0-2 Measurable disease per RECIST v1.1 No prior checkpoint inhibitor therapy''', 'trial_text': '''Phase II basket study of pembrolizumab in patients with MSI-H/dMMR advanced solid tumors. Primary endpoint is objective response rate by tumor type. Secondary endpoints include duration of response, progression-free survival, and overall survival.''', 'trial_boilerplate_text': '''No active autoimmune disease requiring systemic therapy No history of severe immune-related adverse events No active pneumonitis or interstitial lung disease No concurrent systemic corticosteroids (>10mg prednisone equivalent daily) No HIV, hepatitis B, or hepatitis C infection''' }, { 'nct_id': 'NCT56789012', 'this_space': '''Advanced or metastatic renal cell carcinoma (RCC), clear cell histology No prior systemic therapy for advanced disease Intermediate or poor risk per IMDC criteria ECOG performance status: 0-1 Measurable disease per RECIST v1.1''', 'trial_text': '''Phase III randomized study of cabozantinib plus nivolumab versus sunitinib in previously untreated advanced RCC. Primary endpoint is progression-free survival. Secondary endpoints include overall survival, objective response rate, and safety.''', 'trial_boilerplate_text': '''No prior systemic therapy for metastatic RCC No active brain metastases No history of bowel perforation or fistula No poorly controlled hypertension No active hepatitis B or C infection No significant cardiovascular disease''' } ] df = pd.DataFrame(trials) df.to_csv('sample_trials.csv', index=False) print(f"✓ Created sample_trials.csv with {len(df)} trials") return df def create_sample_patient_notes(): """Create sample patient clinical notes CSV.""" base_date = datetime(2023, 1, 1) notes = [ { 'date': base_date, 'text': 'Patient is a 67-year-old male with a 40 pack-year smoking history presenting with cough and weight loss. CT chest shows a 4.5 cm right upper lobe mass with mediastinal lymphadenopathy.', 'note_type': 'clinical_note' }, { 'date': base_date + timedelta(days=7), 'text': 'CT-guided lung biopsy performed. Pathology shows adenocarcinoma, moderately differentiated.', 'note_type': 'pathology_report' }, { 'date': base_date + timedelta(days=14), 'text': 'PET/CT shows FDG-avid right upper lobe mass (SUVmax 12.3), right hilar nodes (SUVmax 8.7), and mediastinal nodes (SUVmax 9.2). No distant metastatic disease identified.', 'note_type': 'imaging_report' }, { 'date': base_date + timedelta(days=21), 'text': '''Next-generation sequencing (NGS) performed on lung biopsy specimen. Results: EGFR exon 19 deletion (L747_A750delinsP) detected. Other findings: TP53 p.R273H mutation, MYC amplification (copy number gain). PD-L1 expression by immunohistochemistry: 75% tumor proportion score. TMB: 4 mutations/Mb (low). No ALK, ROS1, BRAF, MET, RET, or KRAS alterations detected.''', 'note_type': 'ngs_report' }, { 'date': base_date + timedelta(days=28), 'text': 'Mediastinoscopy with biopsy of station 4R and 7 lymph nodes. Pathology confirms metastatic adenocarcinoma. Clinical stage: T2aN2M0, stage IIIA.', 'note_type': 'pathology_report' }, { 'date': base_date + timedelta(days=42), 'text': 'Patient underwent concurrent chemoradiation with carboplatin/pemetrexed and 60 Gy radiation to primary tumor and mediastinum. Tolerated well with grade 2 esophagitis.', 'note_type': 'clinical_note' }, { 'date': base_date + timedelta(days=112), 'text': 'Post-treatment CT chest shows near-complete response of primary tumor (now 1.2 cm) and resolution of lymphadenopathy. Started consolidation durvalumab.', 'note_type': 'imaging_report' }, { 'date': base_date + timedelta(days=280), 'text': 'Surveillance CT shows new liver lesions (segment 6 and 7, largest 2.3 cm) and increase in size of lung primary to 3.1 cm. Progression of disease.', 'note_type': 'imaging_report' }, { 'date': base_date + timedelta(days=287), 'text': 'Patient now has metastatic NSCLC (stage IV). ECOG performance status 1. Discussed treatment options. Given EGFR mutation, recommend EGFR TKI therapy.', 'note_type': 'clinical_note' }, { 'date': base_date + timedelta(days=294), 'text': 'Started osimertinib 80 mg daily for EGFR-mutant metastatic NSCLC.', 'note_type': 'clinical_note' }, { 'date': base_date + timedelta(days=378), 'text': 'Restaging CT shows partial response. Liver lesions decreased to 1.2 and 0.9 cm. Primary lung tumor stable at 2.8 cm. Tolerating osimertinib well with mild diarrhea and dry skin.', 'note_type': 'imaging_report' }, { 'date': base_date + timedelta(days=560), 'text': 'Patient reports increased fatigue and back pain over past 3 weeks.', 'note_type': 'clinical_note' }, { 'date': base_date + timedelta(days=567), 'text': '''CT chest/abdomen/pelvis shows: - Progression of liver metastases (segment 6: 3.8 cm, previously 1.2 cm; segment 7: 2.9 cm, previously 0.9 cm) - New liver lesions in segments 4 and 5 - Lung primary increased to 4.2 cm - New small pleural effusion Assessment: Progressive disease on osimertinib.''', 'note_type': 'imaging_report' }, { 'date': base_date + timedelta(days=574), 'text': 'MRI brain with contrast shows no brain metastases. Patient has progressive EGFR-mutant NSCLC after first-line osimertinib. ECOG PS 1. Discussing clinical trial options for second-line therapy.', 'note_type': 'clinical_note' } ] df = pd.DataFrame(notes) df.to_csv('sample_patient_notes.csv', index=False) print(f"✓ Created sample_patient_notes.csv with {len(df)} notes") return df def create_sample_patient_summary(): """Create a sample patient summary text file.""" summary = """Age: 67 Sex: Male Cancer type: Non-small cell lung cancer (NSCLC) Histology: Adenocarcinoma, moderately differentiated Stage at diagnosis: Stage IIIA (T2aN2M0) Current extent: Metastatic (stage IV) with liver metastases Biomarkers: - EGFR exon 19 deletion (L747_A750delinsP) - TP53 p.R273H mutation - MYC amplification - PD-L1 75% TPS - TMB: 4 mutations/Mb (low) Treatment history: # 1/28/2023 - 4/15/2023: Concurrent chemoradiation (carboplatin/pemetrexed with 60 Gy) # 4/22/2023 - 10/5/2023: Consolidation durvalumab # 10/19/2023 - present: Osimertinib 80 mg daily for metastatic disease Disease course: - Initial diagnosis: January 2023, stage IIIA - Near-complete response to chemoradiation - Progression to stage IV in September 2023 (liver metastases) - Partial response to osimertinib - Current progression on osimertinib (July 2024) after ~9 months of therapy Current status: - ECOG performance status: 1 - Progressive disease with liver metastases - No brain metastases on recent MRI Boilerplate: No evidence of brain metastases (MRI brain 7/22/2024). No history of pneumonitis, interstitial lung disease, congestive heart failure, HIV, or hepatitis infection documented. Adequate performance status (ECOG 1). """ with open('sample_patient_summary.txt', 'w') as f: f.write(summary) print(f"✓ Created sample_patient_summary.txt") return summary if __name__ == "__main__": print("Generating sample data for Clinical Trial Matching Pipeline...\n") create_sample_trials() create_sample_patient_notes() create_sample_patient_summary() print("\n✓ All sample files created successfully!") print("\nFiles generated:") print(" - sample_trials.csv (5 clinical trials)") print(" - sample_patient_notes.csv (14 clinical notes)") print(" - sample_patient_summary.txt (pre-made summary)") print("\nYou can now use these files to test the Gradio application.")