mm-ai-demo / create_sample_data.py
kenlkehl's picture
Upload 8 files
2ed7323 verified
#!/usr/bin/env python3
"""
Generate sample data for testing the Clinical Trial Matching Pipeline
"""
import pandas as pd
from datetime import datetime, timedelta
def create_sample_trials():
"""Create a sample trial database CSV."""
trials = [
{
'nct_id': 'NCT12345678',
'this_space': '''Metastatic non-small cell lung cancer (NSCLC) with EGFR exon 19 deletion or L858R mutation
Prior treatment: At least one prior platinum-based chemotherapy regimen
ECOG performance status: 0-2
Measurable disease per RECIST v1.1
Adequate organ function''',
'trial_text': '''Phase III randomized study of osimertinib versus platinum-based chemotherapy in patients with
EGFR-mutated metastatic NSCLC who have progressed on first-line EGFR TKI therapy. Primary endpoint is progression-free
survival. Secondary endpoints include overall survival, objective response rate, and quality of life.''',
'trial_boilerplate_text': '''No active brain metastases requiring immediate intervention
No prior treatment with third-generation EGFR TKIs
No interstitial lung disease or pneumonitis
No congestive heart failure NYHA class III-IV
No HIV, hepatitis B, or hepatitis C infection'''
},
{
'nct_id': 'NCT23456789',
'this_space': '''HER2-positive metastatic breast cancer
Prior treatment: Trastuzumab and pertuzumab in any setting
ECOG performance status: 0-1
Brain metastases allowed if treated and stable
LVEF β‰₯50%''',
'trial_text': '''Phase II study of trastuzumab deruxtecan in HER2-positive metastatic breast cancer patients
who have received prior trastuzumab and pertuzumab. Primary endpoint is objective response rate. Key secondary endpoints
include duration of response, progression-free survival, and safety.''',
'trial_boilerplate_text': '''No history of pneumonitis or interstitial lung disease
No concurrent cardiac dysfunction
No active hepatitis B or C infection
No pregnancy or breastfeeding'''
},
{
'nct_id': 'NCT34567890',
'this_space': '''Advanced melanoma with BRAF V600E or V600K mutation
Treatment-naive for metastatic disease (adjuvant therapy allowed if completed >6 months prior)
ECOG performance status: 0-1
No active autoimmune disease requiring systemic therapy
Adequate bone marrow, hepatic, and renal function''',
'trial_text': '''Phase III randomized trial comparing dabrafenib plus trametinib versus vemurafenib monotherapy
in previously untreated BRAF-mutant metastatic melanoma. Primary endpoint is overall survival. Secondary endpoints include
progression-free survival, response rate, and toxicity.''',
'trial_boilerplate_text': '''No prior systemic therapy for metastatic melanoma
No active brain metastases (treated and stable brain metastases allowed)
No history of inflammatory bowel disease
No significant cardiac disease
No HIV infection on antiretroviral therapy'''
},
{
'nct_id': 'NCT45678901',
'this_space': '''Microsatellite instability-high (MSI-H) or mismatch repair deficient (dMMR) advanced solid tumors
Progressive disease on or after prior standard therapy
ECOG performance status: 0-2
Measurable disease per RECIST v1.1
No prior checkpoint inhibitor therapy''',
'trial_text': '''Phase II basket study of pembrolizumab in patients with MSI-H/dMMR advanced solid tumors.
Primary endpoint is objective response rate by tumor type. Secondary endpoints include duration of response,
progression-free survival, and overall survival.''',
'trial_boilerplate_text': '''No active autoimmune disease requiring systemic therapy
No history of severe immune-related adverse events
No active pneumonitis or interstitial lung disease
No concurrent systemic corticosteroids (>10mg prednisone equivalent daily)
No HIV, hepatitis B, or hepatitis C infection'''
},
{
'nct_id': 'NCT56789012',
'this_space': '''Advanced or metastatic renal cell carcinoma (RCC), clear cell histology
No prior systemic therapy for advanced disease
Intermediate or poor risk per IMDC criteria
ECOG performance status: 0-1
Measurable disease per RECIST v1.1''',
'trial_text': '''Phase III randomized study of cabozantinib plus nivolumab versus sunitinib in previously
untreated advanced RCC. Primary endpoint is progression-free survival. Secondary endpoints include overall survival,
objective response rate, and safety.''',
'trial_boilerplate_text': '''No prior systemic therapy for metastatic RCC
No active brain metastases
No history of bowel perforation or fistula
No poorly controlled hypertension
No active hepatitis B or C infection
No significant cardiovascular disease'''
}
]
df = pd.DataFrame(trials)
df.to_csv('sample_trials.csv', index=False)
print(f"βœ“ Created sample_trials.csv with {len(df)} trials")
return df
def create_sample_patient_notes():
"""Create sample patient clinical notes CSV."""
base_date = datetime(2023, 1, 1)
notes = [
{
'date': base_date,
'text': 'Patient is a 67-year-old male with a 40 pack-year smoking history presenting with cough and weight loss. CT chest shows a 4.5 cm right upper lobe mass with mediastinal lymphadenopathy.',
'note_type': 'clinical_note'
},
{
'date': base_date + timedelta(days=7),
'text': 'CT-guided lung biopsy performed. Pathology shows adenocarcinoma, moderately differentiated.',
'note_type': 'pathology_report'
},
{
'date': base_date + timedelta(days=14),
'text': 'PET/CT shows FDG-avid right upper lobe mass (SUVmax 12.3), right hilar nodes (SUVmax 8.7), and mediastinal nodes (SUVmax 9.2). No distant metastatic disease identified.',
'note_type': 'imaging_report'
},
{
'date': base_date + timedelta(days=21),
'text': '''Next-generation sequencing (NGS) performed on lung biopsy specimen.
Results: EGFR exon 19 deletion (L747_A750delinsP) detected.
Other findings: TP53 p.R273H mutation, MYC amplification (copy number gain).
PD-L1 expression by immunohistochemistry: 75% tumor proportion score.
TMB: 4 mutations/Mb (low).
No ALK, ROS1, BRAF, MET, RET, or KRAS alterations detected.''',
'note_type': 'ngs_report'
},
{
'date': base_date + timedelta(days=28),
'text': 'Mediastinoscopy with biopsy of station 4R and 7 lymph nodes. Pathology confirms metastatic adenocarcinoma. Clinical stage: T2aN2M0, stage IIIA.',
'note_type': 'pathology_report'
},
{
'date': base_date + timedelta(days=42),
'text': 'Patient underwent concurrent chemoradiation with carboplatin/pemetrexed and 60 Gy radiation to primary tumor and mediastinum. Tolerated well with grade 2 esophagitis.',
'note_type': 'clinical_note'
},
{
'date': base_date + timedelta(days=112),
'text': 'Post-treatment CT chest shows near-complete response of primary tumor (now 1.2 cm) and resolution of lymphadenopathy. Started consolidation durvalumab.',
'note_type': 'imaging_report'
},
{
'date': base_date + timedelta(days=280),
'text': 'Surveillance CT shows new liver lesions (segment 6 and 7, largest 2.3 cm) and increase in size of lung primary to 3.1 cm. Progression of disease.',
'note_type': 'imaging_report'
},
{
'date': base_date + timedelta(days=287),
'text': 'Patient now has metastatic NSCLC (stage IV). ECOG performance status 1. Discussed treatment options. Given EGFR mutation, recommend EGFR TKI therapy.',
'note_type': 'clinical_note'
},
{
'date': base_date + timedelta(days=294),
'text': 'Started osimertinib 80 mg daily for EGFR-mutant metastatic NSCLC.',
'note_type': 'clinical_note'
},
{
'date': base_date + timedelta(days=378),
'text': 'Restaging CT shows partial response. Liver lesions decreased to 1.2 and 0.9 cm. Primary lung tumor stable at 2.8 cm. Tolerating osimertinib well with mild diarrhea and dry skin.',
'note_type': 'imaging_report'
},
{
'date': base_date + timedelta(days=560),
'text': 'Patient reports increased fatigue and back pain over past 3 weeks.',
'note_type': 'clinical_note'
},
{
'date': base_date + timedelta(days=567),
'text': '''CT chest/abdomen/pelvis shows:
- Progression of liver metastases (segment 6: 3.8 cm, previously 1.2 cm; segment 7: 2.9 cm, previously 0.9 cm)
- New liver lesions in segments 4 and 5
- Lung primary increased to 4.2 cm
- New small pleural effusion
Assessment: Progressive disease on osimertinib.''',
'note_type': 'imaging_report'
},
{
'date': base_date + timedelta(days=574),
'text': 'MRI brain with contrast shows no brain metastases. Patient has progressive EGFR-mutant NSCLC after first-line osimertinib. ECOG PS 1. Discussing clinical trial options for second-line therapy.',
'note_type': 'clinical_note'
}
]
df = pd.DataFrame(notes)
df.to_csv('sample_patient_notes.csv', index=False)
print(f"βœ“ Created sample_patient_notes.csv with {len(df)} notes")
return df
def create_sample_patient_summary():
"""Create a sample patient summary text file."""
summary = """Cancer type: Non-small cell lung cancer (NSCLC)
Histology: Adenocarcinoma, moderately differentiated
Stage at diagnosis: Stage IIIA (T2aN2M0)
Current extent: Metastatic (stage IV) with liver metastases
Biomarkers:
- EGFR exon 19 deletion (L747_A750delinsP)
- TP53 p.R273H mutation
- MYC amplification
- PD-L1 75% TPS
- TMB: 4 mutations/Mb (low)
Treatment history:
# 1/28/2023 - 4/15/2023: Concurrent chemoradiation (carboplatin/pemetrexed with 60 Gy)
# 4/22/2023 - 10/5/2023: Consolidation durvalumab
# 10/19/2023 - present: Osimertinib 80 mg daily for metastatic disease
Disease course:
- Initial diagnosis: January 2023, stage IIIA
- Near-complete response to chemoradiation
- Progression to stage IV in September 2023 (liver metastases)
- Partial response to osimertinib
- Current progression on osimertinib (July 2024) after ~9 months of therapy
Current status:
- ECOG performance status: 1
- Progressive disease with liver metastases
- No brain metastases on recent MRI
Boilerplate:
No evidence of brain metastases (MRI brain 7/22/2024).
No history of pneumonitis, interstitial lung disease, congestive heart failure, HIV, or hepatitis infection documented.
Adequate performance status (ECOG 1).
"""
with open('sample_patient_summary.txt', 'w') as f:
f.write(summary)
print(f"βœ“ Created sample_patient_summary.txt")
return summary
if __name__ == "__main__":
print("Generating sample data for Clinical Trial Matching Pipeline...\n")
create_sample_trials()
create_sample_patient_notes()
create_sample_patient_summary()
print("\nβœ“ All sample files created successfully!")
print("\nFiles generated:")
print(" - sample_trials.csv (5 clinical trials)")
print(" - sample_patient_notes.csv (14 clinical notes)")
print(" - sample_patient_summary.txt (pre-made summary)")
print("\nYou can now use these files to test the Gradio application.")