|
|
|
|
|
""" |
|
|
Generate test medical PDFs without Synthea dependency. |
|
|
This creates synthetic medical documents for initial testing. |
|
|
""" |
|
|
|
|
|
import sys |
|
|
import os |
|
|
from pathlib import Path |
|
|
from faker import Faker |
|
|
import random |
|
|
import json |
|
|
|
|
|
def generate_fake_patient_data(faker, num_patients=100): |
|
|
"""Generate fake patient data without Synthea.""" |
|
|
patients = [] |
|
|
|
|
|
for i in range(num_patients): |
|
|
patient = { |
|
|
'name': faker.name(), |
|
|
'birth_date': faker.date_of_birth(minimum_age=18, maximum_age=90).strftime('%Y-%m-%d'), |
|
|
'ssn': faker.ssn(), |
|
|
'phone': faker.phone_number(), |
|
|
'email': faker.email(), |
|
|
'address': faker.address().replace('\n', ', '), |
|
|
'mrn': f"MRN-{faker.random_number(digits=8, fix_len=True)}", |
|
|
'insurance_id': f"INS-{faker.random_number(digits=10, fix_len=True)}", |
|
|
|
|
|
|
|
|
'conditions': [ |
|
|
{'code': faker.random_element(['Hypertension', 'Diabetes Type 2', 'Asthma', 'COPD', 'CAD']), |
|
|
'onset': faker.date_between(start_date='-10y', end_date='today').strftime('%Y-%m-%d'), |
|
|
'status': 'active'} |
|
|
for _ in range(random.randint(1, 3)) |
|
|
], |
|
|
|
|
|
'medications': [ |
|
|
{'name': faker.random_element(['Lisinopril 10mg', 'Metformin 500mg', 'Atorvastatin 20mg', 'Omeprazole 20mg']), |
|
|
'dosage': f'Take {random.randint(1, 3)} tablet(s) {random.choice(["daily", "twice daily", "three times daily"])}', |
|
|
'prescriber': f'Dr. {faker.last_name()}'} |
|
|
for _ in range(random.randint(1, 4)) |
|
|
], |
|
|
|
|
|
'allergies': [ |
|
|
{'substance': faker.random_element(['Penicillin', 'Sulfa drugs', 'Aspirin', 'Latex']), |
|
|
'severity': faker.random_element(['mild', 'moderate', 'severe'])} |
|
|
for _ in range(random.randint(0, 2)) |
|
|
], |
|
|
|
|
|
'procedures': [], |
|
|
'encounters': [], |
|
|
'immunizations': [], |
|
|
'observations': [] |
|
|
} |
|
|
|
|
|
patients.append(patient) |
|
|
|
|
|
return patients |
|
|
|
|
|
|
|
|
def main(): |
|
|
import argparse |
|
|
|
|
|
parser = argparse.ArgumentParser(description='Generate test medical PDFs') |
|
|
parser.add_argument('--num-patients', type=int, default=100, help='Number of patients') |
|
|
parser.add_argument('--num-documents', type=int, default=500, help='Number of PDFs to generate') |
|
|
parser.add_argument('--output-dir', type=str, default='./data/pdfs', help='Output directory') |
|
|
parser.add_argument('--annotations-dir', type=str, default='./data/annotations', help='Annotations directory') |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
print("="*60) |
|
|
print("Generating Test Medical PDFs (without Synthea)") |
|
|
print("="*60) |
|
|
print(f"Patients: {args.num_patients}") |
|
|
print(f"Documents: {args.num_documents}") |
|
|
print(f"Output: {args.output_dir}") |
|
|
print() |
|
|
|
|
|
|
|
|
faker = Faker() |
|
|
Faker.seed(42) |
|
|
|
|
|
|
|
|
print("Generating synthetic patient data...") |
|
|
patients = generate_fake_patient_data(faker, args.num_patients) |
|
|
print(f"✓ Generated {len(patients)} patients") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from datetime import datetime |
|
|
import random as rand |
|
|
from reportlab.lib.pagesizes import letter |
|
|
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle |
|
|
from reportlab.lib import colors |
|
|
from reportlab.lib.styles import getSampleStyleSheet |
|
|
from reportlab.lib.units import inch |
|
|
|
|
|
output_dir = Path(args.output_dir) |
|
|
annotations_dir = Path(args.annotations_dir) |
|
|
output_dir.mkdir(parents=True, exist_ok=True) |
|
|
annotations_dir.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
document_types = ['prescription', 'lab_report', 'insurance_claim'] |
|
|
|
|
|
print(f"\nGenerating {args.num_documents} PDFs...") |
|
|
|
|
|
for i in range(args.num_documents): |
|
|
patient = rand.choice(patients) |
|
|
doc_type = rand.choice(document_types) |
|
|
|
|
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') |
|
|
pdf_name = f"{doc_type}_{i:04d}_{timestamp}.pdf" |
|
|
pdf_path = output_dir / pdf_name |
|
|
|
|
|
|
|
|
doc = SimpleDocTemplate(str(pdf_path), pagesize=letter) |
|
|
story = [] |
|
|
styles = getSampleStyleSheet() |
|
|
|
|
|
|
|
|
story.append(Paragraph(f"<b>{doc_type.upper().replace('_', ' ')}</b>", styles['Heading1'])) |
|
|
story.append(Spacer(1, 0.2 * inch)) |
|
|
|
|
|
|
|
|
patient_data = [ |
|
|
['Patient Name:', patient['name']], |
|
|
['Date of Birth:', patient['birth_date']], |
|
|
['MRN:', patient['mrn']], |
|
|
['Phone:', patient['phone']], |
|
|
['Date:', datetime.now().strftime('%Y-%m-%d')], |
|
|
] |
|
|
|
|
|
patient_table = Table(patient_data, colWidths=[2 * inch, 4 * inch]) |
|
|
patient_table.setStyle(TableStyle([ |
|
|
('FONTNAME', (0, 0), (0, -1), 'Helvetica-Bold'), |
|
|
('FONTSIZE', (0, 0), (-1, -1), 10), |
|
|
('GRID', (0, 0), (-1, -1), 0.5, colors.grey), |
|
|
])) |
|
|
story.append(patient_table) |
|
|
|
|
|
doc.build(story) |
|
|
|
|
|
|
|
|
annotations = [ |
|
|
{'type': 'name', 'value': patient['name'], 'page': 1}, |
|
|
{'type': 'date', 'value': patient['birth_date'], 'page': 1}, |
|
|
{'type': 'mrn', 'value': patient['mrn'], 'page': 1}, |
|
|
{'type': 'phone', 'value': patient['phone'], 'page': 1}, |
|
|
] |
|
|
|
|
|
annotation_file = annotations_dir / f"{pdf_name}.json" |
|
|
with open(annotation_file, 'w') as f: |
|
|
json.dump({ |
|
|
'document': pdf_name, |
|
|
'annotations': annotations, |
|
|
'timestamp': datetime.now().isoformat(), |
|
|
}, f, indent=2) |
|
|
|
|
|
if (i + 1) % 100 == 0: |
|
|
print(f" Generated {i + 1}/{args.num_documents} PDFs...") |
|
|
|
|
|
print(f"\n✓ Generation complete!") |
|
|
print(f" PDFs: {output_dir}") |
|
|
print(f" Annotations: {annotations_dir}") |
|
|
print(f"\nGenerated files:") |
|
|
print(f" {len(list(output_dir.glob('*.pdf')))} PDFs") |
|
|
print(f" {len(list(annotations_dir.glob('*.json')))} annotations") |
|
|
|
|
|
print("\n" + "="*60) |
|
|
print("Next Steps:") |
|
|
print("="*60) |
|
|
print("1. Review PDFs: ls data/pdfs") |
|
|
print("2. Download model: python scripts/download_model.py") |
|
|
print("3. Train LoRA: python src/training/train_lora.py") |
|
|
print("="*60) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|