File size: 6,581 Bytes
a6b8ecc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 |
#!/usr/bin/env python3
"""
Generate test medical PDFs without Synthea dependency.
This creates synthetic medical documents for initial testing.
"""
import sys
import os
from pathlib import Path
from faker import Faker
import random
import json
def generate_fake_patient_data(faker, num_patients=100):
"""Generate fake patient data without Synthea."""
patients = []
for i in range(num_patients):
patient = {
'name': faker.name(),
'birth_date': faker.date_of_birth(minimum_age=18, maximum_age=90).strftime('%Y-%m-%d'),
'ssn': faker.ssn(),
'phone': faker.phone_number(),
'email': faker.email(),
'address': faker.address().replace('\n', ', '),
'mrn': f"MRN-{faker.random_number(digits=8, fix_len=True)}",
'insurance_id': f"INS-{faker.random_number(digits=10, fix_len=True)}",
# Medical data
'conditions': [
{'code': faker.random_element(['Hypertension', 'Diabetes Type 2', 'Asthma', 'COPD', 'CAD']),
'onset': faker.date_between(start_date='-10y', end_date='today').strftime('%Y-%m-%d'),
'status': 'active'}
for _ in range(random.randint(1, 3))
],
'medications': [
{'name': faker.random_element(['Lisinopril 10mg', 'Metformin 500mg', 'Atorvastatin 20mg', 'Omeprazole 20mg']),
'dosage': f'Take {random.randint(1, 3)} tablet(s) {random.choice(["daily", "twice daily", "three times daily"])}',
'prescriber': f'Dr. {faker.last_name()}'}
for _ in range(random.randint(1, 4))
],
'allergies': [
{'substance': faker.random_element(['Penicillin', 'Sulfa drugs', 'Aspirin', 'Latex']),
'severity': faker.random_element(['mild', 'moderate', 'severe'])}
for _ in range(random.randint(0, 2))
],
'procedures': [],
'encounters': [],
'immunizations': [],
'observations': []
}
patients.append(patient)
return patients
def main():
import argparse
parser = argparse.ArgumentParser(description='Generate test medical PDFs')
parser.add_argument('--num-patients', type=int, default=100, help='Number of patients')
parser.add_argument('--num-documents', type=int, default=500, help='Number of PDFs to generate')
parser.add_argument('--output-dir', type=str, default='./data/pdfs', help='Output directory')
parser.add_argument('--annotations-dir', type=str, default='./data/annotations', help='Annotations directory')
args = parser.parse_args()
print("="*60)
print("Generating Test Medical PDFs (without Synthea)")
print("="*60)
print(f"Patients: {args.num_patients}")
print(f"Documents: {args.num_documents}")
print(f"Output: {args.output_dir}")
print()
# Initialize Faker
faker = Faker()
Faker.seed(42)
# Generate fake patient data
print("Generating synthetic patient data...")
patients = generate_fake_patient_data(faker, args.num_patients)
print(f"✓ Generated {len(patients)} patients")
# Create PDF generator
# Note: We need to adapt since it expects Synthea output
# Let's create PDFs directly
from datetime import datetime
import random as rand
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
from reportlab.lib import colors
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib.units import inch
output_dir = Path(args.output_dir)
annotations_dir = Path(args.annotations_dir)
output_dir.mkdir(parents=True, exist_ok=True)
annotations_dir.mkdir(parents=True, exist_ok=True)
document_types = ['prescription', 'lab_report', 'insurance_claim']
print(f"\nGenerating {args.num_documents} PDFs...")
for i in range(args.num_documents):
patient = rand.choice(patients)
doc_type = rand.choice(document_types)
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
pdf_name = f"{doc_type}_{i:04d}_{timestamp}.pdf"
pdf_path = output_dir / pdf_name
# Create simple PDF
doc = SimpleDocTemplate(str(pdf_path), pagesize=letter)
story = []
styles = getSampleStyleSheet()
# Header
story.append(Paragraph(f"<b>{doc_type.upper().replace('_', ' ')}</b>", styles['Heading1']))
story.append(Spacer(1, 0.2 * inch))
# Patient info
patient_data = [
['Patient Name:', patient['name']],
['Date of Birth:', patient['birth_date']],
['MRN:', patient['mrn']],
['Phone:', patient['phone']],
['Date:', datetime.now().strftime('%Y-%m-%d')],
]
patient_table = Table(patient_data, colWidths=[2 * inch, 4 * inch])
patient_table.setStyle(TableStyle([
('FONTNAME', (0, 0), (0, -1), 'Helvetica-Bold'),
('FONTSIZE', (0, 0), (-1, -1), 10),
('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
]))
story.append(patient_table)
doc.build(story)
# Create annotation
annotations = [
{'type': 'name', 'value': patient['name'], 'page': 1},
{'type': 'date', 'value': patient['birth_date'], 'page': 1},
{'type': 'mrn', 'value': patient['mrn'], 'page': 1},
{'type': 'phone', 'value': patient['phone'], 'page': 1},
]
annotation_file = annotations_dir / f"{pdf_name}.json"
with open(annotation_file, 'w') as f:
json.dump({
'document': pdf_name,
'annotations': annotations,
'timestamp': datetime.now().isoformat(),
}, f, indent=2)
if (i + 1) % 100 == 0:
print(f" Generated {i + 1}/{args.num_documents} PDFs...")
print(f"\n✓ Generation complete!")
print(f" PDFs: {output_dir}")
print(f" Annotations: {annotations_dir}")
print(f"\nGenerated files:")
print(f" {len(list(output_dir.glob('*.pdf')))} PDFs")
print(f" {len(list(annotations_dir.glob('*.json')))} annotations")
print("\n" + "="*60)
print("Next Steps:")
print("="*60)
print("1. Review PDFs: ls data/pdfs")
print("2. Download model: python scripts/download_model.py")
print("3. Train LoRA: python src/training/train_lora.py")
print("="*60)
if __name__ == "__main__":
main()
|