| # Convert Synthea output to medical PDFs with PHI annotations | |
| SYNTHEA_DIR=${1:-./data/synthetic/patients} | |
| PDF_DIR=${2:-./data/pdfs} | |
| ANNOTATION_DIR=${3:-./data/annotations} | |
| NUM_DOCS=${4:-500} | |
| echo "Converting Synthea data to medical PDFs..." | |
| echo "Synthea output: $SYNTHEA_DIR" | |
| echo "PDF output: $PDF_DIR" | |
| echo "Annotations: $ANNOTATION_DIR" | |
| echo "Documents to generate: $NUM_DOCS" | |
| echo "" | |
| # Create output directories | |
| mkdir -p "$PDF_DIR" | |
| mkdir -p "$ANNOTATION_DIR" | |
| # Run the PDF generator | |
| python src/data_generation/synthea_to_pdf.py \ | |
| --synthea-output "$SYNTHEA_DIR" \ | |
| --pdf-output "$PDF_DIR" \ | |
| --annotations-output "$ANNOTATION_DIR" \ | |
| --num-documents "$NUM_DOCS" | |
| echo "" | |
| echo "✓ PDF generation complete!" | |
| echo "" | |
| echo "Generated files:" | |
| find "$PDF_DIR" -name "*.pdf" | wc -l | xargs echo " PDFs:" | |
| find "$ANNOTATION_DIR" -name "*.json" | wc -l | xargs echo " Annotations:" | |
| echo "" | |
| echo "Next steps:" | |
| echo " 1. Review PDFs: ls $PDF_DIR" | |
| echo " 2. Check annotations: ls $ANNOTATION_DIR" | |
| echo " 3. Start training: python src/training/train_lora.py" | |