File size: 1,087 Bytes
a6b8ecc | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 | #!/bin/bash
# Convert Synthea output to medical PDFs with PHI annotations
SYNTHEA_DIR=${1:-./data/synthetic/patients}
PDF_DIR=${2:-./data/pdfs}
ANNOTATION_DIR=${3:-./data/annotations}
NUM_DOCS=${4:-500}
echo "Converting Synthea data to medical PDFs..."
echo "Synthea output: $SYNTHEA_DIR"
echo "PDF output: $PDF_DIR"
echo "Annotations: $ANNOTATION_DIR"
echo "Documents to generate: $NUM_DOCS"
echo ""
# Create output directories
mkdir -p "$PDF_DIR"
mkdir -p "$ANNOTATION_DIR"
# Run the PDF generator
python src/data_generation/synthea_to_pdf.py \
--synthea-output "$SYNTHEA_DIR" \
--pdf-output "$PDF_DIR" \
--annotations-output "$ANNOTATION_DIR" \
--num-documents "$NUM_DOCS"
echo ""
echo "✓ PDF generation complete!"
echo ""
echo "Generated files:"
find "$PDF_DIR" -name "*.pdf" | wc -l | xargs echo " PDFs:"
find "$ANNOTATION_DIR" -name "*.json" | wc -l | xargs echo " Annotations:"
echo ""
echo "Next steps:"
echo " 1. Review PDFs: ls $PDF_DIR"
echo " 2. Check annotations: ls $ANNOTATION_DIR"
echo " 3. Start training: python src/training/train_lora.py"
|