vision-token-masking-phi / scripts /convert_to_pdfs.sh

Ric

Initial commit: Justitia - Selective Vision Token Masking for PHI-Compliant OCR

a6b8ecc 2 months ago

1.09 kB

	#!/bin/bash

	# Convert Synthea output to medical PDFs with PHI annotations

	SYNTHEA_DIR=${1:-./data/synthetic/patients}
	PDF_DIR=${2:-./data/pdfs}
	ANNOTATION_DIR=${3:-./data/annotations}
	NUM_DOCS=${4:-500}

	echo "Converting Synthea data to medical PDFs..."
	echo "Synthea output: $SYNTHEA_DIR"
	echo "PDF output: $PDF_DIR"
	echo "Annotations: $ANNOTATION_DIR"
	echo "Documents to generate: $NUM_DOCS"
	echo ""

	# Create output directories
	mkdir -p "$PDF_DIR"
	mkdir -p "$ANNOTATION_DIR"

	# Run the PDF generator
	python src/data_generation/synthea_to_pdf.py \
	--synthea-output "$SYNTHEA_DIR" \
	--pdf-output "$PDF_DIR" \
	--annotations-output "$ANNOTATION_DIR" \
	--num-documents "$NUM_DOCS"

	echo ""
	echo "✓ PDF generation complete!"
	echo ""
	echo "Generated files:"
	find "$PDF_DIR" -name "*.pdf" \| wc -l \| xargs echo " PDFs:"
	find "$ANNOTATION_DIR" -name "*.json" \| wc -l \| xargs echo " Annotations:"
	echo ""
	echo "Next steps:"
	echo " 1. Review PDFs: ls $PDF_DIR"
	echo " 2. Check annotations: ls $ANNOTATION_DIR"
	echo " 3. Start training: python src/training/train_lora.py"