|
|
#!/bin/bash |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
set -e |
|
|
|
|
|
echo "==========================================" |
|
|
echo "Setting up Synthea for Justitia PHI-OCR" |
|
|
echo "==========================================" |
|
|
|
|
|
|
|
|
echo "Checking Java installation..." |
|
|
if ! command -v java &> /dev/null; then |
|
|
echo "Error: Java is not installed. Please install Java JDK 11 or newer." |
|
|
echo "Visit: https://www.oracle.com/java/technologies/downloads/" |
|
|
exit 1 |
|
|
fi |
|
|
|
|
|
JAVA_VERSION=$(java -version 2>&1 | head -n 1 | cut -d'"' -f2) |
|
|
echo "Found Java version: $JAVA_VERSION" |
|
|
|
|
|
|
|
|
EXTERNAL_DIR="external" |
|
|
if [ ! -d "$EXTERNAL_DIR" ]; then |
|
|
echo "Creating external directory..." |
|
|
mkdir -p "$EXTERNAL_DIR" |
|
|
fi |
|
|
|
|
|
cd "$EXTERNAL_DIR" |
|
|
|
|
|
|
|
|
if [ ! -d "synthea" ]; then |
|
|
echo "Cloning Synthea repository..." |
|
|
git clone https://github.com/synthetichealth/synthea.git |
|
|
cd synthea |
|
|
else |
|
|
echo "Synthea directory already exists. Updating..." |
|
|
cd synthea |
|
|
git pull |
|
|
fi |
|
|
|
|
|
|
|
|
echo "Building Synthea (this may take a few minutes)..." |
|
|
./gradlew build -x test |
|
|
|
|
|
|
|
|
echo "Creating custom Synthea configuration..." |
|
|
cat > src/main/resources/synthea_phi_config.properties << 'EOF' |
|
|
|
|
|
|
|
|
|
|
|
generate.demographics = true |
|
|
generate.vital_signs = true |
|
|
generate.medications = true |
|
|
generate.conditions = true |
|
|
generate.allergies = true |
|
|
generate.procedures = true |
|
|
generate.immunizations = true |
|
|
generate.encounters = true |
|
|
generate.imaging_studies = true |
|
|
generate.devices = true |
|
|
generate.supplies = true |
|
|
|
|
|
|
|
|
exporter.ccda.export = true |
|
|
exporter.fhir.export = true |
|
|
exporter.csv.export = true |
|
|
exporter.text.export = true |
|
|
exporter.pdf.export = false |
|
|
|
|
|
|
|
|
exporter.csv.included_files = patients,encounters,conditions,medications,procedures,immunizations,allergies,devices,supplies |
|
|
exporter.csv.append_mode = false |
|
|
|
|
|
|
|
|
generate.demographics.socioeconomic.weights.income = 1,1,1,1,1 |
|
|
generate.demographics.socioeconomic.weights.education = 1,1,1,1,1 |
|
|
|
|
|
|
|
|
generate.geography.country = United States |
|
|
generate.geography.state = Massachusetts |
|
|
|
|
|
|
|
|
generate.demographics.min_age = 0 |
|
|
generate.demographics.max_age = 100 |
|
|
|
|
|
|
|
|
generate.log_patients = true |
|
|
generate.keep_patients = true |
|
|
EOF |
|
|
|
|
|
|
|
|
echo "Creating generation script..." |
|
|
cd ../.. |
|
|
cat > scripts/generate_synthea_data.sh << 'EOF' |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
PATIENTS=${1:-100} |
|
|
STATE=${2:-Massachusetts} |
|
|
OUTPUT_DIR=${3:-./data/synthetic/synthea} |
|
|
|
|
|
echo "Generating $PATIENTS synthetic patients from $STATE..." |
|
|
|
|
|
|
|
|
mkdir -p "$OUTPUT_DIR" |
|
|
|
|
|
|
|
|
cd external/synthea |
|
|
./run_synthea -p "$PATIENTS" -s "$STATE" --exporter.baseDirectory="../../$OUTPUT_DIR" |
|
|
|
|
|
echo "Generation complete. Output saved to $OUTPUT_DIR" |
|
|
|
|
|
|
|
|
echo "Generated files:" |
|
|
find "../../$OUTPUT_DIR" -type f -name "*.json" | wc -l | xargs echo " FHIR bundles:" |
|
|
find "../../$OUTPUT_DIR" -type f -name "*.xml" | wc -l | xargs echo " C-CDA documents:" |
|
|
find "../../$OUTPUT_DIR" -type f -name "*.csv" | wc -l | xargs echo " CSV files:" |
|
|
|
|
|
echo "Done!" |
|
|
EOF |
|
|
|
|
|
chmod +x scripts/generate_synthea_data.sh |
|
|
|
|
|
|
|
|
echo "Testing Synthea with 5 patients..." |
|
|
cd external/synthea |
|
|
./run_synthea -p 5 -s Massachusetts --exporter.baseDirectory="../../data/synthetic/test" |
|
|
|
|
|
|
|
|
if [ -d "../../data/synthetic/test" ]; then |
|
|
echo "✓ Synthea test successful!" |
|
|
echo " Generated test files in data/synthetic/test/" |
|
|
else |
|
|
echo "✗ Synthea test failed. Please check the output above for errors." |
|
|
exit 1 |
|
|
fi |
|
|
|
|
|
cd ../.. |
|
|
|
|
|
echo "" |
|
|
echo "==========================================" |
|
|
echo "Synthea Setup Complete!" |
|
|
echo "==========================================" |
|
|
echo "" |
|
|
echo "To generate synthetic patients, run:" |
|
|
echo " ./scripts/generate_synthea_data.sh [num_patients] [state] [output_dir]" |
|
|
echo "" |
|
|
echo "Example:" |
|
|
echo " ./scripts/generate_synthea_data.sh 1000 California ./data/synthetic/patients" |
|
|
echo "" |
|
|
echo "Next steps:" |
|
|
echo "1. Generate synthetic patient data" |
|
|
echo "2. Run the PDF generation script to convert to PDFs with PHI" |
|
|
echo "3. Use the PDFs for training the LoRA adapter" |
|
|
echo "" |