File size: 4,751 Bytes
a6b8ecc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
#!/bin/bash
# Setup script for Synthea synthetic patient generator
# This script downloads and configures Synthea for generating synthetic medical records
set -e # Exit on error
echo "=========================================="
echo "Setting up Synthea for Justitia PHI-OCR"
echo "=========================================="
# Check Java installation
echo "Checking Java installation..."
if ! command -v java &> /dev/null; then
echo "Error: Java is not installed. Please install Java JDK 11 or newer."
echo "Visit: https://www.oracle.com/java/technologies/downloads/"
exit 1
fi
JAVA_VERSION=$(java -version 2>&1 | head -n 1 | cut -d'"' -f2)
echo "Found Java version: $JAVA_VERSION"
# Create external directory if it doesn't exist
EXTERNAL_DIR="external"
if [ ! -d "$EXTERNAL_DIR" ]; then
echo "Creating external directory..."
mkdir -p "$EXTERNAL_DIR"
fi
cd "$EXTERNAL_DIR"
# Clone Synthea if not already present
if [ ! -d "synthea" ]; then
echo "Cloning Synthea repository..."
git clone https://github.com/synthetichealth/synthea.git
cd synthea
else
echo "Synthea directory already exists. Updating..."
cd synthea
git pull
fi
# Build Synthea
echo "Building Synthea (this may take a few minutes)..."
./gradlew build -x test
# Create custom configuration for PHI-heavy output
echo "Creating custom Synthea configuration..."
cat > src/main/resources/synthea_phi_config.properties << 'EOF'
# Custom Synthea configuration for PHI detection training
# Generate complete patient information
generate.demographics = true
generate.vital_signs = true
generate.medications = true
generate.conditions = true
generate.allergies = true
generate.procedures = true
generate.immunizations = true
generate.encounters = true
generate.imaging_studies = true
generate.devices = true
generate.supplies = true
# Export formats (we want multiple formats for diverse training)
exporter.ccda.export = true
exporter.fhir.export = true
exporter.csv.export = true
exporter.text.export = true
exporter.pdf.export = false # We'll generate our own PDFs
# Include all PHI fields
exporter.csv.included_files = patients,encounters,conditions,medications,procedures,immunizations,allergies,devices,supplies
exporter.csv.append_mode = false
# Generate diverse demographics
generate.demographics.socioeconomic.weights.income = 1,1,1,1,1
generate.demographics.socioeconomic.weights.education = 1,1,1,1,1
# Location settings (generate diverse addresses)
generate.geography.country = United States
generate.geography.state = Massachusetts # Can be changed
# Age distribution
generate.demographics.min_age = 0
generate.demographics.max_age = 100
# Keep all history
generate.log_patients = true
generate.keep_patients = true
EOF
# Create a generation script
echo "Creating generation script..."
cd ../..
cat > scripts/generate_synthea_data.sh << 'EOF'
#!/bin/bash
# Script to generate synthetic patient data using Synthea
PATIENTS=${1:-100}
STATE=${2:-Massachusetts}
OUTPUT_DIR=${3:-./data/synthetic/synthea}
echo "Generating $PATIENTS synthetic patients from $STATE..."
# Ensure output directory exists
mkdir -p "$OUTPUT_DIR"
# Run Synthea
cd external/synthea
./run_synthea -p "$PATIENTS" -s "$STATE" --exporter.baseDirectory="../../$OUTPUT_DIR"
echo "Generation complete. Output saved to $OUTPUT_DIR"
# Count generated files
echo "Generated files:"
find "../../$OUTPUT_DIR" -type f -name "*.json" | wc -l | xargs echo " FHIR bundles:"
find "../../$OUTPUT_DIR" -type f -name "*.xml" | wc -l | xargs echo " C-CDA documents:"
find "../../$OUTPUT_DIR" -type f -name "*.csv" | wc -l | xargs echo " CSV files:"
echo "Done!"
EOF
chmod +x scripts/generate_synthea_data.sh
# Test Synthea with a small generation
echo "Testing Synthea with 5 patients..."
cd external/synthea
./run_synthea -p 5 -s Massachusetts --exporter.baseDirectory="../../data/synthetic/test"
# Check if test was successful
if [ -d "../../data/synthetic/test" ]; then
echo "✓ Synthea test successful!"
echo " Generated test files in data/synthetic/test/"
else
echo "✗ Synthea test failed. Please check the output above for errors."
exit 1
fi
cd ../..
echo ""
echo "=========================================="
echo "Synthea Setup Complete!"
echo "=========================================="
echo ""
echo "To generate synthetic patients, run:"
echo " ./scripts/generate_synthea_data.sh [num_patients] [state] [output_dir]"
echo ""
echo "Example:"
echo " ./scripts/generate_synthea_data.sh 1000 California ./data/synthetic/patients"
echo ""
echo "Next steps:"
echo "1. Generate synthetic patient data"
echo "2. Run the PDF generation script to convert to PDFs with PHI"
echo "3. Use the PDFs for training the LoRA adapter"
echo "" |