File size: 4,751 Bytes
a6b8ecc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#!/bin/bash

# Setup script for Synthea synthetic patient generator
# This script downloads and configures Synthea for generating synthetic medical records

set -e  # Exit on error

echo "=========================================="
echo "Setting up Synthea for Justitia PHI-OCR"
echo "=========================================="

# Check Java installation
echo "Checking Java installation..."
if ! command -v java &> /dev/null; then
    echo "Error: Java is not installed. Please install Java JDK 11 or newer."
    echo "Visit: https://www.oracle.com/java/technologies/downloads/"
    exit 1
fi

JAVA_VERSION=$(java -version 2>&1 | head -n 1 | cut -d'"' -f2)
echo "Found Java version: $JAVA_VERSION"

# Create external directory if it doesn't exist
EXTERNAL_DIR="external"
if [ ! -d "$EXTERNAL_DIR" ]; then
    echo "Creating external directory..."
    mkdir -p "$EXTERNAL_DIR"
fi

cd "$EXTERNAL_DIR"

# Clone Synthea if not already present
if [ ! -d "synthea" ]; then
    echo "Cloning Synthea repository..."
    git clone https://github.com/synthetichealth/synthea.git
    cd synthea
else
    echo "Synthea directory already exists. Updating..."
    cd synthea
    git pull
fi

# Build Synthea
echo "Building Synthea (this may take a few minutes)..."
./gradlew build -x test

# Create custom configuration for PHI-heavy output
echo "Creating custom Synthea configuration..."
cat > src/main/resources/synthea_phi_config.properties << 'EOF'
# Custom Synthea configuration for PHI detection training

# Generate complete patient information
generate.demographics = true
generate.vital_signs = true
generate.medications = true
generate.conditions = true
generate.allergies = true
generate.procedures = true
generate.immunizations = true
generate.encounters = true
generate.imaging_studies = true
generate.devices = true
generate.supplies = true

# Export formats (we want multiple formats for diverse training)
exporter.ccda.export = true
exporter.fhir.export = true
exporter.csv.export = true
exporter.text.export = true
exporter.pdf.export = false  # We'll generate our own PDFs

# Include all PHI fields
exporter.csv.included_files = patients,encounters,conditions,medications,procedures,immunizations,allergies,devices,supplies
exporter.csv.append_mode = false

# Generate diverse demographics
generate.demographics.socioeconomic.weights.income = 1,1,1,1,1
generate.demographics.socioeconomic.weights.education = 1,1,1,1,1

# Location settings (generate diverse addresses)
generate.geography.country = United States
generate.geography.state = Massachusetts  # Can be changed

# Age distribution
generate.demographics.min_age = 0
generate.demographics.max_age = 100

# Keep all history
generate.log_patients = true
generate.keep_patients = true
EOF

# Create a generation script
echo "Creating generation script..."
cd ../..
cat > scripts/generate_synthea_data.sh << 'EOF'
#!/bin/bash

# Script to generate synthetic patient data using Synthea

PATIENTS=${1:-100}
STATE=${2:-Massachusetts}
OUTPUT_DIR=${3:-./data/synthetic/synthea}

echo "Generating $PATIENTS synthetic patients from $STATE..."

# Ensure output directory exists
mkdir -p "$OUTPUT_DIR"

# Run Synthea
cd external/synthea
./run_synthea -p "$PATIENTS" -s "$STATE" --exporter.baseDirectory="../../$OUTPUT_DIR"

echo "Generation complete. Output saved to $OUTPUT_DIR"

# Count generated files
echo "Generated files:"
find "../../$OUTPUT_DIR" -type f -name "*.json" | wc -l | xargs echo "  FHIR bundles:"
find "../../$OUTPUT_DIR" -type f -name "*.xml" | wc -l | xargs echo "  C-CDA documents:"
find "../../$OUTPUT_DIR" -type f -name "*.csv" | wc -l | xargs echo "  CSV files:"

echo "Done!"
EOF

chmod +x scripts/generate_synthea_data.sh

# Test Synthea with a small generation
echo "Testing Synthea with 5 patients..."
cd external/synthea
./run_synthea -p 5 -s Massachusetts --exporter.baseDirectory="../../data/synthetic/test"

# Check if test was successful
if [ -d "../../data/synthetic/test" ]; then
    echo "✓ Synthea test successful!"
    echo "  Generated test files in data/synthetic/test/"
else
    echo "✗ Synthea test failed. Please check the output above for errors."
    exit 1
fi

cd ../..

echo ""
echo "=========================================="
echo "Synthea Setup Complete!"
echo "=========================================="
echo ""
echo "To generate synthetic patients, run:"
echo "  ./scripts/generate_synthea_data.sh [num_patients] [state] [output_dir]"
echo ""
echo "Example:"
echo "  ./scripts/generate_synthea_data.sh 1000 California ./data/synthetic/patients"
echo ""
echo "Next steps:"
echo "1. Generate synthetic patient data"
echo "2. Run the PDF generation script to convert to PDFs with PHI"
echo "3. Use the PDFs for training the LoRA adapter"
echo ""