File size: 1,101 Bytes
3df5819 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 | #!/bin/bash
# Download all training data sources
# Run: bash scripts/download_datasets.sh
set -e
mkdir -p data/raw/wi_locness data/raw/jfleg data/raw/gyafc data/raw/custom_dyslexia
echo "=== Downloading JFLEG (JHU Fluency-Extended GUG) ==="
if [ ! -d "data/raw/jfleg_repo" ]; then
git clone https://github.com/keisks/jfleg.git data/raw/jfleg_repo
cp data/raw/jfleg_repo/test/*.src data/raw/jfleg/ 2>/dev/null || true
cp data/raw/jfleg_repo/test/*.ref* data/raw/jfleg/ 2>/dev/null || true
echo " ✓ JFLEG downloaded"
else
echo " ✓ JFLEG already exists"
fi
echo ""
echo "=== Manual Downloads Required ==="
echo ""
echo "W&I+LOCNESS (35k pairs, gold standard GEC):"
echo " → Register at: https://www.cl.cam.ac.uk/research/nl/bea2019st/"
echo " → Place files in: data/raw/wi_locness/"
echo ""
echo "GYAFC (105k pairs, formality transfer):"
echo " → Request access at: https://github.com/raosudha89/GYAFC-corpus"
echo " → Place files in: data/raw/gyafc/"
echo ""
echo "=== Dataset download complete ==="
echo "Check manually downloaded datasets before proceeding."
|