rewrite / scripts /download_datasets.sh
morpheuslord's picture
Add files using upload-large-folder tool
3df5819 verified
#!/bin/bash
# Download all training data sources
# Run: bash scripts/download_datasets.sh
set -e
mkdir -p data/raw/wi_locness data/raw/jfleg data/raw/gyafc data/raw/custom_dyslexia
echo "=== Downloading JFLEG (JHU Fluency-Extended GUG) ==="
if [ ! -d "data/raw/jfleg_repo" ]; then
git clone https://github.com/keisks/jfleg.git data/raw/jfleg_repo
cp data/raw/jfleg_repo/test/*.src data/raw/jfleg/ 2>/dev/null || true
cp data/raw/jfleg_repo/test/*.ref* data/raw/jfleg/ 2>/dev/null || true
echo " βœ“ JFLEG downloaded"
else
echo " βœ“ JFLEG already exists"
fi
echo ""
echo "=== Manual Downloads Required ==="
echo ""
echo "W&I+LOCNESS (35k pairs, gold standard GEC):"
echo " β†’ Register at: https://www.cl.cam.ac.uk/research/nl/bea2019st/"
echo " β†’ Place files in: data/raw/wi_locness/"
echo ""
echo "GYAFC (105k pairs, formality transfer):"
echo " β†’ Request access at: https://github.com/raosudha89/GYAFC-corpus"
echo " β†’ Place files in: data/raw/gyafc/"
echo ""
echo "=== Dataset download complete ==="
echo "Check manually downloaded datasets before proceeding."