File size: 1,088 Bytes
a4462f5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 | #!/bin/bash
# Download data sources
echo "Downloading Aramaic Wikipedia dump..."
wget https://dumps.wikimedia.org/arcwiki/latest/arcwiki-latest-pages-articles.xml.bz2
echo "Downloading Syriac corpus from Opus' Bible dataset..."
wget https://object.pouta.csc.fi/OPUS-bible-uedin/v1/mono/syr.txt.gz
echo "Fetching Syriac texts from Digital Syriac Corpus (GitHub TEI files)..."
python3 fetch_syriac_corpus.py
# Unzip/extract data
echo "Extracting Wikipedia articles..."
python3 -m wikiextractor.WikiExtractor arcwiki-latest-pages-articles.xml.bz2 --output extracted --json
rm arcwiki-latest-pages-articles.xml.bz2
echo "Extracting Syriac sentences from the Bible corpus..."
gzip -d syr.txt.gz
mv syr.txt syriac_bible_corpus.txt
echo "Fetching SEDRA vocalised Syriac lexicon..."
echo "This provides accurate vocalisation for ~60k word forms (takes ~20-30 min)"
python3 fetch_sedra_vocalised.py
echo "Data download complete!"
echo "Next step: Run generate_syr_lat_pairs.py to process the combined corpus."
echo "Note: SEDRA lookup will be used automatically for accurate vocalisations." |