#!/bin/bash # Download data sources echo "Downloading Aramaic Wikipedia dump..." wget https://dumps.wikimedia.org/arcwiki/latest/arcwiki-latest-pages-articles.xml.bz2 echo "Downloading Syriac corpus from Opus' Bible dataset..." wget https://object.pouta.csc.fi/OPUS-bible-uedin/v1/mono/syr.txt.gz echo "Fetching Syriac texts from Digital Syriac Corpus (GitHub TEI files)..." python3 fetch_syriac_corpus.py # Unzip/extract data echo "Extracting Wikipedia articles..." python3 -m wikiextractor.WikiExtractor arcwiki-latest-pages-articles.xml.bz2 --output extracted --json rm arcwiki-latest-pages-articles.xml.bz2 echo "Extracting Syriac sentences from the Bible corpus..." gzip -d syr.txt.gz mv syr.txt syriac_bible_corpus.txt echo "Fetching SEDRA vocalised Syriac lexicon..." echo "This provides accurate vocalisation for ~60k word forms (takes ~20-30 min)" python3 fetch_sedra_vocalised.py echo "Data download complete!" echo "Next step: Run generate_syr_lat_pairs.py to process the combined corpus." echo "Note: SEDRA lookup will be used automatically for accurate vocalisations."