| #!/bin/bash |
|
|
| |
| echo "Downloading Aramaic Wikipedia dump..." |
| wget https://dumps.wikimedia.org/arcwiki/latest/arcwiki-latest-pages-articles.xml.bz2 |
|
|
| echo "Downloading Syriac corpus from Opus' Bible dataset..." |
| wget https://object.pouta.csc.fi/OPUS-bible-uedin/v1/mono/syr.txt.gz |
|
|
| echo "Fetching Syriac texts from Digital Syriac Corpus (GitHub TEI files)..." |
| python3 fetch_syriac_corpus.py |
|
|
| |
| echo "Extracting Wikipedia articles..." |
| python3 -m wikiextractor.WikiExtractor arcwiki-latest-pages-articles.xml.bz2 --output extracted --json |
| rm arcwiki-latest-pages-articles.xml.bz2 |
|
|
| echo "Extracting Syriac sentences from the Bible corpus..." |
| gzip -d syr.txt.gz |
| mv syr.txt syriac_bible_corpus.txt |
|
|
| echo "Fetching SEDRA vocalised Syriac lexicon..." |
| echo "This provides accurate vocalisation for ~60k word forms (takes ~20-30 min)" |
| python3 fetch_sedra_vocalised.py |
|
|
| echo "Data download complete!" |
| echo "Next step: Run generate_syr_lat_pairs.py to process the combined corpus." |
| echo "Note: SEDRA lookup will be used automatically for accurate vocalisations." |