| #!/bin/bash |
|
|
| echo "Shuffling and cleaning the West Syriac corpus..." |
| shuf syriac_west_corpus.jsonl -o syriac_west_corpus_shuffled.jsonl |
| mv syriac_west_corpus_shuffled.jsonl syriac_west_corpus.jsonl |
| grep '\S' syriac_west_corpus.jsonl > syriac_west_clean_corpus.jsonl |
| sed -i 's/\xe2\x80\x8f//g' syriac_west_clean_corpus.jsonl |
| rm syriac_west_corpus.jsonl |
|
|
| echo "Shuffling and cleaning the East Syriac corpus..." |
| shuf syriac_east_corpus.jsonl -o syriac_east_corpus_shuffled.jsonl |
| mv syriac_east_corpus_shuffled.jsonl syriac_east_corpus.jsonl |
| grep '\S' syriac_east_corpus.jsonl > syriac_east_clean_corpus.jsonl |
| sed -i 's/\xe2\x80\x8f//g' syriac_east_clean_corpus.jsonl |
| rm syriac_east_corpus.jsonl |
|
|
| echo "" |
| echo "Generated corpora:" |
| echo " - syriac_west_clean_corpus.jsonl (West Syriac / Serto)" |
| echo " - syriac_east_clean_corpus.jsonl (East Syriac / Madnḥaya)" |
| echo "" |
| echo "Next step: Run train_tokeniser.py to train the tokeniser on the clean corpus." |