#!/bin/bash echo "Shuffling and cleaning the West Syriac corpus..." shuf syriac_west_corpus.jsonl -o syriac_west_corpus_shuffled.jsonl mv syriac_west_corpus_shuffled.jsonl syriac_west_corpus.jsonl grep '\S' syriac_west_corpus.jsonl > syriac_west_clean_corpus.jsonl sed -i 's/\xe2\x80\x8f//g' syriac_west_clean_corpus.jsonl rm syriac_west_corpus.jsonl echo "Shuffling and cleaning the East Syriac corpus..." shuf syriac_east_corpus.jsonl -o syriac_east_corpus_shuffled.jsonl mv syriac_east_corpus_shuffled.jsonl syriac_east_corpus.jsonl grep '\S' syriac_east_corpus.jsonl > syriac_east_clean_corpus.jsonl sed -i 's/\xe2\x80\x8f//g' syriac_east_clean_corpus.jsonl rm syriac_east_corpus.jsonl echo "" echo "Generated corpora:" echo " - syriac_west_clean_corpus.jsonl (West Syriac / Serto)" echo " - syriac_east_clean_corpus.jsonl (East Syriac / Madnḥaya)" echo "" echo "Next step: Run train_tokeniser.py to train the tokeniser on the clean corpus."