aramt5 / src /data /generate_clean_corpus.sh
crossroderick's picture
Initial commit
a4462f5
raw
history blame contribute delete
952 Bytes
#!/bin/bash
echo "Shuffling and cleaning the West Syriac corpus..."
shuf syriac_west_corpus.jsonl -o syriac_west_corpus_shuffled.jsonl
mv syriac_west_corpus_shuffled.jsonl syriac_west_corpus.jsonl
grep '\S' syriac_west_corpus.jsonl > syriac_west_clean_corpus.jsonl
sed -i 's/\xe2\x80\x8f//g' syriac_west_clean_corpus.jsonl
rm syriac_west_corpus.jsonl
echo "Shuffling and cleaning the East Syriac corpus..."
shuf syriac_east_corpus.jsonl -o syriac_east_corpus_shuffled.jsonl
mv syriac_east_corpus_shuffled.jsonl syriac_east_corpus.jsonl
grep '\S' syriac_east_corpus.jsonl > syriac_east_clean_corpus.jsonl
sed -i 's/\xe2\x80\x8f//g' syriac_east_clean_corpus.jsonl
rm syriac_east_corpus.jsonl
echo ""
echo "Generated corpora:"
echo " - syriac_west_clean_corpus.jsonl (West Syriac / Serto)"
echo " - syriac_east_clean_corpus.jsonl (East Syriac / Madnḥaya)"
echo ""
echo "Next step: Run train_tokeniser.py to train the tokeniser on the clean corpus."