aramt5 / src /test_tokeniser.py
crossroderick's picture
Initial commit
a4462f5
import sys
from transformers import T5TokenizerFast
sys.path.insert(0, ".")
from src.data.generate_syr_lat_pairs import SyriacDialect, transliterate_syriac
# Load tokeniser
tokeniser = T5TokenizerFast.from_pretrained("src/tokeniser/")
print("=== Syriac Tokeniser Test ===\n")
# Test Syriac script
syriac_text = "ܡܠܟܐ ܕܒܝܬ ܢܗܪܝܢ"
print(f"Syriac: {syriac_text}")
print(f"Tokens: {tokeniser.tokenize(syriac_text)}")
print()
# Generate actual transliterations from the script (matching training data format)
west_text = transliterate_syriac(syriac_text, SyriacDialect.WEST)
east_text = transliterate_syriac(syriac_text, SyriacDialect.EAST)
print(f"West Syriac Latin: {west_text}")
print(f"Tokens: {tokeniser.tokenize(west_text)}")
print()
print(f"East Syriac Latin: {east_text}")
print(f"Tokens: {tokeniser.tokenize(east_text)}")
print()
# Test task prefixes (should be single tokens)
west_task = "Syriac2WestLatin: ܐܠܗܐ"
east_task = "Syriac2EastLatin: ܐܠܗܐ"
print(f"West task prefix: {tokeniser.tokenize(west_task)}")
print(f"East task prefix: {tokeniser.tokenize(east_task)}")
print()
# Vocabulary info
print(f"Vocabulary size: {tokeniser.vocab_size}")