import sys from transformers import T5TokenizerFast sys.path.insert(0, ".") from src.data.generate_syr_lat_pairs import SyriacDialect, transliterate_syriac # Load tokeniser tokeniser = T5TokenizerFast.from_pretrained("src/tokeniser/") print("=== Syriac Tokeniser Test ===\n") # Test Syriac script syriac_text = "ܡܠܟܐ ܕܒܝܬ ܢܗܪܝܢ" print(f"Syriac: {syriac_text}") print(f"Tokens: {tokeniser.tokenize(syriac_text)}") print() # Generate actual transliterations from the script (matching training data format) west_text = transliterate_syriac(syriac_text, SyriacDialect.WEST) east_text = transliterate_syriac(syriac_text, SyriacDialect.EAST) print(f"West Syriac Latin: {west_text}") print(f"Tokens: {tokeniser.tokenize(west_text)}") print() print(f"East Syriac Latin: {east_text}") print(f"Tokens: {tokeniser.tokenize(east_text)}") print() # Test task prefixes (should be single tokens) west_task = "Syriac2WestLatin: ܐܠܗܐ" east_task = "Syriac2EastLatin: ܐܠܗܐ" print(f"West task prefix: {tokeniser.tokenize(west_task)}") print(f"East task prefix: {tokeniser.tokenize(east_task)}") print() # Vocabulary info print(f"Vocabulary size: {tokeniser.vocab_size}")