| import sys |
|
|
| from transformers import T5TokenizerFast |
|
|
| sys.path.insert(0, ".") |
| from src.data.generate_syr_lat_pairs import SyriacDialect, transliterate_syriac |
|
|
| |
| tokeniser = T5TokenizerFast.from_pretrained("src/tokeniser/") |
|
|
| print("=== Syriac Tokeniser Test ===\n") |
|
|
| |
| syriac_text = "ܡܠܟܐ ܕܒܝܬ ܢܗܪܝܢ" |
| print(f"Syriac: {syriac_text}") |
| print(f"Tokens: {tokeniser.tokenize(syriac_text)}") |
| print() |
|
|
| |
| west_text = transliterate_syriac(syriac_text, SyriacDialect.WEST) |
| east_text = transliterate_syriac(syriac_text, SyriacDialect.EAST) |
|
|
| print(f"West Syriac Latin: {west_text}") |
| print(f"Tokens: {tokeniser.tokenize(west_text)}") |
| print() |
|
|
| print(f"East Syriac Latin: {east_text}") |
| print(f"Tokens: {tokeniser.tokenize(east_text)}") |
| print() |
|
|
| |
| west_task = "Syriac2WestLatin: ܐܠܗܐ" |
| east_task = "Syriac2EastLatin: ܐܠܗܐ" |
| print(f"West task prefix: {tokeniser.tokenize(west_task)}") |
| print(f"East task prefix: {tokeniser.tokenize(east_task)}") |
| print() |
|
|
| |
| print(f"Vocabulary size: {tokeniser.vocab_size}") |
|
|