File size: 1,182 Bytes
a4462f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import sys

from transformers import T5TokenizerFast

sys.path.insert(0, ".")
from src.data.generate_syr_lat_pairs import SyriacDialect, transliterate_syriac

# Load tokeniser
tokeniser = T5TokenizerFast.from_pretrained("src/tokeniser/")

print("=== Syriac Tokeniser Test ===\n")

# Test Syriac script
syriac_text = "ܡܠܟܐ ܕܒܝܬ ܢܗܪܝܢ"
print(f"Syriac: {syriac_text}")
print(f"Tokens: {tokeniser.tokenize(syriac_text)}")
print()

# Generate actual transliterations from the script (matching training data format)
west_text = transliterate_syriac(syriac_text, SyriacDialect.WEST)
east_text = transliterate_syriac(syriac_text, SyriacDialect.EAST)

print(f"West Syriac Latin: {west_text}")
print(f"Tokens: {tokeniser.tokenize(west_text)}")
print()

print(f"East Syriac Latin: {east_text}")
print(f"Tokens: {tokeniser.tokenize(east_text)}")
print()

# Test task prefixes (should be single tokens)
west_task = "Syriac2WestLatin: ܐܠܗܐ"
east_task = "Syriac2EastLatin: ܐܠܗܐ"
print(f"West task prefix: {tokeniser.tokenize(west_task)}")
print(f"East task prefix: {tokeniser.tokenize(east_task)}")
print()

# Vocabulary info
print(f"Vocabulary size: {tokeniser.vocab_size}")