| from transformers import AutoTokenizer, T5ForConditionalGeneration, pipeline |
|
|
| |
| model_path = "crossroderick/aramt5" |
|
|
| |
| RLI = "\u2067" |
| PDI = "\u2069" |
|
|
|
|
| def rtl(text: str) -> str: |
| """Wrap text in RTL isolate markers for correct terminal display.""" |
| return f"{RLI}{text}{PDI}" |
|
|
|
|
| |
| print("Loading model and tokeniser...") |
| tokeniser = AutoTokenizer.from_pretrained(model_path) |
| model = T5ForConditionalGeneration.from_pretrained(model_path) |
| pipe = pipeline("text2text-generation", model=model, tokenizer=tokeniser) |
| print("Model loaded successfully.\n") |
|
|
|
|
| def transliterate(text: str, dialect: str = "west") -> str: |
| """ |
| Transliterate Syriac text to Latin script. |
| |
| Args: |
| text: Syriac text to transliterate |
| dialect: 'west' for West Syriac (Serto) or 'east' for East Syriac (Madnḥaya) |
| |
| Returns: |
| Transliterated Latin text |
| """ |
| if dialect == "east": |
| prefix = "Syriac2EastLatin: " |
| else: |
| prefix = "Syriac2WestLatin: " |
|
|
| input_prompt = f"{prefix}{text}" |
| |
| output = pipe( |
| input_prompt, |
| max_new_tokens=128, |
| num_beams=4, |
| do_sample=False, |
| )[ |
| 0 |
| ]["generated_text"] |
| return output |
|
|
|
|
| |
| test_samples = [ |
| |
| {"text": "ܫܠܡܐ", "dialect": "west", "description": "Peace (West)"}, |
| {"text": "ܐܠܗܐ", "dialect": "west", "description": "God (West)"}, |
| {"text": "ܡܫܝܚܐ", "dialect": "west", "description": "Messiah/Christ (West)"}, |
| {"text": "ܡܠܟܐ", "dialect": "west", "description": "King (West)"}, |
| {"text": "ܒܝܬܐ", "dialect": "west", "description": "House (West)"}, |
| |
| {"text": "ܫܠܡܐ", "dialect": "east", "description": "Peace (East)"}, |
| {"text": "ܐܠܗܐ", "dialect": "east", "description": "God (East)"}, |
| {"text": "ܡܫܝܚܐ", "dialect": "east", "description": "Messiah/Christ (East)"}, |
| |
| {"text": "ܒܒܝܬܐ", "dialect": "west", "description": "In the house (West)"}, |
| {"text": "ܘܡܠܟܐ", "dialect": "west", "description": "And the king (West)"}, |
| {"text": "ܕܐܠܗܐ", "dialect": "west", "description": "Of God (West)"}, |
| {"text": "ܠܡܠܟܐ", "dialect": "west", "description": "To the king (West)"}, |
| |
| { |
| "text": "ܐܒܘܢ ܕܒܫܡܝܐ", |
| "dialect": "west", |
| "description": "Our Father in heaven (West)", |
| }, |
| {"text": "ܫܠܡܐ ܥܡܟ", "dialect": "west", "description": "Peace be with you (West)"}, |
| ] |
|
|
| print("=" * 50) |
| print("AramT5 Syriac Transliteration Test") |
| print("=" * 50) |
|
|
| for sample in test_samples: |
| result = transliterate(sample["text"], sample["dialect"]) |
| print(f"\n{sample['description']}:") |
| print(f" Syriac: {rtl(sample['text'])}") |
| print(f" Latin: {result}") |
|
|
| print("\n" + "=" * 50) |
| print("Interactive mode - enter Syriac text to transliterate") |
| print("Format: [e/w] text (e=east, w=west, default=west)") |
| print("Enter 'q' to quit") |
| print("=" * 50) |
|
|
| while True: |
| user_input = input("\n> ").strip() |
| if user_input.lower() == "q": |
| break |
|
|
| |
| if user_input.startswith("e "): |
| dialect = "east" |
| text = user_input[2:] |
| elif user_input.startswith("w "): |
| dialect = "west" |
| text = user_input[2:] |
| else: |
| dialect = "west" |
| text = user_input |
|
|
| if text: |
| result = transliterate(text, dialect) |
| dialect_name = "East" if dialect == "east" else "West" |
| print(f" [{dialect_name}] {rtl(text)} → {result}") |
|
|