from transformers import AutoTokenizer, T5ForConditionalGeneration, pipeline # HF Hub path config model_path = "crossroderick/aramt5" # Unicode directional formatting for RTL text (Syriac) RLI = "\u2067" # Right-to-Left Isolate PDI = "\u2069" # Pop Directional Isolate def rtl(text: str) -> str: """Wrap text in RTL isolate markers for correct terminal display.""" return f"{RLI}{text}{PDI}" # Load model and tokeniser print("Loading model and tokeniser...") tokeniser = AutoTokenizer.from_pretrained(model_path) model = T5ForConditionalGeneration.from_pretrained(model_path) pipe = pipeline("text2text-generation", model=model, tokenizer=tokeniser) print("Model loaded successfully.\n") def transliterate(text: str, dialect: str = "west") -> str: """ Transliterate Syriac text to Latin script. Args: text: Syriac text to transliterate dialect: 'west' for West Syriac (Serto) or 'east' for East Syriac (Madnḥaya) Returns: Transliterated Latin text """ if dialect == "east": prefix = "Syriac2EastLatin: " else: prefix = "Syriac2WestLatin: " input_prompt = f"{prefix}{text}" # Simple generation - let model decide length naturally output = pipe( input_prompt, max_new_tokens=128, num_beams=4, do_sample=False, )[ 0 ]["generated_text"] return output # Test examples - mix of words and sentences test_samples = [ # Single words - West Syriac {"text": "ܫܠܡܐ", "dialect": "west", "description": "Peace (West)"}, {"text": "ܐܠܗܐ", "dialect": "west", "description": "God (West)"}, {"text": "ܡܫܝܚܐ", "dialect": "west", "description": "Messiah/Christ (West)"}, {"text": "ܡܠܟܐ", "dialect": "west", "description": "King (West)"}, {"text": "ܒܝܬܐ", "dialect": "west", "description": "House (West)"}, # Single words - East Syriac {"text": "ܫܠܡܐ", "dialect": "east", "description": "Peace (East)"}, {"text": "ܐܠܗܐ", "dialect": "east", "description": "God (East)"}, {"text": "ܡܫܝܚܐ", "dialect": "east", "description": "Messiah/Christ (East)"}, # Proclitic examples {"text": "ܒܒܝܬܐ", "dialect": "west", "description": "In the house (West)"}, {"text": "ܘܡܠܟܐ", "dialect": "west", "description": "And the king (West)"}, {"text": "ܕܐܠܗܐ", "dialect": "west", "description": "Of God (West)"}, {"text": "ܠܡܠܟܐ", "dialect": "west", "description": "To the king (West)"}, # Short phrases { "text": "ܐܒܘܢ ܕܒܫܡܝܐ", "dialect": "west", "description": "Our Father in heaven (West)", }, {"text": "ܫܠܡܐ ܥܡܟ", "dialect": "west", "description": "Peace be with you (West)"}, ] print("=" * 50) print("AramT5 Syriac Transliteration Test") print("=" * 50) for sample in test_samples: result = transliterate(sample["text"], sample["dialect"]) print(f"\n{sample['description']}:") print(f" Syriac: {rtl(sample['text'])}") print(f" Latin: {result}") print("\n" + "=" * 50) print("Interactive mode - enter Syriac text to transliterate") print("Format: [e/w] text (e=east, w=west, default=west)") print("Enter 'q' to quit") print("=" * 50) while True: user_input = input("\n> ").strip() if user_input.lower() == "q": break # Parse dialect prefix if user_input.startswith("e "): dialect = "east" text = user_input[2:] elif user_input.startswith("w "): dialect = "west" text = user_input[2:] else: dialect = "west" text = user_input if text: result = transliterate(text, dialect) dialect_name = "East" if dialect == "east" else "West" print(f" [{dialect_name}] {rtl(text)} → {result}")