File size: 3,802 Bytes

from transformers import AutoTokenizer, T5ForConditionalGeneration, pipeline

# HF Hub path config
model_path = "crossroderick/aramt5"

# Unicode directional formatting for RTL text (Syriac)
RLI = "\u2067"  # Right-to-Left Isolate
PDI = "\u2069"  # Pop Directional Isolate


def rtl(text: str) -> str:
    """Wrap text in RTL isolate markers for correct terminal display."""
    return f"{RLI}{text}{PDI}"


# Load model and tokeniser
print("Loading model and tokeniser...")
tokeniser = AutoTokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)
pipe = pipeline("text2text-generation", model=model, tokenizer=tokeniser)
print("Model loaded successfully.\n")


def transliterate(text: str, dialect: str = "west") -> str:
    """
    Transliterate Syriac text to Latin script.

    Args:
        text: Syriac text to transliterate
        dialect: 'west' for West Syriac (Serto) or 'east' for East Syriac (Madnḥaya)

    Returns:
        Transliterated Latin text
    """
    if dialect == "east":
        prefix = "Syriac2EastLatin: "
    else:
        prefix = "Syriac2WestLatin: "

    input_prompt = f"{prefix}{text}"
    # Simple generation - let model decide length naturally
    output = pipe(
        input_prompt,
        max_new_tokens=128,
        num_beams=4,
        do_sample=False,
    )[
        0
    ]["generated_text"]
    return output


# Test examples - mix of words and sentences
test_samples = [
    # Single words - West Syriac
    {"text": "ܫܠܡܐ", "dialect": "west", "description": "Peace (West)"},
    {"text": "ܐܠܗܐ", "dialect": "west", "description": "God (West)"},
    {"text": "ܡܫܝܚܐ", "dialect": "west", "description": "Messiah/Christ (West)"},
    {"text": "ܡܠܟܐ", "dialect": "west", "description": "King (West)"},
    {"text": "ܒܝܬܐ", "dialect": "west", "description": "House (West)"},
    # Single words - East Syriac
    {"text": "ܫܠܡܐ", "dialect": "east", "description": "Peace (East)"},
    {"text": "ܐܠܗܐ", "dialect": "east", "description": "God (East)"},
    {"text": "ܡܫܝܚܐ", "dialect": "east", "description": "Messiah/Christ (East)"},
    # Proclitic examples
    {"text": "ܒܒܝܬܐ", "dialect": "west", "description": "In the house (West)"},
    {"text": "ܘܡܠܟܐ", "dialect": "west", "description": "And the king (West)"},
    {"text": "ܕܐܠܗܐ", "dialect": "west", "description": "Of God (West)"},
    {"text": "ܠܡܠܟܐ", "dialect": "west", "description": "To the king (West)"},
    # Short phrases
    {
        "text": "ܐܒܘܢ ܕܒܫܡܝܐ",
        "dialect": "west",
        "description": "Our Father in heaven (West)",
    },
    {"text": "ܫܠܡܐ ܥܡܟ", "dialect": "west", "description": "Peace be with you (West)"},
]

print("=" * 50)
print("AramT5 Syriac Transliteration Test")
print("=" * 50)

for sample in test_samples:
    result = transliterate(sample["text"], sample["dialect"])
    print(f"\n{sample['description']}:")
    print(f"  Syriac: {rtl(sample['text'])}")
    print(f"  Latin:  {result}")

print("\n" + "=" * 50)
print("Interactive mode - enter Syriac text to transliterate")
print("Format: [e/w] text (e=east, w=west, default=west)")
print("Enter 'q' to quit")
print("=" * 50)

while True:
    user_input = input("\n> ").strip()
    if user_input.lower() == "q":
        break

    # Parse dialect prefix
    if user_input.startswith("e "):
        dialect = "east"
        text = user_input[2:]
    elif user_input.startswith("w "):
        dialect = "west"
        text = user_input[2:]
    else:
        dialect = "west"
        text = user_input

    if text:
        result = transliterate(text, dialect)
        dialect_name = "East" if dialect == "east" else "West"
        print(f"  [{dialect_name}] {rtl(text)} → {result}")