Spaces:

Norelad
/

coptic-translation-interface

Sleeping

Rogaton Claude commited on Apr 9

Commit

9ebe8e4

1 Parent(s): d7507b9

Load models directly instead of using pipeline API

- Use AutoTokenizer and AutoModelForSeq2SeqLM directly
- Implement Coptic-Greek character mapping from handler.py
- Add dialect tags (з for Sahidic, б for Bohairic)
- Properly preprocess input text before translation
- Fixes pipeline_tag and task type errors

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (1) hide show

app.py +90 -31

app.py CHANGED Viewed

@@ -5,7 +5,8 @@ Supports Coptic↔English translation using megalaa models
 """
 import gradio as gr
-from transformers import pipeline
 # Coptic alphabet for virtual keyboard
 COPTIC_LETTERS = [
@@ -14,31 +15,61 @@ COPTIC_LETTERS = [
     'ϣ', 'ϥ', 'ϧ', 'ϩ', 'ϫ', 'ϭ', 'ϯ'
 ]
 # Model caching
-coptic_to_english_pipe = None
-english_to_coptic_pipe = None
 def load_coptic_to_english():
-    """Load Coptic → English translation pipeline"""
-    global coptic_to_english_pipe
-    if coptic_to_english_pipe is None:
-        coptic_to_english_pipe = pipeline(
-            task="text2text-generation",
-            model="megalaa/coptic-english-translator",
-            trust_remote_code=True
-        )
-    return coptic_to_english_pipe
 def load_english_to_coptic():
-    """Load English → Coptic translation pipeline"""
-    global english_to_coptic_pipe
-    if english_to_coptic_pipe is None:
-        english_to_coptic_pipe = pipeline(
-            task="text2text-generation",
-            model="megalaa/english-coptic-translator",
-            trust_remote_code=True
-        )
-    return english_to_coptic_pipe
 def translate_coptic_to_english(text, dialect):
     """Translate Coptic to English"""
@@ -46,15 +77,30 @@ def translate_coptic_to_english(text, dialect):
         return "Please enter Coptic text to translate."
     try:
-        pipe = load_coptic_to_english()
-        # Use from_bohairic parameter if Bohairic dialect selected
         if dialect == "Bohairic":
-            result = pipe(text, from_bohairic=True)
         else:
-            result = pipe(text)
-        return result['translation']
     except Exception as e:
         return f"Translation error: {str(e)}"
@@ -64,15 +110,28 @@ def translate_english_to_coptic(text, dialect):
         return "Please enter English text to translate."
     try:
-        pipe = load_english_to_coptic()
-        # Use to_bohairic parameter if Bohairic dialect selected
         if dialect == "Bohairic":
-            result = pipe(text, to_bohairic=True)
         else:
-            result = pipe(text)
-        return result['translation']
     except Exception as e:
         return f"Translation error: {str(e)}"

 """
 import gradio as gr
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+import torch
 # Coptic alphabet for virtual keyboard
 COPTIC_LETTERS = [
     'ϣ', 'ϥ', 'ϧ', 'ϩ', 'ϫ', 'ϭ', 'ϯ'
 ]
+# Coptic-Greek character mappings (from handler.py)
+COPTIC_TO_GREEK = {
+    "ⲁ": "α", "ⲃ": "β", "ⲅ": "γ", "ⲇ": "δ", "ⲉ": "ε", "ⲋ": "ϛ",
+    "ⲍ": "ζ", "ⲏ": "η", "ⲑ": "θ", "ⲓ": "ι", "ⲕ": "κ", "ⲗ": "λ",
+    "ⲙ": "μ", "ⲛ": "ν", "ⲝ": "ξ", "ⲟ": "ο", "ⲡ": "π", "ⲣ": "ρ",
+    "ⲥ": "σ", "ⲧ": "τ", "ⲩ": "υ", "ⲫ": "φ", "ⲭ": "χ", "ⲯ": "ψ", "ⲱ": "ω",
+    "ϣ": "ʃ", "ϥ": "f", "ϧ": "x", "ϩ": "h", "ϫ": "ɟ", "ϭ": "c", "ϯ": "ti"
+}
+GREEK_TO_COPTIC = {v: k for k, v in COPTIC_TO_GREEK.items()}
+def greekify(coptic_text):
+    """Convert Coptic Unicode to Greek transcription"""
+    result = []
+    for char in coptic_text:
+        result.append(COPTIC_TO_GREEK.get(char.lower(), char.lower()))
+    return "".join(result)
+def degreekify(greek_text):
+    """Convert Greek transcription back to Coptic Unicode"""
+    result = []
+    i = 0
+    while i < len(greek_text):
+        if i < len(greek_text) - 1 and greek_text[i:i+2].lower() == 'ti':
+            result.append(GREEK_TO_COPTIC.get('ti', greek_text[i:i+2]))
+            i += 2
+        else:
+            result.append(GREEK_TO_COPTIC.get(greek_text[i], greek_text[i]))
+            i += 1
+    return ''.join(result)
 # Model caching
+coptic_to_english_model = None
+english_to_coptic_model = None
+device = "cuda" if torch.cuda.is_available() else "cpu"
 def load_coptic_to_english():
+    """Load Coptic → English translation model"""
+    global coptic_to_english_model
+    if coptic_to_english_model is None:
+        tokenizer = AutoTokenizer.from_pretrained("megalaa/coptic-english-translator")
+        model = AutoModelForSeq2SeqLM.from_pretrained("megalaa/coptic-english-translator")
+        model = model.to(device)
+        coptic_to_english_model = (tokenizer, model)
+    return coptic_to_english_model
 def load_english_to_coptic():
+    """Load English → Coptic translation model"""
+    global english_to_coptic_model
+    if english_to_coptic_model is None:
+        tokenizer = AutoTokenizer.from_pretrained("megalaa/english-coptic-translator")
+        model = AutoModelForSeq2SeqLM.from_pretrained("megalaa/english-coptic-translator")
+        model = model.to(device)
+        english_to_coptic_model = (tokenizer, model)
+    return english_to_coptic_model
 def translate_coptic_to_english(text, dialect):
     """Translate Coptic to English"""
         return "Please enter Coptic text to translate."
     try:
+        tokenizer, model = load_coptic_to_english()
+        # Preprocess: convert Coptic to Greek transcription
+        greek_text = greekify(text)
+        # Add dialect tag (from handler.py)
         if dialect == "Bohairic":
+            greek_text = "б " + greek_text  # Bohairic tag
         else:
+            greek_text = "з " + greek_text  # Sahidic tag
+        # Tokenize and generate
+        inputs = tokenizer(greek_text, return_tensors="pt", padding=True).to(device)
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=128,
+            num_beams=5,
+            early_stopping=True
+        )
+        # Decode
+        translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        return translation
     except Exception as e:
         return f"Translation error: {str(e)}"
         return "Please enter English text to translate."
     try:
+        tokenizer, model = load_english_to_coptic()
+        # Add dialect tag
         if dialect == "Bohairic":
+            input_text = "б " + text  # Bohairic tag
         else:
+            input_text = "з " + text  # Sahidic tag
+        # Tokenize and generate
+        inputs = tokenizer(input_text, return_tensors="pt", padding=True).to(device)
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=128,
+            num_beams=5,
+            early_stopping=True
+        )
+        # Decode and convert back to Coptic
+        greek_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        coptic_output = degreekify(greek_output)
+        return coptic_output
     except Exception as e:
         return f"Translation error: {str(e)}"