Rogaton Claude commited on
Commit ·
9ebe8e4
1
Parent(s): d7507b9
Load models directly instead of using pipeline API
Browse files- Use AutoTokenizer and AutoModelForSeq2SeqLM directly
- Implement Coptic-Greek character mapping from handler.py
- Add dialect tags (з for Sahidic, б for Bohairic)
- Properly preprocess input text before translation
- Fixes pipeline_tag and task type errors
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
app.py
CHANGED
|
@@ -5,7 +5,8 @@ Supports Coptic↔English translation using megalaa models
|
|
| 5 |
"""
|
| 6 |
|
| 7 |
import gradio as gr
|
| 8 |
-
from transformers import
|
|
|
|
| 9 |
|
| 10 |
# Coptic alphabet for virtual keyboard
|
| 11 |
COPTIC_LETTERS = [
|
|
@@ -14,31 +15,61 @@ COPTIC_LETTERS = [
|
|
| 14 |
'ϣ', 'ϥ', 'ϧ', 'ϩ', 'ϫ', 'ϭ', 'ϯ'
|
| 15 |
]
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
# Model caching
|
| 18 |
-
|
| 19 |
-
|
|
|
|
| 20 |
|
| 21 |
def load_coptic_to_english():
|
| 22 |
-
"""Load Coptic → English translation
|
| 23 |
-
global
|
| 24 |
-
if
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
return coptic_to_english_pipe
|
| 31 |
|
| 32 |
def load_english_to_coptic():
|
| 33 |
-
"""Load English → Coptic translation
|
| 34 |
-
global
|
| 35 |
-
if
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
return english_to_coptic_pipe
|
| 42 |
|
| 43 |
def translate_coptic_to_english(text, dialect):
|
| 44 |
"""Translate Coptic to English"""
|
|
@@ -46,15 +77,30 @@ def translate_coptic_to_english(text, dialect):
|
|
| 46 |
return "Please enter Coptic text to translate."
|
| 47 |
|
| 48 |
try:
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
-
#
|
| 52 |
if dialect == "Bohairic":
|
| 53 |
-
|
| 54 |
else:
|
| 55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
-
return result['translation']
|
| 58 |
except Exception as e:
|
| 59 |
return f"Translation error: {str(e)}"
|
| 60 |
|
|
@@ -64,15 +110,28 @@ def translate_english_to_coptic(text, dialect):
|
|
| 64 |
return "Please enter English text to translate."
|
| 65 |
|
| 66 |
try:
|
| 67 |
-
|
| 68 |
|
| 69 |
-
#
|
| 70 |
if dialect == "Bohairic":
|
| 71 |
-
|
| 72 |
else:
|
| 73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
-
return result['translation']
|
| 76 |
except Exception as e:
|
| 77 |
return f"Translation error: {str(e)}"
|
| 78 |
|
|
|
|
| 5 |
"""
|
| 6 |
|
| 7 |
import gradio as gr
|
| 8 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
| 9 |
+
import torch
|
| 10 |
|
| 11 |
# Coptic alphabet for virtual keyboard
|
| 12 |
COPTIC_LETTERS = [
|
|
|
|
| 15 |
'ϣ', 'ϥ', 'ϧ', 'ϩ', 'ϫ', 'ϭ', 'ϯ'
|
| 16 |
]
|
| 17 |
|
| 18 |
+
# Coptic-Greek character mappings (from handler.py)
|
| 19 |
+
COPTIC_TO_GREEK = {
|
| 20 |
+
"ⲁ": "α", "ⲃ": "β", "ⲅ": "γ", "ⲇ": "δ", "ⲉ": "ε", "ⲋ": "ϛ",
|
| 21 |
+
"ⲍ": "ζ", "ⲏ": "η", "ⲑ": "θ", "ⲓ": "ι", "ⲕ": "κ", "ⲗ": "λ",
|
| 22 |
+
"ⲙ": "μ", "ⲛ": "ν", "ⲝ": "ξ", "ⲟ": "ο", "ⲡ": "π", "ⲣ": "ρ",
|
| 23 |
+
"ⲥ": "σ", "ⲧ": "τ", "ⲩ": "υ", "ⲫ": "φ", "ⲭ": "χ", "ⲯ": "ψ", "ⲱ": "ω",
|
| 24 |
+
"ϣ": "ʃ", "ϥ": "f", "ϧ": "x", "ϩ": "h", "ϫ": "ɟ", "ϭ": "c", "ϯ": "ti"
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
GREEK_TO_COPTIC = {v: k for k, v in COPTIC_TO_GREEK.items()}
|
| 28 |
+
|
| 29 |
+
def greekify(coptic_text):
|
| 30 |
+
"""Convert Coptic Unicode to Greek transcription"""
|
| 31 |
+
result = []
|
| 32 |
+
for char in coptic_text:
|
| 33 |
+
result.append(COPTIC_TO_GREEK.get(char.lower(), char.lower()))
|
| 34 |
+
return "".join(result)
|
| 35 |
+
|
| 36 |
+
def degreekify(greek_text):
|
| 37 |
+
"""Convert Greek transcription back to Coptic Unicode"""
|
| 38 |
+
result = []
|
| 39 |
+
i = 0
|
| 40 |
+
while i < len(greek_text):
|
| 41 |
+
if i < len(greek_text) - 1 and greek_text[i:i+2].lower() == 'ti':
|
| 42 |
+
result.append(GREEK_TO_COPTIC.get('ti', greek_text[i:i+2]))
|
| 43 |
+
i += 2
|
| 44 |
+
else:
|
| 45 |
+
result.append(GREEK_TO_COPTIC.get(greek_text[i], greek_text[i]))
|
| 46 |
+
i += 1
|
| 47 |
+
return ''.join(result)
|
| 48 |
+
|
| 49 |
# Model caching
|
| 50 |
+
coptic_to_english_model = None
|
| 51 |
+
english_to_coptic_model = None
|
| 52 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 53 |
|
| 54 |
def load_coptic_to_english():
|
| 55 |
+
"""Load Coptic → English translation model"""
|
| 56 |
+
global coptic_to_english_model
|
| 57 |
+
if coptic_to_english_model is None:
|
| 58 |
+
tokenizer = AutoTokenizer.from_pretrained("megalaa/coptic-english-translator")
|
| 59 |
+
model = AutoModelForSeq2SeqLM.from_pretrained("megalaa/coptic-english-translator")
|
| 60 |
+
model = model.to(device)
|
| 61 |
+
coptic_to_english_model = (tokenizer, model)
|
| 62 |
+
return coptic_to_english_model
|
|
|
|
| 63 |
|
| 64 |
def load_english_to_coptic():
|
| 65 |
+
"""Load English → Coptic translation model"""
|
| 66 |
+
global english_to_coptic_model
|
| 67 |
+
if english_to_coptic_model is None:
|
| 68 |
+
tokenizer = AutoTokenizer.from_pretrained("megalaa/english-coptic-translator")
|
| 69 |
+
model = AutoModelForSeq2SeqLM.from_pretrained("megalaa/english-coptic-translator")
|
| 70 |
+
model = model.to(device)
|
| 71 |
+
english_to_coptic_model = (tokenizer, model)
|
| 72 |
+
return english_to_coptic_model
|
|
|
|
| 73 |
|
| 74 |
def translate_coptic_to_english(text, dialect):
|
| 75 |
"""Translate Coptic to English"""
|
|
|
|
| 77 |
return "Please enter Coptic text to translate."
|
| 78 |
|
| 79 |
try:
|
| 80 |
+
tokenizer, model = load_coptic_to_english()
|
| 81 |
+
|
| 82 |
+
# Preprocess: convert Coptic to Greek transcription
|
| 83 |
+
greek_text = greekify(text)
|
| 84 |
|
| 85 |
+
# Add dialect tag (from handler.py)
|
| 86 |
if dialect == "Bohairic":
|
| 87 |
+
greek_text = "б " + greek_text # Bohairic tag
|
| 88 |
else:
|
| 89 |
+
greek_text = "з " + greek_text # Sahidic tag
|
| 90 |
+
|
| 91 |
+
# Tokenize and generate
|
| 92 |
+
inputs = tokenizer(greek_text, return_tensors="pt", padding=True).to(device)
|
| 93 |
+
outputs = model.generate(
|
| 94 |
+
**inputs,
|
| 95 |
+
max_new_tokens=128,
|
| 96 |
+
num_beams=5,
|
| 97 |
+
early_stopping=True
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
# Decode
|
| 101 |
+
translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 102 |
+
return translation
|
| 103 |
|
|
|
|
| 104 |
except Exception as e:
|
| 105 |
return f"Translation error: {str(e)}"
|
| 106 |
|
|
|
|
| 110 |
return "Please enter English text to translate."
|
| 111 |
|
| 112 |
try:
|
| 113 |
+
tokenizer, model = load_english_to_coptic()
|
| 114 |
|
| 115 |
+
# Add dialect tag
|
| 116 |
if dialect == "Bohairic":
|
| 117 |
+
input_text = "б " + text # Bohairic tag
|
| 118 |
else:
|
| 119 |
+
input_text = "з " + text # Sahidic tag
|
| 120 |
+
|
| 121 |
+
# Tokenize and generate
|
| 122 |
+
inputs = tokenizer(input_text, return_tensors="pt", padding=True).to(device)
|
| 123 |
+
outputs = model.generate(
|
| 124 |
+
**inputs,
|
| 125 |
+
max_new_tokens=128,
|
| 126 |
+
num_beams=5,
|
| 127 |
+
early_stopping=True
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
# Decode and convert back to Coptic
|
| 131 |
+
greek_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 132 |
+
coptic_output = degreekify(greek_output)
|
| 133 |
+
return coptic_output
|
| 134 |
|
|
|
|
| 135 |
except Exception as e:
|
| 136 |
return f"Translation error: {str(e)}"
|
| 137 |
|