#!/usr/bin/env python3 """ Coptic Translation Interface - Hugging Face Space Supports Coptic↔English translation using megalaa models """ import gradio as gr from transformers import AutoTokenizer, AutoModelForSeq2SeqLM import torch # Coptic alphabet for virtual keyboard COPTIC_LETTERS = [ 'ⲁ', 'ⲃ', 'ⲅ', 'ⲇ', 'ⲉ', 'ⲍ', 'ⲏ', 'ⲑ', 'ⲓ', 'ⲕ', 'ⲗ', 'ⲙ', 'ⲛ', 'ⲝ', 'ⲟ', 'ⲡ', 'ⲣ', 'ⲥ', 'ⲧ', 'ⲩ', 'ⲫ', 'ⲭ', 'ⲯ', 'ⲱ', 'ϣ', 'ϥ', 'ϧ', 'ϩ', 'ϫ', 'ϭ', 'ϯ' ] # Coptic-Greek character mappings (from handler.py) COPTIC_TO_GREEK = { "ⲁ": "α", "ⲃ": "β", "ⲅ": "γ", "ⲇ": "δ", "ⲉ": "ε", "ⲋ": "ϛ", "ⲍ": "ζ", "ⲏ": "η", "ⲑ": "θ", "ⲓ": "ι", "ⲕ": "κ", "ⲗ": "λ", "ⲙ": "μ", "ⲛ": "ν", "ⲝ": "ξ", "ⲟ": "ο", "ⲡ": "π", "ⲣ": "ρ", "ⲥ": "σ", "ⲧ": "τ", "ⲩ": "υ", "ⲫ": "φ", "ⲭ": "χ", "ⲯ": "ψ", "ⲱ": "ω", "ϣ": "ʃ", "ϥ": "f", "ϧ": "x", "ϩ": "h", "ϫ": "ɟ", "ϭ": "c", "ϯ": "ti" } GREEK_TO_COPTIC = {v: k for k, v in COPTIC_TO_GREEK.items()} def greekify(coptic_text): """Convert Coptic Unicode to Greek transcription""" result = [] for char in coptic_text: result.append(COPTIC_TO_GREEK.get(char.lower(), char.lower())) return "".join(result) def degreekify(greek_text): """Convert Greek transcription back to Coptic Unicode""" result = [] i = 0 while i < len(greek_text): if i < len(greek_text) - 1 and greek_text[i:i+2].lower() == 'ti': result.append(GREEK_TO_COPTIC.get('ti', greek_text[i:i+2])) i += 2 else: result.append(GREEK_TO_COPTIC.get(greek_text[i], greek_text[i])) i += 1 return ''.join(result) # Model caching coptic_to_english_model = None english_to_coptic_model = None device = "cuda" if torch.cuda.is_available() else "cpu" def load_coptic_to_english(): """Load Coptic → English translation model""" global coptic_to_english_model if coptic_to_english_model is None: tokenizer = AutoTokenizer.from_pretrained("megalaa/coptic-english-translator") model = AutoModelForSeq2SeqLM.from_pretrained("megalaa/coptic-english-translator") model = model.to(device) coptic_to_english_model = (tokenizer, model) return coptic_to_english_model def load_english_to_coptic(): """Load English → Coptic translation model""" global english_to_coptic_model if english_to_coptic_model is None: tokenizer = AutoTokenizer.from_pretrained("megalaa/english-coptic-translator") model = AutoModelForSeq2SeqLM.from_pretrained("megalaa/english-coptic-translator") model = model.to(device) english_to_coptic_model = (tokenizer, model) return english_to_coptic_model def translate_coptic_to_english(text, dialect): """Translate Coptic to English""" if not text or not text.strip(): return "Please enter Coptic text to translate." try: tokenizer, model = load_coptic_to_english() # Preprocess: convert Coptic to Greek transcription greek_text = greekify(text) # Add dialect tag (from handler.py) if dialect == "Bohairic": greek_text = "б " + greek_text # Bohairic tag else: greek_text = "з " + greek_text # Sahidic tag # Tokenize and generate inputs = tokenizer(greek_text, return_tensors="pt", padding=True).to(device) outputs = model.generate( **inputs, max_new_tokens=128, num_beams=5, early_stopping=True ) # Decode translation = tokenizer.decode(outputs[0], skip_special_tokens=True) return translation except Exception as e: return f"Translation error: {str(e)}" def translate_english_to_coptic(text, dialect): """Translate English to Coptic""" if not text or not text.strip(): return "Please enter English text to translate." try: tokenizer, model = load_english_to_coptic() # Add dialect tag if dialect == "Bohairic": input_text = "б " + text # Bohairic tag else: input_text = "з " + text # Sahidic tag # Tokenize and generate inputs = tokenizer(input_text, return_tensors="pt", padding=True).to(device) outputs = model.generate( **inputs, max_new_tokens=128, num_beams=5, early_stopping=True ) # Decode and convert back to Coptic greek_output = tokenizer.decode(outputs[0], skip_special_tokens=True) coptic_output = degreekify(greek_output) return coptic_output except Exception as e: return f"Translation error: {str(e)}" def add_letter(current_text, letter): """Add a Coptic letter to the current text""" return current_text + letter if current_text else letter def add_space(current_text): """Add a space to the current text""" return current_text + " " if current_text else " " def backspace(current_text): """Remove last character from current text""" return current_text[:-1] if current_text else "" def clear_text(): """Clear all text""" return "" # Load comprehensive test corpus import json from pathlib import Path def load_test_corpus(): """Load the comprehensive Coptic test corpus""" corpus_path = Path(__file__).parent / "coptic_test_corpus.json" if corpus_path.exists(): with open(corpus_path, 'r', encoding='utf-8') as f: return json.load(f) return None # Example texts organized by category # SAHIDIC EXAMPLES COPTIC_EXAMPLES_SIMPLE = [ ["ⲁⲩⲱ ⲁϥⲙⲟⲩⲧⲉ ⲉⲣⲟϥ", "Sahidic"], # and he called him ["ⲁⲛⲟⲕ ⲡⲉ ⲡⲛⲟⲩⲧⲉ ⲙⲡⲉⲕⲉⲓⲱⲧ", "Sahidic"], # I am the God of your father ["ⲙⲡⲣⲣ ϩⲟⲧⲉ", "Sahidic"], # Do not be afraid ["ⲡϫⲟⲉⲓⲥ ⲡⲉ ⲡⲁⲛⲟⲩⲧⲉ", "Sahidic"], # The Lord is my God ["ⲁϥⲃⲱⲕ ⲉϩⲣⲁⲓ ⲉⲡⲉⲣⲡⲉ", "Sahidic"], # he went up to the temple ] COPTIC_EXAMPLES_COMPLEX = [ ["ⲁⲩⲱ ⲛⲧⲉⲣⲉϥⲛⲁⲩ ⲉⲡⲙⲏⲏϣⲉ ⲁϥϣⲡϩⲧⲏϥ ⲉϩⲣⲁⲓ ⲉϫⲱⲟⲩ", "Sahidic"], # when he saw the crowd ["ⲉϣⲱⲡⲉ ⲇⲉ ⲁⲩⲛⲁⲩ ⲉⲣⲟϥ ⲉϥⲙⲟⲟϣⲉ ϩⲓϫⲛ ⲧⲉⲑⲁⲗⲁⲥⲥⲁ ⲁⲩϣⲧⲟⲣⲧⲣ", "Sahidic"], # when they saw him walking ["ⲁⲓⲉⲓ ⲅⲁⲣ ⲉⲙⲟⲩⲧⲉ ⲁⲛ ⲉⲛⲇⲓⲕⲁⲓⲟⲥ ⲁⲗⲗⲁ ⲛⲣⲉϥⲣⲛⲟⲃⲉ", "Sahidic"], # I came not to call the righteous ] COPTIC_EXAMPLES_TEXTS = [ ["ⲛⲉⲩⲛⲟⲩⲙⲏⲏϣⲉ ⲇⲉ ⲛϣⲱⲛⲉ ⲉⲩⲛⲕⲟⲧⲕ ϩⲙ ⲡⲙⲁ ⲉⲧⲙⲙⲁⲩ· ⲛϩⲁⲛⲃⲗⲗⲉ ⲙⲛ ⲛϩⲁⲛϭⲁⲗⲉ ⲙⲛ ⲛϣⲟⲩⲱⲟⲩ·", "Sahidic"], # Healing at the pool ["ⲉⲓⲥ ⲡⲉⲧϫⲟ ⲁϥⲉⲓ ⲉⲃⲟⲗ ⲉϫⲟ· ⲁⲩⲱ ⲛⲧⲉⲣⲉϥϫⲟ ϩⲟⲓⲛⲉ ⲙⲉⲛ ⲁⲩϩⲉ ϩⲁⲧⲏ ⲧⲉϩⲓⲏ·", "Sahidic"], # The Sower parable ] # BOHAIRIC EXAMPLES BOHAIRIC_EXAMPLES_SIMPLE = [ ["ⲟⲩⲟϩ ⲁϥⲙⲟⲩϯ ⲉⲣⲟϥ", "Bohairic"], # and he called him ["ⲁⲛⲟⲕ ⲡⲉ ⲫϯ ⲛⲧⲉ ⲡⲉⲕⲓⲱⲧ", "Bohairic"], # I am the God of your father ["ⲙⲡⲉⲣⲉⲣϩⲟϯ", "Bohairic"], # Do not be afraid ["ⲡϭⲟⲓⲥ ⲡⲉ ⲡⲁⲛⲟⲩϯ", "Bohairic"], # The Lord is my God ["ⲁϥϣⲉⲛⲁϥ ⲉⲡϣⲱⲓ ⲉⲡⲓⲉⲣⲫⲉⲓ", "Bohairic"], # he went up to the temple ] BOHAIRIC_EXAMPLES_COMPLEX = [ ["ⲟⲩⲟϩ ⲉⲧⲁϥⲛⲁⲩ ⲉⲡⲓⲙⲏϣ ⲁϥϣⲉⲛϩⲏⲧ ϧⲁⲣⲱⲟⲩ", "Bohairic"], # when he saw the crowd ["ⲡϭⲟⲓⲥ ⲡⲉⲧⲁⲙⲟⲛⲓ", "Bohairic"], # The Lord is my shepherd (Psalm 23:1) ] BOHAIRIC_EXAMPLES_TEXTS = [ ["ⲛⲉ ⲟⲩⲟⲛ ⲟⲩⲙⲏϣ ⲛϣⲱⲛⲓ ⲉⲩⲉⲛⲕⲟⲧ ϧⲉⲛ ⲡⲓⲙⲁ ⲉⲧⲉⲙⲙⲁⲩ· ϩⲁⲛⲃⲉⲗⲗⲉⲩ ⲛⲉⲙ ϩⲁⲛϭⲁⲗⲉⲩ ⲛⲉⲙ ϩⲁⲛϣⲁⲩⲟⲩⲱⲟⲩ·", "Bohairic"], # Healing at the pool (Bohairic) ] ENGLISH_EXAMPLES = [ ["The Lord is good", "Sahidic"], ["I am a teacher", "Sahidic"], ["We give thanks to God", "Sahidic"], ["Do not be afraid", "Sahidic"], ["He went to the house", "Sahidic"], ] # Create Gradio Interface with gr.Blocks() as demo: gr.Markdown(""" # 🔮 Coptic Translation Interface Translate between Coptic and English using specialized models from [megalaa](https://huggingface.co/megalaa): - **Coptic → English**: `megalaa/coptic-english-translator` - **English → Coptic**: `megalaa/english-coptic-translator` Based on neural machine translation models trained on Coptic-English parallel corpus. """) with gr.Tabs(): # Tab 1: Coptic → English with gr.TabItem("Coptic → English"): gr.Markdown("### Translate Coptic text to English") with gr.Row(): with gr.Column(scale=1): cop_input = gr.Textbox( label="Coptic Text", placeholder="Enter Coptic text or use the virtual keyboard below...", lines=8, max_lines=15 ) cop_dialect = gr.Radio( choices=["Sahidic", "Bohairic"], value="Sahidic", label="Coptic Dialect" ) # Virtual Coptic Keyboard with gr.Group(): gr.Markdown("**Virtual Coptic Keyboard**") # Create keyboard in rows of 8 for i in range(0, len(COPTIC_LETTERS), 8): with gr.Row(): for letter in COPTIC_LETTERS[i:i+8]: btn = gr.Button(letter, size="sm", scale=1) btn.click( fn=lambda current, l=letter: add_letter(current, l), inputs=[cop_input], outputs=[cop_input] ) with gr.Row(): space_btn = gr.Button("Space", size="sm", scale=2) back_btn = gr.Button("⌫ Backspace", size="sm", scale=2) clear_btn = gr.Button("Clear", size="sm", scale=1) space_btn.click(fn=add_space, inputs=[cop_input], outputs=[cop_input]) back_btn.click(fn=backspace, inputs=[cop_input], outputs=[cop_input]) clear_btn.click(fn=clear_text, outputs=[cop_input]) cop_translate_btn = gr.Button("🔄 Translate to English", variant="primary", size="lg") with gr.Column(scale=1): cop_output = gr.Textbox( label="English Translation", lines=8, max_lines=15, interactive=False ) with gr.Accordion("📖 Example Texts", open=True): gr.Markdown("### Sahidic Dialect (Literary Standard)") gr.Markdown("**Simple Sentences**: Basic grammatical structures") gr.Examples( examples=COPTIC_EXAMPLES_SIMPLE, inputs=[cop_input, cop_dialect], outputs=cop_output, fn=translate_coptic_to_english, cache_examples=False, label="Sahidic Simple" ) gr.Markdown("**Complex Sentences**: Multi-clause with subordination") gr.Examples( examples=COPTIC_EXAMPLES_COMPLEX, inputs=[cop_input, cop_dialect], outputs=cop_output, fn=translate_coptic_to_english, cache_examples=False, label="Sahidic Complex" ) gr.Markdown("**Full Texts**: Connected discourse (paragraphs)") gr.Examples( examples=COPTIC_EXAMPLES_TEXTS, inputs=[cop_input, cop_dialect], outputs=cop_output, fn=translate_coptic_to_english, cache_examples=False, label="Sahidic Texts" ) gr.Markdown("---") gr.Markdown("### Bohairic Dialect (Northern/Liturgical)") gr.Markdown("**Simple Sentences**: Basic grammatical structures") gr.Examples( examples=BOHAIRIC_EXAMPLES_SIMPLE, inputs=[cop_input, cop_dialect], outputs=cop_output, fn=translate_coptic_to_english, cache_examples=False, label="Bohairic Simple" ) gr.Markdown("**Complex Sentences**: Multi-clause constructions") gr.Examples( examples=BOHAIRIC_EXAMPLES_COMPLEX, inputs=[cop_input, cop_dialect], outputs=cop_output, fn=translate_coptic_to_english, cache_examples=False, label="Bohairic Complex" ) gr.Markdown("**Full Texts**: Connected discourse") gr.Examples( examples=BOHAIRIC_EXAMPLES_TEXTS, inputs=[cop_input, cop_dialect], outputs=cop_output, fn=translate_coptic_to_english, cache_examples=False, label="Bohairic Texts" ) cop_translate_btn.click( fn=translate_coptic_to_english, inputs=[cop_input, cop_dialect], outputs=cop_output ) # Tab 2: English → Coptic with gr.TabItem("English → Coptic"): gr.Markdown("### Translate English text to Coptic") with gr.Row(): with gr.Column(scale=1): eng_input = gr.Textbox( label="English Text", placeholder="Enter English text...", lines=8, max_lines=15 ) eng_dialect = gr.Radio( choices=["Sahidic", "Bohairic"], value="Sahidic", label="Target Coptic Dialect" ) eng_translate_btn = gr.Button("🔄 Translate to Coptic", variant="primary", size="lg") with gr.Column(scale=1): eng_output = gr.Textbox( label="Coptic Translation", lines=8, max_lines=15, interactive=False ) gr.Examples( examples=ENGLISH_EXAMPLES, inputs=[eng_input, eng_dialect], outputs=eng_output, fn=translate_english_to_coptic, cache_examples=False, label="📖 Example English Texts" ) eng_translate_btn.click( fn=translate_english_to_coptic, inputs=[eng_input, eng_dialect], outputs=eng_output ) # Tab 3: Dependency Parsing (Neural-Symbolic) with gr.TabItem("📊 Dependency Analysis"): gr.Markdown(""" ### Neural-Symbolic Coptic Parser Hybrid architecture combining: - **Neural**: Stanza + DiaParser for dependency parsing - **Symbolic**: Prolog rules implementing Walter Till's grammar - **Lexicon**: Crum's Coptic Dictionary integration """) with gr.Row(): with gr.Column(scale=1): parse_input = gr.Textbox( label="Coptic Text to Parse", placeholder="Enter Coptic text for grammatical analysis...", lines=6, max_lines=10 ) parse_btn = gr.Button("🔍 Parse & Validate", variant="primary", size="lg") with gr.Column(scale=1): parse_output = gr.Markdown( label="Dependency Parse Results", value="Parse results will appear here..." ) with gr.Accordion("Prolog Validation Results", open=False): prolog_output = gr.Markdown( value="Grammatical validation results will appear here..." ) with gr.Accordion("Download Options", open=False): conllu_download = gr.File( label="Download CoNLL-U Format", visible=False ) with gr.Accordion("📖 Example Texts for Parsing", open=True): gr.Markdown("**Simple Structures** - Test basic dependency relations") simple_parse_examples = [ "ⲁⲩⲱ ⲁϥⲙⲟⲩⲧⲉ ⲉⲣⲟϥ", # and he called him "ⲁⲛⲟⲕ ⲡⲉ ⲡⲛⲟⲩⲧⲉ ⲙⲡⲉⲕⲉⲓⲱⲧ", # Tripartite nominal "ⲡϫⲟⲉⲓⲥ ⲡⲉ ⲡⲁⲛⲟⲩⲧⲉ", # The Lord is my God ] gr.Examples( examples=[[ex] for ex in simple_parse_examples], inputs=parse_input, label="Simple" ) gr.Markdown("**Complex Structures** - Test subordination and coordination") complex_parse_examples = [ "ⲁⲩⲱ ⲛⲧⲉⲣⲉϥⲛⲁⲩ ⲉⲡⲙⲏⲏϣⲉ ⲁϥϣⲡϩⲧⲏϥ ⲉϩⲣⲁⲓ ⲉϫⲱⲟⲩ", # Temporal clause "ⲁⲓⲉⲓ ⲅⲁⲣ ⲉⲙⲟⲩⲧⲉ ⲁⲛ ⲉⲛⲇⲓⲕⲁⲓⲟⲥ ⲁⲗⲗⲁ ⲛⲣⲉϥⲣⲛⲟⲃⲉ", # Purpose with negation ] gr.Examples( examples=[[ex] for ex in complex_parse_examples], inputs=parse_input, label="Complex" ) gr.Markdown("**Full Texts** - Test discourse-level parsing") text_parse_examples = [ "ⲛⲉⲩⲛⲟⲩⲙⲏⲏϣⲉ ⲇⲉ ⲛϣⲱⲛⲉ ⲉⲩⲛⲕⲟⲧⲕ ϩⲙ ⲡⲙⲁ ⲉⲧⲙⲙⲁⲩ· ⲛϩⲁⲛⲃⲗⲗⲉ ⲙⲛ ⲛϩⲁⲛϭⲁⲗⲉ ⲙⲛ ⲛϣⲟⲩⲱⲟⲩ·", ] gr.Examples( examples=[[ex] for ex in text_parse_examples], inputs=parse_input, label="Texts" ) def parse_coptic_text(text): """Parse Coptic text with neural-symbolic validation""" if not text or not text.strip(): return "Please enter Coptic text to parse.", "", None try: from coptic_parser_core import CopticParserCore # Initialize parser (cached) parser = CopticParserCore() parser.load_parser() # Parse the text result = parser.parse_text(text) if not result: return "❌ Parsing failed. Please check input.", "", None # Format main output main_output = f""" ## Parse Results **Total Sentences**: {result['total_sentences']} **Total Tokens**: {result['total_tokens']} ### Dependency Structure {parser.format_table(result)} """ # Format Prolog validation output prolog_output_text = "" if 'prolog_validation' in result and result['prolog_validation']: validation = result['prolog_validation'] prolog_output_text = "## 🔍 Prolog Validation (Walter Till Grammar)\n\n" if validation.get('patterns_detected'): prolog_output_text += "### ✅ Detected Grammatical Patterns\n\n" for pattern in validation['patterns_detected']: if isinstance(pattern, dict): if pattern.get('is_tripartite'): prolog_output_text += f"- **Tripartite Sentence**: {pattern.get('description', '')}\n" prolog_output_text += f" ```\n {pattern.get('pattern', '')}\n ```\n" else: prolog_output_text += f"- {pattern}\n" else: prolog_output_text += f"- {pattern}\n" if validation.get('warnings'): prolog_output_text += "\n### ⚠️ Grammatical Warnings\n\n" for warning in validation['warnings']: prolog_output_text += f"- {warning}\n" if not validation.get('warnings') and not validation.get('patterns_detected'): prolog_output_text += "✓ No grammatical issues detected\n" else: prolog_output_text = "ℹ️ Prolog validation not available (requires SWI-Prolog)" # Create CoNLL-U file for download conllu_content = parser.format_conllu(result) conllu_path = "/tmp/coptic_parse.conllu" with open(conllu_path, 'w', encoding='utf-8') as f: f.write(conllu_content) return main_output, prolog_output_text, conllu_path except Exception as e: return f"❌ Error: {str(e)}", "", None parse_btn.click( fn=parse_coptic_text, inputs=parse_input, outputs=[parse_output, prolog_output, conllu_download] ) gr.Markdown(""" --- ### About This Research Interface **Translation Models**: - [megalaa/coptic-english-translator](https://huggingface.co/megalaa/coptic-english-translator) & [megalaa/english-coptic-translator](https://huggingface.co/megalaa/english-coptic-translator) - Based on work by Enis & Megalaa (2024) **Dependency Parser** (Neural-Symbolic Hybrid): - **Neural**: Stanza NLP pipeline + DiaParser for Coptic - **Symbolic**: Prolog implementation of Walter Till's Coptic grammar - **Lexicon**: Integration with Crum's Coptic Dictionary - **Error Detection**: Prolog validation catches neural parser hallucinations **Research Features**: - CoNLL-U format export for corpus analysis - Grammatical pattern detection (tripartite sentences, etc.) - Dialect-aware processing (Sahidic/Bohairic) """) if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, show_error=True )