Spaces:

Norelad
/

coptic-translation-interface

Running

Rogaton Claude commited on Nov 17, 2025

Commit

e0d73ae

1 Parent(s): 5461265

Replace megalaa model with Norelad/coptic-megalaa-finetuned

- Update Coptic→English model to use the fine-tuned Norelad/coptic-megalaa-finetuned
- Fix preprocessing: add dialect tags (з for Sahidic, б for Bohairic)
- Fix greekify/degreekify character mappings to match model training
- ϣ → ʃ (IPA), ϧ → x, ϫ → ɟ, ϯ → ti
- Add transformers and sentencepiece to requirements.txt
- Update translate_coptic_to_english to accept dialect parameter

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (2) hide show

apertus_ui.py +315 -89
requirements.txt +2 -0

apertus_ui.py CHANGED Viewed

@@ -2,9 +2,77 @@ import streamlit as st
 import os
 import xml.etree.ElementTree as ET
 import re
 from huggingface_hub import InferenceClient
 from coptic_parser_core import CopticParserCore
 # Coptic alphabet helper
 COPTIC_ALPHABET = {
     'Ⲁ': 'Alpha', 'Ⲃ': 'Beta', 'Ⲅ': 'Gamma', 'Ⲇ': 'Delta', 'Ⲉ': 'Epsilon', 'Ⲋ': 'Zeta',
@@ -102,6 +170,116 @@ def load_coptic_lexicon(file_path=None):
     return lexicon
 # Language detection and UI
 LANGUAGES = {
     'en': 'English', 'es': 'Español', 'fr': 'Français', 'de': 'Deutsch',
@@ -120,18 +298,24 @@ selected_lang = st.selectbox("Language / Langue / Idioma",
 with st.sidebar:
     st.header("Coptic Tools")
-    # HuggingFace API Token input
-    st.subheader("🔑 API Configuration")
-    hf_token_input = st.text_input(
-        "HuggingFace API Token",
-        type="password",
-        help="Required for Apertus-8B translation. Get your token at: https://huggingface.co/settings/tokens"
-    )
-    if hf_token_input:
-        st.success("✅ API token configured")
-    else:
-        st.warning("⚠️ Translation requires an API token")
-        st.markdown("[Get your free HF token →](https://huggingface.co/settings/tokens)")
     st.divider()
@@ -373,17 +557,10 @@ if prompt := st.chat_input("Type your message..."):
         st.stop()  # Don't continue to translation
-    # For translation tasks, check API token
-    if not hf_token_input:
-        st.error("⚠️ Please enter your HuggingFace API token in the sidebar to use translation.")
-        st.stop()
-    # Initialize inference client with user token
-    inference_client = get_inference_client(hf_token_input)
-    if not inference_client:
-        st.error("❌ Failed to initialize inference client. Please check your API token.")
-        st.stop()
     # Handle parse_and_translate mode
     if selected_lang in ['cop', 'cop-sa', 'cop-bo'] and 'analysis_type' in locals() and analysis_type == 'parse_and_translate':
@@ -407,39 +584,51 @@ if prompt := st.chat_input("Type your message..."):
             st.divider()
             st.subheader(f"🌍 Translation to {LANGUAGES[target_lang]}")
-            # Get translation prompts
-            COPTIC_PROMPTS_TRANSLATE = get_coptic_prompts(target_language_name)
-            translate_prompt = f"{COPTIC_PROMPTS_TRANSLATE['translation']} {prompt}"
-            with st.spinner("🤖 Translating..."):
                 try:
-                    messages = [
-                        {"role": "system", "content": "You are a professional Coptic-to-modern-language translator. Provide only direct translations without explanations, commentary, or repeating the source text."},
-                        {"role": "user", "content": translate_prompt}
-                    ]
-                    response_stream = inference_client.chat_completion(
-                        model=MODEL_NAME,
-                        messages=messages,
-                        max_tokens=512,
-                        temperature=0.5,
-                        top_p=0.9,
-                        stream=True
-                    )
-                    # Stream the translation
-                    response_placeholder = st.empty()
-                    full_response = ""
-                    for message in response_stream:
-                        if message.choices[0].delta.content:
-                            full_response += message.choices[0].delta.content
-                            response_placeholder.markdown(full_response + "▌")
-                    response_placeholder.markdown(full_response)
-                    combined_response = f"Parse complete. Translation: {full_response}"
-                    st.session_state.messages.append({"role": "assistant", "content": combined_response})
                 except Exception as e:
                     st.error(f"❌ Translation error: {e}")
@@ -468,42 +657,79 @@ if prompt := st.chat_input("Type your message..."):
     with st.chat_message("user"):
         st.markdown(full_prompt)
-    # Generate response using HuggingFace Inference API
     with st.chat_message("assistant"):
         try:
-            with st.spinner("🤖 Generating response..."):
-                # Prepare messages with system instruction for better control
-                if selected_lang in ['cop', 'cop-sa', 'cop-bo'] and 'analysis_type' in locals() and analysis_type == 'translation':
-                    # For translation: strict system message
-                    messages = [
-                        {"role": "system", "content": "You are a professional Coptic-to-modern-language translator. Provide only direct translations without explanations, commentary, or repeating the source text."},
-                        {"role": "user", "content": full_prompt}
-                    ]
                 else:
-                    # For other tasks: standard chat
-                    messages = [{"role": "user", "content": full_prompt}]
-                response_stream = inference_client.chat_completion(
-                    model=MODEL_NAME,
-                    messages=messages,
-                    max_tokens=512,
-                    temperature=0.5,  # Lower temperature for more focused translations
-                    top_p=0.9,
-                    stream=True
-                )
-                # Stream the response
-                response_placeholder = st.empty()
-                full_response = ""
-                for message in response_stream:
-                    if message.choices[0].delta.content:
-                        full_response += message.choices[0].delta.content
-                        response_placeholder.markdown(full_response + "▌")
-                response_placeholder.markdown(full_response)
-                st.session_state.messages.append({"role": "assistant", "content": full_response})
         except Exception as e:
-            st.error(f"❌ Error generating response: {str(e)}")
-            st.info("💡 Please verify your API token is valid and has not expired.")

 import os
 import xml.etree.ElementTree as ET
 import re
+import torch
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 from huggingface_hub import InferenceClient
 from coptic_parser_core import CopticParserCore
+# ========================================
+# MEGALAA MODEL PREPROCESSING FUNCTIONS
+# ========================================
+# These functions convert between Coptic Unicode and Greek transcription
+# Required for megalaa/coptic-english-translator and megalaa/english-coptic-translator
+COPTIC_TO_GREEK = {
+    "ⲁ": "α", "ⲃ": "β", "ⲅ": "γ", "ⲇ": "δ", "ⲉ": "ε", "ⲋ": "ϛ",
+    "ⲍ": "ζ", "ⲏ": "η", "ⲑ": "θ", "ⲓ": "ι", "ⲕ": "κ", "ⲗ": "λ",
+    "ⲙ": "μ", "ⲛ": "ν", "ⲝ": "ξ", "ⲟ": "ο", "ⲡ": "π", "ⲣ": "ρ",
+    "ⲥ": "σ", "ⲧ": "τ", "ⲩ": "υ", "ⲫ": "φ", "ⲭ": "χ", "ⲯ": "ψ",
+    "ⲱ": "ω",
+    # Coptic-specific characters (must match model training)
+    "ϣ": "ʃ", "ϥ": "f", "ϧ": "x", "ϩ": "h", "ϫ": "ɟ",
+    "ϭ": "c", "ϯ": "ti",
+    # Uppercase variants
+    "Ⲁ": "Α", "Ⲃ": "Β", "Ⲅ": "Γ", "Ⲇ": "Δ", "Ⲉ": "Ε", "Ⲍ": "Ζ", "Ⲏ": "Η", "Ⲑ": "Θ",
+    "Ⲓ": "Ι", "Ⲕ": "Κ", "Ⲗ": "Λ", "Ⲙ": "Μ", "Ⲛ": "Ν", "Ⲝ": "Ξ", "Ⲟ": "Ο", "Ⲡ": "Π",
+    "Ⲣ": "Ρ", "Ⲥ": "Σ", "Ⲧ": "Τ", "Ⲩ": "Υ", "Ⲫ": "Φ", "Ⲭ": "Χ", "Ⲯ": "Ψ", "Ⲱ": "Ω",
+    "Ϣ": "Ʃ", "Ϥ": "F", "Ϧ": "X", "Ϩ": "H", "Ϫ": "Ɉ", "Ϭ": "C", "Ϯ": "TI"
+}
+GREEK_TO_COPTIC = {
+    "α": "ⲁ", "β": "ⲃ", "γ": "ⲅ", "δ": "ⲇ", "ε": "ⲉ", "ϛ": "ⲋ",
+    "ζ": "ⲍ", "η": "ⲏ", "θ": "ⲑ", "ι": "ⲓ", "κ": "ⲕ", "λ": "ⲗ",
+    "μ": "ⲙ", "ν": "ⲛ", "ξ": "ⲝ", "ο": "ⲟ", "π": "ⲡ", "ρ": "ⲣ",
+    "σ": "ⲥ", "ς": "ⲥ", "τ": "ⲧ", "υ": "ⲩ", "φ": "ⲫ", "χ": "ⲭ", "ψ": "ⲯ",
+    "ω": "ⲱ",
+    # Coptic-specific characters (must match model training)
+    "ʃ": "ϣ", "f": "ϥ", "x": "ϧ", "h": "ϩ", "ɟ": "ϫ",
+    "c": "ϭ", "ti": "ϯ",
+    # Uppercase variants
+    "Α": "Ⲁ", "Β": "Ⲃ", "Γ": "Ⲅ", "Δ": "Ⲇ", "Ε": "Ⲉ", "Ζ": "Ⲍ", "Η": "Ⲏ", "Θ": "Ⲑ",
+    "Ι": "Ⲓ", "Κ": "Ⲕ", "Λ": "Ⲗ", "Μ": "Ⲙ", "Ν": "Ⲛ", "Ξ": "Ⲝ", "Ο": "Ⲟ", "Π": "Ⲡ",
+    "Ρ": "Ⲣ", "Σ": "Ⲥ", "Τ": "Ⲧ", "Υ": "Ⲩ", "Φ": "Ⲫ", "Χ": "Ⲭ", "Ψ": "Ⲯ", "Ω": "Ⲱ",
+    "Ʃ": "Ϣ", "F": "Ϥ", "X": "Ϧ", "H": "Ϩ", "Ɉ": "Ϫ", "C": "Ϭ", "TI": "Ϯ"
+}
+def greekify(coptic_text):
+    """Convert Coptic Unicode to Greek transcription for megalaa models."""
+    chars = []
+    for c in coptic_text:
+        l_c = c.lower()
+        chars.append(COPTIC_TO_GREEK.get(l_c, l_c))
+    return "".join(chars)
+def degreekify(greek_text):
+    """Convert Greek transcription back to Coptic Unicode.
+    Handles two-character sequences like 'ti' → 'ϯ'
+    """
+    result = []
+    i = 0
+    while i < len(greek_text):
+        # Check for two-character sequences first
+        if i < len(greek_text) - 1:
+            two_char = greek_text[i:i+2].lower()
+            if two_char == 'ti':
+                result.append(GREEK_TO_COPTIC.get(two_char, greek_text[i:i+2]))
+                i += 2
+                continue
+        # Single character
+        result.append(GREEK_TO_COPTIC.get(greek_text[i], greek_text[i]))
+        i += 1
+    return ''.join(result)
 # Coptic alphabet helper
 COPTIC_ALPHABET = {
     'Ⲁ': 'Alpha', 'Ⲃ': 'Beta', 'Ⲅ': 'Gamma', 'Ⲇ': 'Delta', 'Ⲉ': 'Epsilon', 'Ⲋ': 'Zeta',
     return lexicon
+# ========================================
+# MEGALAA MODEL LOADING
+# ========================================
+# Load and cache megalaa translation models
+@st.cache_resource
+def load_coptic_to_english_model():
+    """Load Coptic → English translation model (Norelad's fine-tuned megalaa)."""
+    try:
+        with st.spinner("📥 Loading Coptic→English model (first time only, ~600MB)..."):
+            model_name = "Norelad/coptic-megalaa-finetuned"
+            tokenizer = AutoTokenizer.from_pretrained(model_name)
+            model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+            # Move to GPU if available
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+            model = model.to(device)
+            st.success(f"✅ Coptic→English model loaded on {device.upper()}")
+            return tokenizer, model, device
+    except Exception as e:
+        st.error(f"Failed to load Coptic→English model: {e}")
+        return None, None, None
+@st.cache_resource
+def load_english_to_coptic_model():
+    """Load megalaa English → Coptic translation model."""
+    try:
+        with st.spinner("📥 Loading English→Coptic model (first time only, ~600MB)..."):
+            model_name = "megalaa/english-coptic-translator"
+            tokenizer = AutoTokenizer.from_pretrained(model_name)
+            model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+            # Move to GPU if available
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+            model = model.to(device)
+            st.success(f"✅ English→Coptic model loaded on {device.upper()}")
+            return tokenizer, model, device
+    except Exception as e:
+        st.error(f"Failed to load English→Coptic model: {e}")
+        return None, None, None
+def translate_coptic_to_english(text, dialect='cop-sa'):
+    """Translate Coptic text to English using megalaa model.
+    Args:
+        text: Coptic text to translate
+        dialect: Coptic dialect ('cop-sa' for Sahidic, 'cop-bo' for Bohairic, 'cop' defaults to Sahidic)
+    """
+    tokenizer, model, device = load_coptic_to_english_model()
+    if tokenizer is None or model is None:
+        return "Error: Model not loaded. Please check your internet connection."
+    try:
+        # Dialect tags (required by the Norelad/coptic-megalaa-finetuned model)
+        DIALECT_TAGS = {
+            'cop-sa': 'з',  # Sahidic (Cyrillic 'з')
+            'cop-bo': 'б',  # Bohairic (Cyrillic 'б')
+            'cop': 'з'      # Default to Sahidic for generic Coptic
+        }
+        dialect_tag = DIALECT_TAGS.get(dialect, 'з')
+        # Preprocessing: Convert Coptic Unicode to Greek transcription and add dialect tag
+        greek_input = greekify(text.lower())
+        greek_input = f"{dialect_tag} {greek_input}"
+        # Tokenize and generate
+        inputs = tokenizer(greek_input, return_tensors="pt", padding=True).to(device)
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=128,
+            num_beams=5,
+            early_stopping=True
+        )
+        # Decode translation
+        translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        return translation
+    except Exception as e:
+        return f"Translation error: {e}"
+def translate_english_to_coptic(text):
+    """Translate English text to Coptic using megalaa model."""
+    tokenizer, model, device = load_english_to_coptic_model()
+    if tokenizer is None or model is None:
+        return "Error: Model not loaded. Please check your internet connection."
+    try:
+        # Tokenize and generate (input is already in English)
+        inputs = tokenizer(text, return_tensors="pt", padding=True).to(device)
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=128,
+            num_beams=5,
+            early_stopping=True
+        )
+        # Decode and postprocess: Convert Greek transcription to Coptic Unicode
+        greek_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        coptic_output = degreekify(greek_output)
+        return coptic_output
+    except Exception as e:
+        return f"Translation error: {e}"
 # Language detection and UI
 LANGUAGES = {
     'en': 'English', 'es': 'Español', 'fr': 'Français', 'de': 'Deutsch',
 with st.sidebar:
     st.header("Coptic Tools")
+    # Translation Model Selection
+    st.subheader("🤖 Translation Model")
+    st.info("✨ **NEW:** Using megalaa specialized Coptic models (free, no API token needed!)")
+    st.markdown("Models: `megalaa/coptic-english-translator` & `megalaa/english-coptic-translator`")
+    # Optional: HuggingFace API Token for advanced features
+    with st.expander("⚙️ Advanced: Use Apertus-8B (optional)"):
+        st.caption("For multi-language translation beyond English-Coptic")
+        hf_token_input = st.text_input(
+            "HuggingFace API Token",
+            type="password",
+            help="Optional: For Apertus-8B multi-language support"
+        )
+        use_apertus = st.checkbox("Use Apertus-8B instead of megalaa", value=False)
+        if hf_token_input and use_apertus:
+            st.success("✅ Apertus-8B enabled")
+        elif not use_apertus:
+            hf_token_input = None  # Disable API usage
     st.divider()
         st.stop()  # Don't continue to translation
+    # Initialize inference client if API token is provided (optional for megalaa)
+    inference_client = None
+    if hf_token_input:
+        inference_client = get_inference_client(hf_token_input)
     # Handle parse_and_translate mode
     if selected_lang in ['cop', 'cop-sa', 'cop-bo'] and 'analysis_type' in locals() and analysis_type == 'parse_and_translate':
             st.divider()
             st.subheader(f"🌍 Translation to {LANGUAGES[target_lang]}")
+            with st.spinner("🤖 Translating with megalaa model..."):
                 try:
+                    # Use megalaa for Coptic→English translation
+                    if target_lang == 'en':
+                        translation = translate_coptic_to_english(prompt, dialect=selected_lang)
+                        st.markdown(translation)
+                        combined_response = f"Parse complete. Translation: {translation}"
+                        st.session_state.messages.append({"role": "assistant", "content": combined_response})
+                    else:
+                        # For non-English targets, need Apertus or show message
+                        if inference_client and hf_token_input:
+                            COPTIC_PROMPTS_TRANSLATE = get_coptic_prompts(target_language_name)
+                            translate_prompt = f"{COPTIC_PROMPTS_TRANSLATE['translation']} {prompt}"
+                            messages = [
+                                {"role": "system", "content": "You are a professional Coptic-to-modern-language translator. Provide only direct translations without explanations, commentary, or repeating the source text."},
+                                {"role": "user", "content": translate_prompt}
+                            ]
+                            response_stream = inference_client.chat_completion(
+                                model=MODEL_NAME,
+                                messages=messages,
+                                max_tokens=512,
+                                temperature=0.5,
+                                top_p=0.9,
+                                stream=True
+                            )
+                            # Stream the translation
+                            response_placeholder = st.empty()
+                            full_response = ""
+                            for message in response_stream:
+                                if message.choices[0].delta.content:
+                                    full_response += message.choices[0].delta.content
+                                    response_placeholder.markdown(full_response + "▌")
+                            response_placeholder.markdown(full_response)
+                            combined_response = f"Parse complete. Translation: {full_response}"
+                            st.session_state.messages.append({"role": "assistant", "content": combined_response})
+                        else:
+                            st.warning(f"⚠️ Translation to {target_language_name} requires Apertus-8B. Please enable it in the sidebar.")
+                            st.info("💡 Megalaa models currently support English↔Coptic only.")
                 except Exception as e:
                     st.error(f"❌ Translation error: {e}")
     with st.chat_message("user"):
         st.markdown(full_prompt)
+    # Generate response using megalaa models or Apertus API
     with st.chat_message("assistant"):
         try:
+            # Check if this is a Coptic→English translation task
+            if selected_lang in ['cop', 'cop-sa', 'cop-bo'] and 'analysis_type' in locals() and analysis_type == 'translation':
+                # Use megalaa models for Coptic translation
+                if 'target_lang' in locals() and target_lang == 'en':
+                    with st.spinner("🤖 Translating with megalaa model..."):
+                        translation = translate_coptic_to_english(prompt, dialect=selected_lang)
+                        st.markdown(translation)
+                        st.session_state.messages.append({"role": "assistant", "content": translation})
+                else:
+                    # Non-English target: requires Apertus
+                    if inference_client and hf_token_input:
+                        with st.spinner("🤖 Translating with Apertus-8B..."):
+                            messages = [
+                                {"role": "system", "content": "You are a professional Coptic-to-modern-language translator. Provide only direct translations without explanations, commentary, or repeating the source text."},
+                                {"role": "user", "content": full_prompt}
+                            ]
+                            response_stream = inference_client.chat_completion(
+                                model=MODEL_NAME,
+                                messages=messages,
+                                max_tokens=512,
+                                temperature=0.5,
+                                top_p=0.9,
+                                stream=True
+                            )
+                            response_placeholder = st.empty()
+                            full_response = ""
+                            for message in response_stream:
+                                if message.choices[0].delta.content:
+                                    full_response += message.choices[0].delta.content
+                                    response_placeholder.markdown(full_response + "▌")
+                            response_placeholder.markdown(full_response)
+                            st.session_state.messages.append({"role": "assistant", "content": full_response})
+                    else:
+                        st.warning(f"⚠️ Translation to {target_language_name} requires Apertus-8B.")
+                        st.info("💡 Enable Apertus-8B in the sidebar for multi-language support.")
+                        st.info("💡 Megalaa models currently support English↔Coptic only.")
+            # For non-translation tasks or other languages
+            else:
+                if inference_client and hf_token_input:
+                    with st.spinner("🤖 Generating response..."):
+                        messages = [{"role": "user", "content": full_prompt}]
+                        response_stream = inference_client.chat_completion(
+                            model=MODEL_NAME,
+                            messages=messages,
+                            max_tokens=512,
+                            temperature=0.5,
+                            top_p=0.9,
+                            stream=True
+                        )
+                        response_placeholder = st.empty()
+                        full_response = ""
+                        for message in response_stream:
+                            if message.choices[0].delta.content:
+                                full_response += message.choices[0].delta.content
+                                response_placeholder.markdown(full_response + "▌")
+                        response_placeholder.markdown(full_response)
+                        st.session_state.messages.append({"role": "assistant", "content": full_response})
                 else:
+                    st.warning("⚠️ This feature requires Apertus-8B. Please enable it in the sidebar.")
+                    st.info("💡 Coptic→English translation works without API token using megalaa models.")
         except Exception as e:
+            st.error(f"❌ Error: {str(e)}")
+            st.info("💡 If using Apertus-8B, please verify your API token is valid.")

requirements.txt CHANGED Viewed

@@ -3,3 +3,5 @@ huggingface_hub
 lxml
 stanza
 torch

 lxml
 stanza
 torch
+transformers>=4.30.0
+sentencepiece>=0.1.99