Spaces:

minte-two
/

GihonTech_Translation

Sleeping

App Files Files Community

Minte commited on Oct 9, 2025

Commit

f525548

1 Parent(s): 570f689

Refactor Swahili and Somali model configurations and update loading logic

Browse files

Files changed (2) hide show

app.py +75 -66
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -11,13 +11,13 @@ LANGUAGE_CONFIG = {
     },
     "Swahili": {
         "code": "swh",
-        "model_type": "swahili_mms",
-        "swahili_code": "swh"
     },
     "Somali": {
         "code": "som",
-        "model_type": "somali_m2m",
-        "somali_code": "so"
     },
     "Afan Oromo": {
         "code": "gaz",
@@ -40,35 +40,29 @@ LANGUAGE_CONFIG = {
 models = {}
 tokenizers = {}
-print("🚀 Initializing specialized translation models...")
-# Load Swahili MMS model
 try:
-    print("📥 Loading Swahili MMS model...")
-    swahili_model_id = "Benjamin-png/swahili-mms-tts-finetuned"
-    # Note: This appears to be a TTS model, so we'll need to check if it supports translation
-    # If not, we'll fall back to another approach
-    try:
-        tokenizers['swahili'] = AutoTokenizer.from_pretrained(swahili_model_id)
-        models['swahili'] = AutoModelForSeq2SeqLM.from_pretrained(swahili_model_id)
-        print("✅ Swahili MMS model loaded successfully!")
-    except:
-        print("⚠️  Swahili MMS model might be TTS-only, will use fallback")
-        models['swahili'] = None
 except Exception as e:
-    print(f"❌ Failed to load Swahili MMS model: {e}")
-    models['swahili'] = None
-# Load Somali M2M100 model
 try:
-    print("📥 Loading Somali M2M100 model...")
-    somali_model_id = "Ammad1Ali/m2m100_418M-2.0"
-    tokenizers['somali'] = AutoTokenizer.from_pretrained(somali_model_id)
-    models['somali'] = M2M100ForConditionalGeneration.from_pretrained(somali_model_id)
-    print("✅ Somali M2M100 model loaded successfully!")
 except Exception as e:
-    print(f"❌ Failed to load Somali M2M100 model: {e}")
-    models['somali'] = None
 # Load NLLB model for other languages
 try:
@@ -81,56 +75,71 @@ except Exception as e:
     print(f"❌ Failed to load NLLB model: {e}")
     models['nllb'] = None
-def translate_with_swahili_mms(text):
-    """Translate Swahili text using specialized model"""
     try:
-        if models.get('swahili') is None:
             return "Swahili translation model not available"
-        # For MMS models, we need to check the specific approach
-        # Since this might be a TTS model, we'll use a fallback to NLLB
-        if models['nllb'] is not None:
-            return translate_with_nllb(text, "swh_Latn")
-        else:
-            return "Translation service temporarily unavailable"
     except Exception as e:
-        print(f"Swahili translation error: {e}")
-        if models['nllb'] is not None:
             return translate_with_nllb(text, "swh_Latn")
         return f"Translation failed: {str(e)[:200]}"
-def translate_with_somali_m2m(text):
-    """Translate Somali text using M2M100 model"""
     try:
-        if models.get('somali') is None or tokenizers.get('somali') is None:
-            return "Somali translation model not available"
         # Set source language
-        tokenizers['somali'].src_lang = "so"
         # Tokenize input
-        inputs = tokenizers['somali'](text, return_tensors="pt", truncation=True, max_length=512)
         # Generate translation to English
         with torch.no_grad():
-            generated_tokens = models['somali'].generate(
                 **inputs,
-                forced_bos_token_id=tokenizers['somali'].get_lang_id("en"),
                 max_length=256,
-                num_beams=5,
                 early_stopping=True
             )
         # Decode
-        translation = tokenizers['somali'].batch_decode(generated_tokens, skip_special_tokens=True)[0]
         return translation
     except Exception as e:
-        print(f"Somali M2M100 translation error: {e}")
         # Fallback to NLLB if available
-        if models['nllb'] is not None:
-            return translate_with_nllb(text, "som_Latn")
         return f"Translation failed: {str(e)[:200]}"
 def translate_with_nllb(text, source_lang_code):
@@ -151,7 +160,7 @@ def translate_with_nllb(text, source_lang_code):
                 **inputs,
                 forced_bos_token_id=forced_bos_token_id,
                 max_length=256,
-                num_beams=5,
                 early_stopping=True
             )
@@ -174,10 +183,10 @@ def translate_text(text, source_language):
     config = LANGUAGE_CONFIG[source_language]
     try:
-        if config["model_type"] == "swahili_mms":
-            return translate_with_swahili_mms(text)
-        elif config["model_type"] == "somali_m2m":
-            return translate_with_somali_m2m(text)
         else:  # nllb
             return translate_with_nllb(text, config["nllb_code"])
@@ -301,11 +310,11 @@ with gr.Blocks(
             gr.Markdown("### 🔧 Model Information")
             # Create status display
-            swahili_status = "✅ Loaded" if models.get('swahili') else "❌ Failed"
-            somali_status = "✅ Loaded" if models.get('somali') else "❌ Failed"
             nllb_status = "✅ Loaded" if models.get('nllb') else "❌ Failed"
-            status_text = f"Swahili MMS: {swahili_status} | Somali M2M100: {somali_status} | NLLB: {nllb_status}"
             gr.Textbox(
                 value=status_text,
                 label="Model Status",
@@ -315,15 +324,15 @@ with gr.Blocks(
             # Create model info
             gr.Markdown(f"""
             **Specialized Models:**
-            - **Swahili:** Benjamin-png/swahili-mms-tts-finetuned
-            - **Somali:** Ammad1Ali/m2m100_418M-2.0
             - **Other Languages:** Facebook NLLB-200
             **Features:**
-            - High-quality specialized models for Swahili and Somali
-            - NLLB-200 for other supported languages
-            - Fast and accurate translations
-            - Automatic fallback to ensure service availability
             """)
     # Add CSS for better styling

     },
     "Swahili": {
         "code": "swh",
+        "model_type": "helsinki_swahili",
+        "helsinki_code": "swc"
     },
     "Somali": {
         "code": "som",
+        "model_type": "m2m",
+        "m2m_code": "so"
     },
     "Afan Oromo": {
         "code": "gaz",
 models = {}
 tokenizers = {}
+print("🚀 Initializing translation models...")
+# Load Helsinki-NLP Swahili model
 try:
+    print("📥 Loading Helsinki-NLP Swahili model...")
+    swahili_model_id = "Helsinki-NLP/opus-mt-swc-en"
+    tokenizers['helsinki_swahili'] = AutoTokenizer.from_pretrained(swahili_model_id)
+    models['helsinki_swahili'] = AutoModelForSeq2SeqLM.from_pretrained(swahili_model_id)
+    print("✅ Helsinki-NLP Swahili model loaded successfully!")
 except Exception as e:
+    print(f"❌ Failed to load Helsinki-NLP Swahili model: {e}")
+    models['helsinki_swahili'] = None
+# Load M2M100 model for Somali
 try:
+    print("📥 Loading M2M100 model for Somali...")
+    m2m_model_id = "facebook/m2m100_418M"
+    tokenizers['m2m'] = AutoTokenizer.from_pretrained(m2m_model_id)
+    models['m2m'] = M2M100ForConditionalGeneration.from_pretrained(m2m_model_id)
+    print("✅ M2M100 model loaded successfully!")
 except Exception as e:
+    print(f"❌ Failed to load M2M100 model: {e}")
+    models['m2m'] = None
 # Load NLLB model for other languages
 try:
     print(f"❌ Failed to load NLLB model: {e}")
     models['nllb'] = None
+def translate_with_helsinki_swahili(text):
+    """Translate Swahili text using Helsinki-NLP model"""
     try:
+        if models.get('helsinki_swahili') is None or tokenizers.get('helsinki_swahili') is None:
             return "Swahili translation model not available"
+        # Tokenize input
+        inputs = tokenizers['helsinki_swahili'](text, return_tensors="pt", truncation=True, max_length=512)
+        # Generate translation
+        with torch.no_grad():
+            generated_tokens = models['helsinki_swahili'].generate(
+                **inputs,
+                max_length=256,
+                num_beams=5,
+                early_stopping=True
+            )
+        # Decode
+        translation = tokenizers['helsinki_swahili'].batch_decode(generated_tokens, skip_special_tokens=True)[0]
+        return translation
     except Exception as e:
+        print(f"Helsinki Swahili translation error: {e}")
+        # Fallback to M2M100 if available
+        if models.get('m2m') is not None:
+            return translate_with_m2m(text, "sw")
+        # Fallback to NLLB if available
+        elif models.get('nllb') is not None:
             return translate_with_nllb(text, "swh_Latn")
         return f"Translation failed: {str(e)[:200]}"
+def translate_with_m2m(text, source_lang_code):
+    """Translate text using M2M100 model"""
     try:
+        if models.get('m2m') is None or tokenizers.get('m2m') is None:
+            return "M2M100 model not available"
         # Set source language
+        tokenizers['m2m'].src_lang = source_lang_code
         # Tokenize input
+        inputs = tokenizers['m2m'](text, return_tensors="pt", truncation=True, max_length=512)
         # Generate translation to English
         with torch.no_grad():
+            generated_tokens = models['m2m'].generate(
                 **inputs,
+                forced_bos_token_id=tokenizers['m2m'].get_lang_id("en"),
                 max_length=256,
+                num_beams=3,
                 early_stopping=True
             )
         # Decode
+        translation = tokenizers['m2m'].batch_decode(generated_tokens, skip_special_tokens=True)[0]
         return translation
     except Exception as e:
+        print(f"M2M100 translation error: {e}")
         # Fallback to NLLB if available
+        if models.get('nllb') is not None:
+            lang_map = {"so": "som_Latn", "sw": "swh_Latn"}
+            nllb_code = lang_map.get(source_lang_code, "eng_Latn")
+            return translate_with_nllb(text, nllb_code)
         return f"Translation failed: {str(e)[:200]}"
 def translate_with_nllb(text, source_lang_code):
                 **inputs,
                 forced_bos_token_id=forced_bos_token_id,
                 max_length=256,
+                num_beams=3,
                 early_stopping=True
             )
     config = LANGUAGE_CONFIG[source_language]
     try:
+        if config["model_type"] == "helsinki_swahili":
+            return translate_with_helsinki_swahili(text)
+        elif config["model_type"] == "m2m":
+            return translate_with_m2m(text, config["m2m_code"])
         else:  # nllb
             return translate_with_nllb(text, config["nllb_code"])
             gr.Markdown("### 🔧 Model Information")
             # Create status display
+            helsinki_status = "✅ Loaded" if models.get('helsinki_swahili') else "❌ Failed"
+            m2m_status = "✅ Loaded" if models.get('m2m') else "❌ Failed"
             nllb_status = "✅ Loaded" if models.get('nllb') else "❌ Failed"
+            status_text = f"Helsinki Swahili: {helsinki_status} | M2M100: {m2m_status} | NLLB: {nllb_status}"
             gr.Textbox(
                 value=status_text,
                 label="Model Status",
             # Create model info
             gr.Markdown(f"""
             **Specialized Models:**
+            - **Swahili:** Helsinki-NLP/opus-mt-swc-en (Specialized Swahili→English)
+            - **Somali:** Facebook M2M100
             - **Other Languages:** Facebook NLLB-200
             **Features:**
+            - High-quality specialized model for Swahili translation
+            - Optimized models for each language family
+            - Cross-model fallback for reliability
+            - Fast and accurate results
             """)
     # Add CSS for better styling

requirements.txt CHANGED Viewed

@@ -6,4 +6,5 @@ soundfile>=0.12.0
 resampy>=0.4.0
 numpy>=1.24.0
 accelerate>=0.20.0
-sentencepiece>=0.1.99

 resampy>=0.4.0
 numpy>=1.24.0
 accelerate>=0.20.0
+sentencepiece>=0.1.99
+protobuf>=3.20.0