Spaces:

minte-atnafu
/

GihonTech

Sleeping

App Files Files Community

Minte commited on Oct 6, 2025

Commit

cb4630e

1 Parent(s): 018fb8e

fix: refactor model loading and enhance ASR and translation functionality with SeamlessM4T integration

Browse files

Files changed (1) hide show

app.py +66 -82

app.py CHANGED Viewed

@@ -3,58 +3,44 @@ import soundfile as sf
 import torch
 import numpy as np
 from transformers import (
-    AutoProcessor, AutoModelForSpeechSeq2Seq,
-    pipeline, M2M100ForConditionalGeneration, M2M100Tokenizer,
-    VitsModel, AutoTokenizer
 )
 import gradio as gr
 import resampy
 import tempfile
 import subprocess
-# --- Load ASR model ---
 try:
     model_id = "facebook/seamless-m4t-v2-large"
     processor = AutoProcessor.from_pretrained(model_id)
-    asr_model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id).to("cpu")
-    print("[INFO] ASR model loaded.")
 except Exception as e:
-    print("[ERROR] Failed to load ASR model:", e)
     traceback.print_exc()
-    asr_model = None
     processor = None
-# --- Load translation models ---
 try:
-    back_translate_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_1.2B").to("cpu")
-    back_translate_tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_1.2B")
-    print("[INFO] Back translation model loaded.")
-except Exception as e:
-    print("[ERROR] Failed to load back translation model:", e)
-    traceback.print_exc()
-    back_translate_model = None
-    back_translate_tokenizer = None
-# --- Load other pipelines ---
-try:
-    translate_to_en = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
     chat_model = pipeline("text2text-generation", model="google/flan-t5-base")
-    print("[INFO] Translation and chat models loaded successfully.")
 except Exception as e:
-    print("[ERROR] Failed to load pipelines:", e)
     traceback.print_exc()
-    translate_to_en = None
     chat_model = None
 # --- Load TTS model (Facebook MMS for Amharic) ---
 try:
-    tts_processor = AutoProcessor.from_pretrained("facebook/mms-tts-amh")
     tts_model = VitsModel.from_pretrained("facebook/mms-tts-amh").to("cpu")
     print("[INFO] Facebook MMS TTS model for Amharic loaded successfully.")
 except Exception as e:
     print("[ERROR] Failed to load Facebook MMS TTS model:", e)
     traceback.print_exc()
-    tts_processor = None
     tts_model = None
 # --- Romanization helper ---
@@ -66,44 +52,65 @@ def romanize(text):
         print("[ERROR] Romanization failed:", e)
         return text  # fallback
-# --- ASR ---
 def transcribe_amharic(audio_file):
-    if asr_model is None or processor is None:
         return "ASR Model loading failed"
     try:
         audio, sr = sf.read(audio_file)
         if audio.ndim > 1:
             audio = audio.mean(axis=1)
         audio = resampy.resample(audio, sr, 16000)
         inputs = processor(audio=audio, sampling_rate=16000, return_tensors="pt")
         with torch.no_grad():
-            generated_ids = asr_model.generate(**inputs, tgt_lang="amh")
-        transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
         return transcription.strip()
     except Exception as e:
         print("[ERROR] ASR transcription failed:", e)
         traceback.print_exc()
         return f"ASR failed: {str(e)[:50]}..."
-# --- Back translation ---
 def back_translate_en_to_am(en_text):
-    if back_translate_model is None or back_translate_tokenizer is None:
         return "Back translation model not loaded"
     try:
-        back_translate_tokenizer.src_lang = "en"
-        inputs = back_translate_tokenizer(en_text, return_tensors="pt")
         with torch.no_grad():
-            generated_tokens = back_translate_model.generate(
-                **inputs,
-                forced_bos_token_id=back_translate_tokenizer.get_lang_id("am"),
-                max_length=128,
-                num_beams=5,
-                no_repeat_ngram_size=2,
-                early_stopping=True,
-                repetition_penalty=1.5,
-                length_penalty=0.8
             )
-        am_response = back_translate_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
         return am_response.strip()
     except Exception as e:
         print("[ERROR] Back translation failed:", e)
@@ -117,19 +124,7 @@ def generate_chat_response(text):
     try:
         # Add context to make responses more meaningful
         prompt = f"Respond to this in a helpful and conversational way: {text}"
-        inputs = chat_model.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
-        with torch.no_grad():
-            outputs = chat_model.model.generate(
-                inputs.input_ids,
-                max_length=150,
-                num_beams=5,
-                no_repeat_ngram_size=3,
-                early_stopping=True,
-                repetition_penalty=2.0,
-                temperature=0.7,
-                do_sample=True
-            )
-        response = chat_model.tokenizer.decode(outputs[0], skip_special_tokens=True)
         return response.strip()
     except Exception as e:
         print("[ERROR] Chat generation failed:", e)
@@ -137,34 +132,28 @@ def generate_chat_response(text):
 # --- TTS with Facebook MMS ---
 def generate_tts(text):
-    if tts_model is None or tts_processor is None:
         print("[ERROR] TTS model not loaded")
         return None
     try:
         if not text.strip():
             return None
-        # Process text and generate speech
-        inputs = tts_processor(text=text, return_tensors="pt")
         with torch.no_grad():
-            speech = tts_model(**inputs).waveform
         # Convert to numpy and normalize
-        if isinstance(speech, torch.Tensor):
-            audio_data = speech.cpu().numpy()
-        else:
-            audio_data = speech
-        # Handle mono/stereo and normalize
-        if audio_data.ndim > 1:
-            audio_data = audio_data.squeeze()
         max_val = np.max(np.abs(audio_data))
         if max_val > 0:
             audio_data = audio_data / max_val
-        return audio_data, 16000  # MMS TTS typically uses 16kHz
     except Exception as e:
         print("[ERROR] MMS TTS generation failed:", e)
@@ -219,29 +208,24 @@ def create_wav_file(audio_array, sample_rate):
 def assistant_pipeline(audio):
     if not audio:
         return "No audio", "", "", "", None
     asr_result = transcribe_amharic(audio)
     print(f"ASR Result: {asr_result}")
-    # Translation
-    if translate_to_en is None:
-        en_text = "Translation model not loaded"
-    else:
-        try:
-            en_text = translate_to_en(asr_result)[0]["translation_text"]
-        except Exception as e:
-            print("[ERROR] Translation to English failed:", e)
-            en_text = f"Translation failed: {str(e)[:50]}..."
     print(f"English Translation: {en_text}")
-    # Chat
     en_response = generate_chat_response(en_text)
     print(f"Chat Response: {en_response}")
-    # Back translation
     am_response = back_translate_en_to_am(en_response)
     print(f"Amharic Response: {am_response}")
-    # TTS with multiple fallbacks
     audio_file_path = None
     if am_response and not am_response.startswith("Back translation failed"):
         # Try MMS TTS first

 import torch
 import numpy as np
 from transformers import (
+    SeamlessM4TModel, AutoProcessor,
+    pipeline, VitsModel, AutoTokenizer
 )
 import gradio as gr
 import resampy
 import tempfile
 import subprocess
+# --- Load SeamlessM4T model for ASR and translation ---
 try:
     model_id = "facebook/seamless-m4t-v2-large"
     processor = AutoProcessor.from_pretrained(model_id)
+    model = SeamlessM4TModel.from_pretrained(model_id).to("cpu")
+    print("[INFO] SeamlessM4T model loaded for ASR and translation.")
 except Exception as e:
+    print("[ERROR] Failed to load SeamlessM4T model:", e)
     traceback.print_exc()
+    model = None
     processor = None
+# --- Load chat model ---
 try:
     chat_model = pipeline("text2text-generation", model="google/flan-t5-base")
+    print("[INFO] Chat model loaded successfully.")
 except Exception as e:
+    print("[ERROR] Failed to load chat model:", e)
     traceback.print_exc()
     chat_model = None
 # --- Load TTS model (Facebook MMS for Amharic) ---
 try:
+    tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-amh")
     tts_model = VitsModel.from_pretrained("facebook/mms-tts-amh").to("cpu")
     print("[INFO] Facebook MMS TTS model for Amharic loaded successfully.")
 except Exception as e:
     print("[ERROR] Failed to load Facebook MMS TTS model:", e)
     traceback.print_exc()
+    tts_tokenizer = None
     tts_model = None
 # --- Romanization helper ---
         print("[ERROR] Romanization failed:", e)
         return text  # fallback
+# --- ASR with SeamlessM4T ---
 def transcribe_amharic(audio_file):
+    if model is None or processor is None:
         return "ASR Model loading failed"
     try:
         audio, sr = sf.read(audio_file)
         if audio.ndim > 1:
             audio = audio.mean(axis=1)
         audio = resampy.resample(audio, sr, 16000)
+        # Direct Amharic transcription
         inputs = processor(audio=audio, sampling_rate=16000, return_tensors="pt")
         with torch.no_grad():
+            generated_ids = model.generate(
+                **inputs,
+                tgt_lang="amh",
+                generate_speech=False
+            )
+        transcription = processor.decode(generated_ids[0], skip_special_tokens=True)
         return transcription.strip()
     except Exception as e:
         print("[ERROR] ASR transcription failed:", e)
         traceback.print_exc()
         return f"ASR failed: {str(e)[:50]}..."
+# --- Translation with SeamlessM4T (Amharic to English) ---
+def translate_am_to_en(amharic_text):
+    if model is None or processor is None:
+        return "Translation model not loaded"
+    try:
+        # Translate Amharic to English using SeamlessM4T
+        text_inputs = processor(text=amharic_text, src_lang="amh", return_tensors="pt")
+        with torch.no_grad():
+            output_tokens = model.generate(
+                **text_inputs,
+                tgt_lang="eng",
+                generate_speech=False
+            )
+        translated_text = processor.decode(output_tokens[0], skip_special_tokens=True)
+        return translated_text.strip()
+    except Exception as e:
+        print("[ERROR] Translation failed:", e)
+        traceback.print_exc()
+        return f"Translation failed: {str(e)[:50]}..."
+# --- Back translation with SeamlessM4T (English to Amharic) ---
 def back_translate_en_to_am(en_text):
+    if model is None or processor is None:
         return "Back translation model not loaded"
     try:
+        # Translate English back to Amharic using SeamlessM4T
+        text_inputs = processor(text=en_text, src_lang="eng", return_tensors="pt")
         with torch.no_grad():
+            output_tokens = model.generate(
+                **text_inputs,
+                tgt_lang="amh",
+                generate_speech=False
             )
+        am_response = processor.decode(output_tokens[0], skip_special_tokens=True)
         return am_response.strip()
     except Exception as e:
         print("[ERROR] Back translation failed:", e)
     try:
         # Add context to make responses more meaningful
         prompt = f"Respond to this in a helpful and conversational way: {text}"
+        response = chat_model(prompt, max_length=150, num_beams=5, temperature=0.7, do_sample=True)[0]['generated_text']
         return response.strip()
     except Exception as e:
         print("[ERROR] Chat generation failed:", e)
 # --- TTS with Facebook MMS ---
 def generate_tts(text):
+    if tts_model is None or tts_tokenizer is None:
         print("[ERROR] TTS model not loaded")
         return None
     try:
         if not text.strip():
             return None
+        # Tokenize text and generate speech
+        inputs = tts_tokenizer(text, return_tensors="pt")
         with torch.no_grad():
+            output = tts_model(**inputs)
+            speech = output.waveform
         # Convert to numpy and normalize
+        audio_data = speech.cpu().numpy().squeeze()
         max_val = np.max(np.abs(audio_data))
         if max_val > 0:
             audio_data = audio_data / max_val
+        return audio_data, tts_model.config.sampling_rate
     except Exception as e:
         print("[ERROR] MMS TTS generation failed:", e)
 def assistant_pipeline(audio):
     if not audio:
         return "No audio", "", "", "", None
+    # Step 1: ASR with SeamlessM4T
     asr_result = transcribe_amharic(audio)
     print(f"ASR Result: {asr_result}")
+    # Step 2: Translation with SeamlessM4T
+    en_text = translate_am_to_en(asr_result)
     print(f"English Translation: {en_text}")
+    # Step 3: Chat response
     en_response = generate_chat_response(en_text)
     print(f"Chat Response: {en_response}")
+    # Step 4: Back translation with SeamlessM4T
     am_response = back_translate_en_to_am(en_response)
     print(f"Amharic Response: {am_response}")
+    # Step 5: TTS
     audio_file_path = None
     if am_response and not am_response.startswith("Back translation failed"):
         # Try MMS TTS first