Spaces:

PlotweaverModel
/

Live-Football-Commentary

Sleeping

App Files Files Community

Added c2translate

by offiongbassey - opened Apr 3

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+99

-135

Files changed (1) hide show

app.py +99 -135

app.py CHANGED Viewed

@@ -2,8 +2,7 @@
 Live Football Commentary Pipeline — English → Yoruba
 =====================================================
 Gradio app for HuggingFace Spaces.
-Pipeline: ASR (Whisper) → MT (NLLB-200) → TTS (MMS-TTS Yoruba)
 """
 import torch
@@ -11,11 +10,9 @@ import numpy as np
 import re
 import time
 import gradio as gr
-from transformers import (
-    pipeline as hf_pipeline,
-    AutoTokenizer,
-    AutoModelForSeq2SeqLM,
-)
 # =============================================================================
 # Configuration
@@ -24,19 +21,40 @@ from transformers import (
 ASR_MODEL_ID = "PlotweaverAI/whisper-small-de-en"
 MT_MODEL_ID = "PlotweaverAI/nllb-200-distilled-600M-african-6lang"
 TTS_MODEL_ID = "PlotweaverAI/yoruba-mms-tts-new"
 MT_SRC_LANG = "eng_Latn"
 MT_TGT_LANG = "yor_Latn"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 TORCH_DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
 # =============================================================================
 # Load models (runs once at startup)
 # =============================================================================
-print(f"Device: {DEVICE} | Dtype: {TORCH_DTYPE}")
 print("Loading models...")
 # ASR
@@ -49,15 +67,16 @@ asr_pipe = hf_pipeline(
 )
 print("  ASR loaded ✓")
-# MT
-print(f"  Loading MT: {MT_MODEL_ID}")
 mt_tokenizer = AutoTokenizer.from_pretrained(MT_MODEL_ID)
-mt_model = AutoModelForSeq2SeqLM.from_pretrained(
-    MT_MODEL_ID,
-    torch_dtype=TORCH_DTYPE,
-).to(DEVICE)
-mt_tokenizer.src_lang = MT_SRC_LANG
-print("  MT loaded ✓")
 # TTS
 print(f"  Loading TTS: {TTS_MODEL_ID}")
@@ -72,7 +91,7 @@ print("All models loaded!")
 # =============================================================================
-# Pipeline functions (from working Colab notebook)
 # =============================================================================
 def split_into_sentences(text):
@@ -80,16 +99,10 @@ def split_into_sentences(text):
     text = text.strip()
     if not text:
         return []
-    # Normalize case
     text = '. '.join(s.strip().capitalize() for s in text.split('. ') if s.strip())
-    # If text has punctuation, split on it
     if re.search(r'[.!?]', text):
         sentences = re.split(r'(?<=[.!?])\s+', text)
         return [s.strip() for s in sentences if s.strip()]
-    # No punctuation — split into ~12 word chunks
     words = text.split()
     MAX_WORDS = 12
     sentences = []
@@ -113,31 +126,55 @@ def transcribe(audio_array, sample_rate=16000):
     return result["text"].strip()
-def translate_sentence(text, max_length=256):
-    """MT: Translate a single sentence from English to Yoruba."""
-    inputs = mt_tokenizer(text, return_tensors="pt", truncation=True).to(DEVICE)
-    tgt_lang_id = mt_tokenizer.convert_tokens_to_ids(MT_TGT_LANG)
-    with torch.no_grad():
-        output_ids = mt_model.generate(
-            **inputs,
-            max_length=max_length,
-            forced_bos_token_id=tgt_lang_id,
-            repetition_penalty=1.5,
-            no_repeat_ngram_size=3,
-            num_beams=4,
-            early_stopping=True,
         )
-    return mt_tokenizer.decode(output_ids[0], skip_special_tokens=True)
 def translate_long_text(text):
-    """Split into sentences and translate each individually."""
     sentences = split_into_sentences(text)
-    translations = []
-    for sent in sentences:
-        yo = translate_sentence(sent)
-        translations.append(yo)
     return ' '.join(translations), sentences, translations
@@ -154,99 +191,65 @@ def synthesize(text):
 # =============================================================================
 def process_audio(audio_input):
-    """
-    Full pipeline: English audio → Yoruba audio.
-    audio_input: tuple of (sample_rate, numpy_array) from Gradio.
-    """
     if audio_input is None:
         return None, "⚠️ No audio provided. Please upload or record audio."
     sample_rate, audio_array = audio_input
-    # Convert to float32 mono if needed
     audio_array = audio_array.astype(np.float32)
     if audio_array.ndim > 1:
         audio_array = audio_array.mean(axis=1)
-    # Normalize to [-1, 1] if integer audio
     if audio_array.max() > 1.0 or audio_array.min() < -1.0:
         audio_array = audio_array / max(abs(audio_array.max()), abs(audio_array.min()))
     total_start = time.time()
     log_lines = []
-    # Step 1: ASR
     t0 = time.time()
     english_text = transcribe(audio_array, sample_rate)
-    asr_time = time.time() - t0
-    log_lines.append(f"**🎤 ASR** ({asr_time:.2f}s)")
-    log_lines.append(f"English: {english_text}")
-    log_lines.append("")
     if not english_text:
-        return None, "⚠️ ASR returned empty text. Please try with clearer audio."
-    # Step 2: MT (sentence by sentence)
     t0 = time.time()
     yoruba_text, en_sentences, yo_sentences = translate_long_text(english_text)
-    mt_time = time.time() - t0
-    log_lines.append(f"**🔄 Translation** ({mt_time:.2f}s)")
     for en_s, yo_s in zip(en_sentences, yo_sentences):
         log_lines.append(f"  EN: {en_s}")
         log_lines.append(f"  YO: {yo_s}")
     log_lines.append("")
     if not yoruba_text:
         return None, "⚠️ Translation returned empty text."
-    # Step 3: TTS
     t0 = time.time()
     yoruba_audio, output_sr = synthesize(yoruba_text)
-    tts_time = time.time() - t0
-    log_lines.append(f"**🔊 TTS** ({tts_time:.2f}s) → {len(yoruba_audio)/output_sr:.2f}s of audio")
-    total = time.time() - total_start
-    log_lines.append("")
-    log_lines.append(f"**Total: {total:.2f}s**")
-    log_output = "\n".join(log_lines)
-    return (output_sr, yoruba_audio), log_output
 def process_text(english_text):
-    """
-    Text-only mode: English text → Yoruba text + audio.
-    Skips the ASR stage — useful for testing MT + TTS.
-    """
     if not english_text or not english_text.strip():
         return None, "⚠️ Please enter some English text."
     total_start = time.time()
     log_lines = []
-    # MT
     t0 = time.time()
     yoruba_text, en_sentences, yo_sentences = translate_long_text(english_text.strip())
-    mt_time = time.time() - t0
-    log_lines.append(f"**🔄 Translation** ({mt_time:.2f}s)")
     for en_s, yo_s in zip(en_sentences, yo_sentences):
         log_lines.append(f"  EN: {en_s}")
         log_lines.append(f"  YO: {yo_s}")
     log_lines.append("")
     if not yoruba_text:
         return None, "⚠️ Translation returned empty text."
-    # TTS
     t0 = time.time()
     yoruba_audio, output_sr = synthesize(yoruba_text)
-    tts_time = time.time() - t0
-    log_lines.append(f"**🔊 TTS** ({tts_time:.2f}s) → {len(yoruba_audio)/output_sr:.2f}s of audio")
-    total = time.time() - total_start
-    log_lines.append("")
-    log_lines.append(f"**Total: {total:.2f}s**")
     return (output_sr, yoruba_audio), "\n".join(log_lines)
@@ -257,12 +260,8 @@ def process_text(english_text):
 DESCRIPTION = """
 # 🏟️ Live Football Commentary — English → Yoruba
 Translate English football commentary into Yoruba speech in real-time.
-**Pipeline:** ASR (Whisper) → MT (NLLB-200) → TTS (MMS-TTS Yoruba)
-Upload or record English commentary audio, and get back Yoruba audio + full transcript.
 """
 EXAMPLES_TEXT = [
@@ -272,66 +271,32 @@ EXAMPLES_TEXT = [
     "He dribbles past two defenders and shoots! The ball hits the back of the net!",
 ]
-with gr.Blocks(
-    title="Football Commentary EN→YO",
-    theme=gr.themes.Soft(),
-) as demo:
     gr.Markdown(DESCRIPTION)
     with gr.Tabs():
-        # ---- Tab 1: Audio → Audio (Full Pipeline) ----
         with gr.TabItem("🎙️ Audio → Audio (Full Pipeline)"):
             gr.Markdown("Upload or record English commentary. The pipeline will transcribe, translate, and synthesize Yoruba audio.")
             with gr.Row():
                 with gr.Column():
-                    audio_input = gr.Audio(
-                        label="English Commentary Audio",
-                        type="numpy",
-                        sources=["upload", "microphone"],
-                    )
                     audio_submit_btn = gr.Button("Translate to Yoruba", variant="primary", size="lg")
                 with gr.Column():
                     audio_output = gr.Audio(label="Yoruba Commentary Audio", type="numpy")
                     audio_log = gr.Markdown(label="Pipeline Log")
-            audio_submit_btn.click(
-                fn=process_audio,
-                inputs=[audio_input],
-                outputs=[audio_output, audio_log],
-            )
-        # ---- Tab 2: Text → Audio (Skip ASR) ----
         with gr.TabItem("📝 Text → Audio (Translation + TTS)"):
-            gr.Markdown("Type or paste English text to translate to Yoruba and hear the result. Useful for testing without audio.")
             with gr.Row():
                 with gr.Column():
-                    text_input = gr.Textbox(
-                        label="English Text",
-                        placeholder="Type English football commentary here...",
-                        lines=4,
-                    )
                     text_submit_btn = gr.Button("Translate to Yoruba", variant="primary", size="lg")
-                    gr.Examples(
-                        examples=[[e] for e in EXAMPLES_TEXT],
-                        inputs=[text_input],
-                        label="Example Commentary",
-                    )
                 with gr.Column():
                     text_audio_output = gr.Audio(label="Yoruba Audio", type="numpy")
                     text_log = gr.Markdown(label="Pipeline Log")
-            text_submit_btn.click(
-                fn=process_text,
-                inputs=[text_input],
-                outputs=[text_audio_output, text_log],
-            )
     gr.Markdown("""
 ---
@@ -341,6 +306,5 @@ with gr.Blocks(
 [TTS: PlotweaverAI/yoruba-mms-tts-new](https://huggingface.co/PlotweaverAI/yoruba-mms-tts-new)
 """)
-# Launch
 if __name__ == "__main__":
-    demo.launch()

 Live Football Commentary Pipeline — English → Yoruba
 =====================================================
 Gradio app for HuggingFace Spaces.
+Pipeline: ASR (Whisper) → MT (NLLB-200 via CTranslate2) → TTS (MMS-TTS Yoruba)
 """
 import torch
 import re
 import time
 import gradio as gr
+import ctranslate2
+from transformers import AutoTokenizer
+from transformers import pipeline as hf_pipeline
 # =============================================================================
 # Configuration
 ASR_MODEL_ID = "PlotweaverAI/whisper-small-de-en"
 MT_MODEL_ID = "PlotweaverAI/nllb-200-distilled-600M-african-6lang"
 TTS_MODEL_ID = "PlotweaverAI/yoruba-mms-tts-new"
+CT2_MODEL_DIR = "./nllb_ct2"  # Local dir where converted model is saved
 MT_SRC_LANG = "eng_Latn"
 MT_TGT_LANG = "yor_Latn"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 TORCH_DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
+CT2_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+CT2_COMPUTE_TYPE = "int8_float16" if torch.cuda.is_available() else "int8"
+# =============================================================================
+# Convert MT model to CTranslate2 format (runs once at startup if needed)
+# =============================================================================
+import os
+if not os.path.exists(CT2_MODEL_DIR):
+    print(f"Converting {MT_MODEL_ID} to CTranslate2 format...")
+    import subprocess
+    subprocess.run([
+        "ct2-transformers-converter",
+        "--model", MT_MODEL_ID,
+        "--output_dir", CT2_MODEL_DIR,
+        "--quantization", "int8",   # int8 = fastest on CPU; use int8_float16 on GPU
+        "--force",
+    ], check=True)
+    print("Conversion done ✓")
 # =============================================================================
 # Load models (runs once at startup)
 # =============================================================================
+print(f"Device: {DEVICE} | CT2 Compute: {CT2_COMPUTE_TYPE}")
 print("Loading models...")
 # ASR
 )
 print("  ASR loaded ✓")
+# MT — CTranslate2 Translator (replaces AutoModelForSeq2SeqLM)
+print(f"  Loading MT (CTranslate2): {CT2_MODEL_DIR}")
 mt_tokenizer = AutoTokenizer.from_pretrained(MT_MODEL_ID)
+mt_translator = ctranslate2.Translator(
+    CT2_MODEL_DIR,
+    device=CT2_DEVICE,
+    compute_type=CT2_COMPUTE_TYPE,
+    inter_threads=2,   # allows parallel sentence translations
+)
+print("  MT (CTranslate2) loaded ✓")
 # TTS
 print(f"  Loading TTS: {TTS_MODEL_ID}")
 # =============================================================================
+# Pipeline functions
 # =============================================================================
 def split_into_sentences(text):
     text = text.strip()
     if not text:
         return []
     text = '. '.join(s.strip().capitalize() for s in text.split('. ') if s.strip())
     if re.search(r'[.!?]', text):
         sentences = re.split(r'(?<=[.!?])\s+', text)
         return [s.strip() for s in sentences if s.strip()]
     words = text.split()
     MAX_WORDS = 12
     sentences = []
     return result["text"].strip()
+def translate_batch_ct2(sentences):
+    """
+    MT: Translate a batch of sentences from English → Yoruba using CTranslate2.
+    Much faster than calling .generate() per sentence.
+    """
+    # Tokenize all sentences at once
+    mt_tokenizer.src_lang = MT_SRC_LANG
+    tgt_lang_token = MT_TGT_LANG
+    # Encode to token strings (CTranslate2 works with token lists, not IDs)
+    tokenized = [
+        mt_tokenizer.convert_ids_to_tokens(
+            mt_tokenizer.encode(s, add_special_tokens=True)
+        )
+        for s in sentences
+    ]
+    tgt_prefix = [[tgt_lang_token]] * len(sentences)
+    results = mt_translator.translate_batch(
+        tokenized,
+        target_prefix=tgt_prefix,
+        beam_size=4,
+        repetition_penalty=1.5,
+        no_repeat_ngram_size=3,
+        max_decoding_length=256,
+    )
+    translations = []
+    for result in results:
+        tokens = result.hypotheses[0]
+        # Remove the language token prefix if present
+        if tokens and tokens[0] == tgt_lang_token:
+            tokens = tokens[1:]
+        text = mt_tokenizer.decode(
+            mt_tokenizer.convert_tokens_to_ids(tokens),
+            skip_special_tokens=True,
         )
+        translations.append(text)
+    return translations
 def translate_long_text(text):
+    """Split into sentences and translate as a batch."""
     sentences = split_into_sentences(text)
+    if not sentences:
+        return "", [], []
+    translations = translate_batch_ct2(sentences)
     return ' '.join(translations), sentences, translations
 # =============================================================================
 def process_audio(audio_input):
     if audio_input is None:
         return None, "⚠️ No audio provided. Please upload or record audio."
     sample_rate, audio_array = audio_input
     audio_array = audio_array.astype(np.float32)
     if audio_array.ndim > 1:
         audio_array = audio_array.mean(axis=1)
     if audio_array.max() > 1.0 or audio_array.min() < -1.0:
         audio_array = audio_array / max(abs(audio_array.max()), abs(audio_array.min()))
     total_start = time.time()
     log_lines = []
     t0 = time.time()
     english_text = transcribe(audio_array, sample_rate)
+    log_lines.append(f"**🎤 ASR** ({time.time()-t0:.2f}s)")
+    log_lines.append(f"English: {english_text}\n")
     if not english_text:
+        return None, "⚠️ ASR returned empty text."
     t0 = time.time()
     yoruba_text, en_sentences, yo_sentences = translate_long_text(english_text)
+    log_lines.append(f"**🔄 Translation (CTranslate2)** ({time.time()-t0:.2f}s)")
     for en_s, yo_s in zip(en_sentences, yo_sentences):
         log_lines.append(f"  EN: {en_s}")
         log_lines.append(f"  YO: {yo_s}")
     log_lines.append("")
     if not yoruba_text:
         return None, "⚠️ Translation returned empty text."
     t0 = time.time()
     yoruba_audio, output_sr = synthesize(yoruba_text)
+    log_lines.append(f"**🔊 TTS** ({time.time()-t0:.2f}s) → {len(yoruba_audio)/output_sr:.2f}s of audio")
+    log_lines.append(f"\n**Total: {time.time()-total_start:.2f}s**")
+    return (output_sr, yoruba_audio), "\n".join(log_lines)
 def process_text(english_text):
     if not english_text or not english_text.strip():
         return None, "⚠️ Please enter some English text."
     total_start = time.time()
     log_lines = []
     t0 = time.time()
     yoruba_text, en_sentences, yo_sentences = translate_long_text(english_text.strip())
+    log_lines.append(f"**🔄 Translation (CTranslate2)** ({time.time()-t0:.2f}s)")
     for en_s, yo_s in zip(en_sentences, yo_sentences):
         log_lines.append(f"  EN: {en_s}")
         log_lines.append(f"  YO: {yo_s}")
     log_lines.append("")
     if not yoruba_text:
         return None, "⚠️ Translation returned empty text."
     t0 = time.time()
     yoruba_audio, output_sr = synthesize(yoruba_text)
+    log_lines.append(f"**🔊 TTS** ({time.time()-t0:.2f}s) → {len(yoruba_audio)/output_sr:.2f}s of audio")
+    log_lines.append(f"\n**Total: {time.time()-total_start:.2f}s**")
     return (output_sr, yoruba_audio), "\n".join(log_lines)
 DESCRIPTION = """
 # 🏟️ Live Football Commentary — English → Yoruba
 Translate English football commentary into Yoruba speech in real-time.
+**Pipeline:** ASR (Whisper) → MT (NLLB-200 via CTranslate2) → TTS (MMS-TTS Yoruba)
 """
 EXAMPLES_TEXT = [
     "He dribbles past two defenders and shoots! The ball hits the back of the net!",
 ]
+with gr.Blocks(title="Football Commentary EN→YO", theme=gr.themes.Soft()) as demo:
     gr.Markdown(DESCRIPTION)
     with gr.Tabs():
         with gr.TabItem("🎙️ Audio → Audio (Full Pipeline)"):
             gr.Markdown("Upload or record English commentary. The pipeline will transcribe, translate, and synthesize Yoruba audio.")
             with gr.Row():
                 with gr.Column():
+                    audio_input = gr.Audio(label="English Commentary Audio", type="numpy", sources=["upload", "microphone"])
                     audio_submit_btn = gr.Button("Translate to Yoruba", variant="primary", size="lg")
                 with gr.Column():
                     audio_output = gr.Audio(label="Yoruba Commentary Audio", type="numpy")
                     audio_log = gr.Markdown(label="Pipeline Log")
+            audio_submit_btn.click(fn=process_audio, inputs=[audio_input], outputs=[audio_output, audio_log])
         with gr.TabItem("📝 Text → Audio (Translation + TTS)"):
+            gr.Markdown("Type or paste English text to translate to Yoruba and hear the result.")
             with gr.Row():
                 with gr.Column():
+                    text_input = gr.Textbox(label="English Text", placeholder="Type English football commentary here...", lines=4)
                     text_submit_btn = gr.Button("Translate to Yoruba", variant="primary", size="lg")
+                    gr.Examples(examples=[[e] for e in EXAMPLES_TEXT], inputs=[text_input], label="Example Commentary")
                 with gr.Column():
                     text_audio_output = gr.Audio(label="Yoruba Audio", type="numpy")
                     text_log = gr.Markdown(label="Pipeline Log")
+            text_submit_btn.click(fn=process_text, inputs=[text_input], outputs=[text_audio_output, text_log])
     gr.Markdown("""
 ---
 [TTS: PlotweaverAI/yoruba-mms-tts-new](https://huggingface.co/PlotweaverAI/yoruba-mms-tts-new)
 """)
 if __name__ == "__main__":
+    demo.launch()