""" Live Football Commentary Pipeline — English → Yoruba ===================================================== Gradio app for HuggingFace Spaces. Pipeline: ASR (Whisper) → MT (NLLB-200 via CTranslate2) → TTS (MMS-TTS Yoruba) """ import torch import numpy as np import re import time import gradio as gr import ctranslate2 from transformers import AutoTokenizer from transformers import pipeline as hf_pipeline # ============================================================================= # Configuration # ============================================================================= ASR_MODEL_ID = "PlotweaverAI/whisper-small-de-en" MT_MODEL_ID = "PlotweaverAI/nllb-200-distilled-600M-african-6lang" TTS_MODEL_ID = "PlotweaverAI/yoruba-mms-tts-new" CT2_MODEL_DIR = "./nllb_ct2" # Local dir where converted model is saved MT_SRC_LANG = "eng_Latn" MT_TGT_LANG = "yor_Latn" DEVICE = "cuda" if torch.cuda.is_available() else "cpu" TORCH_DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32 CT2_DEVICE = "cuda" if torch.cuda.is_available() else "cpu" CT2_COMPUTE_TYPE = "int8_float16" if torch.cuda.is_available() else "int8" # ============================================================================= # Convert MT model to CTranslate2 format (runs once at startup if needed) # ============================================================================= import os if not os.path.exists(CT2_MODEL_DIR): print(f"Converting {MT_MODEL_ID} to CTranslate2 format...") import subprocess subprocess.run([ "ct2-transformers-converter", "--model", MT_MODEL_ID, "--output_dir", CT2_MODEL_DIR, "--quantization", "int8", # int8 = fastest on CPU; use int8_float16 on GPU "--force", ], check=True) print("Conversion done ✓") # ============================================================================= # Load models (runs once at startup) # ============================================================================= print(f"Device: {DEVICE} | CT2 Compute: {CT2_COMPUTE_TYPE}") print("Loading models...") # ASR print(f" Loading ASR: {ASR_MODEL_ID}") asr_pipe = hf_pipeline( "automatic-speech-recognition", model=ASR_MODEL_ID, device=DEVICE, torch_dtype=TORCH_DTYPE, ) print(" ASR loaded ✓") # MT — CTranslate2 Translator (replaces AutoModelForSeq2SeqLM) print(f" Loading MT (CTranslate2): {CT2_MODEL_DIR}") mt_tokenizer = AutoTokenizer.from_pretrained(MT_MODEL_ID) mt_translator = ctranslate2.Translator( CT2_MODEL_DIR, device=CT2_DEVICE, compute_type=CT2_COMPUTE_TYPE, inter_threads=2, # allows parallel sentence translations ) print(" MT (CTranslate2) loaded ✓") # TTS print(f" Loading TTS: {TTS_MODEL_ID}") tts_pipe = hf_pipeline( "text-to-speech", model=TTS_MODEL_ID, device=DEVICE, torch_dtype=TORCH_DTYPE, ) print(" TTS loaded ✓") print("All models loaded!") # ============================================================================= # Pipeline functions # ============================================================================= def split_into_sentences(text): """Split raw ASR text into individual sentences for MT.""" text = text.strip() if not text: return [] text = '. '.join(s.strip().capitalize() for s in text.split('. ') if s.strip()) if re.search(r'[.!?]', text): sentences = re.split(r'(?<=[.!?])\s+', text) return [s.strip() for s in sentences if s.strip()] words = text.split() MAX_WORDS = 12 sentences = [] for i in range(0, len(words), MAX_WORDS): chunk = ' '.join(words[i:i + MAX_WORDS]) if not chunk.endswith(('.', '!', '?')): chunk += '.' chunk = chunk[0].upper() + chunk[1:] if len(chunk) > 1 else chunk.upper() sentences.append(chunk) return sentences def transcribe(audio_array, sample_rate=16000): """ASR: English audio → English text.""" result = asr_pipe( {"raw": audio_array, "sampling_rate": sample_rate}, chunk_length_s=15, batch_size=1, return_timestamps=False, ) return result["text"].strip() def translate_batch_ct2(sentences): """ MT: Translate a batch of sentences from English → Yoruba using CTranslate2. Much faster than calling .generate() per sentence. """ # Tokenize all sentences at once mt_tokenizer.src_lang = MT_SRC_LANG tgt_lang_token = MT_TGT_LANG # Encode to token strings (CTranslate2 works with token lists, not IDs) tokenized = [ mt_tokenizer.convert_ids_to_tokens( mt_tokenizer.encode(s, add_special_tokens=True) ) for s in sentences ] tgt_prefix = [[tgt_lang_token]] * len(sentences) results = mt_translator.translate_batch( tokenized, target_prefix=tgt_prefix, beam_size=4, repetition_penalty=1.5, no_repeat_ngram_size=3, max_decoding_length=256, ) translations = [] for result in results: tokens = result.hypotheses[0] # Remove the language token prefix if present if tokens and tokens[0] == tgt_lang_token: tokens = tokens[1:] text = mt_tokenizer.decode( mt_tokenizer.convert_tokens_to_ids(tokens), skip_special_tokens=True, ) translations.append(text) return translations def translate_long_text(text): """Split into sentences and translate as a batch.""" sentences = split_into_sentences(text) if not sentences: return "", [], [] translations = translate_batch_ct2(sentences) return ' '.join(translations), sentences, translations def synthesize(text): """TTS: Yoruba text → audio.""" result = tts_pipe(text) audio = np.array(result["audio"]).squeeze() sr = result["sampling_rate"] return audio, sr # ============================================================================= # Gradio interface functions # ============================================================================= def process_audio(audio_input): if audio_input is None: return None, "⚠️ No audio provided. Please upload or record audio." sample_rate, audio_array = audio_input audio_array = audio_array.astype(np.float32) if audio_array.ndim > 1: audio_array = audio_array.mean(axis=1) if audio_array.max() > 1.0 or audio_array.min() < -1.0: audio_array = audio_array / max(abs(audio_array.max()), abs(audio_array.min())) total_start = time.time() log_lines = [] t0 = time.time() english_text = transcribe(audio_array, sample_rate) log_lines.append(f"**🎤 ASR** ({time.time()-t0:.2f}s)") log_lines.append(f"English: {english_text}\n") if not english_text: return None, "⚠️ ASR returned empty text." t0 = time.time() yoruba_text, en_sentences, yo_sentences = translate_long_text(english_text) log_lines.append(f"**🔄 Translation (CTranslate2)** ({time.time()-t0:.2f}s)") for en_s, yo_s in zip(en_sentences, yo_sentences): log_lines.append(f" EN: {en_s}") log_lines.append(f" YO: {yo_s}") log_lines.append("") if not yoruba_text: return None, "⚠️ Translation returned empty text." t0 = time.time() yoruba_audio, output_sr = synthesize(yoruba_text) log_lines.append(f"**🔊 TTS** ({time.time()-t0:.2f}s) → {len(yoruba_audio)/output_sr:.2f}s of audio") log_lines.append(f"\n**Total: {time.time()-total_start:.2f}s**") return (output_sr, yoruba_audio), "\n".join(log_lines) def process_text(english_text): if not english_text or not english_text.strip(): return None, "⚠️ Please enter some English text." total_start = time.time() log_lines = [] t0 = time.time() yoruba_text, en_sentences, yo_sentences = translate_long_text(english_text.strip()) log_lines.append(f"**🔄 Translation (CTranslate2)** ({time.time()-t0:.2f}s)") for en_s, yo_s in zip(en_sentences, yo_sentences): log_lines.append(f" EN: {en_s}") log_lines.append(f" YO: {yo_s}") log_lines.append("") if not yoruba_text: return None, "⚠️ Translation returned empty text." t0 = time.time() yoruba_audio, output_sr = synthesize(yoruba_text) log_lines.append(f"**🔊 TTS** ({time.time()-t0:.2f}s) → {len(yoruba_audio)/output_sr:.2f}s of audio") log_lines.append(f"\n**Total: {time.time()-total_start:.2f}s**") return (output_sr, yoruba_audio), "\n".join(log_lines) # ============================================================================= # Gradio UI # ============================================================================= DESCRIPTION = """ # 🏟️ Live Football Commentary — English → Yoruba Translate English football commentary into Yoruba speech in real-time. **Pipeline:** ASR (Whisper) → MT (NLLB-200 via CTranslate2) → TTS (MMS-TTS Yoruba) """ EXAMPLES_TEXT = [ "And it's a brilliant goal from the striker!", "The referee has shown a yellow card. Corner kick for the home team.", "What a save by the goalkeeper! The match is heading into injury time.", "He dribbles past two defenders and shoots! The ball hits the back of the net!", ] with gr.Blocks(title="Football Commentary EN→YO", theme=gr.themes.Soft()) as demo: gr.Markdown(DESCRIPTION) with gr.Tabs(): with gr.TabItem("🎙️ Audio → Audio (Full Pipeline)"): gr.Markdown("Upload or record English commentary. The pipeline will transcribe, translate, and synthesize Yoruba audio.") with gr.Row(): with gr.Column(): audio_input = gr.Audio(label="English Commentary Audio", type="numpy", sources=["upload", "microphone"]) audio_submit_btn = gr.Button("Translate to Yoruba", variant="primary", size="lg") with gr.Column(): audio_output = gr.Audio(label="Yoruba Commentary Audio", type="numpy") audio_log = gr.Markdown(label="Pipeline Log") audio_submit_btn.click(fn=process_audio, inputs=[audio_input], outputs=[audio_output, audio_log]) with gr.TabItem("📝 Text → Audio (Translation + TTS)"): gr.Markdown("Type or paste English text to translate to Yoruba and hear the result.") with gr.Row(): with gr.Column(): text_input = gr.Textbox(label="English Text", placeholder="Type English football commentary here...", lines=4) text_submit_btn = gr.Button("Translate to Yoruba", variant="primary", size="lg") gr.Examples(examples=[[e] for e in EXAMPLES_TEXT], inputs=[text_input], label="Example Commentary") with gr.Column(): text_audio_output = gr.Audio(label="Yoruba Audio", type="numpy") text_log = gr.Markdown(label="Pipeline Log") text_submit_btn.click(fn=process_text, inputs=[text_input], outputs=[text_audio_output, text_log]) gr.Markdown(""" --- **Models used:** [ASR: PlotweaverAI/whisper-small-de-en](https://huggingface.co/PlotweaverAI/whisper-small-de-en) | [MT: PlotweaverAI/nllb-200-distilled-600M-african-6lang](https://huggingface.co/PlotweaverAI/nllb-200-distilled-600M-african-6lang) | [TTS: PlotweaverAI/yoruba-mms-tts-new](https://huggingface.co/PlotweaverAI/yoruba-mms-tts-new) """) if __name__ == "__main__": demo.launch()