import gradio as gr import torch import whisper from transformers import NllbTokenizerFast, AutoModelForSeq2SeqLM import os os.environ["COQUI_TOS_AGREED"] = "1" from TTS.api import TTS import tempfile # --------------------------- # Load all models once (Hugging Face caching will persist) # --------------------------- whisper_model = whisper.load_model("small") #tokenizer = AutoTokenizer.from_pretrained("mafromedia/yaaba-fr-mo-nllb600M") from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained( "mafromedia/yaaba-fr-mo-nllb600M", use_fast=True, trust_remote_code=True ) translator_model = AutoModelForSeq2SeqLM.from_pretrained("mafromedia/yaaba-fr-mo-nllb600M") tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=False, gpu=False) # --------------------------- # Translate text # --------------------------- def translate_text(input_text, direction): if not input_text.strip(): return "Error: Empty input text." src_lang = "fra_Latn" if "French" in direction else "mos_Latn" tgt_lang = "mos_Latn" if "French" in direction else "fra_Latn" inputs = tokenizer(input_text, return_tensors="pt") translated_tokens = translator_model.generate( **inputs, forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_lang) ) translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True) return translated_text # --------------------------- # Speech → Speech Translation # --------------------------- def voice_translate(audio, direction): if audio is None: return "Error: No audio input detected.", "", None # Step 1: Transcribe audio → text result = whisper_model.transcribe(audio) detected_text = result["text"] # Step 2: Translate translated_text = translate_text(detected_text, direction) # Step 3: TTS synthesis tmp_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) tts.tts_to_file(text=translated_text, file_path=tmp_wav.name, speaker="female-en") return translated_text, detected_text, tmp_wav.name # --------------------------- # Text-only translation tab # --------------------------- def text_translate(input_text, direction): translated_text = translate_text(input_text, direction) tmp_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) tts.tts_to_file(text=translated_text, file_path=tmp_wav.name, speaker="female-en") return translated_text, tmp_wav.name # --------------------------- # Build Gradio Interface # --------------------------- description = ( "Yaaba AI bridges African and global communities through real-time French ↔ Mooré translation. " "Speak or type in one language to instantly hear and read the translation in the other." ) with gr.Blocks(title="🎙 Yaaba AI – French ↔ Mooré Voice Translator") as demo: gr.Markdown("# 🎙 Yaaba AI – French ↔ Mooré Voice Translator") gr.Markdown(description) with gr.Tabs(): with gr.Tab("Voice Translator"): direction_select = gr.Radio( ["French → Mooré", "Mooré → French"], label="Translation Direction", value="French → Mooré" ) audio_input = gr.Audio(sources=["microphone"], type="filepath", label="Record or Upload Speech") translate_btn = gr.Button("Translate Speech") translated_box = gr.Textbox(label="Translated Text") detected_box = gr.Textbox(label="Detected Speech") audio_output = gr.Audio(label="Translated Speech") translate_btn.click( fn=voice_translate, inputs=[audio_input, direction_select], outputs=[translated_box, detected_box, audio_output], ) with gr.Tab("Text-only"): direction_text = gr.Radio( ["French → Mooré", "Mooré → French"], label="Translation Direction", value="French → Mooré" ) text_box = gr.Textbox(label="Enter Text", placeholder="Type your text here...") text_btn = gr.Button("Translate Text") text_out = gr.Textbox(label="Translated Text") audio_out = gr.Audio(label="Spoken Translation") text_btn.click(fn=text_translate, inputs=[text_box, direction_text], outputs=[text_out, audio_out]) gr.Markdown("---") gr.Markdown("Built by GO AI Corp – Yaaba AI Initiative") # --------------------------- # Launch app (important for Spaces) # --------------------------- if __name__ == "__main__": demo.launch()