import os import torch import gradio as gr import soundfile as sf from transformers import AutoProcessor, VitsModel HF_TOKEN = os.getenv("HF_TOKEN") DEVICE = "cuda" if torch.cuda.is_available() else "cpu" TTS_MODELS = { "yoruba": "facebook/mms-tts-yor", "hausa": "facebook/mms-tts-hau", } tts_engines = {} for lang, model_id in TTS_MODELS.items(): print(f"Loading TTS model for {lang}...") processor = AutoProcessor.from_pretrained( model_id, token=HF_TOKEN ) model = VitsModel.from_pretrained( model_id, token=HF_TOKEN ).to(DEVICE) model.eval() tts_engines[lang] = { "processor": processor, "model": model } print("All TTS models loaded successfully") def synthesize_speech(text, language): if not text.strip(): return None language = language.lower() if language not in tts_engines: return None processor = tts_engines[language]["processor"] model = tts_engines[language]["model"] inputs = processor( text=text, return_tensors="pt" ).to(DEVICE) with torch.no_grad(): output = model(**inputs) audio = output.waveform.squeeze().cpu().numpy() output_path = "tts_output.wav" sf.write(output_path, audio, 16000) return output_path demo = gr.Interface( fn=synthesize_speech, inputs=[ gr.Textbox(label="Text"), gr.Dropdown( choices=["yoruba", "hausa"], label="Language" ) ], outputs=gr.Audio(type="filepath", label="Generated Speech"), title="HealthAtlas Nigerian TTS Service", description="Text → Speech (Yoruba & Hausa)", ) if __name__ == "__main__": demo.launch()