import gradio as gr import torch import numpy as np from transformers import VitsModel, AutoTokenizer # Configuración del modelo model_id = "facebook/mms-tts-yua" device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Cargando modelo en: {device}") model = VitsModel.from_pretrained(model_id).to(device) tokenizer = AutoTokenizer.from_pretrained(model_id) def generate_tts(text): if not text.strip(): return None inputs = tokenizer(text, return_tensors="pt").to(device) with torch.no_grad(): output = model(**inputs) waveform = output.waveform[0].cpu().numpy() sample_rate = model.config.sampling_rate return (sample_rate, waveform) # Interfaz de Gradio demo = gr.Interface( fn=generate_tts, inputs=gr.Textbox(label="Escribe en Maya Yucateco", placeholder="Bix yanilech..."), outputs=gr.Audio(label="Audio Generado"), title="Maya Yucateco TTS (Facebook MMS)", description="Servidor de síntesis de voz para el idioma Maya Yucateco." ) if __name__ == "__main__": demo.launch()