| import gradio as gr |
| import torch |
| import numpy as np |
| from transformers import VitsModel, AutoTokenizer |
|
|
| LANG_MODEL_MAP = { |
| "Malayalam": "trysem/Seema", |
| } |
|
|
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| cache = {} |
|
|
| def load_model_and_tokenizer(language): |
| model_name = LANG_MODEL_MAP[language] |
| if model_name not in cache: |
| tokenizer = AutoTokenizer.from_pretrained(model_name) |
| model = VitsModel.from_pretrained(model_name).to(device) |
| cache[model_name] = (tokenizer, model) |
| return cache[model_name] |
|
|
| def tts(language, text): |
| if not text.strip(): |
| return 16000, np.zeros(1) |
|
|
| tokenizer, model = load_model_and_tokenizer(language) |
| inputs = tokenizer(text, return_tensors="pt").to(device) |
|
|
| with torch.no_grad(): |
| output = model(**inputs) |
|
|
| waveform = output.waveform.squeeze().cpu().numpy() |
| return 16000, waveform |
|
|
| iface = gr.Interface( |
| fn=tts, |
| inputs=[ |
| gr.Dropdown(choices=list(LANG_MODEL_MAP.keys()), label="Select Language"), |
| gr.Textbox(label="Enter Text") |
| ], |
| outputs=gr.Audio(label="Synthesized Speech", type="numpy"), |
| title="Multilingual Text-to-Speech (MMS)", |
| description="Generate speech from text using Meta's MMS models for Malayalam" |
| ) |
|
|
| if __name__ == "__main__": |
| iface.launch() |