File size: 1,352 Bytes
5e6c5bb c573494 5a1a7ec 9ce846a c573494 fc06a3d c573494 c953361 c573494 9ce846a c573494 9ce846a c573494 9ce846a 5a1a7ec 9ce846a 5a1a7ec c573494 5a1a7ec c953361 5a1a7ec 66d0bf1 9ce846a 66d0bf1 5a1a7ec c953361 5a1a7ec 18b11fa 66d0bf1 c573494 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 | import gradio as gr
import torch
import numpy as np
from transformers import VitsModel, AutoTokenizer
LANG_MODEL_MAP = {
"Malayalam": "trysem/Seema",
}
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
cache = {}
def load_model_and_tokenizer(language):
model_name = LANG_MODEL_MAP[language]
if model_name not in cache:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = VitsModel.from_pretrained(model_name).to(device)
cache[model_name] = (tokenizer, model)
return cache[model_name]
def tts(language, text):
if not text.strip():
return 16000, np.zeros(1) # empty waveform if no text
tokenizer, model = load_model_and_tokenizer(language)
inputs = tokenizer(text, return_tensors="pt").to(device)
with torch.no_grad():
output = model(**inputs)
waveform = output.waveform.squeeze().cpu().numpy()
return 16000, waveform
iface = gr.Interface(
fn=tts,
inputs=[
gr.Dropdown(choices=list(LANG_MODEL_MAP.keys()), label="Select Language"),
gr.Textbox(label="Enter Text")
],
outputs=gr.Audio(label="Synthesized Speech", type="numpy"),
title="Multilingual Text-to-Speech (MMS)",
description="Generate speech from text using Meta's MMS models for Malayalam"
)
if __name__ == "__main__":
iface.launch() |