File size: 1,352 Bytes
5e6c5bb
c573494
5a1a7ec
9ce846a
c573494
 
fc06a3d
c573494
 
c953361
c573494
 
9ce846a
c573494
 
9ce846a
 
 
c573494
 
9ce846a
5a1a7ec
 
9ce846a
5a1a7ec
 
c573494
5a1a7ec
 
c953361
5a1a7ec
 
66d0bf1
 
9ce846a
66d0bf1
5a1a7ec
c953361
 
5a1a7ec
 
18b11fa
66d0bf1
 
c573494
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import gradio as gr
import torch
import numpy as np
from transformers import VitsModel, AutoTokenizer

LANG_MODEL_MAP = {
    "Malayalam": "trysem/Seema",
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
cache = {}

def load_model_and_tokenizer(language):
    model_name = LANG_MODEL_MAP[language]
    if model_name not in cache:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = VitsModel.from_pretrained(model_name).to(device)
        cache[model_name] = (tokenizer, model)
    return cache[model_name]

def tts(language, text):
    if not text.strip():
        return 16000, np.zeros(1)  # empty waveform if no text

    tokenizer, model = load_model_and_tokenizer(language)
    inputs = tokenizer(text, return_tensors="pt").to(device)

    with torch.no_grad():
        output = model(**inputs)

    waveform = output.waveform.squeeze().cpu().numpy()
    return 16000, waveform

iface = gr.Interface(
    fn=tts,
    inputs=[
        gr.Dropdown(choices=list(LANG_MODEL_MAP.keys()), label="Select Language"),
        gr.Textbox(label="Enter Text")
    ],
    outputs=gr.Audio(label="Synthesized Speech", type="numpy"),
    title="Multilingual Text-to-Speech (MMS)",
    description="Generate speech from text using Meta's MMS models for Malayalam"
)

if __name__ == "__main__":
    iface.launch()