tts / app.py
ramanan-techlover's picture
added new language (Tamil)
a96487d verified
raw
history blame
1.76 kB
import torch
from transformers import AutoModel, AutoTokenizer
import gradio as gr
import soundfile as sf
import numpy as np
import tempfile
# Load model and tokenizer
device = "cpu" # Change to "cuda" if you have GPU
model = AutoModel.from_pretrained("ai4bharat/vits_rasa_13", trust_remote_code=True).to(device)
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/vits_rasa_13", trust_remote_code=True)
# Speaker IDs for languages
LANG_SPEAKER_MAP = {
"mar": 13, # Marathi Male
"hin": 13, # Reuse Marathi Male for Hindi
"san": 17 # Sanskrit Male
"tam": 18
}
DEFAULT_STYLE_ID = 0 # ALEXA
def generate_audio(text, language):
if not text.strip():
return "Error: Text cannot be empty."
speaker_id = LANG_SPEAKER_MAP.get(language.lower())
if speaker_id is None:
return f"Unsupported language: {language}"
inputs = tokenizer(text=text, return_tensors="pt").to(device)
with torch.no_grad():
outputs = model(inputs['input_ids'], speaker_id=speaker_id, emotion_id=DEFAULT_STYLE_ID)
waveform = outputs.waveform.squeeze().cpu().numpy()
sample_rate = model.config.sampling_rate
# Save temp audio
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
sf.write(f.name, waveform, sample_rate)
return sample_rate, waveform
# Gradio Interface with clean inputs
iface = gr.Interface(
fn=generate_audio,
inputs=[
gr.Textbox(label="Enter Text"),
gr.Dropdown(["mar", "hin", "san"], label="Select Language")
],
outputs=gr.Audio(label="Generated Audio"),
title="VITS TTS for Indian Languages (Marathi, Hindi, Sanskrit)",
description="Uses ai4bharat/vits_rasa_13. Enter text and select a language."
)
iface.launch()