TTS2 / app.py
sathvikt's picture
Upload 2 files
e8bdfd6 verified
import gradio as gr
import torch
import soundfile as sf
import tempfile
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer
import os
from huggingface_hub import login
login(token=os.getenv("HF_TOKEN"))
MODEL_NAME = "ai4bharat/indic-parler-tts"
device = "cuda" if torch.cuda.is_available() else "cpu"
print("🚀 Using device:", device)
print("⏳ Loading Kannada TTS model...")
model = ParlerTTSForConditionalGeneration.from_pretrained(
MODEL_NAME
).to(device)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
description_tokenizer = AutoTokenizer.from_pretrained(
model.config.text_encoder._name_or_path
)
print("✅ Model loaded successfully")
# =========================================================
# TTS FUNCTION
# =========================================================
def generate_kannada_tts(prompt_text):
prompt_text = str(prompt_text).strip()
if not prompt_text:
return None
description = (
"A calm Kannada male speaker with natural pronunciation, "
"clear studio quality audio, smooth narration, "
"and no background noise."
)
description_inputs = description_tokenizer(
description,
return_tensors="pt"
).to(device)
prompt_inputs = tokenizer(
prompt_text,
return_tensors="pt"
).to(device)
with torch.no_grad():
generation = model.generate(
input_ids=description_inputs.input_ids,
prompt_input_ids=prompt_inputs.input_ids
)
audio = generation.cpu().numpy().squeeze()
temp_wav = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
sf.write(temp_wav.name, audio, model.config.sampling_rate)
return temp_wav.name
demo = gr.Interface(
fn=generate_kannada_tts,
inputs=gr.Textbox(
label="Enter Kannada Text",
placeholder="ನಮಸ್ಕಾರ, ನನ್ನ ಹೆಸರು ಅಥ್ಮಿಕ"
),
outputs=gr.Audio(label="Generated Kannada Speech"),
title="Kannada Text To Speech using AI4Bharat",
description="Deep Learning based Kannada TTS model for project presentation"
)
demo.launch()