Spaces:
Runtime error
Runtime error
File size: 2,118 Bytes
ba69d3d 4faea06 ba69d3d 4faea06 ba69d3d 4faea06 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 | import gradio as gr
import torch
import soundfile as sf
import tempfile
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer
# Load model
device = "cuda" if torch.cuda.is_available() else "cpu"
model = ParlerTTSForConditionalGeneration.from_pretrained(
"ai4bharat/indic-parler-tts"
).to(device)
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts")
description_tokenizer = AutoTokenizer.from_pretrained(
model.config.text_encoder._name_or_path
)
# Default style prompt (you can change this in UI later)
DEFAULT_DESCRIPTION = (
"A clear and natural speaker with expressive tone, "
"high quality recording, close microphone."
)
def generate_speech(text, description):
if not text:
return None
if not description:
description = DEFAULT_DESCRIPTION
# Tokenize
desc_inputs = description_tokenizer(description, return_tensors="pt").to(device)
text_inputs = tokenizer(text, return_tensors="pt").to(device)
# Generate audio
with torch.no_grad():
audio = model.generate(
input_ids=desc_inputs.input_ids,
attention_mask=desc_inputs.attention_mask,
prompt_input_ids=text_inputs.input_ids,
prompt_attention_mask=text_inputs.attention_mask,
)
audio = audio.cpu().numpy().squeeze()
# Save to temp file
out_path = tempfile.mktemp(suffix=".wav")
sf.write(out_path, audio, model.config.sampling_rate)
return out_path
# UI
with gr.Blocks() as demo:
gr.Markdown("# ๐๏ธ Indic Parler TTS (AI4Bharat)")
text_input = gr.Textbox(
label="Text to speak",
placeholder="Enter Hindi, English, or any Indic language text..."
)
description_input = gr.Textbox(
label="Voice Style Description (optional)",
value=DEFAULT_DESCRIPTION
)
btn = gr.Button("Generate Speech ๐")
audio_output = gr.Audio(label="Generated Audio")
btn.click(
fn=generate_speech,
inputs=[text_input, description_input],
outputs=audio_output
)
demo.launch() |