File size: 2,118 Bytes
ba69d3d
4faea06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ba69d3d
4faea06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ba69d3d
4faea06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import gradio as gr
import torch
import soundfile as sf
import tempfile

from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer

# Load model
device = "cuda" if torch.cuda.is_available() else "cpu"

model = ParlerTTSForConditionalGeneration.from_pretrained(
    "ai4bharat/indic-parler-tts"
).to(device)

tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts")
description_tokenizer = AutoTokenizer.from_pretrained(
    model.config.text_encoder._name_or_path
)

# Default style prompt (you can change this in UI later)
DEFAULT_DESCRIPTION = (
    "A clear and natural speaker with expressive tone, "
    "high quality recording, close microphone."
)

def generate_speech(text, description):
    if not text:
        return None

    if not description:
        description = DEFAULT_DESCRIPTION

    # Tokenize
    desc_inputs = description_tokenizer(description, return_tensors="pt").to(device)
    text_inputs = tokenizer(text, return_tensors="pt").to(device)

    # Generate audio
    with torch.no_grad():
        audio = model.generate(
            input_ids=desc_inputs.input_ids,
            attention_mask=desc_inputs.attention_mask,
            prompt_input_ids=text_inputs.input_ids,
            prompt_attention_mask=text_inputs.attention_mask,
        )

    audio = audio.cpu().numpy().squeeze()

    # Save to temp file
    out_path = tempfile.mktemp(suffix=".wav")
    sf.write(out_path, audio, model.config.sampling_rate)

    return out_path


# UI
with gr.Blocks() as demo:
    gr.Markdown("# ๐ŸŽ™๏ธ Indic Parler TTS (AI4Bharat)")

    text_input = gr.Textbox(
        label="Text to speak",
        placeholder="Enter Hindi, English, or any Indic language text..."
    )

    description_input = gr.Textbox(
        label="Voice Style Description (optional)",
        value=DEFAULT_DESCRIPTION
    )

    btn = gr.Button("Generate Speech ๐Ÿ”Š")

    audio_output = gr.Audio(label="Generated Audio")

    btn.click(
        fn=generate_speech,
        inputs=[text_input, description_input],
        outputs=audio_output
    )

demo.launch()