File size: 3,889 Bytes
f239f3d
 
 
f0e6eda
f239f3d
928010a
f239f3d
 
 
ed16b98
f239f3d
 
ed16b98
f239f3d
 
60591b3
f239f3d
 
cefdd32
f239f3d
928010a
f239f3d
f0e6eda
f239f3d
f0e6eda
 
 
 
 
 
 
 
 
 
f239f3d
ed16b98
f0e6eda
ed16b98
f0e6eda
 
 
 
 
 
 
 
ed16b98
f239f3d
f0e6eda
 
 
 
 
 
f239f3d
f0e6eda
 
 
2a9225b
60591b3
f0e6eda
ed16b98
 
f239f3d
f0e6eda
f239f3d
f0e6eda
ed16b98
928010a
f0e6eda
ed16b98
 
f0e6eda
 
ed16b98
 
f0e6eda
f239f3d
 
f0e6eda
 
 
 
 
 
 
 
 
f239f3d
f0e6eda
 
 
 
 
 
 
 
 
 
 
 
 
 
928010a
f0e6eda
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
928010a
f0e6eda
f239f3d
928010a
ed16b98
f239f3d
928010a
f0e6eda
 
 
 
 
 
 
 
 
 
 
 
928010a
f239f3d
f0e6eda
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import os
import sys
import subprocess
import time

# 1. Clone the repo if it doesn't exist
if not os.path.exists("LuxTTS"):
    subprocess.run(["git", "clone", "https://github.com/ysharma3501/LuxTTS.git"])

# 2. Install requirements
subprocess.run([sys.executable, "-m", "pip", "install", "-r", "LuxTTS/requirements.txt"])

# 3. Add to path
sys.path.append(os.path.abspath("LuxTTS"))

import numpy as np
import gradio as gr
import torch
from zipvoice.luxvoice import LuxTTS

# Init Model
device = "cuda" if torch.cuda.is_available() else "cpu"
lux_tts = LuxTTS("YatharthS/LuxTTS", device=device, threads=2)

def infer(
    text,
    audio_prompt,
    rms,
    ref_duration,
    t_shift,
    num_steps,
    speed,
    return_smooth,
):
    if audio_prompt is None or not text:
        return None, "Please provide text and reference audio."

    start_time = time.time()

    # Encode reference (WITH duration)
    encoded_prompt = lux_tts.encode_prompt(
        audio_prompt,
        duration=ref_duration,
        rms=rms,
    )

    # Generate speech
    final_wav = lux_tts.generate_speech(
        text,
        encoded_prompt,
        num_steps=int(num_steps),
        t_shift=t_shift,
        speed=speed,
        return_smooth=return_smooth,
    )

    duration = round(time.time() - start_time, 2)

    final_wav = final_wav.cpu().squeeze(0).numpy()
    final_wav = (np.clip(final_wav, -1.0, 1.0) * 32767).astype(np.int16)

    stats_msg = f"✨ Generation complete in **{duration}s**."
    return (48000, final_wav), stats_msg

# =======================
# Gradio UI
# =======================
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🎙️ LuxTTS Voice Cloning")

    gr.Markdown(
        """
        > **Note:** This demo runs on a **2-core CPU**, so expect slower inference.  
        > **Tip:** If words get cut off, lower **Speed** or increase **Ref Duration**.
        """
    )

    with gr.Row():
        with gr.Column():
            input_text = gr.Textbox(
                label="Text to Synthesize",
                value="Hey, what's up? I'm feeling really great!",
            )
            input_audio = gr.Audio(
                label="Reference Audio (.wav)",
                type="filepath",
            )

            with gr.Row():
                rms_val = gr.Number(
                    value=0.01,
                    label="RMS (Loudness)",
                )
                ref_duration_val = gr.Number(
                    value=5,
                    label="Reference Duration (sec)",
                    info="Lower = faster. Set ~1000 if you hear artifacts.",
                )
                t_shift_val = gr.Number(
                    value=0.9,
                    label="T-Shift",
                )

            with gr.Row():
                steps_val = gr.Slider(
                    1,
                    10,
                    value=4,
                    step=1,
                    label="Num Steps",
                )
                speed_val = gr.Slider(
                    0.5,
                    2.0,
                    value=0.8,
                    step=0.1,
                    label="Speed (Lower = Longer / Clearer)",
                )
                smooth_val = gr.Checkbox(
                    label="Return Smooth",
                    value=False,
                )

            btn = gr.Button("Generate Speech", variant="primary")

        with gr.Column():
            audio_out = gr.Audio(label="Result")
            status_text = gr.Markdown("Ready to generate...")

    btn.click(
        fn=infer,
        inputs=[
            input_text,
            input_audio,
            rms_val,
            ref_duration_val,
            t_shift_val,
            steps_val,
            speed_val,
            smooth_val,
        ],
        outputs=[audio_out, status_text],
    )

demo.launch()