LuxTTS / app.py
sinhprous's picture
Update app.py
0053af4 verified
import os
import sys
import subprocess
import time
# 1. Clone the repo if it doesn't exist
if not os.path.exists("LuxTTS"):
subprocess.run(["git", "clone", "https://github.com/ysharma3501/LuxTTS.git"])
# 2. Install requirements
subprocess.run([sys.executable, "-m", "pip", "install", "-r", "LuxTTS/requirements.txt"])
# 3. Add to path
sys.path.append(os.path.abspath("LuxTTS"))
import numpy as np
import gradio as gr
import torch
from zipvoice.luxvoice import LuxTTS
# Init Model
device = "cuda" if torch.cuda.is_available() else "cpu"
lux_tts = LuxTTS("YatharthS/LuxTTS", device=device, threads=2)
def infer(
text,
audio_prompt,
rms,
ref_duration,
t_shift,
num_steps,
speed,
return_smooth,
):
if audio_prompt is None or not text:
return None, "Please provide text and reference audio."
start_time = time.time()
# Encode reference (WITH duration)
encoded_prompt = lux_tts.encode_prompt(
audio_prompt,
duration=ref_duration,
rms=rms,
)
# Generate speech
final_wav = lux_tts.generate_speech(
text,
encoded_prompt,
num_steps=int(num_steps),
t_shift=t_shift,
speed=speed,
return_smooth=return_smooth,
)
duration = round(time.time() - start_time, 2)
final_wav = final_wav.cpu().squeeze(0).numpy()
final_wav = (np.clip(final_wav, -1.0, 1.0) * 32767).astype(np.int16)
stats_msg = f"✨ Generation complete in **{duration}s**."
return (48000, final_wav), stats_msg
# =======================
# Gradio UI
# =======================
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("# 🎙️ LuxTTS Voice Cloning")
gr.Markdown(
"""
> **Note:** This demo runs on a **2-core CPU**, so expect slower inference.
> **Tip:** If words get cut off, lower **Speed** or increase **Ref Duration**.
"""
)
with gr.Row():
with gr.Column():
input_text = gr.Textbox(
label="Text to Synthesize",
value="Hey, what's up? I'm feeling really great!",
)
input_audio = gr.Audio(
label="Reference Audio (.wav)",
type="filepath",
)
with gr.Row():
rms_val = gr.Number(
value=0.01,
label="RMS (Loudness)",
)
ref_duration_val = gr.Number(
value=5,
label="Reference Duration (sec)",
info="Lower = faster. Set ~1000 if you hear artifacts.",
)
t_shift_val = gr.Number(
value=0.9,
label="T-Shift",
)
with gr.Row():
steps_val = gr.Slider(
1,
64,
value=4,
step=1,
label="Num Steps",
)
speed_val = gr.Slider(
0.5,
2.0,
value=0.8,
step=0.1,
label="Speed (Lower = Longer / Clearer)",
)
smooth_val = gr.Checkbox(
label="Return Smooth",
value=False,
)
btn = gr.Button("Generate Speech", variant="primary")
with gr.Column():
audio_out = gr.Audio(label="Result")
status_text = gr.Markdown("Ready to generate...")
btn.click(
fn=infer,
inputs=[
input_text,
input_audio,
rms_val,
ref_duration_val,
t_shift_val,
steps_val,
speed_val,
smooth_val,
],
outputs=[audio_out, status_text],
)
demo.launch()