import os import sys import subprocess import time # 1. Clone the repo if it doesn't exist if not os.path.exists("LuxTTS"): subprocess.run(["git", "clone", "https://github.com/ysharma3501/LuxTTS.git"]) # 2. Install requirements subprocess.run([sys.executable, "-m", "pip", "install", "-r", "LuxTTS/requirements.txt"]) # 3. Add to path sys.path.append(os.path.abspath("LuxTTS")) import numpy as np import gradio as gr import torch from zipvoice.luxvoice import LuxTTS # Init Model device = "cuda" if torch.cuda.is_available() else "cpu" lux_tts = LuxTTS("YatharthS/LuxTTS", device=device, threads=2) def infer( text, audio_prompt, rms, ref_duration, t_shift, num_steps, speed, return_smooth, ): if audio_prompt is None or not text: return None, "Please provide text and reference audio." start_time = time.time() # Encode reference (WITH duration) encoded_prompt = lux_tts.encode_prompt( audio_prompt, duration=ref_duration, rms=rms, ) # Generate speech final_wav = lux_tts.generate_speech( text, encoded_prompt, num_steps=int(num_steps), t_shift=t_shift, speed=speed, return_smooth=return_smooth, ) duration = round(time.time() - start_time, 2) final_wav = final_wav.cpu().squeeze(0).numpy() final_wav = (np.clip(final_wav, -1.0, 1.0) * 32767).astype(np.int16) stats_msg = f"✨ Generation complete in **{duration}s**." return (48000, final_wav), stats_msg # ======================= # Gradio UI # ======================= with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown("# 🎙️ LuxTTS Voice Cloning") gr.Markdown( """ > **Note:** This demo runs on a **2-core CPU**, so expect slower inference. > **Tip:** If words get cut off, lower **Speed** or increase **Ref Duration**. """ ) with gr.Row(): with gr.Column(): input_text = gr.Textbox( label="Text to Synthesize", value="Hey, what's up? I'm feeling really great!", ) input_audio = gr.Audio( label="Reference Audio (.wav)", type="filepath", ) with gr.Row(): rms_val = gr.Number( value=0.01, label="RMS (Loudness)", ) ref_duration_val = gr.Number( value=5, label="Reference Duration (sec)", info="Lower = faster. Set ~1000 if you hear artifacts.", ) t_shift_val = gr.Number( value=0.9, label="T-Shift", ) with gr.Row(): steps_val = gr.Slider( 1, 64, value=4, step=1, label="Num Steps", ) speed_val = gr.Slider( 0.5, 2.0, value=0.8, step=0.1, label="Speed (Lower = Longer / Clearer)", ) smooth_val = gr.Checkbox( label="Return Smooth", value=False, ) btn = gr.Button("Generate Speech", variant="primary") with gr.Column(): audio_out = gr.Audio(label="Result") status_text = gr.Markdown("Ready to generate...") btn.click( fn=infer, inputs=[ input_text, input_audio, rms_val, ref_duration_val, t_shift_val, steps_val, speed_val, smooth_val, ], outputs=[audio_out, status_text], ) demo.launch()