| import os |
| import sys |
| import subprocess |
| import time |
|
|
| |
| if not os.path.exists("LuxTTS"): |
| subprocess.run(["git", "clone", "https://github.com/ysharma3501/LuxTTS.git"]) |
|
|
| |
| subprocess.run([sys.executable, "-m", "pip", "install", "-r", "LuxTTS/requirements.txt"]) |
|
|
| |
| sys.path.append(os.path.abspath("LuxTTS")) |
|
|
| import numpy as np |
| import gradio as gr |
| import torch |
| from zipvoice.luxvoice import LuxTTS |
|
|
| |
| device = "cuda" if torch.cuda.is_available() else "cpu" |
| lux_tts = LuxTTS("YatharthS/LuxTTS", device=device, threads=2) |
|
|
| def infer( |
| text, |
| audio_prompt, |
| rms, |
| ref_duration, |
| t_shift, |
| num_steps, |
| speed, |
| return_smooth, |
| ): |
| if audio_prompt is None or not text: |
| return None, "Please provide text and reference audio." |
|
|
| start_time = time.time() |
|
|
| |
| encoded_prompt = lux_tts.encode_prompt( |
| audio_prompt, |
| duration=ref_duration, |
| rms=rms, |
| ) |
|
|
| |
| final_wav = lux_tts.generate_speech( |
| text, |
| encoded_prompt, |
| num_steps=int(num_steps), |
| t_shift=t_shift, |
| speed=speed, |
| return_smooth=return_smooth, |
| ) |
|
|
| duration = round(time.time() - start_time, 2) |
|
|
| final_wav = final_wav.cpu().squeeze(0).numpy() |
| final_wav = (np.clip(final_wav, -1.0, 1.0) * 32767).astype(np.int16) |
|
|
| stats_msg = f"✨ Generation complete in **{duration}s**." |
| return (48000, final_wav), stats_msg |
|
|
| |
| |
| |
| with gr.Blocks(theme=gr.themes.Soft()) as demo: |
| gr.Markdown("# 🎙️ LuxTTS Voice Cloning") |
|
|
| gr.Markdown( |
| """ |
| > **Note:** This demo runs on a **2-core CPU**, so expect slower inference. |
| > **Tip:** If words get cut off, lower **Speed** or increase **Ref Duration**. |
| """ |
| ) |
|
|
| with gr.Row(): |
| with gr.Column(): |
| input_text = gr.Textbox( |
| label="Text to Synthesize", |
| value="Hey, what's up? I'm feeling really great!", |
| ) |
| input_audio = gr.Audio( |
| label="Reference Audio (.wav)", |
| type="filepath", |
| ) |
|
|
| with gr.Row(): |
| rms_val = gr.Number( |
| value=0.01, |
| label="RMS (Loudness)", |
| ) |
| ref_duration_val = gr.Number( |
| value=5, |
| label="Reference Duration (sec)", |
| info="Lower = faster. Set ~1000 if you hear artifacts.", |
| ) |
| t_shift_val = gr.Number( |
| value=0.9, |
| label="T-Shift", |
| ) |
|
|
| with gr.Row(): |
| steps_val = gr.Slider( |
| 1, |
| 64, |
| value=4, |
| step=1, |
| label="Num Steps", |
| ) |
| speed_val = gr.Slider( |
| 0.5, |
| 2.0, |
| value=0.8, |
| step=0.1, |
| label="Speed (Lower = Longer / Clearer)", |
| ) |
| smooth_val = gr.Checkbox( |
| label="Return Smooth", |
| value=False, |
| ) |
|
|
| btn = gr.Button("Generate Speech", variant="primary") |
|
|
| with gr.Column(): |
| audio_out = gr.Audio(label="Result") |
| status_text = gr.Markdown("Ready to generate...") |
|
|
| btn.click( |
| fn=infer, |
| inputs=[ |
| input_text, |
| input_audio, |
| rms_val, |
| ref_duration_val, |
| t_shift_val, |
| steps_val, |
| speed_val, |
| smooth_val, |
| ], |
| outputs=[audio_out, status_text], |
| ) |
|
|
| demo.launch() |
|
|