File size: 3,180 Bytes
f239f3d
 
 
ed16b98
f239f3d
928010a
f239f3d
 
 
ed16b98
f239f3d
 
ed16b98
f239f3d
 
60591b3
f239f3d
 
cefdd32
f239f3d
928010a
f239f3d
ed16b98
f239f3d
 
 
 
ed16b98
 
 
f239f3d
928010a
f239f3d
928010a
ed16b98
f239f3d
 
 
 
 
 
 
 
ed16b98
 
 
 
2a9225b
60591b3
ed16b98
 
 
f239f3d
 
ed16b98
928010a
 
ed16b98
 
 
 
 
 
 
 
 
f239f3d
 
928010a
 
 
f239f3d
928010a
f239f3d
928010a
 
 
ed16b98
 
928010a
 
 
 
f239f3d
928010a
ed16b98
f239f3d
928010a
 
 
ed16b98
928010a
f239f3d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import os
import sys
import subprocess
import time  # Added for tracking duration

# 1. Clone the repo if it doesn't exist
if not os.path.exists("LuxTTS"):
    subprocess.run(["git", "clone", "https://github.com/ysharma3501/LuxTTS.git"])

# 2. Install requirements
subprocess.run([sys.executable, "-m", "pip", "install", "-r", "LuxTTS/requirements.txt"])

# 3. Add to path
sys.path.append(os.path.abspath("LuxTTS"))

import numpy as np
import gradio as gr
import torch
from zipvoice.luxvoice import LuxTTS

# Init Model
device = "cuda" if torch.cuda.is_available() else "cpu"
# Note: 2 threads on a 2-core CPU is the bottleneck
lux_tts = LuxTTS('YatharthS/LuxTTS', device=device, threads=2)

def infer(text, audio_prompt, rms, t_shift, num_steps, speed, return_smooth):
    if audio_prompt is None or not text:
        return None, "Please provide text and reference audio."
    
    start_time = time.time()
    
    # Encode reference
    encoded_prompt = lux_tts.encode_prompt(audio_prompt, rms=rms)
    
    # Generate speech
    final_wav = lux_tts.generate_speech(
        text, 
        encoded_prompt, 
        num_steps=int(num_steps), 
        t_shift=t_shift, 
        speed=speed, 
        return_smooth=return_smooth
    )
    
    end_time = time.time()
    duration = round(end_time - start_time, 2)
    
    final_wav = final_wav.cpu().squeeze(0).numpy()
    final_wav = (np.clip(final_wav, -1.0, 1.0) * 32767).astype(np.int16)
    
    stats_msg = f"✨ Generation complete in **{duration}s**."
    return (48000, final_wav), stats_msg

# Gradio UI
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🎙️ LuxTTS Voice Cloning")
    
    # Info Panel
    gr.Markdown(
        """
        > **Note:** Processing may feel slow as this instance uses a **2-core CPU** (lower specs than most modern phones). 
        > 
        > **Tip:** If you notice words are being **cut off** at the end, try **lowering the speed** further.
        """
    )
    
    with gr.Row():
        with gr.Column():
            input_text = gr.Textbox(label="Text to Synthesize", value="Hey, what's up? I'm feeling really great!")
            input_audio = gr.Audio(label="Reference Audio (.wav)", type="filepath")
            
            with gr.Row():
                rms_val = gr.Number(value=0.01, label="RMS (Loudness)")
                t_shift_val = gr.Number(value=0.9, label="T-Shift")
                steps_val = gr.Slider(1, 10, value=4, step=1, label="Num Steps")
            
            with gr.Row():
                # Default speed set to 0.8
                speed_val = gr.Slider(0.5, 2.0, value=0.8, step=0.1, label="Speed (Lower = Longer/Clearer)")
                smooth_val = gr.Checkbox(label="Return Smooth", value=False)
            
            btn = gr.Button("Generate Speech", variant="primary")
            
        with gr.Column():
            audio_out = gr.Audio(label="Result")
            status_text = gr.Markdown("Ready to generate...")

    btn.click(
        fn=infer, 
        inputs=[input_text, input_audio, rms_val, t_shift_val, steps_val, speed_val, smooth_val], 
        outputs=[audio_out, status_text]
    )

demo.launch()