File size: 3,889 Bytes
f239f3d f0e6eda f239f3d 928010a f239f3d ed16b98 f239f3d ed16b98 f239f3d 60591b3 f239f3d cefdd32 f239f3d 928010a f239f3d f0e6eda f239f3d f0e6eda f239f3d ed16b98 f0e6eda ed16b98 f0e6eda ed16b98 f239f3d f0e6eda f239f3d f0e6eda 2a9225b 60591b3 f0e6eda ed16b98 f239f3d f0e6eda f239f3d f0e6eda ed16b98 928010a f0e6eda ed16b98 f0e6eda ed16b98 f0e6eda f239f3d f0e6eda f239f3d f0e6eda 928010a f0e6eda 928010a f0e6eda f239f3d 928010a ed16b98 f239f3d 928010a f0e6eda 928010a f239f3d f0e6eda | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 | import os
import sys
import subprocess
import time
# 1. Clone the repo if it doesn't exist
if not os.path.exists("LuxTTS"):
subprocess.run(["git", "clone", "https://github.com/ysharma3501/LuxTTS.git"])
# 2. Install requirements
subprocess.run([sys.executable, "-m", "pip", "install", "-r", "LuxTTS/requirements.txt"])
# 3. Add to path
sys.path.append(os.path.abspath("LuxTTS"))
import numpy as np
import gradio as gr
import torch
from zipvoice.luxvoice import LuxTTS
# Init Model
device = "cuda" if torch.cuda.is_available() else "cpu"
lux_tts = LuxTTS("YatharthS/LuxTTS", device=device, threads=2)
def infer(
text,
audio_prompt,
rms,
ref_duration,
t_shift,
num_steps,
speed,
return_smooth,
):
if audio_prompt is None or not text:
return None, "Please provide text and reference audio."
start_time = time.time()
# Encode reference (WITH duration)
encoded_prompt = lux_tts.encode_prompt(
audio_prompt,
duration=ref_duration,
rms=rms,
)
# Generate speech
final_wav = lux_tts.generate_speech(
text,
encoded_prompt,
num_steps=int(num_steps),
t_shift=t_shift,
speed=speed,
return_smooth=return_smooth,
)
duration = round(time.time() - start_time, 2)
final_wav = final_wav.cpu().squeeze(0).numpy()
final_wav = (np.clip(final_wav, -1.0, 1.0) * 32767).astype(np.int16)
stats_msg = f"✨ Generation complete in **{duration}s**."
return (48000, final_wav), stats_msg
# =======================
# Gradio UI
# =======================
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("# 🎙️ LuxTTS Voice Cloning")
gr.Markdown(
"""
> **Note:** This demo runs on a **2-core CPU**, so expect slower inference.
> **Tip:** If words get cut off, lower **Speed** or increase **Ref Duration**.
"""
)
with gr.Row():
with gr.Column():
input_text = gr.Textbox(
label="Text to Synthesize",
value="Hey, what's up? I'm feeling really great!",
)
input_audio = gr.Audio(
label="Reference Audio (.wav)",
type="filepath",
)
with gr.Row():
rms_val = gr.Number(
value=0.01,
label="RMS (Loudness)",
)
ref_duration_val = gr.Number(
value=5,
label="Reference Duration (sec)",
info="Lower = faster. Set ~1000 if you hear artifacts.",
)
t_shift_val = gr.Number(
value=0.9,
label="T-Shift",
)
with gr.Row():
steps_val = gr.Slider(
1,
10,
value=4,
step=1,
label="Num Steps",
)
speed_val = gr.Slider(
0.5,
2.0,
value=0.8,
step=0.1,
label="Speed (Lower = Longer / Clearer)",
)
smooth_val = gr.Checkbox(
label="Return Smooth",
value=False,
)
btn = gr.Button("Generate Speech", variant="primary")
with gr.Column():
audio_out = gr.Audio(label="Result")
status_text = gr.Markdown("Ready to generate...")
btn.click(
fn=infer,
inputs=[
input_text,
input_audio,
rms_val,
ref_duration_val,
t_shift_val,
steps_val,
speed_val,
smooth_val,
],
outputs=[audio_out, status_text],
)
demo.launch()
|