qweb3-tts-cpu / app.py
broadfield-dev's picture
Create app.py
72a6ef9 verified
import gradio as gr
import torch
import numpy as np
import soundfile as sf
from pathlib import Path
import os
import warnings
import subprocess
import tempfile
import math
warnings.filterwarnings("ignore", category=UserWarning)
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# Lazy import for TTS model (not required for video tab)
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
try:
from qwen_tts import Qwen3TTSModel
TTS_AVAILABLE = True
except ImportError:
TTS_AVAILABLE = False
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# Globals & Model Loader
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
MODELS = {
"1.7B-CustomVoice": "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice",
"0.6B-CustomVoice": "Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice",
"1.7B-VoiceDesign": "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign",
"1.7B-Base": "Qwen/Qwen3-TTS-12Hz-1.7B-Base",
"0.6B-Base": "Qwen/Qwen3-TTS-12Hz-0.6B-Base",
}
loaded_models = {}
def get_model(model_key: str, dtype_str: str = "float32", progress=gr.Progress()):
if not TTS_AVAILABLE:
raise gr.Error("qwen_tts is not installed. TTS tabs unavailable.")
key = f"{model_key}_{dtype_str}"
if key in loaded_models:
return loaded_models[key]
progress(0.1, desc=f"Loading {model_key} ({dtype_str}) โ€ฆ")
repo_id = MODELS[model_key]
dtype = torch.float32 if dtype_str == "float32" else torch.float16
try:
model = Qwen3TTSModel.from_pretrained(
repo_id, device_map="cpu", dtype=dtype,
torch_dtype=dtype, low_cpu_mem_usage=True,
)
except Exception as e:
raise gr.Error(f"Model loading failed:\n{str(e)}\n\nTry float32 or smaller variant.")
loaded_models[key] = model
progress(0.9, desc="Model ready.")
return model
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# TTS Inference (unchanged)
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def infer_custom_voice(text, lang, speaker, instruct, model_key, precision, progress=gr.Progress()):
if not text.strip():
return None, "Please enter some text."
model = get_model(model_key, precision, progress)
progress(0.4, desc="Generating โ€ฆ")
try:
wavs, sr = model.generate_custom_voice(
text=text, language=lang if lang != "Auto" else None,
speaker=speaker, instruct=instruct.strip() or None, max_new_tokens=1500,
)
path = "/tmp/output_custom.wav"
sf.write(path, wavs[0] if isinstance(wavs, list) else wavs, sr)
info = f"**Generated with {model_key}** \nlang: {lang} \nspeaker: {speaker} \ninstruct: {instruct or '(none)'}"
return path, info
except Exception as e:
return None, f"**Error**: {str(e)}"
def infer_voice_design(text, lang, instruct, model_key, precision, progress=gr.Progress()):
if not text.strip() or not instruct.strip():
return None, "Text and voice instruction required."
model = get_model(model_key, precision, progress)
progress(0.4, desc="Generating โ€ฆ")
try:
wavs, sr = model.generate_voice_design(
text=text, language=lang if lang != "Auto" else None,
instruct=instruct, max_new_tokens=1500,
)
path = "/tmp/output_design.wav"
sf.write(path, wavs[0] if isinstance(wavs, list) else wavs, sr)
info = f"**Voice Design โ€“ {model_key}** \nlang: {lang} \ninstruct: {instruct}"
return path, info
except Exception as e:
return None, f"**Error**: {str(e)}"
def infer_voice_clone(text, lang, ref_audio, ref_text, x_vector_only, model_key, precision, progress=gr.Progress()):
if not text.strip():
return None, "Enter text to synthesize."
if not ref_audio:
return None, "Upload reference audio."
model = get_model(model_key, precision, progress)
progress(0.3, desc="Processing reference โ€ฆ")
try:
wavs, sr = model.generate_voice_clone(
text=text, language=lang if lang != "Auto" else None,
ref_audio=ref_audio, ref_text=ref_text.strip() or None,
x_vector_only_mode=x_vector_only, max_new_tokens=1500,
)
path = "/tmp/output_clone.wav"
sf.write(path, wavs[0] if isinstance(wavs, list) else wavs, sr)
info = f"**Voice Clone โ€“ {model_key}** \nlang: {lang} \nx-vector-only: {x_vector_only}"
return path, info
except Exception as e:
return None, f"**Error**: {str(e)}"
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# Video Generation Helpers
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
RESOLUTIONS = {
"1080ร—1920 (TikTok/Reels 9:16)": (1080, 1920),
"1080ร—1080 (Instagram Square)": (1080, 1080),
"1920ร—1080 (YouTube Landscape)": (1920, 1080),
"1280ร—720 (YouTube 720p)": (1280, 720),
}
VISUAL_STYLES = [
"๐ŸŽ™ Solid + Waveform",
"๐ŸŒŠ Animated Spectrum Bars",
"โšก Oscilloscope Line",
"๐ŸŒˆ Gradient Pulse",
"๐Ÿ”ฒ Minimal Dark + Title",
]
def hex_to_rgb(h: str):
h = h.lstrip("#")
return tuple(int(h[i:i+2], 16) for i in (0, 2, 4))
def render_frame_solid_waveform(w, h, audio_chunk, bg_color, accent_color, title, frame_idx, fps):
"""Solid background with a centered waveform line."""
from PIL import Image, ImageDraw, ImageFont
img = Image.new("RGB", (w, h), bg_color)
draw = ImageDraw.Draw(img)
# waveform
n = len(audio_chunk)
if n == 0:
return img
cx = h // 2
bar_w = max(1, w // max(n, 1))
for i, amp in enumerate(audio_chunk):
x = int(i * w / n)
bar_h = int(abs(amp) * h * 0.4)
draw.rectangle([x, cx - bar_h, x + bar_w - 1, cx + bar_h], fill=accent_color)
# title text
if title:
try:
from PIL import ImageFont
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", max(20, h // 25))
except Exception:
font = None
draw.text((w // 2, h // 10), title, fill=(255, 255, 255), anchor="mm", font=font)
return img
def render_frame_spectrum(w, h, audio_chunk, bg_color, accent_color, title, frame_idx, fps):
"""Animated spectrum-like bars using FFT."""
from PIL import Image, ImageDraw
img = Image.new("RGB", (w, h), bg_color)
draw = ImageDraw.Draw(img)
N_BARS = 64
n = len(audio_chunk)
if n > 0:
spectrum = np.abs(np.fft.rfft(audio_chunk, n=512))[:N_BARS]
spectrum = spectrum / (spectrum.max() + 1e-9)
else:
spectrum = np.zeros(N_BARS)
bar_w = w // N_BARS
for i, val in enumerate(spectrum):
bar_h = int(val * h * 0.8)
x0 = i * bar_w
x1 = x0 + bar_w - 2
# gradient colour from accent to white
r, g, b = accent_color
draw.rectangle([x0, h - bar_h, x1, h], fill=(r, g, b))
if title:
try:
from PIL import ImageFont
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", max(20, h // 25))
except Exception:
font = None
draw.text((w // 2, h // 10), title, fill=(255, 255, 255), anchor="mm", font=font)
return img
def render_frame_oscilloscope(w, h, audio_chunk, bg_color, accent_color, title, frame_idx, fps):
"""Classic green-phosphor oscilloscope line."""
from PIL import Image, ImageDraw
img = Image.new("RGB", (w, h), (10, 10, 10))
draw = ImageDraw.Draw(img)
n = len(audio_chunk)
cx = h // 2
pts = []
for i in range(n):
x = int(i * w / n)
y = int(cx - audio_chunk[i] * h * 0.4)
y = max(0, min(h - 1, y))
pts.append((x, y))
if len(pts) > 1:
draw.line(pts, fill=accent_color, width=max(2, h // 200))
if title:
try:
from PIL import ImageFont
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", max(20, h // 25))
except Exception:
font = None
draw.text((w // 2, h // 12), title, fill=(200, 255, 200), anchor="mm", font=font)
return img
def render_frame_gradient_pulse(w, h, audio_chunk, bg_color, accent_color, title, frame_idx, fps):
"""Radial gradient that pulses with RMS energy."""
from PIL import Image, ImageDraw, ImageFilter
rms = float(np.sqrt(np.mean(audio_chunk ** 2))) if len(audio_chunk) > 0 else 0
r0, g0, b0 = bg_color
r1, g1, b1 = accent_color
img = Image.new("RGB", (w, h))
pixels = img.load()
cx, cy = w // 2, h // 2
max_r = math.sqrt(cx**2 + cy**2)
pulse = 0.3 + rms * 2.5
for y in range(h):
for x in range(w):
dist = math.sqrt((x - cx)**2 + (y - cy)**2) / max_r
t = max(0.0, min(1.0, 1 - dist / pulse))
pixels[x, y] = (
int(r0 + t * (r1 - r0)),
int(g0 + t * (g1 - g0)),
int(b0 + t * (b1 - b0)),
)
draw = ImageDraw.Draw(img)
if title:
try:
from PIL import ImageFont
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", max(20, h // 25))
except Exception:
font = None
draw.text((cx, h // 10), title, fill=(255, 255, 255), anchor="mm", font=font)
return img
def render_frame_minimal_dark(w, h, audio_chunk, bg_color, accent_color, title, frame_idx, fps):
"""Minimal dark with thin progress bar + centered text."""
from PIL import Image, ImageDraw
img = Image.new("RGB", (w, h), (18, 18, 22))
draw = ImageDraw.Draw(img)
# thin horizontal waveform strip
strip_h = max(4, h // 15)
cy = h // 2
n = len(audio_chunk)
for i in range(n):
x = int(i * w / n)
amp = int(audio_chunk[i] * strip_h)
draw.rectangle([x, cy - abs(amp), x, cy + abs(amp)], fill=accent_color)
# bottom progress indicator: thin white line based on frame
prog_w = int(frame_idx * w / max(fps * 1, 1)) # width grows with time; real duration injected below
draw.rectangle([0, h - 4, prog_w, h], fill=accent_color)
if title:
try:
from PIL import ImageFont
font_size = max(24, h // 18)
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", font_size)
except Exception:
font = None
draw.text((w // 2, h * 2 // 5), title, fill=(240, 240, 245), anchor="mm", font=font)
return img
RENDERERS = {
"๐ŸŽ™ Solid + Waveform": render_frame_solid_waveform,
"๐ŸŒŠ Animated Spectrum Bars": render_frame_spectrum,
"โšก Oscilloscope Line": render_frame_oscilloscope,
"๐ŸŒˆ Gradient Pulse": render_frame_gradient_pulse,
"๐Ÿ”ฒ Minimal Dark + Title": render_frame_minimal_dark,
}
def audio_to_video(
audio_path, style, resolution_label,
bg_hex, accent_hex, title_text, fps_str,
progress=gr.Progress()
):
if not audio_path:
return None, "โŒ No audio file provided. Generate or upload audio first."
fps = int(fps_str)
w, h = RESOLUTIONS[resolution_label]
bg_color = hex_to_rgb(bg_hex)
accent_color = hex_to_rgb(accent_hex)
render_fn = RENDERERS[style]
# ---- Load audio ----
progress(0.05, desc="Reading audio โ€ฆ")
try:
audio_data, sr = sf.read(audio_path, dtype="float32")
except Exception as e:
return None, f"โŒ Could not read audio: {e}"
if audio_data.ndim > 1:
audio_data = audio_data.mean(axis=1) # mono
duration = len(audio_data) / sr
n_frames = int(duration * fps)
samples_per_frame = max(1, len(audio_data) // max(n_frames, 1))
# ---- Write frames to temp dir ----
progress(0.10, desc="Rendering frames โ€ฆ")
with tempfile.TemporaryDirectory() as tmpdir:
frame_dir = Path(tmpdir) / "frames"
frame_dir.mkdir()
# gradient_pulse is slow (pixel-by-pixel); warn user
for fi in range(n_frames):
if fi % max(1, n_frames // 20) == 0:
progress(0.10 + 0.65 * fi / n_frames, desc=f"Frame {fi}/{n_frames} โ€ฆ")
start = fi * samples_per_frame
end = min(start + samples_per_frame, len(audio_data))
chunk = audio_data[start:end] if end > start else np.zeros(64)
img = render_fn(w, h, chunk, bg_color, accent_color, title_text, fi, fps)
img.save(str(frame_dir / f"frame_{fi:06d}.png"))
# ---- Assemble with ffmpeg ----
progress(0.80, desc="Encoding video โ€ฆ")
out_path = "/tmp/tts_video.mp4"
ffmpeg_cmd = [
"ffmpeg", "-y",
"-framerate", str(fps),
"-i", str(frame_dir / "frame_%06d.png"),
"-i", audio_path,
"-c:v", "libx264",
"-preset", "fast",
"-crf", "23",
"-pix_fmt", "yuv420p",
"-c:a", "aac",
"-b:a", "192k",
"-shortest",
"-movflags", "+faststart",
out_path,
]
result = subprocess.run(ffmpeg_cmd, capture_output=True, text=True)
if result.returncode != 0:
return None, f"โŒ ffmpeg error:\n```\n{result.stderr[-1500:]}\n```"
progress(1.0, desc="Done!")
info = (
f"โœ… **Video ready!** \n"
f"Style: `{style}` ยท Resolution: `{w}ร—{h}` ยท FPS: `{fps}` ยท Duration: `{duration:.1f}s`"
)
return out_path, info
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# UI
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
css = """
.radio-row { display: flex; flex-wrap: wrap; gap: 1.2rem; align-items: center; }
.radio-row > div { min-width: 140px; }
"""
with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
gr.Markdown("# Qwen3-TTS Full Demo\nAll released variants โ€ข CPU-friendly โ€ข No streaming")
# โ”€โ”€ Tab 1: Custom Voice โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
with gr.Tab("CustomVoice โ€“ Preset speakers + instruct"):
gr.Markdown("Uses 9 built-in premium voices + optional style instruction")
with gr.Row(elem_classes="radio-row"):
cv_model = gr.Radio(["1.7B-CustomVoice", "0.6B-CustomVoice"], value="1.7B-CustomVoice", label="Model")
cv_precision = gr.Radio(["float32", "float16"], value="float32", label="Precision")
with gr.Row():
cv_text = gr.Textbox(label="Text to speak", lines=4, value="่ฟ™ๆ˜ฏไธ€ไธชๆต‹่ฏ•ใ€‚ๅธŒๆœ›ๅฃฐ้Ÿณๅฌ่ตทๆฅ่‡ช็„ถไธ€ไบ›ใ€‚")
cv_lang = gr.Dropdown(["Auto","Chinese","English","Japanese","Korean"], value="Auto", label="Language")
cv_speaker = gr.Dropdown(
["Vivian","Serena","Uncle_Fu","Dylan","Eric","Ryan","Aiden","Ono_Anna","Sohee"],
value="Vivian", label="Speaker"
)
cv_instruct = gr.Textbox(label="Style instruction (optional)", lines=2, placeholder="็”จ็‰นๅˆซๆ„คๆ€’็š„่ฏญๆฐ”่ฏด")
cv_btn = gr.Button("Generate", variant="primary")
cv_audio = gr.Audio(label="Generated Speech", type="filepath")
cv_info = gr.Markdown()
cv_btn.click(infer_custom_voice,
inputs=[cv_text, cv_lang, cv_speaker, cv_instruct, cv_model, cv_precision],
outputs=[cv_audio, cv_info])
# โ”€โ”€ Tab 2: Voice Design โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
with gr.Tab("Voice Design โ€“ Describe any voice"):
gr.Markdown("Create arbitrary voices from natural language description (only 1.7B variant)")
with gr.Row(elem_classes="radio-row"):
vd_model = gr.Radio(["1.7B-VoiceDesign"], value="1.7B-VoiceDesign", label="Model")
vd_precision = gr.Radio(["float32", "float16"], value="float32", label="Precision")
vd_text = gr.Textbox(label="Text to speak", lines=4, value="ๅ“ฅๅ“ฅ๏ผŒไฝ ๅ›žๆฅๅ•ฆ๏ผŒไบบๅฎถ็ญ‰ไบ†ๅฅฝไน…๏ผŒ่ฆๆŠฑๆŠฑ๏ผ")
vd_lang = gr.Dropdown(["Auto","Chinese","English"], value="Chinese", label="Language")
vd_instruct = gr.Textbox(label="Voice description / instruction", lines=4,
value="ไฝ“็Žฐๆ’’ๅจ‡็จšๅซฉ็š„่่މๅฅณๅฃฐ๏ผŒ้Ÿณ่ฐƒๅ้ซ˜ไธ”่ตทไผๆ˜Žๆ˜พ๏ผŒ้ปไบบใ€ๅšไฝœๅˆๅˆปๆ„ๅ–่Œ็š„ๆ„Ÿ่ง‰")
vd_btn = gr.Button("Generate", variant="primary")
vd_audio = gr.Audio(label="Generated Speech", type="filepath")
vd_info = gr.Markdown()
vd_btn.click(infer_voice_design,
inputs=[vd_text, vd_lang, vd_instruct, vd_model, vd_precision],
outputs=[vd_audio, vd_info])
# โ”€โ”€ Tab 3: Voice Clone โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
with gr.Tab("Base โ€“ Voice Clone from reference audio"):
gr.Markdown("3-second rapid voice cloning using reference clip (Base models only)")
with gr.Row(elem_classes="radio-row"):
cl_model = gr.Radio(["1.7B-Base","0.6B-Base"], value="1.7B-Base", label="Model")
cl_precision = gr.Radio(["float32","float16"], value="float32", label="Precision")
cl_text = gr.Textbox(label="Text to synthesize", lines=4, value="This is my cloned voice now speaking normally.")
cl_lang = gr.Dropdown(["Auto","English","Chinese"], value="Auto", label="Language")
with gr.Row():
cl_ref_audio = gr.Audio(label="Reference audio clip", type="filepath", sources=["upload","microphone"])
cl_ref_text = gr.Textbox(label="Transcript of reference (optional)", lines=2)
cl_xvec_only = gr.Checkbox(label="x-vector only mode (faster, lower quality)", value=False)
cl_btn = gr.Button("Clone & Generate", variant="primary")
cl_audio = gr.Audio(label="Cloned Speech", type="filepath")
cl_info = gr.Markdown()
cl_btn.click(infer_voice_clone,
inputs=[cl_text, cl_lang, cl_ref_audio, cl_ref_text, cl_xvec_only, cl_model, cl_precision],
outputs=[cl_audio, cl_info])
# โ”€โ”€ Tab 4: Audio โ†’ Video โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
with gr.Tab("๐ŸŽฌ Audio โ†’ Video"):
gr.Markdown(
"## Audio โ†’ Social Media Video\n"
"Upload **any WAV/MP3** (or paste the path from a generated clip above) "
"and render it into a shareable MP4 with a visual style.\n\n"
"> โš ๏ธ **Gradient Pulse** renders per-pixel and is slow for long audio โ€” prefer other styles for > 30 s clips."
)
with gr.Row():
with gr.Column(scale=3):
vid_audio = gr.Audio(
label="Input audio (upload or record)",
type="filepath",
sources=["upload", "microphone"],
)
vid_title = gr.Textbox(
label="Title / caption text (shown on video)",
placeholder="My AI Voice ยท Qwen3-TTS",
value=""
)
with gr.Column(scale=2):
vid_style = gr.Radio(
VISUAL_STYLES,
value="๐ŸŒŠ Animated Spectrum Bars",
label="Visual style",
)
vid_res = gr.Dropdown(
list(RESOLUTIONS.keys()),
value="1080ร—1920 (TikTok/Reels 9:16)",
label="Resolution / aspect ratio",
)
vid_fps = gr.Radio(["24", "30"], value="24", label="FPS")
with gr.Row():
vid_bg = gr.ColorPicker(value="#0d0d1a", label="Background colour")
vid_accent = gr.ColorPicker(value="#7c3aed", label="Accent / waveform colour")
vid_btn = gr.Button("๐ŸŽฌ Render Video", variant="primary", size="lg")
vid_out = gr.Video(label="Output video")
vid_info = gr.Markdown()
vid_btn.click(
audio_to_video,
inputs=[vid_audio, vid_style, vid_res, vid_bg, vid_accent, vid_title, vid_fps],
outputs=[vid_out, vid_info],
)
gr.Markdown("""
**Style guide:**
| Style | Best for | Notes |
|---|---|---|
| ๐ŸŽ™ Solid + Waveform | Podcasts, quotes | Fast, clean |
| ๐ŸŒŠ Animated Spectrum Bars | Music / speech highlights | FFT-based, energetic |
| โšก Oscilloscope Line | Dark/techy aesthetic | Classic green-on-black |
| ๐ŸŒˆ Gradient Pulse | Ambient / ASMR | Slow render โ€” use short clips |
| ๐Ÿ”ฒ Minimal Dark + Title | Branded content | Great with a title caption |
""")
# โ”€โ”€ Footer โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
gr.Markdown("""
**Notes**
โ€ข First generation per model loads weights (may take 1โ€“5 min).
โ€ข Use **float32** if **float16** causes crashes (common on CPU).
โ€ข **0.6B** models are faster / lighter on CPU.
โ€ข Video tab requires `ffmpeg` and `Pillow` (both standard on most systems).
โ€ข Repo & docs: https://github.com/QwenLM/Qwen3-TTS
""")
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)