import gradio as gr import torch import numpy as np import soundfile as sf from pathlib import Path import os import warnings import subprocess import tempfile import math warnings.filterwarnings("ignore", category=UserWarning) # ──────────────────────────────────────────────── # Lazy import for TTS model (not required for video tab) # ──────────────────────────────────────────────── try: from qwen_tts import Qwen3TTSModel TTS_AVAILABLE = True except ImportError: TTS_AVAILABLE = False # ──────────────────────────────────────────────── # Globals & Model Loader # ──────────────────────────────────────────────── MODELS = { "1.7B-CustomVoice": "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice", "0.6B-CustomVoice": "Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice", "1.7B-VoiceDesign": "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign", "1.7B-Base": "Qwen/Qwen3-TTS-12Hz-1.7B-Base", "0.6B-Base": "Qwen/Qwen3-TTS-12Hz-0.6B-Base", } loaded_models = {} def get_model(model_key: str, dtype_str: str = "float32", progress=gr.Progress()): if not TTS_AVAILABLE: raise gr.Error("qwen_tts is not installed. TTS tabs unavailable.") key = f"{model_key}_{dtype_str}" if key in loaded_models: return loaded_models[key] progress(0.1, desc=f"Loading {model_key} ({dtype_str}) …") repo_id = MODELS[model_key] dtype = torch.float32 if dtype_str == "float32" else torch.float16 try: model = Qwen3TTSModel.from_pretrained( repo_id, device_map="cpu", dtype=dtype, torch_dtype=dtype, low_cpu_mem_usage=True, ) except Exception as e: raise gr.Error(f"Model loading failed:\n{str(e)}\n\nTry float32 or smaller variant.") loaded_models[key] = model progress(0.9, desc="Model ready.") return model # ──────────────────────────────────────────────── # TTS Inference (unchanged) # ──────────────────────────────────────────────── def infer_custom_voice(text, lang, speaker, instruct, model_key, precision, progress=gr.Progress()): if not text.strip(): return None, "Please enter some text." model = get_model(model_key, precision, progress) progress(0.4, desc="Generating …") try: wavs, sr = model.generate_custom_voice( text=text, language=lang if lang != "Auto" else None, speaker=speaker, instruct=instruct.strip() or None, max_new_tokens=1500, ) path = "/tmp/output_custom.wav" sf.write(path, wavs[0] if isinstance(wavs, list) else wavs, sr) info = f"**Generated with {model_key}** \nlang: {lang} \nspeaker: {speaker} \ninstruct: {instruct or '(none)'}" return path, info except Exception as e: return None, f"**Error**: {str(e)}" def infer_voice_design(text, lang, instruct, model_key, precision, progress=gr.Progress()): if not text.strip() or not instruct.strip(): return None, "Text and voice instruction required." model = get_model(model_key, precision, progress) progress(0.4, desc="Generating …") try: wavs, sr = model.generate_voice_design( text=text, language=lang if lang != "Auto" else None, instruct=instruct, max_new_tokens=1500, ) path = "/tmp/output_design.wav" sf.write(path, wavs[0] if isinstance(wavs, list) else wavs, sr) info = f"**Voice Design – {model_key}** \nlang: {lang} \ninstruct: {instruct}" return path, info except Exception as e: return None, f"**Error**: {str(e)}" def infer_voice_clone(text, lang, ref_audio, ref_text, x_vector_only, model_key, precision, progress=gr.Progress()): if not text.strip(): return None, "Enter text to synthesize." if not ref_audio: return None, "Upload reference audio." model = get_model(model_key, precision, progress) progress(0.3, desc="Processing reference …") try: wavs, sr = model.generate_voice_clone( text=text, language=lang if lang != "Auto" else None, ref_audio=ref_audio, ref_text=ref_text.strip() or None, x_vector_only_mode=x_vector_only, max_new_tokens=1500, ) path = "/tmp/output_clone.wav" sf.write(path, wavs[0] if isinstance(wavs, list) else wavs, sr) info = f"**Voice Clone – {model_key}** \nlang: {lang} \nx-vector-only: {x_vector_only}" return path, info except Exception as e: return None, f"**Error**: {str(e)}" # ──────────────────────────────────────────────── # Video Generation Helpers # ──────────────────────────────────────────────── RESOLUTIONS = { "1080×1920 (TikTok/Reels 9:16)": (1080, 1920), "1080×1080 (Instagram Square)": (1080, 1080), "1920×1080 (YouTube Landscape)": (1920, 1080), "1280×720 (YouTube 720p)": (1280, 720), } VISUAL_STYLES = [ "🎙 Solid + Waveform", "🌊 Animated Spectrum Bars", "⚡ Oscilloscope Line", "🌈 Gradient Pulse", "🔲 Minimal Dark + Title", ] def hex_to_rgb(h: str): h = h.lstrip("#") return tuple(int(h[i:i+2], 16) for i in (0, 2, 4)) def render_frame_solid_waveform(w, h, audio_chunk, bg_color, accent_color, title, frame_idx, fps): """Solid background with a centered waveform line.""" from PIL import Image, ImageDraw, ImageFont img = Image.new("RGB", (w, h), bg_color) draw = ImageDraw.Draw(img) # waveform n = len(audio_chunk) if n == 0: return img cx = h // 2 bar_w = max(1, w // max(n, 1)) for i, amp in enumerate(audio_chunk): x = int(i * w / n) bar_h = int(abs(amp) * h * 0.4) draw.rectangle([x, cx - bar_h, x + bar_w - 1, cx + bar_h], fill=accent_color) # title text if title: try: from PIL import ImageFont font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", max(20, h // 25)) except Exception: font = None draw.text((w // 2, h // 10), title, fill=(255, 255, 255), anchor="mm", font=font) return img def render_frame_spectrum(w, h, audio_chunk, bg_color, accent_color, title, frame_idx, fps): """Animated spectrum-like bars using FFT.""" from PIL import Image, ImageDraw img = Image.new("RGB", (w, h), bg_color) draw = ImageDraw.Draw(img) N_BARS = 64 n = len(audio_chunk) if n > 0: spectrum = np.abs(np.fft.rfft(audio_chunk, n=512))[:N_BARS] spectrum = spectrum / (spectrum.max() + 1e-9) else: spectrum = np.zeros(N_BARS) bar_w = w // N_BARS for i, val in enumerate(spectrum): bar_h = int(val * h * 0.8) x0 = i * bar_w x1 = x0 + bar_w - 2 # gradient colour from accent to white r, g, b = accent_color draw.rectangle([x0, h - bar_h, x1, h], fill=(r, g, b)) if title: try: from PIL import ImageFont font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", max(20, h // 25)) except Exception: font = None draw.text((w // 2, h // 10), title, fill=(255, 255, 255), anchor="mm", font=font) return img def render_frame_oscilloscope(w, h, audio_chunk, bg_color, accent_color, title, frame_idx, fps): """Classic green-phosphor oscilloscope line.""" from PIL import Image, ImageDraw img = Image.new("RGB", (w, h), (10, 10, 10)) draw = ImageDraw.Draw(img) n = len(audio_chunk) cx = h // 2 pts = [] for i in range(n): x = int(i * w / n) y = int(cx - audio_chunk[i] * h * 0.4) y = max(0, min(h - 1, y)) pts.append((x, y)) if len(pts) > 1: draw.line(pts, fill=accent_color, width=max(2, h // 200)) if title: try: from PIL import ImageFont font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", max(20, h // 25)) except Exception: font = None draw.text((w // 2, h // 12), title, fill=(200, 255, 200), anchor="mm", font=font) return img def render_frame_gradient_pulse(w, h, audio_chunk, bg_color, accent_color, title, frame_idx, fps): """Radial gradient that pulses with RMS energy.""" from PIL import Image, ImageDraw, ImageFilter rms = float(np.sqrt(np.mean(audio_chunk ** 2))) if len(audio_chunk) > 0 else 0 r0, g0, b0 = bg_color r1, g1, b1 = accent_color img = Image.new("RGB", (w, h)) pixels = img.load() cx, cy = w // 2, h // 2 max_r = math.sqrt(cx**2 + cy**2) pulse = 0.3 + rms * 2.5 for y in range(h): for x in range(w): dist = math.sqrt((x - cx)**2 + (y - cy)**2) / max_r t = max(0.0, min(1.0, 1 - dist / pulse)) pixels[x, y] = ( int(r0 + t * (r1 - r0)), int(g0 + t * (g1 - g0)), int(b0 + t * (b1 - b0)), ) draw = ImageDraw.Draw(img) if title: try: from PIL import ImageFont font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", max(20, h // 25)) except Exception: font = None draw.text((cx, h // 10), title, fill=(255, 255, 255), anchor="mm", font=font) return img def render_frame_minimal_dark(w, h, audio_chunk, bg_color, accent_color, title, frame_idx, fps): """Minimal dark with thin progress bar + centered text.""" from PIL import Image, ImageDraw img = Image.new("RGB", (w, h), (18, 18, 22)) draw = ImageDraw.Draw(img) # thin horizontal waveform strip strip_h = max(4, h // 15) cy = h // 2 n = len(audio_chunk) for i in range(n): x = int(i * w / n) amp = int(audio_chunk[i] * strip_h) draw.rectangle([x, cy - abs(amp), x, cy + abs(amp)], fill=accent_color) # bottom progress indicator: thin white line based on frame prog_w = int(frame_idx * w / max(fps * 1, 1)) # width grows with time; real duration injected below draw.rectangle([0, h - 4, prog_w, h], fill=accent_color) if title: try: from PIL import ImageFont font_size = max(24, h // 18) font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", font_size) except Exception: font = None draw.text((w // 2, h * 2 // 5), title, fill=(240, 240, 245), anchor="mm", font=font) return img RENDERERS = { "🎙 Solid + Waveform": render_frame_solid_waveform, "🌊 Animated Spectrum Bars": render_frame_spectrum, "⚡ Oscilloscope Line": render_frame_oscilloscope, "🌈 Gradient Pulse": render_frame_gradient_pulse, "🔲 Minimal Dark + Title": render_frame_minimal_dark, } def audio_to_video( audio_path, style, resolution_label, bg_hex, accent_hex, title_text, fps_str, progress=gr.Progress() ): if not audio_path: return None, "❌ No audio file provided. Generate or upload audio first." fps = int(fps_str) w, h = RESOLUTIONS[resolution_label] bg_color = hex_to_rgb(bg_hex) accent_color = hex_to_rgb(accent_hex) render_fn = RENDERERS[style] # ---- Load audio ---- progress(0.05, desc="Reading audio …") try: audio_data, sr = sf.read(audio_path, dtype="float32") except Exception as e: return None, f"❌ Could not read audio: {e}" if audio_data.ndim > 1: audio_data = audio_data.mean(axis=1) # mono duration = len(audio_data) / sr n_frames = int(duration * fps) samples_per_frame = max(1, len(audio_data) // max(n_frames, 1)) # ---- Write frames to temp dir ---- progress(0.10, desc="Rendering frames …") with tempfile.TemporaryDirectory() as tmpdir: frame_dir = Path(tmpdir) / "frames" frame_dir.mkdir() # gradient_pulse is slow (pixel-by-pixel); warn user for fi in range(n_frames): if fi % max(1, n_frames // 20) == 0: progress(0.10 + 0.65 * fi / n_frames, desc=f"Frame {fi}/{n_frames} …") start = fi * samples_per_frame end = min(start + samples_per_frame, len(audio_data)) chunk = audio_data[start:end] if end > start else np.zeros(64) img = render_fn(w, h, chunk, bg_color, accent_color, title_text, fi, fps) img.save(str(frame_dir / f"frame_{fi:06d}.png")) # ---- Assemble with ffmpeg ---- progress(0.80, desc="Encoding video …") out_path = "/tmp/tts_video.mp4" ffmpeg_cmd = [ "ffmpeg", "-y", "-framerate", str(fps), "-i", str(frame_dir / "frame_%06d.png"), "-i", audio_path, "-c:v", "libx264", "-preset", "fast", "-crf", "23", "-pix_fmt", "yuv420p", "-c:a", "aac", "-b:a", "192k", "-shortest", "-movflags", "+faststart", out_path, ] result = subprocess.run(ffmpeg_cmd, capture_output=True, text=True) if result.returncode != 0: return None, f"❌ ffmpeg error:\n```\n{result.stderr[-1500:]}\n```" progress(1.0, desc="Done!") info = ( f"✅ **Video ready!** \n" f"Style: `{style}` · Resolution: `{w}×{h}` · FPS: `{fps}` · Duration: `{duration:.1f}s`" ) return out_path, info # ──────────────────────────────────────────────── # UI # ──────────────────────────────────────────────── css = """ .radio-row { display: flex; flex-wrap: wrap; gap: 1.2rem; align-items: center; } .radio-row > div { min-width: 140px; } """ with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo: gr.Markdown("# Qwen3-TTS Full Demo\nAll released variants • CPU-friendly • No streaming") # ── Tab 1: Custom Voice ──────────────────────────────────────────────── with gr.Tab("CustomVoice – Preset speakers + instruct"): gr.Markdown("Uses 9 built-in premium voices + optional style instruction") with gr.Row(elem_classes="radio-row"): cv_model = gr.Radio(["1.7B-CustomVoice", "0.6B-CustomVoice"], value="1.7B-CustomVoice", label="Model") cv_precision = gr.Radio(["float32", "float16"], value="float32", label="Precision") with gr.Row(): cv_text = gr.Textbox(label="Text to speak", lines=4, value="这是一个测试。希望声音听起来自然一些。") cv_lang = gr.Dropdown(["Auto","Chinese","English","Japanese","Korean"], value="Auto", label="Language") cv_speaker = gr.Dropdown( ["Vivian","Serena","Uncle_Fu","Dylan","Eric","Ryan","Aiden","Ono_Anna","Sohee"], value="Vivian", label="Speaker" ) cv_instruct = gr.Textbox(label="Style instruction (optional)", lines=2, placeholder="用特别愤怒的语气说") cv_btn = gr.Button("Generate", variant="primary") cv_audio = gr.Audio(label="Generated Speech", type="filepath") cv_info = gr.Markdown() cv_btn.click(infer_custom_voice, inputs=[cv_text, cv_lang, cv_speaker, cv_instruct, cv_model, cv_precision], outputs=[cv_audio, cv_info]) # ── Tab 2: Voice Design ─────────────────────────────────────────────── with gr.Tab("Voice Design – Describe any voice"): gr.Markdown("Create arbitrary voices from natural language description (only 1.7B variant)") with gr.Row(elem_classes="radio-row"): vd_model = gr.Radio(["1.7B-VoiceDesign"], value="1.7B-VoiceDesign", label="Model") vd_precision = gr.Radio(["float32", "float16"], value="float32", label="Precision") vd_text = gr.Textbox(label="Text to speak", lines=4, value="哥哥,你回来啦,人家等了好久,要抱抱!") vd_lang = gr.Dropdown(["Auto","Chinese","English"], value="Chinese", label="Language") vd_instruct = gr.Textbox(label="Voice description / instruction", lines=4, value="体现撒娇稚嫩的萝莉女声,音调偏高且起伏明显,黏人、做作又刻意卖萌的感觉") vd_btn = gr.Button("Generate", variant="primary") vd_audio = gr.Audio(label="Generated Speech", type="filepath") vd_info = gr.Markdown() vd_btn.click(infer_voice_design, inputs=[vd_text, vd_lang, vd_instruct, vd_model, vd_precision], outputs=[vd_audio, vd_info]) # ── Tab 3: Voice Clone ──────────────────────────────────────────────── with gr.Tab("Base – Voice Clone from reference audio"): gr.Markdown("3-second rapid voice cloning using reference clip (Base models only)") with gr.Row(elem_classes="radio-row"): cl_model = gr.Radio(["1.7B-Base","0.6B-Base"], value="1.7B-Base", label="Model") cl_precision = gr.Radio(["float32","float16"], value="float32", label="Precision") cl_text = gr.Textbox(label="Text to synthesize", lines=4, value="This is my cloned voice now speaking normally.") cl_lang = gr.Dropdown(["Auto","English","Chinese"], value="Auto", label="Language") with gr.Row(): cl_ref_audio = gr.Audio(label="Reference audio clip", type="filepath", sources=["upload","microphone"]) cl_ref_text = gr.Textbox(label="Transcript of reference (optional)", lines=2) cl_xvec_only = gr.Checkbox(label="x-vector only mode (faster, lower quality)", value=False) cl_btn = gr.Button("Clone & Generate", variant="primary") cl_audio = gr.Audio(label="Cloned Speech", type="filepath") cl_info = gr.Markdown() cl_btn.click(infer_voice_clone, inputs=[cl_text, cl_lang, cl_ref_audio, cl_ref_text, cl_xvec_only, cl_model, cl_precision], outputs=[cl_audio, cl_info]) # ── Tab 4: Audio → Video ────────────────────────────────────────────── with gr.Tab("🎬 Audio → Video"): gr.Markdown( "## Audio → Social Media Video\n" "Upload **any WAV/MP3** (or paste the path from a generated clip above) " "and render it into a shareable MP4 with a visual style.\n\n" "> ⚠️ **Gradient Pulse** renders per-pixel and is slow for long audio — prefer other styles for > 30 s clips." ) with gr.Row(): with gr.Column(scale=3): vid_audio = gr.Audio( label="Input audio (upload or record)", type="filepath", sources=["upload", "microphone"], ) vid_title = gr.Textbox( label="Title / caption text (shown on video)", placeholder="My AI Voice · Qwen3-TTS", value="" ) with gr.Column(scale=2): vid_style = gr.Radio( VISUAL_STYLES, value="🌊 Animated Spectrum Bars", label="Visual style", ) vid_res = gr.Dropdown( list(RESOLUTIONS.keys()), value="1080×1920 (TikTok/Reels 9:16)", label="Resolution / aspect ratio", ) vid_fps = gr.Radio(["24", "30"], value="24", label="FPS") with gr.Row(): vid_bg = gr.ColorPicker(value="#0d0d1a", label="Background colour") vid_accent = gr.ColorPicker(value="#7c3aed", label="Accent / waveform colour") vid_btn = gr.Button("🎬 Render Video", variant="primary", size="lg") vid_out = gr.Video(label="Output video") vid_info = gr.Markdown() vid_btn.click( audio_to_video, inputs=[vid_audio, vid_style, vid_res, vid_bg, vid_accent, vid_title, vid_fps], outputs=[vid_out, vid_info], ) gr.Markdown(""" **Style guide:** | Style | Best for | Notes | |---|---|---| | 🎙 Solid + Waveform | Podcasts, quotes | Fast, clean | | 🌊 Animated Spectrum Bars | Music / speech highlights | FFT-based, energetic | | ⚡ Oscilloscope Line | Dark/techy aesthetic | Classic green-on-black | | 🌈 Gradient Pulse | Ambient / ASMR | Slow render — use short clips | | 🔲 Minimal Dark + Title | Branded content | Great with a title caption | """) # ── Footer ──────────────────────────────────────────────────────────── gr.Markdown(""" **Notes** • First generation per model loads weights (may take 1–5 min). • Use **float32** if **float16** causes crashes (common on CPU). • **0.6B** models are faster / lighter on CPU. • Video tab requires `ffmpeg` and `Pillow` (both standard on most systems). • Repo & docs: https://github.com/QwenLM/Qwen3-TTS """) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)