| | import gradio as gr |
| | import torch |
| | import numpy as np |
| | import soundfile as sf |
| | from pathlib import Path |
| | import os |
| | import warnings |
| | import subprocess |
| | import tempfile |
| | import math |
| |
|
| | warnings.filterwarnings("ignore", category=UserWarning) |
| |
|
| | |
| | |
| | |
| | try: |
| | from qwen_tts import Qwen3TTSModel |
| | TTS_AVAILABLE = True |
| | except ImportError: |
| | TTS_AVAILABLE = False |
| |
|
| | |
| | |
| | |
| |
|
| | MODELS = { |
| | "1.7B-CustomVoice": "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice", |
| | "0.6B-CustomVoice": "Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice", |
| | "1.7B-VoiceDesign": "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign", |
| | "1.7B-Base": "Qwen/Qwen3-TTS-12Hz-1.7B-Base", |
| | "0.6B-Base": "Qwen/Qwen3-TTS-12Hz-0.6B-Base", |
| | } |
| |
|
| | loaded_models = {} |
| |
|
| | def get_model(model_key: str, dtype_str: str = "float32", progress=gr.Progress()): |
| | if not TTS_AVAILABLE: |
| | raise gr.Error("qwen_tts is not installed. TTS tabs unavailable.") |
| | key = f"{model_key}_{dtype_str}" |
| | if key in loaded_models: |
| | return loaded_models[key] |
| | progress(0.1, desc=f"Loading {model_key} ({dtype_str}) โฆ") |
| | repo_id = MODELS[model_key] |
| | dtype = torch.float32 if dtype_str == "float32" else torch.float16 |
| | try: |
| | model = Qwen3TTSModel.from_pretrained( |
| | repo_id, device_map="cpu", dtype=dtype, |
| | torch_dtype=dtype, low_cpu_mem_usage=True, |
| | ) |
| | except Exception as e: |
| | raise gr.Error(f"Model loading failed:\n{str(e)}\n\nTry float32 or smaller variant.") |
| | loaded_models[key] = model |
| | progress(0.9, desc="Model ready.") |
| | return model |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def infer_custom_voice(text, lang, speaker, instruct, model_key, precision, progress=gr.Progress()): |
| | if not text.strip(): |
| | return None, "Please enter some text." |
| | model = get_model(model_key, precision, progress) |
| | progress(0.4, desc="Generating โฆ") |
| | try: |
| | wavs, sr = model.generate_custom_voice( |
| | text=text, language=lang if lang != "Auto" else None, |
| | speaker=speaker, instruct=instruct.strip() or None, max_new_tokens=1500, |
| | ) |
| | path = "/tmp/output_custom.wav" |
| | sf.write(path, wavs[0] if isinstance(wavs, list) else wavs, sr) |
| | info = f"**Generated with {model_key}** \nlang: {lang} \nspeaker: {speaker} \ninstruct: {instruct or '(none)'}" |
| | return path, info |
| | except Exception as e: |
| | return None, f"**Error**: {str(e)}" |
| |
|
| |
|
| | def infer_voice_design(text, lang, instruct, model_key, precision, progress=gr.Progress()): |
| | if not text.strip() or not instruct.strip(): |
| | return None, "Text and voice instruction required." |
| | model = get_model(model_key, precision, progress) |
| | progress(0.4, desc="Generating โฆ") |
| | try: |
| | wavs, sr = model.generate_voice_design( |
| | text=text, language=lang if lang != "Auto" else None, |
| | instruct=instruct, max_new_tokens=1500, |
| | ) |
| | path = "/tmp/output_design.wav" |
| | sf.write(path, wavs[0] if isinstance(wavs, list) else wavs, sr) |
| | info = f"**Voice Design โ {model_key}** \nlang: {lang} \ninstruct: {instruct}" |
| | return path, info |
| | except Exception as e: |
| | return None, f"**Error**: {str(e)}" |
| |
|
| |
|
| | def infer_voice_clone(text, lang, ref_audio, ref_text, x_vector_only, model_key, precision, progress=gr.Progress()): |
| | if not text.strip(): |
| | return None, "Enter text to synthesize." |
| | if not ref_audio: |
| | return None, "Upload reference audio." |
| | model = get_model(model_key, precision, progress) |
| | progress(0.3, desc="Processing reference โฆ") |
| | try: |
| | wavs, sr = model.generate_voice_clone( |
| | text=text, language=lang if lang != "Auto" else None, |
| | ref_audio=ref_audio, ref_text=ref_text.strip() or None, |
| | x_vector_only_mode=x_vector_only, max_new_tokens=1500, |
| | ) |
| | path = "/tmp/output_clone.wav" |
| | sf.write(path, wavs[0] if isinstance(wavs, list) else wavs, sr) |
| | info = f"**Voice Clone โ {model_key}** \nlang: {lang} \nx-vector-only: {x_vector_only}" |
| | return path, info |
| | except Exception as e: |
| | return None, f"**Error**: {str(e)}" |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | RESOLUTIONS = { |
| | "1080ร1920 (TikTok/Reels 9:16)": (1080, 1920), |
| | "1080ร1080 (Instagram Square)": (1080, 1080), |
| | "1920ร1080 (YouTube Landscape)": (1920, 1080), |
| | "1280ร720 (YouTube 720p)": (1280, 720), |
| | } |
| |
|
| | VISUAL_STYLES = [ |
| | "๐ Solid + Waveform", |
| | "๐ Animated Spectrum Bars", |
| | "โก Oscilloscope Line", |
| | "๐ Gradient Pulse", |
| | "๐ฒ Minimal Dark + Title", |
| | ] |
| |
|
| |
|
| | def hex_to_rgb(h: str): |
| | h = h.lstrip("#") |
| | return tuple(int(h[i:i+2], 16) for i in (0, 2, 4)) |
| |
|
| |
|
| | def render_frame_solid_waveform(w, h, audio_chunk, bg_color, accent_color, title, frame_idx, fps): |
| | """Solid background with a centered waveform line.""" |
| | from PIL import Image, ImageDraw, ImageFont |
| | img = Image.new("RGB", (w, h), bg_color) |
| | draw = ImageDraw.Draw(img) |
| | |
| | n = len(audio_chunk) |
| | if n == 0: |
| | return img |
| | cx = h // 2 |
| | bar_w = max(1, w // max(n, 1)) |
| | for i, amp in enumerate(audio_chunk): |
| | x = int(i * w / n) |
| | bar_h = int(abs(amp) * h * 0.4) |
| | draw.rectangle([x, cx - bar_h, x + bar_w - 1, cx + bar_h], fill=accent_color) |
| | |
| | if title: |
| | try: |
| | from PIL import ImageFont |
| | font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", max(20, h // 25)) |
| | except Exception: |
| | font = None |
| | draw.text((w // 2, h // 10), title, fill=(255, 255, 255), anchor="mm", font=font) |
| | return img |
| |
|
| |
|
| | def render_frame_spectrum(w, h, audio_chunk, bg_color, accent_color, title, frame_idx, fps): |
| | """Animated spectrum-like bars using FFT.""" |
| | from PIL import Image, ImageDraw |
| | img = Image.new("RGB", (w, h), bg_color) |
| | draw = ImageDraw.Draw(img) |
| | N_BARS = 64 |
| | n = len(audio_chunk) |
| | if n > 0: |
| | spectrum = np.abs(np.fft.rfft(audio_chunk, n=512))[:N_BARS] |
| | spectrum = spectrum / (spectrum.max() + 1e-9) |
| | else: |
| | spectrum = np.zeros(N_BARS) |
| | bar_w = w // N_BARS |
| | for i, val in enumerate(spectrum): |
| | bar_h = int(val * h * 0.8) |
| | x0 = i * bar_w |
| | x1 = x0 + bar_w - 2 |
| | |
| | r, g, b = accent_color |
| | draw.rectangle([x0, h - bar_h, x1, h], fill=(r, g, b)) |
| | if title: |
| | try: |
| | from PIL import ImageFont |
| | font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", max(20, h // 25)) |
| | except Exception: |
| | font = None |
| | draw.text((w // 2, h // 10), title, fill=(255, 255, 255), anchor="mm", font=font) |
| | return img |
| |
|
| |
|
| | def render_frame_oscilloscope(w, h, audio_chunk, bg_color, accent_color, title, frame_idx, fps): |
| | """Classic green-phosphor oscilloscope line.""" |
| | from PIL import Image, ImageDraw |
| | img = Image.new("RGB", (w, h), (10, 10, 10)) |
| | draw = ImageDraw.Draw(img) |
| | n = len(audio_chunk) |
| | cx = h // 2 |
| | pts = [] |
| | for i in range(n): |
| | x = int(i * w / n) |
| | y = int(cx - audio_chunk[i] * h * 0.4) |
| | y = max(0, min(h - 1, y)) |
| | pts.append((x, y)) |
| | if len(pts) > 1: |
| | draw.line(pts, fill=accent_color, width=max(2, h // 200)) |
| | if title: |
| | try: |
| | from PIL import ImageFont |
| | font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", max(20, h // 25)) |
| | except Exception: |
| | font = None |
| | draw.text((w // 2, h // 12), title, fill=(200, 255, 200), anchor="mm", font=font) |
| | return img |
| |
|
| |
|
| | def render_frame_gradient_pulse(w, h, audio_chunk, bg_color, accent_color, title, frame_idx, fps): |
| | """Radial gradient that pulses with RMS energy.""" |
| | from PIL import Image, ImageDraw, ImageFilter |
| | rms = float(np.sqrt(np.mean(audio_chunk ** 2))) if len(audio_chunk) > 0 else 0 |
| | r0, g0, b0 = bg_color |
| | r1, g1, b1 = accent_color |
| | img = Image.new("RGB", (w, h)) |
| | pixels = img.load() |
| | cx, cy = w // 2, h // 2 |
| | max_r = math.sqrt(cx**2 + cy**2) |
| | pulse = 0.3 + rms * 2.5 |
| | for y in range(h): |
| | for x in range(w): |
| | dist = math.sqrt((x - cx)**2 + (y - cy)**2) / max_r |
| | t = max(0.0, min(1.0, 1 - dist / pulse)) |
| | pixels[x, y] = ( |
| | int(r0 + t * (r1 - r0)), |
| | int(g0 + t * (g1 - g0)), |
| | int(b0 + t * (b1 - b0)), |
| | ) |
| | draw = ImageDraw.Draw(img) |
| | if title: |
| | try: |
| | from PIL import ImageFont |
| | font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", max(20, h // 25)) |
| | except Exception: |
| | font = None |
| | draw.text((cx, h // 10), title, fill=(255, 255, 255), anchor="mm", font=font) |
| | return img |
| |
|
| |
|
| | def render_frame_minimal_dark(w, h, audio_chunk, bg_color, accent_color, title, frame_idx, fps): |
| | """Minimal dark with thin progress bar + centered text.""" |
| | from PIL import Image, ImageDraw |
| | img = Image.new("RGB", (w, h), (18, 18, 22)) |
| | draw = ImageDraw.Draw(img) |
| | |
| | strip_h = max(4, h // 15) |
| | cy = h // 2 |
| | n = len(audio_chunk) |
| | for i in range(n): |
| | x = int(i * w / n) |
| | amp = int(audio_chunk[i] * strip_h) |
| | draw.rectangle([x, cy - abs(amp), x, cy + abs(amp)], fill=accent_color) |
| | |
| | prog_w = int(frame_idx * w / max(fps * 1, 1)) |
| | draw.rectangle([0, h - 4, prog_w, h], fill=accent_color) |
| | if title: |
| | try: |
| | from PIL import ImageFont |
| | font_size = max(24, h // 18) |
| | font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", font_size) |
| | except Exception: |
| | font = None |
| | draw.text((w // 2, h * 2 // 5), title, fill=(240, 240, 245), anchor="mm", font=font) |
| | return img |
| |
|
| |
|
| | RENDERERS = { |
| | "๐ Solid + Waveform": render_frame_solid_waveform, |
| | "๐ Animated Spectrum Bars": render_frame_spectrum, |
| | "โก Oscilloscope Line": render_frame_oscilloscope, |
| | "๐ Gradient Pulse": render_frame_gradient_pulse, |
| | "๐ฒ Minimal Dark + Title": render_frame_minimal_dark, |
| | } |
| |
|
| |
|
| | def audio_to_video( |
| | audio_path, style, resolution_label, |
| | bg_hex, accent_hex, title_text, fps_str, |
| | progress=gr.Progress() |
| | ): |
| | if not audio_path: |
| | return None, "โ No audio file provided. Generate or upload audio first." |
| |
|
| | fps = int(fps_str) |
| | w, h = RESOLUTIONS[resolution_label] |
| | bg_color = hex_to_rgb(bg_hex) |
| | accent_color = hex_to_rgb(accent_hex) |
| | render_fn = RENDERERS[style] |
| |
|
| | |
| | progress(0.05, desc="Reading audio โฆ") |
| | try: |
| | audio_data, sr = sf.read(audio_path, dtype="float32") |
| | except Exception as e: |
| | return None, f"โ Could not read audio: {e}" |
| |
|
| | if audio_data.ndim > 1: |
| | audio_data = audio_data.mean(axis=1) |
| |
|
| | duration = len(audio_data) / sr |
| | n_frames = int(duration * fps) |
| | samples_per_frame = max(1, len(audio_data) // max(n_frames, 1)) |
| |
|
| | |
| | progress(0.10, desc="Rendering frames โฆ") |
| | with tempfile.TemporaryDirectory() as tmpdir: |
| | frame_dir = Path(tmpdir) / "frames" |
| | frame_dir.mkdir() |
| |
|
| | |
| | for fi in range(n_frames): |
| | if fi % max(1, n_frames // 20) == 0: |
| | progress(0.10 + 0.65 * fi / n_frames, desc=f"Frame {fi}/{n_frames} โฆ") |
| |
|
| | start = fi * samples_per_frame |
| | end = min(start + samples_per_frame, len(audio_data)) |
| | chunk = audio_data[start:end] if end > start else np.zeros(64) |
| |
|
| | img = render_fn(w, h, chunk, bg_color, accent_color, title_text, fi, fps) |
| | img.save(str(frame_dir / f"frame_{fi:06d}.png")) |
| |
|
| | |
| | progress(0.80, desc="Encoding video โฆ") |
| | out_path = "/tmp/tts_video.mp4" |
| | ffmpeg_cmd = [ |
| | "ffmpeg", "-y", |
| | "-framerate", str(fps), |
| | "-i", str(frame_dir / "frame_%06d.png"), |
| | "-i", audio_path, |
| | "-c:v", "libx264", |
| | "-preset", "fast", |
| | "-crf", "23", |
| | "-pix_fmt", "yuv420p", |
| | "-c:a", "aac", |
| | "-b:a", "192k", |
| | "-shortest", |
| | "-movflags", "+faststart", |
| | out_path, |
| | ] |
| | result = subprocess.run(ffmpeg_cmd, capture_output=True, text=True) |
| | if result.returncode != 0: |
| | return None, f"โ ffmpeg error:\n```\n{result.stderr[-1500:]}\n```" |
| |
|
| | progress(1.0, desc="Done!") |
| | info = ( |
| | f"โ
**Video ready!** \n" |
| | f"Style: `{style}` ยท Resolution: `{w}ร{h}` ยท FPS: `{fps}` ยท Duration: `{duration:.1f}s`" |
| | ) |
| | return out_path, info |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | css = """ |
| | .radio-row { display: flex; flex-wrap: wrap; gap: 1.2rem; align-items: center; } |
| | .radio-row > div { min-width: 140px; } |
| | """ |
| |
|
| | with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo: |
| | gr.Markdown("# Qwen3-TTS Full Demo\nAll released variants โข CPU-friendly โข No streaming") |
| |
|
| | |
| | with gr.Tab("CustomVoice โ Preset speakers + instruct"): |
| | gr.Markdown("Uses 9 built-in premium voices + optional style instruction") |
| | with gr.Row(elem_classes="radio-row"): |
| | cv_model = gr.Radio(["1.7B-CustomVoice", "0.6B-CustomVoice"], value="1.7B-CustomVoice", label="Model") |
| | cv_precision = gr.Radio(["float32", "float16"], value="float32", label="Precision") |
| | with gr.Row(): |
| | cv_text = gr.Textbox(label="Text to speak", lines=4, value="่ฟๆฏไธไธชๆต่ฏใๅธๆๅฃฐ้ณๅฌ่ตทๆฅ่ช็ถไธไบใ") |
| | cv_lang = gr.Dropdown(["Auto","Chinese","English","Japanese","Korean"], value="Auto", label="Language") |
| | cv_speaker = gr.Dropdown( |
| | ["Vivian","Serena","Uncle_Fu","Dylan","Eric","Ryan","Aiden","Ono_Anna","Sohee"], |
| | value="Vivian", label="Speaker" |
| | ) |
| | cv_instruct = gr.Textbox(label="Style instruction (optional)", lines=2, placeholder="็จ็นๅซๆคๆ็่ฏญๆฐ่ฏด") |
| | cv_btn = gr.Button("Generate", variant="primary") |
| | cv_audio = gr.Audio(label="Generated Speech", type="filepath") |
| | cv_info = gr.Markdown() |
| | cv_btn.click(infer_custom_voice, |
| | inputs=[cv_text, cv_lang, cv_speaker, cv_instruct, cv_model, cv_precision], |
| | outputs=[cv_audio, cv_info]) |
| |
|
| | |
| | with gr.Tab("Voice Design โ Describe any voice"): |
| | gr.Markdown("Create arbitrary voices from natural language description (only 1.7B variant)") |
| | with gr.Row(elem_classes="radio-row"): |
| | vd_model = gr.Radio(["1.7B-VoiceDesign"], value="1.7B-VoiceDesign", label="Model") |
| | vd_precision = gr.Radio(["float32", "float16"], value="float32", label="Precision") |
| | vd_text = gr.Textbox(label="Text to speak", lines=4, value="ๅฅๅฅ๏ผไฝ ๅๆฅๅฆ๏ผไบบๅฎถ็ญไบๅฅฝไน
๏ผ่ฆๆฑๆฑ๏ผ") |
| | vd_lang = gr.Dropdown(["Auto","Chinese","English"], value="Chinese", label="Language") |
| | vd_instruct = gr.Textbox(label="Voice description / instruction", lines=4, |
| | value="ไฝ็ฐๆๅจ็จๅซฉ็่่ๅฅณๅฃฐ๏ผ้ณ่ฐๅ้ซไธ่ตทไผๆๆพ๏ผ้ปไบบใๅไฝๅๅปๆๅ่็ๆ่ง") |
| | vd_btn = gr.Button("Generate", variant="primary") |
| | vd_audio = gr.Audio(label="Generated Speech", type="filepath") |
| | vd_info = gr.Markdown() |
| | vd_btn.click(infer_voice_design, |
| | inputs=[vd_text, vd_lang, vd_instruct, vd_model, vd_precision], |
| | outputs=[vd_audio, vd_info]) |
| |
|
| | |
| | with gr.Tab("Base โ Voice Clone from reference audio"): |
| | gr.Markdown("3-second rapid voice cloning using reference clip (Base models only)") |
| | with gr.Row(elem_classes="radio-row"): |
| | cl_model = gr.Radio(["1.7B-Base","0.6B-Base"], value="1.7B-Base", label="Model") |
| | cl_precision = gr.Radio(["float32","float16"], value="float32", label="Precision") |
| | cl_text = gr.Textbox(label="Text to synthesize", lines=4, value="This is my cloned voice now speaking normally.") |
| | cl_lang = gr.Dropdown(["Auto","English","Chinese"], value="Auto", label="Language") |
| | with gr.Row(): |
| | cl_ref_audio = gr.Audio(label="Reference audio clip", type="filepath", sources=["upload","microphone"]) |
| | cl_ref_text = gr.Textbox(label="Transcript of reference (optional)", lines=2) |
| | cl_xvec_only = gr.Checkbox(label="x-vector only mode (faster, lower quality)", value=False) |
| | cl_btn = gr.Button("Clone & Generate", variant="primary") |
| | cl_audio = gr.Audio(label="Cloned Speech", type="filepath") |
| | cl_info = gr.Markdown() |
| | cl_btn.click(infer_voice_clone, |
| | inputs=[cl_text, cl_lang, cl_ref_audio, cl_ref_text, cl_xvec_only, cl_model, cl_precision], |
| | outputs=[cl_audio, cl_info]) |
| |
|
| | |
| | with gr.Tab("๐ฌ Audio โ Video"): |
| | gr.Markdown( |
| | "## Audio โ Social Media Video\n" |
| | "Upload **any WAV/MP3** (or paste the path from a generated clip above) " |
| | "and render it into a shareable MP4 with a visual style.\n\n" |
| | "> โ ๏ธ **Gradient Pulse** renders per-pixel and is slow for long audio โ prefer other styles for > 30 s clips." |
| | ) |
| |
|
| | with gr.Row(): |
| | with gr.Column(scale=3): |
| | vid_audio = gr.Audio( |
| | label="Input audio (upload or record)", |
| | type="filepath", |
| | sources=["upload", "microphone"], |
| | ) |
| | vid_title = gr.Textbox( |
| | label="Title / caption text (shown on video)", |
| | placeholder="My AI Voice ยท Qwen3-TTS", |
| | value="" |
| | ) |
| | with gr.Column(scale=2): |
| | vid_style = gr.Radio( |
| | VISUAL_STYLES, |
| | value="๐ Animated Spectrum Bars", |
| | label="Visual style", |
| | ) |
| | vid_res = gr.Dropdown( |
| | list(RESOLUTIONS.keys()), |
| | value="1080ร1920 (TikTok/Reels 9:16)", |
| | label="Resolution / aspect ratio", |
| | ) |
| | vid_fps = gr.Radio(["24", "30"], value="24", label="FPS") |
| |
|
| | with gr.Row(): |
| | vid_bg = gr.ColorPicker(value="#0d0d1a", label="Background colour") |
| | vid_accent = gr.ColorPicker(value="#7c3aed", label="Accent / waveform colour") |
| |
|
| | vid_btn = gr.Button("๐ฌ Render Video", variant="primary", size="lg") |
| | vid_out = gr.Video(label="Output video") |
| | vid_info = gr.Markdown() |
| |
|
| | vid_btn.click( |
| | audio_to_video, |
| | inputs=[vid_audio, vid_style, vid_res, vid_bg, vid_accent, vid_title, vid_fps], |
| | outputs=[vid_out, vid_info], |
| | ) |
| |
|
| | gr.Markdown(""" |
| | **Style guide:** |
| | | Style | Best for | Notes | |
| | |---|---|---| |
| | | ๐ Solid + Waveform | Podcasts, quotes | Fast, clean | |
| | | ๐ Animated Spectrum Bars | Music / speech highlights | FFT-based, energetic | |
| | | โก Oscilloscope Line | Dark/techy aesthetic | Classic green-on-black | |
| | | ๐ Gradient Pulse | Ambient / ASMR | Slow render โ use short clips | |
| | | ๐ฒ Minimal Dark + Title | Branded content | Great with a title caption | |
| | """) |
| |
|
| | |
| | gr.Markdown(""" |
| | **Notes** |
| | โข First generation per model loads weights (may take 1โ5 min). |
| | โข Use **float32** if **float16** causes crashes (common on CPU). |
| | โข **0.6B** models are faster / lighter on CPU. |
| | โข Video tab requires `ffmpeg` and `Pillow` (both standard on most systems). |
| | โข Repo & docs: https://github.com/QwenLM/Qwen3-TTS |
| | """) |
| |
|
| | if __name__ == "__main__": |
| | demo.launch(server_name="0.0.0.0", server_port=7860) |