Spaces:

broadfield-dev
/

qweb3-tts-cpu

Paused

App Files Files Community

qweb3-tts-cpu / app.py

broadfield-dev

Create app.py

72a6ef9 verified 24 days ago

raw

history blame contribute delete

22.4 kB

	import gradio as gr
	import torch
	import numpy as np
	import soundfile as sf
	from pathlib import Path
	import os
	import warnings
	import subprocess
	import tempfile
	import math

	warnings.filterwarnings("ignore", category=UserWarning)

	# ────────────────────────────────────────────────
	# Lazy import for TTS model (not required for video tab)
	# ────────────────────────────────────────────────
	try:
	from qwen_tts import Qwen3TTSModel
	TTS_AVAILABLE = True
	except ImportError:
	TTS_AVAILABLE = False

	# ────────────────────────────────────────────────
	# Globals & Model Loader
	# ────────────────────────────────────────────────

	MODELS = {
	"1.7B-CustomVoice": "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice",
	"0.6B-CustomVoice": "Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice",
	"1.7B-VoiceDesign": "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign",
	"1.7B-Base": "Qwen/Qwen3-TTS-12Hz-1.7B-Base",
	"0.6B-Base": "Qwen/Qwen3-TTS-12Hz-0.6B-Base",
	}

	loaded_models = {}

	def get_model(model_key: str, dtype_str: str = "float32", progress=gr.Progress()):
	if not TTS_AVAILABLE:
	raise gr.Error("qwen_tts is not installed. TTS tabs unavailable.")
	key = f"{model_key}_{dtype_str}"
	if key in loaded_models:
	return loaded_models[key]
	progress(0.1, desc=f"Loading {model_key} ({dtype_str}) …")
	repo_id = MODELS[model_key]
	dtype = torch.float32 if dtype_str == "float32" else torch.float16
	try:
	model = Qwen3TTSModel.from_pretrained(
	repo_id, device_map="cpu", dtype=dtype,
	torch_dtype=dtype, low_cpu_mem_usage=True,
	)
	except Exception as e:
	raise gr.Error(f"Model loading failed:\n{str(e)}\n\nTry float32 or smaller variant.")
	loaded_models[key] = model
	progress(0.9, desc="Model ready.")
	return model


	# ────────────────────────────────────────────────
	# TTS Inference (unchanged)
	# ────────────────────────────────────────────────

	def infer_custom_voice(text, lang, speaker, instruct, model_key, precision, progress=gr.Progress()):
	if not text.strip():
	return None, "Please enter some text."
	model = get_model(model_key, precision, progress)
	progress(0.4, desc="Generating …")
	try:
	wavs, sr = model.generate_custom_voice(
	text=text, language=lang if lang != "Auto" else None,
	speaker=speaker, instruct=instruct.strip() or None, max_new_tokens=1500,
	)
	path = "/tmp/output_custom.wav"
	sf.write(path, wavs[0] if isinstance(wavs, list) else wavs, sr)
	info = f"Generated with {model_key} \nlang: {lang} \nspeaker: {speaker} \ninstruct: {instruct or '(none)'}"
	return path, info
	except Exception as e:
	return None, f"Error: {str(e)}"


	def infer_voice_design(text, lang, instruct, model_key, precision, progress=gr.Progress()):
	if not text.strip() or not instruct.strip():
	return None, "Text and voice instruction required."
	model = get_model(model_key, precision, progress)
	progress(0.4, desc="Generating …")
	try:
	wavs, sr = model.generate_voice_design(
	text=text, language=lang if lang != "Auto" else None,
	instruct=instruct, max_new_tokens=1500,
	)
	path = "/tmp/output_design.wav"
	sf.write(path, wavs[0] if isinstance(wavs, list) else wavs, sr)
	info = f"Voice Design – {model_key} \nlang: {lang} \ninstruct: {instruct}"
	return path, info
	except Exception as e:
	return None, f"Error: {str(e)}"


	def infer_voice_clone(text, lang, ref_audio, ref_text, x_vector_only, model_key, precision, progress=gr.Progress()):
	if not text.strip():
	return None, "Enter text to synthesize."
	if not ref_audio:
	return None, "Upload reference audio."
	model = get_model(model_key, precision, progress)
	progress(0.3, desc="Processing reference …")
	try:
	wavs, sr = model.generate_voice_clone(
	text=text, language=lang if lang != "Auto" else None,
	ref_audio=ref_audio, ref_text=ref_text.strip() or None,
	x_vector_only_mode=x_vector_only, max_new_tokens=1500,
	)
	path = "/tmp/output_clone.wav"
	sf.write(path, wavs[0] if isinstance(wavs, list) else wavs, sr)
	info = f"Voice Clone – {model_key} \nlang: {lang} \nx-vector-only: {x_vector_only}"
	return path, info
	except Exception as e:
	return None, f"Error: {str(e)}"


	# ────────────────────────────────────────────────
	# Video Generation Helpers
	# ────────────────────────────────────────────────

	RESOLUTIONS = {
	"1080×1920 (TikTok/Reels 9:16)": (1080, 1920),
	"1080×1080 (Instagram Square)": (1080, 1080),
	"1920×1080 (YouTube Landscape)": (1920, 1080),
	"1280×720 (YouTube 720p)": (1280, 720),
	}

	VISUAL_STYLES = [
	"🎙 Solid + Waveform",
	"🌊 Animated Spectrum Bars",
	"⚡ Oscilloscope Line",
	"🌈 Gradient Pulse",
	"🔲 Minimal Dark + Title",
	]


	def hex_to_rgb(h: str):
	h = h.lstrip("#")
	return tuple(int(h[i:i+2], 16) for i in (0, 2, 4))


	def render_frame_solid_waveform(w, h, audio_chunk, bg_color, accent_color, title, frame_idx, fps):
	"""Solid background with a centered waveform line."""
	from PIL import Image, ImageDraw, ImageFont
	img = Image.new("RGB", (w, h), bg_color)
	draw = ImageDraw.Draw(img)
	# waveform
	n = len(audio_chunk)
	if n == 0:
	return img
	cx = h // 2
	bar_w = max(1, w // max(n, 1))
	for i, amp in enumerate(audio_chunk):
	x = int(i * w / n)
	bar_h = int(abs(amp) * h * 0.4)
	draw.rectangle([x, cx - bar_h, x + bar_w - 1, cx + bar_h], fill=accent_color)
	# title text
	if title:
	try:
	from PIL import ImageFont
	font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", max(20, h // 25))
	except Exception:
	font = None
	draw.text((w // 2, h // 10), title, fill=(255, 255, 255), anchor="mm", font=font)
	return img


	def render_frame_spectrum(w, h, audio_chunk, bg_color, accent_color, title, frame_idx, fps):
	"""Animated spectrum-like bars using FFT."""
	from PIL import Image, ImageDraw
	img = Image.new("RGB", (w, h), bg_color)
	draw = ImageDraw.Draw(img)
	N_BARS = 64
	n = len(audio_chunk)
	if n > 0:
	spectrum = np.abs(np.fft.rfft(audio_chunk, n=512))[:N_BARS]
	spectrum = spectrum / (spectrum.max() + 1e-9)
	else:
	spectrum = np.zeros(N_BARS)
	bar_w = w // N_BARS
	for i, val in enumerate(spectrum):
	bar_h = int(val * h * 0.8)
	x0 = i * bar_w
	x1 = x0 + bar_w - 2
	# gradient colour from accent to white
	r, g, b = accent_color
	draw.rectangle([x0, h - bar_h, x1, h], fill=(r, g, b))
	if title:
	try:
	from PIL import ImageFont
	font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", max(20, h // 25))
	except Exception:
	font = None
	draw.text((w // 2, h // 10), title, fill=(255, 255, 255), anchor="mm", font=font)
	return img


	def render_frame_oscilloscope(w, h, audio_chunk, bg_color, accent_color, title, frame_idx, fps):
	"""Classic green-phosphor oscilloscope line."""
	from PIL import Image, ImageDraw
	img = Image.new("RGB", (w, h), (10, 10, 10))
	draw = ImageDraw.Draw(img)
	n = len(audio_chunk)
	cx = h // 2
	pts = []
	for i in range(n):
	x = int(i * w / n)
	y = int(cx - audio_chunk[i] * h * 0.4)
	y = max(0, min(h - 1, y))
	pts.append((x, y))
	if len(pts) > 1:
	draw.line(pts, fill=accent_color, width=max(2, h // 200))
	if title:
	try:
	from PIL import ImageFont
	font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", max(20, h // 25))
	except Exception:
	font = None
	draw.text((w // 2, h // 12), title, fill=(200, 255, 200), anchor="mm", font=font)
	return img


	def render_frame_gradient_pulse(w, h, audio_chunk, bg_color, accent_color, title, frame_idx, fps):
	"""Radial gradient that pulses with RMS energy."""
	from PIL import Image, ImageDraw, ImageFilter
	rms = float(np.sqrt(np.mean(audio_chunk ** 2))) if len(audio_chunk) > 0 else 0
	r0, g0, b0 = bg_color
	r1, g1, b1 = accent_color
	img = Image.new("RGB", (w, h))
	pixels = img.load()
	cx, cy = w // 2, h // 2
	max_r = math.sqrt(cx2 + cy2)
	pulse = 0.3 + rms * 2.5
	for y in range(h):
	for x in range(w):
	dist = math.sqrt((x - cx)2 + (y - cy)2) / max_r
	t = max(0.0, min(1.0, 1 - dist / pulse))
	pixels[x, y] = (
	int(r0 + t * (r1 - r0)),
	int(g0 + t * (g1 - g0)),
	int(b0 + t * (b1 - b0)),
	)
	draw = ImageDraw.Draw(img)
	if title:
	try:
	from PIL import ImageFont
	font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", max(20, h // 25))
	except Exception:
	font = None
	draw.text((cx, h // 10), title, fill=(255, 255, 255), anchor="mm", font=font)
	return img


	def render_frame_minimal_dark(w, h, audio_chunk, bg_color, accent_color, title, frame_idx, fps):
	"""Minimal dark with thin progress bar + centered text."""
	from PIL import Image, ImageDraw
	img = Image.new("RGB", (w, h), (18, 18, 22))
	draw = ImageDraw.Draw(img)
	# thin horizontal waveform strip
	strip_h = max(4, h // 15)
	cy = h // 2
	n = len(audio_chunk)
	for i in range(n):
	x = int(i * w / n)
	amp = int(audio_chunk[i] * strip_h)
	draw.rectangle([x, cy - abs(amp), x, cy + abs(amp)], fill=accent_color)
	# bottom progress indicator: thin white line based on frame
	prog_w = int(frame_idx * w / max(fps * 1, 1)) # width grows with time; real duration injected below
	draw.rectangle([0, h - 4, prog_w, h], fill=accent_color)
	if title:
	try:
	from PIL import ImageFont
	font_size = max(24, h // 18)
	font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", font_size)
	except Exception:
	font = None
	draw.text((w // 2, h * 2 // 5), title, fill=(240, 240, 245), anchor="mm", font=font)
	return img


	RENDERERS = {
	"🎙 Solid + Waveform": render_frame_solid_waveform,
	"🌊 Animated Spectrum Bars": render_frame_spectrum,
	"⚡ Oscilloscope Line": render_frame_oscilloscope,
	"🌈 Gradient Pulse": render_frame_gradient_pulse,
	"🔲 Minimal Dark + Title": render_frame_minimal_dark,
	}


	def audio_to_video(
	audio_path, style, resolution_label,
	bg_hex, accent_hex, title_text, fps_str,
	progress=gr.Progress()
	):
	if not audio_path:
	return None, "❌ No audio file provided. Generate or upload audio first."

	fps = int(fps_str)
	w, h = RESOLUTIONS[resolution_label]
	bg_color = hex_to_rgb(bg_hex)
	accent_color = hex_to_rgb(accent_hex)
	render_fn = RENDERERS[style]

	# ---- Load audio ----
	progress(0.05, desc="Reading audio …")
	try:
	audio_data, sr = sf.read(audio_path, dtype="float32")
	except Exception as e:
	return None, f"❌ Could not read audio: {e}"

	if audio_data.ndim > 1:
	audio_data = audio_data.mean(axis=1) # mono

	duration = len(audio_data) / sr
	n_frames = int(duration * fps)
	samples_per_frame = max(1, len(audio_data) // max(n_frames, 1))

	# ---- Write frames to temp dir ----
	progress(0.10, desc="Rendering frames …")
	with tempfile.TemporaryDirectory() as tmpdir:
	frame_dir = Path(tmpdir) / "frames"
	frame_dir.mkdir()

	# gradient_pulse is slow (pixel-by-pixel); warn user
	for fi in range(n_frames):
	if fi % max(1, n_frames // 20) == 0:
	progress(0.10 + 0.65 * fi / n_frames, desc=f"Frame {fi}/{n_frames} …")

	start = fi * samples_per_frame
	end = min(start + samples_per_frame, len(audio_data))
	chunk = audio_data[start:end] if end > start else np.zeros(64)

	img = render_fn(w, h, chunk, bg_color, accent_color, title_text, fi, fps)
	img.save(str(frame_dir / f"frame_{fi:06d}.png"))

	# ---- Assemble with ffmpeg ----
	progress(0.80, desc="Encoding video …")
	out_path = "/tmp/tts_video.mp4"
	ffmpeg_cmd = [
	"ffmpeg", "-y",
	"-framerate", str(fps),
	"-i", str(frame_dir / "frame_%06d.png"),
	"-i", audio_path,
	"-c:v", "libx264",
	"-preset", "fast",
	"-crf", "23",
	"-pix_fmt", "yuv420p",
	"-c:a", "aac",
	"-b:a", "192k",
	"-shortest",
	"-movflags", "+faststart",
	out_path,
	]
	result = subprocess.run(ffmpeg_cmd, capture_output=True, text=True)
	if result.returncode != 0:
	return None, f"❌ ffmpeg error:\n```\n{result.stderr[-1500:]}\n```"

	progress(1.0, desc="Done!")
	info = (
	f"✅ Video ready! \n"
	f"Style: `{style}` · Resolution: `{w}×{h}` · FPS: `{fps}` · Duration: `{duration:.1f}s`"
	)
	return out_path, info


	# ────────────────────────────────────────────────
	# UI
	# ────────────────────────────────────────────────

	css = """
	.radio-row { display: flex; flex-wrap: wrap; gap: 1.2rem; align-items: center; }
	.radio-row > div { min-width: 140px; }
	"""

	with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
	gr.Markdown("# Qwen3-TTS Full Demo\nAll released variants • CPU-friendly • No streaming")

	# ── Tab 1: Custom Voice ────────────────────────────────────────────────
	with gr.Tab("CustomVoice – Preset speakers + instruct"):
	gr.Markdown("Uses 9 built-in premium voices + optional style instruction")
	with gr.Row(elem_classes="radio-row"):
	cv_model = gr.Radio(["1.7B-CustomVoice", "0.6B-CustomVoice"], value="1.7B-CustomVoice", label="Model")
	cv_precision = gr.Radio(["float32", "float16"], value="float32", label="Precision")
	with gr.Row():
	cv_text = gr.Textbox(label="Text to speak", lines=4, value="这是一个测试。希望声音听起来自然一些。")
	cv_lang = gr.Dropdown(["Auto","Chinese","English","Japanese","Korean"], value="Auto", label="Language")
	cv_speaker = gr.Dropdown(
	["Vivian","Serena","Uncle_Fu","Dylan","Eric","Ryan","Aiden","Ono_Anna","Sohee"],
	value="Vivian", label="Speaker"
	)
	cv_instruct = gr.Textbox(label="Style instruction (optional)", lines=2, placeholder="用特别愤怒的语气说")
	cv_btn = gr.Button("Generate", variant="primary")
	cv_audio = gr.Audio(label="Generated Speech", type="filepath")
	cv_info = gr.Markdown()
	cv_btn.click(infer_custom_voice,
	inputs=[cv_text, cv_lang, cv_speaker, cv_instruct, cv_model, cv_precision],
	outputs=[cv_audio, cv_info])

	# ── Tab 2: Voice Design ───────────────────────────────────────────────
	with gr.Tab("Voice Design – Describe any voice"):
	gr.Markdown("Create arbitrary voices from natural language description (only 1.7B variant)")
	with gr.Row(elem_classes="radio-row"):
	vd_model = gr.Radio(["1.7B-VoiceDesign"], value="1.7B-VoiceDesign", label="Model")
	vd_precision = gr.Radio(["float32", "float16"], value="float32", label="Precision")
	vd_text = gr.Textbox(label="Text to speak", lines=4, value="哥哥，你回来啦，人家等了好久，要抱抱！")
	vd_lang = gr.Dropdown(["Auto","Chinese","English"], value="Chinese", label="Language")
	vd_instruct = gr.Textbox(label="Voice description / instruction", lines=4,
	value="体现撒娇稚嫩的萝莉女声，音调偏高且起伏明显，黏人、做作又刻意卖萌的感觉")
	vd_btn = gr.Button("Generate", variant="primary")
	vd_audio = gr.Audio(label="Generated Speech", type="filepath")
	vd_info = gr.Markdown()
	vd_btn.click(infer_voice_design,
	inputs=[vd_text, vd_lang, vd_instruct, vd_model, vd_precision],
	outputs=[vd_audio, vd_info])

	# ── Tab 3: Voice Clone ────────────────────────────────────────────────
	with gr.Tab("Base – Voice Clone from reference audio"):
	gr.Markdown("3-second rapid voice cloning using reference clip (Base models only)")
	with gr.Row(elem_classes="radio-row"):
	cl_model = gr.Radio(["1.7B-Base","0.6B-Base"], value="1.7B-Base", label="Model")
	cl_precision = gr.Radio(["float32","float16"], value="float32", label="Precision")
	cl_text = gr.Textbox(label="Text to synthesize", lines=4, value="This is my cloned voice now speaking normally.")
	cl_lang = gr.Dropdown(["Auto","English","Chinese"], value="Auto", label="Language")
	with gr.Row():
	cl_ref_audio = gr.Audio(label="Reference audio clip", type="filepath", sources=["upload","microphone"])
	cl_ref_text = gr.Textbox(label="Transcript of reference (optional)", lines=2)
	cl_xvec_only = gr.Checkbox(label="x-vector only mode (faster, lower quality)", value=False)
	cl_btn = gr.Button("Clone & Generate", variant="primary")
	cl_audio = gr.Audio(label="Cloned Speech", type="filepath")
	cl_info = gr.Markdown()
	cl_btn.click(infer_voice_clone,
	inputs=[cl_text, cl_lang, cl_ref_audio, cl_ref_text, cl_xvec_only, cl_model, cl_precision],
	outputs=[cl_audio, cl_info])

	# ── Tab 4: Audio → Video ──────────────────────────────────────────────
	with gr.Tab("🎬 Audio → Video"):
	gr.Markdown(
	"## Audio → Social Media Video\n"
	"Upload any WAV/MP3 (or paste the path from a generated clip above) "
	"and render it into a shareable MP4 with a visual style.\n\n"
	"> ⚠️ Gradient Pulse renders per-pixel and is slow for long audio — prefer other styles for > 30 s clips."
	)

	with gr.Row():
	with gr.Column(scale=3):
	vid_audio = gr.Audio(
	label="Input audio (upload or record)",
	type="filepath",
	sources=["upload", "microphone"],
	)
	vid_title = gr.Textbox(
	label="Title / caption text (shown on video)",
	placeholder="My AI Voice · Qwen3-TTS",
	value=""
	)
	with gr.Column(scale=2):
	vid_style = gr.Radio(
	VISUAL_STYLES,
	value="🌊 Animated Spectrum Bars",
	label="Visual style",
	)
	vid_res = gr.Dropdown(
	list(RESOLUTIONS.keys()),
	value="1080×1920 (TikTok/Reels 9:16)",
	label="Resolution / aspect ratio",
	)
	vid_fps = gr.Radio(["24", "30"], value="24", label="FPS")

	with gr.Row():
	vid_bg = gr.ColorPicker(value="#0d0d1a", label="Background colour")
	vid_accent = gr.ColorPicker(value="#7c3aed", label="Accent / waveform colour")

	vid_btn = gr.Button("🎬 Render Video", variant="primary", size="lg")
	vid_out = gr.Video(label="Output video")
	vid_info = gr.Markdown()

	vid_btn.click(
	audio_to_video,
	inputs=[vid_audio, vid_style, vid_res, vid_bg, vid_accent, vid_title, vid_fps],
	outputs=[vid_out, vid_info],
	)

	gr.Markdown("""
	Style guide:
	\| Style \| Best for \| Notes \|
	\|---\|---\|---\|
	\| 🎙 Solid + Waveform \| Podcasts, quotes \| Fast, clean \|
	\| 🌊 Animated Spectrum Bars \| Music / speech highlights \| FFT-based, energetic \|
	\| ⚡ Oscilloscope Line \| Dark/techy aesthetic \| Classic green-on-black \|
	\| 🌈 Gradient Pulse \| Ambient / ASMR \| Slow render — use short clips \|
	\| 🔲 Minimal Dark + Title \| Branded content \| Great with a title caption \|
	""")

	# ── Footer ────────────────────────────────────────────────────────────
	gr.Markdown("""
	Notes
	• First generation per model loads weights (may take 1–5 min).
	• Use float32 if float16 causes crashes (common on CPU).
	• 0.6B models are faster / lighter on CPU.
	• Video tab requires `ffmpeg` and `Pillow` (both standard on most systems).
	• Repo & docs: https://github.com/QwenLM/Qwen3-TTS
	""")

	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=7860)