Spaces:

Accordic
/

C____C

Sleeping

App Files Files Community

C____C / app.py

Accordic

Update app.py

38b750c verified 2 months ago

raw

history blame contribute delete

16.1 kB

	# coding=utf-8
	# Qwen3-TTS Gradio Demo - Giao diện Responsive
	import os
	import gradio as gr
	import numpy as np
	import torch
	from huggingface_hub import snapshot_download, login

	# Đăng nhập HuggingFace
	HF_TOKEN = os.environ.get('HF_TOKEN')
	if HF_TOKEN:
	login(token=HF_TOKEN)

	loaded_models = {}
	MODEL_SIZES = ["0.6B", "1.7B"]

	def get_model_path(model_type: str, model_size: str) -> str:
	return snapshot_download(f"Qwen/Qwen3-TTS-12Hz-{model_size}-{model_type}")

	def get_model(model_type: str, model_size: str):
	global loaded_models
	key = (model_type, model_size)
	if key not in loaded_models:
	from qwen_tts import Qwen3TTSModel
	model_path = get_model_path(model_type, model_size)
	device = "cuda" if torch.cuda.is_available() else "cpu"
	dtype = torch.bfloat16 if device == "cuda" else torch.float32
	print(f"Đang tải model {model_type} {model_size} trên {device}")
	loaded_models[key] = Qwen3TTSModel.from_pretrained(
	model_path, device_map=device, dtype=dtype, token=HF_TOKEN
	)
	return loaded_models[key]

	def _normalize_audio(wav, eps=1e-12, clip=True):
	x = np.asarray(wav)
	if np.issubdtype(x.dtype, np.integer):
	info = np.iinfo(x.dtype)
	if info.min < 0:
	y = x.astype(np.float32) / max(abs(info.min), info.max)
	else:
	mid = (info.max + 1) / 2.0
	y = (x.astype(np.float32) - mid) / mid
	elif np.issubdtype(x.dtype, np.floating):
	y = x.astype(np.float32)
	m = np.max(np.abs(y)) if y.size else 0.0
	if m > 1.0 + 1e-6:
	y = y / (m + eps)
	else:
	raise TypeError(f"Kiểu dữ liệu không hỗ trợ: {x.dtype}")
	if clip:
	y = np.clip(y, -1.0, 1.0)
	if y.ndim > 1:
	y = np.mean(y, axis=-1).astype(np.float32)
	return y

	def _audio_to_tuple(audio):
	if audio is None:
	return None
	if isinstance(audio, tuple) and len(audio) == 2 and isinstance(audio[0], int):
	sr, wav = audio
	wav = _normalize_audio(wav)
	return wav, int(sr)
	if isinstance(audio, dict) and "sampling_rate" in audio and "data" in audio:
	sr = int(audio["sampling_rate"])
	wav = _normalize_audio(audio["data"])
	return wav, sr
	return None

	SPEAKERS = ["Aiden", "Dylan", "Eric", "Ono_anna", "Ryan", "Serena", "Sohee", "Uncle_fu", "Vivian"]
	LANGUAGES = ["Tự động", "Tiếng Trung", "Tiếng Anh", "Tiếng Nhật", "Tiếng Hàn", "Tiếng Pháp", "Tiếng Đức", "Tiếng Tây Ban Nha", "Tiếng Bồ Đào Nha", "Tiếng Nga"]
	LANGUAGE_MAP = {
	"Tự động": "Auto", "Tiếng Trung": "Chinese", "Tiếng Anh": "English",
	"Tiếng Nhật": "Japanese", "Tiếng Hàn": "Korean", "Tiếng Pháp": "French",
	"Tiếng Đức": "German", "Tiếng Tây Ban Nha": "Spanish",
	"Tiếng Bồ Đào Nha": "Portuguese", "Tiếng Nga": "Russian"
	}

	def generate_voice_design(text, language, voice_description):
	if not text or not text.strip():
	return None, "❌ Vui lòng nhập văn bản"
	if not voice_description or not voice_description.strip():
	return None, "❌ Vui lòng nhập mô tả giọng nói"
	try:
	tts = get_model("VoiceDesign", "1.7B")
	lang_code = LANGUAGE_MAP.get(language, "Auto")
	wavs, sr = tts.generate_voice_design(
	text=text.strip(), language=lang_code,
	instruct=voice_description.strip(),
	non_streaming_mode=True, max_new_tokens=2048
	)
	return (sr, wavs[0]), "✅ Hoàn thành!"
	except Exception as e:
	return None, f"❌ Lỗi: {str(e)}"

	def generate_voice_clone(ref_audio, ref_text, target_text, language, use_xvector_only, model_size):
	if not target_text or not target_text.strip():
	return None, "❌ Vui lòng nhập văn bản đích"
	audio_tuple = _audio_to_tuple(ref_audio)
	if audio_tuple is None:
	return None, "❌ Vui lòng tải audio tham chiếu"
	if not use_xvector_only and (not ref_text or not ref_text.strip()):
	return None, "❌ Vui lòng nhập văn bản tham chiếu"
	try:
	tts = get_model("Base", model_size)
	lang_code = LANGUAGE_MAP.get(language, "Auto")
	wavs, sr = tts.generate_voice_clone(
	text=target_text.strip(), language=lang_code,
	ref_audio=audio_tuple,
	ref_text=ref_text.strip() if ref_text else None,
	x_vector_only_mode=use_xvector_only, max_new_tokens=2048
	)
	return (sr, wavs[0]), "✅ Hoàn thành!"
	except Exception as e:
	return None, f"❌ Lỗi: {str(e)}"

	def generate_custom_voice(text, language, speaker, instruct, model_size):
	if not text or not text.strip():
	return None, "❌ Vui lòng nhập văn bản"
	if not speaker:
	return None, "❌ Vui lòng chọn giọng nói"
	try:
	tts = get_model("CustomVoice", model_size)
	lang_code = LANGUAGE_MAP.get(language, "Auto")
	wavs, sr = tts.generate_custom_voice(
	text=text.strip(), language=lang_code,
	speaker=speaker.lower().replace(" ", "_"),
	instruct=instruct.strip() if instruct else None,
	non_streaming_mode=True, max_new_tokens=2048
	)
	return (sr, wavs[0]), "✅ Hoàn thành!"
	except Exception as e:
	return None, f"❌ Lỗi: {str(e)}"

	def build_ui():
	theme = gr.themes.Soft(
	font=[gr.themes.GoogleFont("Inter"), "sans-serif"],
	primary_hue="blue",
	radius_size="md",
	)

	css = """
	/* Container chính */
	.gradio-container {
	max-width: 100% !important;
	padding: 10px !important;
	}

	/* Tab style */
	.tab-nav button {
	font-size: 14px !important;
	padding: 10px 15px !important;
	}

	/* Button style */
	button.primary {
	background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
	border: none !important;
	font-weight: 600 !important;
	padding: 12px 24px !important;
	border-radius: 8px !important;
	transition: all 0.3s ease !important;
	}

	button.primary:hover {
	transform: translateY(-2px) !important;
	box-shadow: 0 8px 16px rgba(102, 126, 234, 0.3) !important;
	}

	/* Input fields */
	textarea, input, select {
	border-radius: 8px !important;
	border: 1.5px solid #e0e0e0 !important;
	font-size: 14px !important;
	}

	/* Labels */
	label {
	font-weight: 600 !important;
	color: #374151 !important;
	margin-bottom: 8px !important;
	}

	/* Header */
	.app-header {
	text-align: center;
	padding: 20px 10px;
	background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
	color: white;
	border-radius: 12px;
	margin-bottom: 20px;
	}

	.app-header h1 {
	margin: 0;
	font-size: clamp(24px, 5vw, 36px);
	font-weight: 700;
	}

	.app-header p {
	margin: 8px 0 0 0;
	font-size: clamp(12px, 3vw, 16px);
	opacity: 0.95;
	}

	/* Card style cho sections */
	.input-card {
	background: white;
	padding: 20px;
	border-radius: 12px;
	box-shadow: 0 2px 8px rgba(0,0,0,0.08);
	margin-bottom: 15px;
	}

	/* Status message */
	.status-box {
	padding: 12px;
	border-radius: 8px;
	margin-top: 10px;
	font-size: 13px;
	}

	/* Info boxes */
	.info-box {
	background: #f0f9ff;
	border-left: 4px solid #3b82f6;
	padding: 12px 15px;
	border-radius: 6px;
	margin: 10px 0;
	font-size: 13px;
	}

	.warning-box {
	background: #fef3c7;
	border-left: 4px solid #f59e0b;
	padding: 12px 15px;
	border-radius: 6px;
	margin: 10px 0;
	font-size: 13px;
	}

	/* Responsive adjustments */
	@media (max-width: 768px) {
	.gradio-container {
	padding: 5px !important;
	}

	.app-header {
	padding: 15px 10px;
	margin-bottom: 15px;
	}

	.input-card {
	padding: 15px;
	}

	button.primary {
	width: 100%;
	padding: 14px 20px !important;
	}

	.tab-nav button {
	font-size: 12px !important;
	padding: 8px 10px !important;
	}
	}

	/* Compact spacing for mobile */
	@media (max-width: 480px) {
	.block {
	margin: 8px 0 !important;
	}

	textarea {
	font-size: 14px !important;
	}
	}
	"""

	with gr.Blocks(theme=theme, css=css, title="Qwen3-TTS") as demo:
	# Header
	gr.HTML("""
	<div class="app-header">
	<h1>🎙️ Qwen3-TTS</h1>
	<p>Chuyển đổi Văn bản thành Giọng nói bằng AI</p>
	</div>
	""")

	with gr.Tabs():
	# Tab 1: Thiết kế Giọng nói
	with gr.Tab("🎨 Thiết kế Giọng"):
	gr.Markdown("Tạo giọng nói tùy chỉnh bằng mô tả (Model 1.7B)")

	design_text = gr.Textbox(
	label="📝 Văn bản",
	lines=4,
	placeholder="Nhập nội dung cần đọc...",
	value="Xin chào! Đây là giọng nói được tạo bởi AI."
	)

	design_language = gr.Dropdown(
	label="🌍 Ngôn ngữ",
	choices=LANGUAGES,
	value="Tự động"
	)

	design_instruct = gr.Textbox(
	label="🎭 Mô tả giọng nói",
	lines=3,
	placeholder="VD: Giọng vui vẻ, tràn đầy năng lượng...",
	value="Nói với giọng thân thiện và nhiệt tình"
	)

	design_btn = gr.Button("🚀 Tạo giọng nói", variant="primary")
	design_audio_out = gr.Audio(label="🔊 Kết quả")
	design_status = gr.Textbox(label="Trạng thái", lines=2, interactive=False)

	gr.HTML("""
	<div class="info-box">
	<strong>💡 Mẹo:</strong> Mô tả chi tiết cảm xúc, tốc độ, phong cách để có kết quả tốt nhất
	</div>
	""")

	design_btn.click(
	generate_voice_design,
	inputs=[design_text, design_language, design_instruct],
	outputs=[design_audio_out, design_status]
	)

	# Tab 2: Nhân bản Giọng
	with gr.Tab("🎤 Nhân bản Giọng"):
	gr.Markdown("Sao chép giọng nói từ audio mẫu")

	clone_ref_audio = gr.Audio(
	label="🎵 Audio mẫu",
	type="numpy"
	)

	clone_ref_text = gr.Textbox(
	label="📄 Nội dung audio mẫu",
	lines=2,
	placeholder="Nhập chính xác nội dung trong audio..."
	)

	clone_xvector = gr.Checkbox(
	label="⚡ Chế độ nhanh (không cần nội dung audio)",
	value=False
	)

	clone_target_text = gr.Textbox(
	label="✍️ Văn bản cần đọc",
	lines=3,
	placeholder="Nhập nội dung muốn giọng nhân bản đọc..."
	)

	with gr.Row():
	clone_language = gr.Dropdown(
	label="🌍 Ngôn ngữ",
	choices=LANGUAGES,
	value="Tự động",
	scale=1
	)
	clone_model_size = gr.Dropdown(
	label="⚙️ Model",
	choices=MODEL_SIZES,
	value="0.6B",
	scale=1
	)

	clone_btn = gr.Button("🎬 Nhân bản giọng", variant="primary")
	clone_audio_out = gr.Audio(label="🔊 Kết quả")
	clone_status = gr.Textbox(label="Trạng thái", lines=2, interactive=False)

	gr.HTML("""
	<div class="info-box">
	<strong>💡 Lưu ý:</strong> Audio mẫu nên rõ ràng, ít nhiễu và độ dài 3-10 giây
	</div>
	""")

	clone_btn.click(
	generate_voice_clone,
	inputs=[clone_ref_audio, clone_ref_text, clone_target_text,
	clone_language, clone_xvector, clone_model_size],
	outputs=[clone_audio_out, clone_status]
	)

	# Tab 3: Giọng có sẵn
	with gr.Tab("🗣️ Giọng có sẵn"):
	gr.Markdown("Sử dụng giọng đọc được huấn luyện sẵn")

	tts_text = gr.Textbox(
	label="📝 Văn bản",
	lines=4,
	placeholder="Nhập nội dung cần đọc...",
	value="Xin chào! Chào mừng bạn đến với hệ thống TTS."
	)

	with gr.Row():
	tts_language = gr.Dropdown(
	label="🌍 Ngôn ngữ",
	choices=LANGUAGES,
	value="Tiếng Anh",
	scale=1
	)
	tts_speaker = gr.Dropdown(
	label="👤 Giọng đọc",
	choices=SPEAKERS,
	value="Ryan",
	scale=1
	)

	tts_instruct = gr.Textbox(
	label="🎨 Phong cách (tùy chọn)",
	lines=2,
	placeholder="VD: Nói chậm rãi và rõ ràng"
	)

	tts_model_size = gr.Dropdown(
	label="⚙️ Kích thước Model",
	choices=MODEL_SIZES,
	value="0.6B"
	)

	tts_btn = gr.Button("🎵 Tạo giọng nói", variant="primary")
	tts_audio_out = gr.Audio(label="🔊 Kết quả")
	tts_status = gr.Textbox(label="Trạng thái", lines=2, interactive=False)

	gr.HTML("""
	<div class="info-box">
	<strong>👥 Giọng:</strong> Aiden, Dylan, Eric, Ryan (nam) • Serena, Vivian (nữ) • Ono_anna, Sohee (châu Á)
	</div>
	""")

	tts_btn.click(
	generate_custom_voice,
	inputs=[tts_text, tts_language, tts_speaker, tts_instruct, tts_model_size],
	outputs=[tts_audio_out, tts_status]
	)

	# Footer
	gr.HTML("""
	<div class="warning-box">
	<strong>⚠️ Lưu ý CPU:</strong> Thời gian xử lý: 30s - vài phút. Dùng model 0.6B để nhanh hơn. Văn bản ngắn tốt hơn.
	</div>
	""")

	gr.Markdown("""
	---
	<div style="text-align: center; color: #888; font-size: 13px;">
	Powered by <a href="https://github.com/QwenLM/Qwen3-TTS" target="_blank">Qwen3-TTS</a> • Alibaba Qwen Team
	</div>
	""")

	return demo

	if __name__ == "__main__":
	demo = build_ui()
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False
	)