Spaces:

Accordic
/

C____C

Sleeping

App Files Files Community

Accordic commited on Jan 24

Commit

38b750c

verified ·

1 Parent(s): c7448a0

Update app.py

Browse files

Files changed (1) hide show

app.py +304 -323

app.py CHANGED Viewed

@@ -1,55 +1,38 @@
 # coding=utf-8
-# Qwen3-TTS Gradio Demo - Phiên bản CPU
 import os
 import gradio as gr
 import numpy as np
 import torch
 from huggingface_hub import snapshot_download, login
-# Đăng nhập HuggingFace (tùy chọn, chỉ cần nếu model yêu cầu quyền truy cập)
 HF_TOKEN = os.environ.get('HF_TOKEN')
 if HF_TOKEN:
     login(token=HF_TOKEN)
-# Lưu trữ các model đã tải - theo cặp (model_type, model_size)
 loaded_models = {}
-# Tùy chọn kích thước model
 MODEL_SIZES = ["0.6B", "1.7B"]
 def get_model_path(model_type: str, model_size: str) -> str:
-    """Lấy đường dẫn model dựa trên loại và kích thước."""
     return snapshot_download(f"Qwen/Qwen3-TTS-12Hz-{model_size}-{model_type}")
 def get_model(model_type: str, model_size: str):
-    """Lấy hoặc tải model theo loại và kích thước."""
     global loaded_models
     key = (model_type, model_size)
     if key not in loaded_models:
         from qwen_tts import Qwen3TTSModel
         model_path = get_model_path(model_type, model_size)
-        # Tự động phát hiện thiết bị
         device = "cuda" if torch.cuda.is_available() else "cpu"
         dtype = torch.bfloat16 if device == "cuda" else torch.float32
-        print(f"Đang tải model {model_type} {model_size} trên {device} với dtype {dtype}")
         loaded_models[key] = Qwen3TTSModel.from_pretrained(
-            model_path,
-            device_map=device,
-            dtype=dtype,
-            token=HF_TOKEN,
         )
     return loaded_models[key]
 def _normalize_audio(wav, eps=1e-12, clip=True):
-    """Chuẩn hóa audio về float32 trong khoảng [-1, 1]."""
     x = np.asarray(wav)
     if np.issubdtype(x.dtype, np.integer):
         info = np.iinfo(x.dtype)
         if info.min < 0:
@@ -63,418 +46,416 @@ def _normalize_audio(wav, eps=1e-12, clip=True):
         if m > 1.0 + 1e-6:
             y = y / (m + eps)
     else:
-        raise TypeError(f"Kiểu dữ liệu không được hỗ trợ: {x.dtype}")
     if clip:
         y = np.clip(y, -1.0, 1.0)
     if y.ndim > 1:
         y = np.mean(y, axis=-1).astype(np.float32)
     return y
 def _audio_to_tuple(audio):
-    """Chuyển đổi đầu vào audio của Gradio thành tuple (wav, sr)."""
     if audio is None:
         return None
     if isinstance(audio, tuple) and len(audio) == 2 and isinstance(audio[0], int):
         sr, wav = audio
         wav = _normalize_audio(wav)
         return wav, int(sr)
     if isinstance(audio, dict) and "sampling_rate" in audio and "data" in audio:
         sr = int(audio["sampling_rate"])
         wav = _normalize_audio(audio["data"])
         return wav, sr
     return None
-# Danh sách giọng nói và ngôn ngữ cho model CustomVoice
-SPEAKERS = [
-    "Aiden", "Dylan", "Eric", "Ono_anna", "Ryan", "Serena", "Sohee", "Uncle_fu", "Vivian"
-]
 LANGUAGES = ["Tự động", "Tiếng Trung", "Tiếng Anh", "Tiếng Nhật", "Tiếng Hàn", "Tiếng Pháp", "Tiếng Đức", "Tiếng Tây Ban Nha", "Tiếng Bồ Đào Nha", "Tiếng Nga"]
 LANGUAGE_MAP = {
-    "Tự động": "Auto",
-    "Tiếng Trung": "Chinese",
-    "Tiếng Anh": "English",
-    "Tiếng Nhật": "Japanese",
-    "Tiếng Hàn": "Korean",
-    "Tiếng Pháp": "French",
-    "Tiếng Đức": "German",
-    "Tiếng Tây Ban Nha": "Spanish",
-    "Tiếng Bồ Đào Nha": "Portuguese",
-    "Tiếng Nga": "Russian"
 }
 def generate_voice_design(text, language, voice_description):
-    """Tạo giọng nói bằng model Voice Design (chỉ 1.7B)."""
     if not text or not text.strip():
-        return None, "❌ Lỗi: Vui lòng nhập văn bản."
     if not voice_description or not voice_description.strip():
-        return None, "❌ Lỗi: Vui lòng nhập mô tả giọng nói."
     try:
-        print("Bắt đầu tạo giọng nói tùy chỉnh...")
         tts = get_model("VoiceDesign", "1.7B")
         lang_code = LANGUAGE_MAP.get(language, "Auto")
         wavs, sr = tts.generate_voice_design(
-            text=text.strip(),
-            language=lang_code,
             instruct=voice_description.strip(),
-            non_streaming_mode=True,
-            max_new_tokens=2048,
         )
-        print("Hoàn thành tạo giọng nói!")
-        return (sr, wavs[0]), "✅ Tạo giọng nói tùy chỉnh thành công!"
     except Exception as e:
-        import traceback
-        error_msg = f"❌ Lỗi: {type(e).__name__}: {e}\n{traceback.format_exc()}"
-        print(error_msg)
-        return None, error_msg
 def generate_voice_clone(ref_audio, ref_text, target_text, language, use_xvector_only, model_size):
-    """Tạo giọng nói bằng model Base (Nhân bản giọng nói)."""
     if not target_text or not target_text.strip():
-        return None, "❌ Lỗi: Vui lòng nhập văn bản đích."
     audio_tuple = _audio_to_tuple(ref_audio)
     if audio_tuple is None:
-        return None, "❌ Lỗi: Vui lòng tải lên audio tham chiếu."
     if not use_xvector_only and (not ref_text or not ref_text.strip()):
-        return None, "❌ Lỗi: Vui lòng nhập văn bản tham chiếu khi không sử dụng chế độ 'Chỉ dùng x-vector'."
     try:
-        print("Bắt đầu nhân bản giọng nói...")
         tts = get_model("Base", model_size)
         lang_code = LANGUAGE_MAP.get(language, "Auto")
         wavs, sr = tts.generate_voice_clone(
-            text=target_text.strip(),
-            language=lang_code,
             ref_audio=audio_tuple,
             ref_text=ref_text.strip() if ref_text else None,
-            x_vector_only_mode=use_xvector_only,
-            max_new_tokens=2048,
         )
-        print("Hoàn thành nhân bản giọng nói!")
-        return (sr, wavs[0]), "✅ Nhân bản giọng nói thành công!"
     except Exception as e:
-        import traceback
-        error_msg = f"❌ Lỗi: {type(e).__name__}: {e}\n{traceback.format_exc()}"
-        print(error_msg)
-        return None, error_msg
 def generate_custom_voice(text, language, speaker, instruct, model_size):
-    """Tạo giọng nói bằng model CustomVoice."""
     if not text or not text.strip():
-        return None, "❌ Lỗi: Vui lòng nhập văn bản."
     if not speaker:
-        return None, "❌ Lỗi: Vui lòng chọn giọng nói."
     try:
-        print("Bắt đầu tạo giọng nói...")
         tts = get_model("CustomVoice", model_size)
         lang_code = LANGUAGE_MAP.get(language, "Auto")
         wavs, sr = tts.generate_custom_voice(
-            text=text.strip(),
-            language=lang_code,
             speaker=speaker.lower().replace(" ", "_"),
             instruct=instruct.strip() if instruct else None,
-            non_streaming_mode=True,
-            max_new_tokens=2048,
         )
-        print("Hoàn thành tạo giọng nói!")
-        return (sr, wavs[0]), "✅ Tạo giọng nói thành công!"
     except Exception as e:
-        import traceback
-        error_msg = f"❌ Lỗi: {type(e).__name__}: {e}\n{traceback.format_exc()}"
-        print(error_msg)
-        return None, error_msg
-# Xây dựng giao diện Gradio
 def build_ui():
     theme = gr.themes.Soft(
-        font=[gr.themes.GoogleFont("Source Sans Pro"), "Arial", "sans-serif"],
         primary_hue="blue",
-        secondary_hue="slate",
     )
     css = """
     .gradio-container {
-        max-width: 1400px !important;
-        margin: auto;
     }
-    .tab-content {
-        padding: 25px;
-        background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
-        border-radius: 15px;
     }
-    .input-box {
-        border-radius: 10px;
-        border: 2px solid #e0e0e0;
     }
-    .generate-btn {
-        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-        border: none;
-        font-size: 16px;
-        font-weight: bold;
-        padding: 12px 24px;
-        border-radius: 10px;
-        transition: all 0.3s ease;
     }
-    .generate-btn:hover {
-        transform: translateY(-2px);
-        box-shadow: 0 10px 20px rgba(0,0,0,0.2);
     }
-    h1 {
         text-align: center;
         background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-        -webkit-background-clip: text;
-        -webkit-text-fill-color: transparent;
-        font-size: 3em;
-        margin-bottom: 10px;
     }
-    .subtitle {
-        text-align: center;
-        color: #666;
-        font-size: 1.2em;
-        margin-bottom: 30px;
     }
     """
-    with gr.Blocks(theme=theme, css=css, title="Qwen3-TTS - Chuyển đổi Văn bản thành Giọng nói") as demo:
-        gr.Markdown(
-            """
-# 🎙️ Qwen3-TTS - Chuyển đổi Văn bản thành Giọng nói
-<p class="subtitle">Hệ thống tổng hợp giọng nói AI tiên tiến với 3 chế độ mạnh mẽ</p>
-"""
-        )
         with gr.Tabs():
             # Tab 1: Thiết kế Giọng nói
-            with gr.Tab("🎨 Thiết kế Giọng nói"):
-                gr.Markdown(
-                    """
-### Tạo giọng nói tùy chỉnh bằng mô tả ngôn ngữ tự nhiên
-Sử dụng mô tả bằng lời để tạo giọng nói với phong cách và cảm xúc riêng biệt (Model 1.7B)
-"""
                 )
-                with gr.Row():
-                    with gr.Column(scale=1):
-                        design_text = gr.Textbox(
-                            label="📝 Văn bản cần chuyển đổi",
-                            lines=5,
-                            placeholder="Nhập văn bản bạn muốn chuyển thành giọng nói...",
-                            value="Nó ở trong ngăn kéo trên cùng... chờ đã, trống rỗng ư? Không thể nào! Tôi chắc chắn đã để nó ở đó mà!",
-                            elem_classes="input-box"
-                        )
-                        design_language = gr.Dropdown(
-                            label="🌍 Ngôn ngữ",
-                            choices=LANGUAGES,
-                            value="Tự động",
-                            interactive=True,
-                        )
-                        design_instruct = gr.Textbox(
-                            label="🎭 Mô tả giọng nói",
-                            lines=4,
-                            placeholder="Mô tả đặc điểm giọng nói bạn muốn: cảm xúc, phong cách, tốc độ...",
-                            value="Nói với giọng điệu hoài nghi, nhưng với chút hoảng loạn bắt đầu len lỏi vào giọng nói.",
-                            elem_classes="input-box"
-                        )
-                        design_btn = gr.Button("🚀 Tạo giọng nói", variant="primary", elem_classes="generate-btn")
-                    with gr.Column(scale=1):
-                        design_audio_out = gr.Audio(label="🔊 Audio đã tạo", type="numpy")
-                        design_status = gr.Textbox(label="📊 Trạng thái", lines=3, interactive=False)
-                        gr.Markdown(
-                            """
-**💡 Mẹo:**
-- Mô tả chi tiết cảm xúc và phong cách giọng nói
-- Thử nghiệm với các yêu cầu sáng tạo
-- Model 1.7B cho kết quả tốt nhất
-"""
-                        )
                 design_btn.click(
                     generate_voice_design,
                     inputs=[design_text, design_language, design_instruct],
-                    outputs=[design_audio_out, design_status],
                 )
-            # Tab 2: Nhân bản Giọng nói
-            with gr.Tab("🎤 Nhân bản Giọng nói"):
-                gr.Markdown(
-                    """
-### Sao chép giọng nói từ mẫu audio tham chiếu
-Tải lên một đoạn audio mẫu để nhân bản giọng nói cho văn bản mới
-"""
                 )
                 with gr.Row():
-                    with gr.Column(scale=1):
-                        gr.Markdown("#### 📂 Audio & Văn bản Tham chiếu")
-                        clone_ref_audio = gr.Audio(
-                            label="🎵 Audio tham chiếu (Tải lên mẫu giọng cần nhân bản)",
-                            type="numpy",
-                        )
-                        clone_ref_text = gr.Textbox(
-                            label="📄 Văn bản tham chiếu (Nội dung trong audio tham chiếu)",
-                            lines=3,
-                            placeholder="Nhập chính xác nội dung được nói trong audio tham chiếu...",
-                            elem_classes="input-box"
-                        )
-                        clone_xvector = gr.Checkbox(
-                            label="⚡ Chỉ dùng x-vector (Không cần văn bản tham chiếu, chất lượng thấp hơn)",
-                            value=False,
-                        )
-                    with gr.Column(scale=1):
-                        gr.Markdown("#### 🎯 Văn bản Đích & Cài đặt")
-                        clone_target_text = gr.Textbox(
-                            label="✍️ Văn bản đích (Nội dung muốn giọng nhân bản nói)",
-                            lines=5,
-                            placeholder="Nhập văn bản bạn muốn giọng nhân bản đọc...",
-                            elem_classes="input-box"
-                        )
-                        with gr.Row():
-                            clone_language = gr.Dropdown(
-                                label="🌍 Ngôn ngữ",
-                                choices=LANGUAGES,
-                                value="Tự động",
-                                interactive=True,
-                            )
-                            clone_model_size = gr.Dropdown(
-                                label="⚙️ Kích thước Model",
-                                choices=MODEL_SIZES,
-                                value="0.6B",
-                                interactive=True,
-                            )
-                        clone_btn = gr.Button("🎬 Nhân bản & Tạo giọng", variant="primary", elem_classes="generate-btn")
-                with gr.Row():
-                    with gr.Column(scale=1):
-                        clone_audio_out = gr.Audio(label="🔊 Audio đã tạo", type="numpy")
-                    with gr.Column(scale=1):
-                        clone_status = gr.Textbox(label="📊 Trạng thái", lines=3, interactive=False)
                 clone_btn.click(
                     generate_voice_clone,
-                    inputs=[clone_ref_audio, clone_ref_text, clone_target_text, clone_language, clone_xvector, clone_model_size],
-                    outputs=[clone_audio_out, clone_status],
                 )
-            # Tab 3: Giọng nói có sẵn
-            with gr.Tab("🗣️ Giọng nói có sẵn"):
-                gr.Markdown(
-                    """
-### Chuyển văn bản thành giọng nói với các giọng đọc được huấn luyện trước
-Chọn từ nhiều giọng nói chuyên nghiệp với khả năng tùy chỉnh phong cách
-"""
                 )
                 with gr.Row():
-                    with gr.Column(scale=1):
-                        tts_text = gr.Textbox(
-                            label="📝 Văn bản cần chuyển đổi",
-                            lines=6,
-                            placeholder="Nhập văn bản bạn muốn chuyển thành giọng nói...",
-                            value="Xin chào! Chào mừng đến với hệ thống chuyển văn bản thành giọng nói. Đây là bản demo khả năng TTS của chúng tôi.",
-                            elem_classes="input-box"
-                        )
-                        with gr.Row():
-                            tts_language = gr.Dropdown(
-                                label="🌍 Ngôn ngữ",
-                                choices=LANGUAGES,
-                                value="Tiếng Anh",
-                                interactive=True,
-                            )
-                            tts_speaker = gr.Dropdown(
-                                label="👤 Giọng đọc",
-                                choices=SPEAKERS,
-                                value="Ryan",
-                                interactive=True,
-                            )
-                        with gr.Row():
-                            tts_instruct = gr.Textbox(
-                                label="🎨 Hướng dẫn phong cách (Tùy chọn)",
-                                lines=2,
-                                placeholder="VD: Nói với giọng vui vẻ và tràn đầy năng lượng",
-                                elem_classes="input-box"
-                            )
-                            tts_model_size = gr.Dropdown(
-                                label="⚙️ Kích thước Model",
-                                choices=MODEL_SIZES,
-                                value="0.6B",
-                                interactive=True,
-                            )
-                        tts_btn = gr.Button("🎵 Tạo giọng nói", variant="primary", elem_classes="generate-btn")
-                    with gr.Column(scale=1):
-                        tts_audio_out = gr.Audio(label="🔊 Audio đã tạo", type="numpy")
-                        tts_status = gr.Textbox(label="📊 Trạng thái", lines=3, interactive=False)
-                        gr.Markdown(
-                            """
-**👥 Giọng đọc có sẵn:**
-- **Aiden, Dylan, Eric, Ryan**: Giọng nam
-- **Serena, Vivian**: Giọng nữ
-- **Ono_anna, Sohee**: Giọng châu Á
-- **Uncle_fu**: Giọng trưởng thành
-**💡 Mẹo:**
-- Dùng model 0.6B để tạo nhanh hơn
-- Thêm hướng dẫn phong cách để tùy chỉnh cảm xúc
-"""
-                        )
                 tts_btn.click(
                     generate_custom_voice,
                     inputs=[tts_text, tts_language, tts_speaker, tts_instruct, tts_model_size],
-                    outputs=[tts_audio_out, tts_status],
                 )
-        gr.Markdown(
-            """
----
-### ⚙️ Thông tin Hệ thống
-**⚠️ Lưu ý khi chạy trên CPU:**
-- Phiên bản này chạy trên CPU và sẽ **chậm hơn đáng kể** so với GPU
-- Thời gian ước tính: **30 giây đến vài phút** mỗi lần tạo (tùy độ dài văn bản và model)
-- Lần tạo đầu tiên sẽ chậm hơn do phải tải model vào bộ nhớ
-**📋 Mẹo Tối ưu Hiệu suất:**
-- ✅ Sử dụng model **0.6B** để tạo nhanh hơn (khi có sẵn)
-- ✅ Giữ văn bản **ngắn gọn** (1-2 câu) để thời gian chờ hợp lý
-- ✅ Đóng các ứng dụng khác để **giải phóng RAM**
-- ✅ **Kiên nhẫn** - Model đang xử lý ở chế độ nền
-**💻 Yêu cầu Hệ thống:**
-- 🔹 **RAM**: Tối thiểu 8GB (khuyến nghị 16GB cho model 1.7B)
-- 🔹 **Lưu trữ**: ~5GB cho mỗi model
-- 🔹 **CPU**: Khuyến nghị bộ xử lý đa nhân
-**📚 Được xây dựng với:** [Qwen3-TTS](https://github.com/QwenLM/Qwen3-TTS) bởi Alibaba Qwen Team
 ---
-<p style="text-align: center; color: #888;">Made with ❤️ using Qwen3-TTS | Phiên bản CPU</p>
-"""
-        )
     return demo
 if __name__ == "__main__":
     demo = build_ui()
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
-        share=False,
-        show_error=True
     )

 # coding=utf-8
+# Qwen3-TTS Gradio Demo - Giao diện Responsive
 import os
 import gradio as gr
 import numpy as np
 import torch
 from huggingface_hub import snapshot_download, login
+# Đăng nhập HuggingFace
 HF_TOKEN = os.environ.get('HF_TOKEN')
 if HF_TOKEN:
     login(token=HF_TOKEN)
 loaded_models = {}
 MODEL_SIZES = ["0.6B", "1.7B"]
 def get_model_path(model_type: str, model_size: str) -> str:
     return snapshot_download(f"Qwen/Qwen3-TTS-12Hz-{model_size}-{model_type}")
 def get_model(model_type: str, model_size: str):
     global loaded_models
     key = (model_type, model_size)
     if key not in loaded_models:
         from qwen_tts import Qwen3TTSModel
         model_path = get_model_path(model_type, model_size)
         device = "cuda" if torch.cuda.is_available() else "cpu"
         dtype = torch.bfloat16 if device == "cuda" else torch.float32
+        print(f"Đang tải model {model_type} {model_size} trên {device}")
         loaded_models[key] = Qwen3TTSModel.from_pretrained(
+            model_path, device_map=device, dtype=dtype, token=HF_TOKEN
         )
     return loaded_models[key]
 def _normalize_audio(wav, eps=1e-12, clip=True):
     x = np.asarray(wav)
     if np.issubdtype(x.dtype, np.integer):
         info = np.iinfo(x.dtype)
         if info.min < 0:
         if m > 1.0 + 1e-6:
             y = y / (m + eps)
     else:
+        raise TypeError(f"Kiểu dữ liệu không hỗ trợ: {x.dtype}")
     if clip:
         y = np.clip(y, -1.0, 1.0)
     if y.ndim > 1:
         y = np.mean(y, axis=-1).astype(np.float32)
     return y
 def _audio_to_tuple(audio):
     if audio is None:
         return None
     if isinstance(audio, tuple) and len(audio) == 2 and isinstance(audio[0], int):
         sr, wav = audio
         wav = _normalize_audio(wav)
         return wav, int(sr)
     if isinstance(audio, dict) and "sampling_rate" in audio and "data" in audio:
         sr = int(audio["sampling_rate"])
         wav = _normalize_audio(audio["data"])
         return wav, sr
     return None
+SPEAKERS = ["Aiden", "Dylan", "Eric", "Ono_anna", "Ryan", "Serena", "Sohee", "Uncle_fu", "Vivian"]
 LANGUAGES = ["Tự động", "Tiếng Trung", "Tiếng Anh", "Tiếng Nhật", "Tiếng Hàn", "Tiếng Pháp", "Tiếng Đức", "Tiếng Tây Ban Nha", "Tiếng Bồ Đào Nha", "Tiếng Nga"]
 LANGUAGE_MAP = {
+    "Tự động": "Auto", "Tiếng Trung": "Chinese", "Tiếng Anh": "English",
+    "Tiếng Nhật": "Japanese", "Tiếng Hàn": "Korean", "Tiếng Pháp": "French",
+    "Tiếng Đức": "German", "Tiếng Tây Ban Nha": "Spanish",
+    "Tiếng Bồ Đào Nha": "Portuguese", "Tiếng Nga": "Russian"
 }
 def generate_voice_design(text, language, voice_description):
     if not text or not text.strip():
+        return None, "❌ Vui lòng nhập văn bản"
     if not voice_description or not voice_description.strip():
+        return None, "❌ Vui lòng nhập mô tả giọng nói"
     try:
         tts = get_model("VoiceDesign", "1.7B")
         lang_code = LANGUAGE_MAP.get(language, "Auto")
         wavs, sr = tts.generate_voice_design(
+            text=text.strip(), language=lang_code,
             instruct=voice_description.strip(),
+            non_streaming_mode=True, max_new_tokens=2048
         )
+        return (sr, wavs[0]), "✅ Hoàn thành!"
     except Exception as e:
+        return None, f"❌ Lỗi: {str(e)}"
 def generate_voice_clone(ref_audio, ref_text, target_text, language, use_xvector_only, model_size):
     if not target_text or not target_text.strip():
+        return None, "❌ Vui lòng nhập văn bản đích"
     audio_tuple = _audio_to_tuple(ref_audio)
     if audio_tuple is None:
+        return None, "❌ Vui lòng tải audio tham chiếu"
     if not use_xvector_only and (not ref_text or not ref_text.strip()):
+        return None, "❌ Vui lòng nhập văn bản tham chiếu"
     try:
         tts = get_model("Base", model_size)
         lang_code = LANGUAGE_MAP.get(language, "Auto")
         wavs, sr = tts.generate_voice_clone(
+            text=target_text.strip(), language=lang_code,
             ref_audio=audio_tuple,
             ref_text=ref_text.strip() if ref_text else None,
+            x_vector_only_mode=use_xvector_only, max_new_tokens=2048
         )
+        return (sr, wavs[0]), "✅ Hoàn thành!"
     except Exception as e:
+        return None, f"❌ Lỗi: {str(e)}"
 def generate_custom_voice(text, language, speaker, instruct, model_size):
     if not text or not text.strip():
+        return None, "❌ Vui lòng nhập văn bản"
     if not speaker:
+        return None, "❌ Vui lòng chọn giọng nói"
     try:
         tts = get_model("CustomVoice", model_size)
         lang_code = LANGUAGE_MAP.get(language, "Auto")
         wavs, sr = tts.generate_custom_voice(
+            text=text.strip(), language=lang_code,
             speaker=speaker.lower().replace(" ", "_"),
             instruct=instruct.strip() if instruct else None,
+            non_streaming_mode=True, max_new_tokens=2048
         )
+        return (sr, wavs[0]), "✅ Hoàn thành!"
     except Exception as e:
+        return None, f"❌ Lỗi: {str(e)}"
 def build_ui():
     theme = gr.themes.Soft(
+        font=[gr.themes.GoogleFont("Inter"), "sans-serif"],
         primary_hue="blue",
+        radius_size="md",
     )
     css = """
+    /* Container chính */
     .gradio-container {
+        max-width: 100% !important;
+        padding: 10px !important;
     }
+    /* Tab style */
+    .tab-nav button {
+        font-size: 14px !important;
+        padding: 10px 15px !important;
     }
+    /* Button style */
+    button.primary {
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
+        border: none !important;
+        font-weight: 600 !important;
+        padding: 12px 24px !important;
+        border-radius: 8px !important;
+        transition: all 0.3s ease !important;
     }
+    button.primary:hover {
+        transform: translateY(-2px) !important;
+        box-shadow: 0 8px 16px rgba(102, 126, 234, 0.3) !important;
+    }
+    /* Input fields */
+    textarea, input, select {
+        border-radius: 8px !important;
+        border: 1.5px solid #e0e0e0 !important;
+        font-size: 14px !important;
     }
+    /* Labels */
+    label {
+        font-weight: 600 !important;
+        color: #374151 !important;
+        margin-bottom: 8px !important;
     }
+    /* Header */
+    .app-header {
         text-align: center;
+        padding: 20px 10px;
         background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        color: white;
+        border-radius: 12px;
+        margin-bottom: 20px;
     }
+    .app-header h1 {
+        margin: 0;
+        font-size: clamp(24px, 5vw, 36px);
+        font-weight: 700;
+    }
+    .app-header p {
+        margin: 8px 0 0 0;
+        font-size: clamp(12px, 3vw, 16px);
+        opacity: 0.95;
+    }
+    /* Card style cho sections */
+    .input-card {
+        background: white;
+        padding: 20px;
+        border-radius: 12px;
+        box-shadow: 0 2px 8px rgba(0,0,0,0.08);
+        margin-bottom: 15px;
+    }
+    /* Status message */
+    .status-box {
+        padding: 12px;
+        border-radius: 8px;
+        margin-top: 10px;
+        font-size: 13px;
+    }
+    /* Info boxes */
+    .info-box {
+        background: #f0f9ff;
+        border-left: 4px solid #3b82f6;
+        padding: 12px 15px;
+        border-radius: 6px;
+        margin: 10px 0;
+        font-size: 13px;
+    }
+    .warning-box {
+        background: #fef3c7;
+        border-left: 4px solid #f59e0b;
+        padding: 12px 15px;
+        border-radius: 6px;
+        margin: 10px 0;
+        font-size: 13px;
+    }
+    /* Responsive adjustments */
+    @media (max-width: 768px) {
+        .gradio-container {
+            padding: 5px !important;
+        }
+        .app-header {
+            padding: 15px 10px;
+            margin-bottom: 15px;
+        }
+        .input-card {
+            padding: 15px;
+        }
+        button.primary {
+            width: 100%;
+            padding: 14px 20px !important;
+        }
+        .tab-nav button {
+            font-size: 12px !important;
+            padding: 8px 10px !important;
+        }
+    }
+    /* Compact spacing for mobile */
+    @media (max-width: 480px) {
+        .block {
+            margin: 8px 0 !important;
+        }
+        textarea {
+            font-size: 14px !important;
+        }
     }
     """
+    with gr.Blocks(theme=theme, css=css, title="Qwen3-TTS") as demo:
+        # Header
+        gr.HTML("""
+        <div class="app-header">
+            <h1>🎙️ Qwen3-TTS</h1>
+            <p>Chuyển đổi Văn bản thành Giọng nói bằng AI</p>
+        </div>
+        """)
         with gr.Tabs():
             # Tab 1: Thiết kế Giọng nói
+            with gr.Tab("🎨 Thiết kế Giọng"):
+                gr.Markdown("**Tạo giọng nói tùy chỉnh bằng mô tả** (Model 1.7B)")
+                design_text = gr.Textbox(
+                    label="📝 Văn bản",
+                    lines=4,
+                    placeholder="Nhập nội dung cần đọc...",
+                    value="Xin chào! Đây là giọng nói được tạo bởi AI."
                 )
+                design_language = gr.Dropdown(
+                    label="🌍 Ngôn ngữ",
+                    choices=LANGUAGES,
+                    value="Tự động"
+                )
+                design_instruct = gr.Textbox(
+                    label="🎭 Mô tả giọng nói",
+                    lines=3,
+                    placeholder="VD: Giọng vui vẻ, tràn đầy năng lượng...",
+                    value="Nói với giọng thân thiện và nhiệt tình"
+                )
+                design_btn = gr.Button("🚀 Tạo giọng nói", variant="primary")
+                design_audio_out = gr.Audio(label="🔊 Kết quả")
+                design_status = gr.Textbox(label="Trạng thái", lines=2, interactive=False)
+                gr.HTML("""
+                <div class="info-box">
+                    <strong>💡 Mẹo:</strong> Mô tả chi tiết cảm xúc, tốc độ, phong cách để có kết quả tốt nhất
+                </div>
+                """)
                 design_btn.click(
                     generate_voice_design,
                     inputs=[design_text, design_language, design_instruct],
+                    outputs=[design_audio_out, design_status]
                 )
+            # Tab 2: Nhân bản Giọng
+            with gr.Tab("🎤 Nhân bản Giọng"):
+                gr.Markdown("**Sao chép giọng nói từ audio mẫu**")
+                clone_ref_audio = gr.Audio(
+                    label="🎵 Audio mẫu",
+                    type="numpy"
                 )
+                clone_ref_text = gr.Textbox(
+                    label="📄 Nội dung audio mẫu",
+                    lines=2,
+                    placeholder="Nhập chính xác nội dung trong audio..."
+                )
+                clone_xvector = gr.Checkbox(
+                    label="⚡ Chế độ nhanh (không cần nội dung audio)",
+                    value=False
+                )
+                clone_target_text = gr.Textbox(
+                    label="✍️ Văn bản cần đọc",
+                    lines=3,
+                    placeholder="Nhập nội dung muốn giọng nhân bản đọc..."
+                )
                 with gr.Row():
+                    clone_language = gr.Dropdown(
+                        label="🌍 Ngôn ngữ",
+                        choices=LANGUAGES,
+                        value="Tự động",
+                        scale=1
+                    )
+                    clone_model_size = gr.Dropdown(
+                        label="⚙️ Model",
+                        choices=MODEL_SIZES,
+                        value="0.6B",
+                        scale=1
+                    )
+                clone_btn = gr.Button("🎬 Nhân bản giọng", variant="primary")
+                clone_audio_out = gr.Audio(label="🔊 Kết quả")
+                clone_status = gr.Textbox(label="Trạng thái", lines=2, interactive=False)
+                gr.HTML("""
+                <div class="info-box">
+                    <strong>💡 Lưu ý:</strong> Audio mẫu nên rõ ràng, ít nhiễu và độ dài 3-10 giây
+                </div>
+                """)
                 clone_btn.click(
                     generate_voice_clone,
+                    inputs=[clone_ref_audio, clone_ref_text, clone_target_text,
+                           clone_language, clone_xvector, clone_model_size],
+                    outputs=[clone_audio_out, clone_status]
                 )
+            # Tab 3: Giọng có sẵn
+            with gr.Tab("🗣️ Giọng có sẵn"):
+                gr.Markdown("**Sử dụng giọng đọc được huấn luyện sẵn**")
+                tts_text = gr.Textbox(
+                    label="📝 Văn bản",
+                    lines=4,
+                    placeholder="Nhập nội dung cần đọc...",
+                    value="Xin chào! Chào mừng bạn đến với hệ thống TTS."
                 )
                 with gr.Row():
+                    tts_language = gr.Dropdown(
+                        label="🌍 Ngôn ngữ",
+                        choices=LANGUAGES,
+                        value="Tiếng Anh",
+                        scale=1
+                    )
+                    tts_speaker = gr.Dropdown(
+                        label="👤 Giọng đọc",
+                        choices=SPEAKERS,
+                        value="Ryan",
+                        scale=1
+                    )
+                tts_instruct = gr.Textbox(
+                    label="🎨 Phong cách (tùy chọn)",
+                    lines=2,
+                    placeholder="VD: Nói chậm rãi và rõ ràng"
+                )
+                tts_model_size = gr.Dropdown(
+                    label="⚙️ Kích thước Model",
+                    choices=MODEL_SIZES,
+                    value="0.6B"
+                )
+                tts_btn = gr.Button("🎵 Tạo giọng nói", variant="primary")
+                tts_audio_out = gr.Audio(label="🔊 Kết quả")
+                tts_status = gr.Textbox(label="Trạng thái", lines=2, interactive=False)
+                gr.HTML("""
+                <div class="info-box">
+                    <strong>👥 Giọng:</strong> Aiden, Dylan, Eric, Ryan (nam) • Serena, Vivian (nữ) • Ono_anna, Sohee (châu Á)
+                </div>
+                """)
                 tts_btn.click(
                     generate_custom_voice,
                     inputs=[tts_text, tts_language, tts_speaker, tts_instruct, tts_model_size],
+                    outputs=[tts_audio_out, tts_status]
                 )
+        # Footer
+        gr.HTML("""
+        <div class="warning-box">
+            <strong>⚠️ Lưu ý CPU:</strong> Thời gian xử lý: 30s - vài phút. Dùng model 0.6B để nhanh hơn. Văn bản ngắn tốt hơn.
+        </div>
+        """)
+        gr.Markdown("""
 ---
+<div style="text-align: center; color: #888; font-size: 13px;">
+Powered by <a href="https://github.com/QwenLM/Qwen3-TTS" target="_blank">Qwen3-TTS</a> • Alibaba Qwen Team
+</div>
+        """)
     return demo
 if __name__ == "__main__":
     demo = build_ui()
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
+        share=False
     )