# coding=utf-8
# Qwen3-TTS Gradio Demo - Giao diện Responsive
import os
import gradio as gr
import numpy as np
import torch
from huggingface_hub import snapshot_download, login

# Đăng nhập HuggingFace
HF_TOKEN = os.environ.get('HF_TOKEN')
if HF_TOKEN:
    login(token=HF_TOKEN)

loaded_models = {}
MODEL_SIZES = ["0.6B", "1.7B"]

def get_model_path(model_type: str, model_size: str) -> str:
    return snapshot_download(f"Qwen/Qwen3-TTS-12Hz-{model_size}-{model_type}")

def get_model(model_type: str, model_size: str):
    global loaded_models
    key = (model_type, model_size)
    if key not in loaded_models:
        from qwen_tts import Qwen3TTSModel
        model_path = get_model_path(model_type, model_size)
        device = "cuda" if torch.cuda.is_available() else "cpu"
        dtype = torch.bfloat16 if device == "cuda" else torch.float32
        print(f"Đang tải model {model_type} {model_size} trên {device}")
        loaded_models[key] = Qwen3TTSModel.from_pretrained(
            model_path, device_map=device, dtype=dtype, token=HF_TOKEN
        )
    return loaded_models[key]

def _normalize_audio(wav, eps=1e-12, clip=True):
    x = np.asarray(wav)
    if np.issubdtype(x.dtype, np.integer):
        info = np.iinfo(x.dtype)
        if info.min < 0:
            y = x.astype(np.float32) / max(abs(info.min), info.max)
        else:
            mid = (info.max + 1) / 2.0
            y = (x.astype(np.float32) - mid) / mid
    elif np.issubdtype(x.dtype, np.floating):
        y = x.astype(np.float32)
        m = np.max(np.abs(y)) if y.size else 0.0
        if m > 1.0 + 1e-6:
            y = y / (m + eps)
    else:
        raise TypeError(f"Kiểu dữ liệu không hỗ trợ: {x.dtype}")
    if clip:
        y = np.clip(y, -1.0, 1.0)
    if y.ndim > 1:
        y = np.mean(y, axis=-1).astype(np.float32)
    return y

def _audio_to_tuple(audio):
    if audio is None:
        return None
    if isinstance(audio, tuple) and len(audio) == 2 and isinstance(audio[0], int):
        sr, wav = audio
        wav = _normalize_audio(wav)
        return wav, int(sr)
    if isinstance(audio, dict) and "sampling_rate" in audio and "data" in audio:
        sr = int(audio["sampling_rate"])
        wav = _normalize_audio(audio["data"])
        return wav, sr
    return None

SPEAKERS = ["Aiden", "Dylan", "Eric", "Ono_anna", "Ryan", "Serena", "Sohee", "Uncle_fu", "Vivian"]
LANGUAGES = ["Tự động", "Tiếng Trung", "Tiếng Anh", "Tiếng Nhật", "Tiếng Hàn", "Tiếng Pháp", "Tiếng Đức", "Tiếng Tây Ban Nha", "Tiếng Bồ Đào Nha", "Tiếng Nga"]
LANGUAGE_MAP = {
    "Tự động": "Auto", "Tiếng Trung": "Chinese", "Tiếng Anh": "English",
    "Tiếng Nhật": "Japanese", "Tiếng Hàn": "Korean", "Tiếng Pháp": "French",
    "Tiếng Đức": "German", "Tiếng Tây Ban Nha": "Spanish",
    "Tiếng Bồ Đào Nha": "Portuguese", "Tiếng Nga": "Russian"
}

def generate_voice_design(text, language, voice_description):
    if not text or not text.strip():
        return None, "❌ Vui lòng nhập văn bản"
    if not voice_description or not voice_description.strip():
        return None, "❌ Vui lòng nhập mô tả giọng nói"
    try:
        tts = get_model("VoiceDesign", "1.7B")
        lang_code = LANGUAGE_MAP.get(language, "Auto")
        wavs, sr = tts.generate_voice_design(
            text=text.strip(), language=lang_code,
            instruct=voice_description.strip(),
            non_streaming_mode=True, max_new_tokens=2048
        )
        return (sr, wavs[0]), "✅ Hoàn thành!"
    except Exception as e:
        return None, f"❌ Lỗi: {str(e)}"

def generate_voice_clone(ref_audio, ref_text, target_text, language, use_xvector_only, model_size):
    if not target_text or not target_text.strip():
        return None, "❌ Vui lòng nhập văn bản đích"
    audio_tuple = _audio_to_tuple(ref_audio)
    if audio_tuple is None:
        return None, "❌ Vui lòng tải audio tham chiếu"
    if not use_xvector_only and (not ref_text or not ref_text.strip()):
        return None, "❌ Vui lòng nhập văn bản tham chiếu"
    try:
        tts = get_model("Base", model_size)
        lang_code = LANGUAGE_MAP.get(language, "Auto")
        wavs, sr = tts.generate_voice_clone(
            text=target_text.strip(), language=lang_code,
            ref_audio=audio_tuple,
            ref_text=ref_text.strip() if ref_text else None,
            x_vector_only_mode=use_xvector_only, max_new_tokens=2048
        )
        return (sr, wavs[0]), "✅ Hoàn thành!"
    except Exception as e:
        return None, f"❌ Lỗi: {str(e)}"

def generate_custom_voice(text, language, speaker, instruct, model_size):
    if not text or not text.strip():
        return None, "❌ Vui lòng nhập văn bản"
    if not speaker:
        return None, "❌ Vui lòng chọn giọng nói"
    try:
        tts = get_model("CustomVoice", model_size)
        lang_code = LANGUAGE_MAP.get(language, "Auto")
        wavs, sr = tts.generate_custom_voice(
            text=text.strip(), language=lang_code,
            speaker=speaker.lower().replace(" ", "_"),
            instruct=instruct.strip() if instruct else None,
            non_streaming_mode=True, max_new_tokens=2048
        )
        return (sr, wavs[0]), "✅ Hoàn thành!"
    except Exception as e:
        return None, f"❌ Lỗi: {str(e)}"

def build_ui():
    theme = gr.themes.Soft(
        font=[gr.themes.GoogleFont("Inter"), "sans-serif"],
        primary_hue="blue",
        radius_size="md",
    )

    css = """
    /* Container chính */
    .gradio-container {
        max-width: 100% !important;
        padding: 10px !important;
    }
    
    /* Tab style */
    .tab-nav button {
        font-size: 14px !important;
        padding: 10px 15px !important;
    }
    
    /* Button style */
    button.primary {
        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
        border: none !important;
        font-weight: 600 !important;
        padding: 12px 24px !important;
        border-radius: 8px !important;
        transition: all 0.3s ease !important;
    }
    
    button.primary:hover {
        transform: translateY(-2px) !important;
        box-shadow: 0 8px 16px rgba(102, 126, 234, 0.3) !important;
    }
    
    /* Input fields */
    textarea, input, select {
        border-radius: 8px !important;
        border: 1.5px solid #e0e0e0 !important;
        font-size: 14px !important;
    }
    
    /* Labels */
    label {
        font-weight: 600 !important;
        color: #374151 !important;
        margin-bottom: 8px !important;
    }
    
    /* Header */
    .app-header {
        text-align: center;
        padding: 20px 10px;
        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
        color: white;
        border-radius: 12px;
        margin-bottom: 20px;
    }
    
    .app-header h1 {
        margin: 0;
        font-size: clamp(24px, 5vw, 36px);
        font-weight: 700;
    }
    
    .app-header p {
        margin: 8px 0 0 0;
        font-size: clamp(12px, 3vw, 16px);
        opacity: 0.95;
    }
    
    /* Card style cho sections */
    .input-card {
        background: white;
        padding: 20px;
        border-radius: 12px;
        box-shadow: 0 2px 8px rgba(0,0,0,0.08);
        margin-bottom: 15px;
    }
    
    /* Status message */
    .status-box {
        padding: 12px;
        border-radius: 8px;
        margin-top: 10px;
        font-size: 13px;
    }
    
    /* Info boxes */
    .info-box {
        background: #f0f9ff;
        border-left: 4px solid #3b82f6;
        padding: 12px 15px;
        border-radius: 6px;
        margin: 10px 0;
        font-size: 13px;
    }
    
    .warning-box {
        background: #fef3c7;
        border-left: 4px solid #f59e0b;
        padding: 12px 15px;
        border-radius: 6px;
        margin: 10px 0;
        font-size: 13px;
    }
    
    /* Responsive adjustments */
    @media (max-width: 768px) {
        .gradio-container {
            padding: 5px !important;
        }
        
        .app-header {
            padding: 15px 10px;
            margin-bottom: 15px;
        }
        
        .input-card {
            padding: 15px;
        }
        
        button.primary {
            width: 100%;
            padding: 14px 20px !important;
        }
        
        .tab-nav button {
            font-size: 12px !important;
            padding: 8px 10px !important;
        }
    }
    
    /* Compact spacing for mobile */
    @media (max-width: 480px) {
        .block {
            margin: 8px 0 !important;
        }
        
        textarea {
            font-size: 14px !important;
        }
    }
    """

    with gr.Blocks(theme=theme, css=css, title="Qwen3-TTS") as demo:
        # Header
        gr.HTML("""
        <div class="app-header">
            <h1>🎙️ Qwen3-TTS</h1>
            <p>Chuyển đổi Văn bản thành Giọng nói bằng AI</p>
        </div>
        """)

        with gr.Tabs():
            # Tab 1: Thiết kế Giọng nói
            with gr.Tab("🎨 Thiết kế Giọng"):
                gr.Markdown("**Tạo giọng nói tùy chỉnh bằng mô tả** (Model 1.7B)")
                
                design_text = gr.Textbox(
                    label="📝 Văn bản",
                    lines=4,
                    placeholder="Nhập nội dung cần đọc...",
                    value="Xin chào! Đây là giọng nói được tạo bởi AI."
                )
                
                design_language = gr.Dropdown(
                    label="🌍 Ngôn ngữ",
                    choices=LANGUAGES,
                    value="Tự động"
                )
                
                design_instruct = gr.Textbox(
                    label="🎭 Mô tả giọng nói",
                    lines=3,
                    placeholder="VD: Giọng vui vẻ, tràn đầy năng lượng...",
                    value="Nói với giọng thân thiện và nhiệt tình"
                )
                
                design_btn = gr.Button("🚀 Tạo giọng nói", variant="primary")
                design_audio_out = gr.Audio(label="🔊 Kết quả")
                design_status = gr.Textbox(label="Trạng thái", lines=2, interactive=False)
                
                gr.HTML("""
                <div class="info-box">
                    <strong>💡 Mẹo:</strong> Mô tả chi tiết cảm xúc, tốc độ, phong cách để có kết quả tốt nhất
                </div>
                """)

                design_btn.click(
                    generate_voice_design,
                    inputs=[design_text, design_language, design_instruct],
                    outputs=[design_audio_out, design_status]
                )

            # Tab 2: Nhân bản Giọng
            with gr.Tab("🎤 Nhân bản Giọng"):
                gr.Markdown("**Sao chép giọng nói từ audio mẫu**")
                
                clone_ref_audio = gr.Audio(
                    label="🎵 Audio mẫu",
                    type="numpy"
                )
                
                clone_ref_text = gr.Textbox(
                    label="📄 Nội dung audio mẫu",
                    lines=2,
                    placeholder="Nhập chính xác nội dung trong audio..."
                )
                
                clone_xvector = gr.Checkbox(
                    label="⚡ Chế độ nhanh (không cần nội dung audio)",
                    value=False
                )
                
                clone_target_text = gr.Textbox(
                    label="✍️ Văn bản cần đọc",
                    lines=3,
                    placeholder="Nhập nội dung muốn giọng nhân bản đọc..."
                )
                
                with gr.Row():
                    clone_language = gr.Dropdown(
                        label="🌍 Ngôn ngữ",
                        choices=LANGUAGES,
                        value="Tự động",
                        scale=1
                    )
                    clone_model_size = gr.Dropdown(
                        label="⚙️ Model",
                        choices=MODEL_SIZES,
                        value="0.6B",
                        scale=1
                    )
                
                clone_btn = gr.Button("🎬 Nhân bản giọng", variant="primary")
                clone_audio_out = gr.Audio(label="🔊 Kết quả")
                clone_status = gr.Textbox(label="Trạng thái", lines=2, interactive=False)
                
                gr.HTML("""
                <div class="info-box">
                    <strong>💡 Lưu ý:</strong> Audio mẫu nên rõ ràng, ít nhiễu và độ dài 3-10 giây
                </div>
                """)

                clone_btn.click(
                    generate_voice_clone,
                    inputs=[clone_ref_audio, clone_ref_text, clone_target_text, 
                           clone_language, clone_xvector, clone_model_size],
                    outputs=[clone_audio_out, clone_status]
                )

            # Tab 3: Giọng có sẵn
            with gr.Tab("🗣️ Giọng có sẵn"):
                gr.Markdown("**Sử dụng giọng đọc được huấn luyện sẵn**")
                
                tts_text = gr.Textbox(
                    label="📝 Văn bản",
                    lines=4,
                    placeholder="Nhập nội dung cần đọc...",
                    value="Xin chào! Chào mừng bạn đến với hệ thống TTS."
                )
                
                with gr.Row():
                    tts_language = gr.Dropdown(
                        label="🌍 Ngôn ngữ",
                        choices=LANGUAGES,
                        value="Tiếng Anh",
                        scale=1
                    )
                    tts_speaker = gr.Dropdown(
                        label="👤 Giọng đọc",
                        choices=SPEAKERS,
                        value="Ryan",
                        scale=1
                    )
                
                tts_instruct = gr.Textbox(
                    label="🎨 Phong cách (tùy chọn)",
                    lines=2,
                    placeholder="VD: Nói chậm rãi và rõ ràng"
                )
                
                tts_model_size = gr.Dropdown(
                    label="⚙️ Kích thước Model",
                    choices=MODEL_SIZES,
                    value="0.6B"
                )
                
                tts_btn = gr.Button("🎵 Tạo giọng nói", variant="primary")
                tts_audio_out = gr.Audio(label="🔊 Kết quả")
                tts_status = gr.Textbox(label="Trạng thái", lines=2, interactive=False)
                
                gr.HTML("""
                <div class="info-box">
                    <strong>👥 Giọng:</strong> Aiden, Dylan, Eric, Ryan (nam) • Serena, Vivian (nữ) • Ono_anna, Sohee (châu Á)
                </div>
                """)

                tts_btn.click(
                    generate_custom_voice,
                    inputs=[tts_text, tts_language, tts_speaker, tts_instruct, tts_model_size],
                    outputs=[tts_audio_out, tts_status]
                )

        # Footer
        gr.HTML("""
        <div class="warning-box">
            <strong>⚠️ Lưu ý CPU:</strong> Thời gian xử lý: 30s - vài phút. Dùng model 0.6B để nhanh hơn. Văn bản ngắn tốt hơn.
        </div>
        """)
        
        gr.Markdown("""
---
<div style="text-align: center; color: #888; font-size: 13px;">
Powered by <a href="https://github.com/QwenLM/Qwen3-TTS" target="_blank">Qwen3-TTS</a> • Alibaba Qwen Team
</div>
        """)

    return demo

if __name__ == "__main__":
    demo = build_ui()
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False
    )