# coding=utf-8
import os
import sys
import logging
import spaces
import gradio as gr
import numpy as np
import torch
from huggingface_hub import snapshot_download, login
from qwen_tts import Qwen3TTSModel
from qwen_tts.inference.qwen3_tts_model import VoiceClonePromptItem
import functools
import uuid
import random
import whisper
import librosa
from opencc import OpenCC

# 配置日志
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler(sys.stdout)]
)
logger = logging.getLogger("Qwen3-TTS-Demo")

# 初始化简繁转换器
cc = OpenCC('t2s')

HF_TOKEN = os.environ.get('HF_TOKEN')
if HF_TOKEN:
    login(token=HF_TOKEN)

MODEL_SIZES = ["0.6B", "1.7B"]
LANGUAGES = ["Auto", "Chinese", "English", "Japanese", "Korean", "French", "German", "Spanish", "Portuguese", "Russian"]

def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def get_model_path(model_type: str, model_size: str) -> str:
    return snapshot_download(f"Qwen/Qwen3-TTS-12Hz-{model_size}-{model_type}")

@functools.lru_cache(maxsize=1)
def load_model(model_type, model_size):
    path = get_model_path(model_type, model_size)
    return Qwen3TTSModel.from_pretrained(
        path,
        device_map="cuda",
        dtype=torch.bfloat16,
        token=HF_TOKEN,
        attn_implementation="kernels-community/flash-attn3"
    )

@functools.lru_cache(maxsize=1)
def load_whisper_model(model_name="large-v3"):
    model = whisper.load_model(model_name, device="cuda" if torch.cuda.is_available() else "cpu")
    return model

def _normalize_audio(wav, eps=1e-12, clip=True):
    x = np.asarray(wav)
    if np.issubdtype(x.dtype, np.integer):
        info = np.iinfo(x.dtype)
        y = x.astype(np.float32) / max(abs(info.min), info.max)
    elif np.issubdtype(x.dtype, np.floating):
        y = x.astype(np.float32)
        m = np.max(np.abs(y)) if y.size else 0.0
        if m > 1.0 + 1e-6:
            y = y / (m + eps)
    else:
        raise TypeError(f"Unsupported dtype: {x.dtype}")
    if clip:
        y = np.clip(y, -1.0, 1.0)
    if y.ndim > 1:
        y = np.mean(y, axis=-1).astype(np.float32)
    return y

def _audio_to_tuple(audio):
    if audio is None:
        return None
    if isinstance(audio, tuple) and len(audio) == 2 and isinstance(audio[0], int):
        sr, wav = audio
        wav = _normalize_audio(wav)
        return wav, int(sr)
    if isinstance(audio, dict) and "sampling_rate" in audio and "data" in audio:
        sr = int(audio["sampling_rate"])
        wav = _normalize_audio(audio["data"])
        return wav, sr
    return None

@spaces.GPU
def infer_voice_design(part, language, voice_description):
    voice_design_model = load_model("VoiceDesign","1.7B")
    seed_everything(42)
    wavs, sr = voice_design_model.generate_voice_design(
        text=part,
        language=language,
        instruct=voice_description.strip(),
        non_streaming_mode=True,
        max_new_tokens=2048,
    )
    return wavs[0], sr

@spaces.GPU
def infer_voice_clone(part, language, audio_tuple, ref_text, use_xvector_only):
    tts = load_model("Base", "0.6B")
    voice_clone_prompt = tts.create_voice_clone_prompt(
        ref_audio=audio_tuple,
        ref_text=ref_text.strip() if ref_text else None,
        x_vector_only_mode=use_xvector_only
    )
    wavs, sr = tts.generate_voice_clone(
        text=part,
        language=language,
        voice_clone_prompt=voice_clone_prompt,
        max_new_tokens=2048,
        seed=42, 
        temperature=0.3,
        top_p=0.85
    )
    return wavs[0], sr

@spaces.GPU
def infer_voice_clone_from_prompt(part, language, prompt_file_path):
    loaded_data = torch.load(prompt_file_path, map_location='cuda', weights_only=False)
    if isinstance(loaded_data, list) and len(loaded_data) > 0 and isinstance(loaded_data[0], VoiceClonePromptItem):
        voice_clone_prompt = loaded_data
    elif isinstance(loaded_data, list) and len(loaded_data) > 0 and isinstance(loaded_data[0], dict):
        voice_clone_prompt = [VoiceClonePromptItem(**item) for item in loaded_data]
    else:
         voice_clone_prompt = loaded_data
    if isinstance(voice_clone_prompt, list):
        for item in voice_clone_prompt:
            if item.ref_code is not None and item.ref_code.ndim == 3:
                item.ref_code = item.ref_code.squeeze(0)
    tts = load_model("Base", "0.6B")
    wavs, sr = tts.generate_voice_clone(
        text=part,
        language=language,
        voice_clone_prompt=voice_clone_prompt,
        max_new_tokens=2048,
        seed=42, 
        temperature=0.3,
        top_p=0.85
    )
    return wavs[0], sr

@spaces.GPU
def extract_voice_clone_prompt(ref_audio, ref_text, use_xvector_only):
    tts = load_model("Base", "0.6B")
    seed_everything(42)
    audio_tuple = _audio_to_tuple(ref_audio)
    if audio_tuple is None:
        return None, "错误：需要参考音频。"
    r_text = ref_text
    uxo = use_xvector_only
    if not r_text or (isinstance(r_text, str) and not r_text.strip()):
        whisper_size = "base"
        try:
            whisper_model = load_whisper_model(whisper_size)
            audio_data, sr = audio_tuple
            if sr != 16000:
                whisper_audio = librosa.resample(audio_data, orig_sr=sr, target_sr=16000)
            else:
                whisper_audio = audio_data
            result = whisper_model.transcribe(whisper_audio)
            
            res_val = result.get("text", "")
            if isinstance(res_val, list) and len(res_val) > 0:
                res_val = res_val[0]
            if not isinstance(res_val, str):
                res_val = str(res_val)
            r_text = cc.convert(res_val.strip())
            uxo = False
        except Exception as e:
            logger.error(f"Whisper 识别失败: {str(e)}", exc_info=True)
            uxo = True
            # return None, f"错误：语音识别失败且未提供参考文本。{str(e)}"
    
    r_text_str = ""
    if isinstance(r_text, str):
        r_text_str = r_text.strip()
    elif isinstance(r_text, list) and len(r_text) > 0 and isinstance(r_text[0], str):
        r_text_str = r_text[0].strip()
        
    logger.info(f"语音识别成功 ：{r_text_str}")
    voice_clone_prompt_items = tts.create_voice_clone_prompt(
        ref_audio=audio_tuple,
        ref_text=r_text_str if r_text_str else None,
        x_vector_only_mode=uxo
    )
    prompt_data = []
    for item in voice_clone_prompt_items:
        prompt_data.append({
            "ref_code": item.ref_code,
            "ref_spk_embedding": item.ref_spk_embedding,
            "x_vector_only_mode": item.x_vector_only_mode,
            "icl_mode": item.icl_mode,
            "ref_text": item.ref_text
        })
    file_id = str(uuid.uuid4())[:8]
    file_path = f"voice_clone_prompt_{file_id}.pt"
    torch.save(prompt_data, file_path)
    return file_path

def generate_voice_design(text, language, voice_description):
    if not text or not text.strip():
        return None, "错误：文本不能为空。"
    if not voice_description or not voice_description.strip():
        return None, "错误：语音描述不能为空。"
    try:
        wav, sr = infer_voice_design(text.strip(), language, voice_description)
        return (sr, wav), "语音设计生成成功！"
    except Exception as e:
        logger.error(f"Voice Design 生成失败: {str(e)}", exc_info=True)
        return None, f"错误: {e}"

def generate_voice_clone(ref_audio, ref_text, target_text, language, use_xvector_only):
    t_text = target_text.strip() if isinstance(target_text, str) else ""
    if not t_text:
        return None, "错误：目标文本不能为空。"
    audio_tuple = _audio_to_tuple(ref_audio)
    if audio_tuple is None:
        return None, "错误：需要参考音频。"
    r_text = ref_text.strip() if isinstance(ref_text, str) else ""
    if not use_xvector_only and not r_text:
        return None, "错误：未启用 '仅使用 x-vector' 时需要参考文本。"
    try:
        wav, sr = infer_voice_clone(t_text, language, audio_tuple, r_text, use_xvector_only)
        return (sr, wav), "语音克隆生成成功！"
    except Exception as e:
        logger.error(f"Voice Clone 生成失败: {str(e)}", exc_info=True)
        return None, f"错误: {e}"

def generate_voice_clone_from_prompt_file(prompt_file_path, target_text, language):
    t_text = target_text.strip() if isinstance(target_text, str) else ""
    if not t_text:
        return None, "错误：目标文本不能为空。"
    if not prompt_file_path:
        return None, "错误：需要提供音频特征文件。"
    try:
        wav, sr = infer_voice_clone_from_prompt(t_text, language, prompt_file_path)
        return (sr, wav), "语音克隆生成成功（使用特征文件）！"
    except Exception as e:
        logger.error(f"Voice Clone 生成失败: {str(e)}", exc_info=True)
        return None, f"错误: {e}"

@spaces.GPU
def infer_whisper_audio(audio_path, model_size="base"):
    if not audio_path:
        return "错误：请上传音频文件或进行录音。"
    try:
        model = load_whisper_model(model_size)
        result = model.transcribe(audio_path)
        
        res_val = result.get("text", "")
        if isinstance(res_val, list) and len(res_val) > 0:
            res_val = res_val[0]
        if not isinstance(res_val, str):
            res_val = str(res_val)
        
        return cc.convert(res_val.strip())
    except Exception as e:
        logger.error(f"Whisper 识别失败: {str(e)}", exc_info=True)
        return f"识别出错: {e}"

def build_ui():
    theme = gr.themes.Soft(font=[gr.themes.GoogleFont("Source Sans Pro"), "Arial", "sans-serif"])
    with gr.Blocks(theme=theme, title="Qwen3-TTS Demo") as demo:
        gr.Markdown("# Qwen3-TTS Demo")
        with gr.Tabs():
            with gr.Tab("ASR (Whisper)"):
                with gr.Row():
                    with gr.Column():
                        asr_audio_input = gr.Audio(label="输入音频", type="filepath", sources=["microphone", "upload"])
                        asr_model_size = gr.Dropdown(label="Whisper 模型大小", choices=["base", "small", "medium", "large-v3"], value="base")
                        asr_btn = gr.Button("开始识别", variant="primary")
                    with gr.Column():
                        asr_text_output = gr.Textbox(label="识别结果", lines=10, show_copy_button=True)
                asr_btn.click(infer_whisper_audio, inputs=[asr_audio_input, asr_model_size], outputs=[asr_text_output])
            with gr.Tab("Voice Design"):
                with gr.Row():
                    with gr.Column():
                        design_text = gr.Textbox(label="目标文本", lines=4, value="It's in the top drawer... wait, it's empty?")
                        design_language = gr.Dropdown(label="语言", choices=LANGUAGES, value="Auto")
                        design_instruct = gr.Textbox(label="语音描述", lines=3, value="Speak in an incredulous tone.")
                        design_btn = gr.Button("开始生成", variant="primary")
                    with gr.Column():
                        design_audio_out = gr.Audio(label="生成音频", type="numpy")
                        design_status = gr.Textbox(label="状态", interactive=False)
                design_btn.click(generate_voice_design, inputs=[design_text, design_language, design_instruct], outputs=[design_audio_out, design_status],api_name="generate_voice_design")
            with gr.Tab("Voice Clone (Base)"):
                gr.Markdown("### 1. 提取音频特征")
                with gr.Row():
                    with gr.Column():
                        extract_ref_audio = gr.Audio(label="参考音频", type="numpy")
                        extract_ref_text = gr.Textbox(label="参考文本", lines=2)
                        extract_xvector = gr.Checkbox(label="仅使用 x-vector", value=False)
                        extract_btn = gr.Button("提取音频特征", variant="primary")
                    with gr.Column():
                        extract_file_out = gr.File(label="特征文件 (.pt)")
                extract_btn.click(extract_voice_clone_prompt, inputs=[extract_ref_audio, extract_ref_text, extract_xvector], outputs=[extract_file_out],api_name="extract_voice_clone_prompt")
                gr.Markdown("### 2. 使用特征文件生成")
                with gr.Row():
                    with gr.Column():
                        prompt_file = gr.File(label="特征文件 (.pt)")
                        prompt_target_text = gr.Textbox(label="目标文本", lines=4)
                        prompt_language = gr.Dropdown(label="语言", choices=LANGUAGES, value="Auto")
                        prompt_btn = gr.Button("使用特征文件生成", variant="primary")
                    with gr.Column():
                        prompt_audio_out = gr.Audio(label="生成音频", type="numpy")
                        prompt_status = gr.Textbox(label="状态", interactive=False)
                prompt_btn.click(generate_voice_clone_from_prompt_file, inputs=[prompt_file, prompt_target_text, prompt_language], outputs=[prompt_audio_out, prompt_status],api_name="generate_voice_clone_from_prompt")
                gr.Markdown("---")

                # Section 3: Traditional Voice Clone (Original)
                gr.Markdown("### 3. 传统音色克隆（直接使用参考音频）")
                gr.Markdown("直接上传参考音频生成语音（每次都需要提取特征）。")
                with gr.Row():
                    with gr.Column(scale=2):
                        clone_ref_audio = gr.Audio(
                            label="参考音频",
                            type="numpy",
                        )
                        clone_ref_text = gr.Textbox(
                            label="参考文本",
                            lines=2,
                            placeholder="输入参考音频中的确切文字...",
                        )
                        clone_xvector = gr.Checkbox(
                            label="仅使用 x-vector",
                            value=False,
                        )

                    with gr.Column(scale=2):
                        clone_target_text = gr.Textbox(
                            label="目标文本",
                            lines=4,
                            placeholder="输入要让克隆音色说话的文字...",
                        )
                        with gr.Row():
                            clone_language = gr.Dropdown(
                                label="语言",
                                choices=LANGUAGES,
                                value="Auto",
                                interactive=True,
                            )
                        clone_btn = gr.Button("克隆并生成", variant="primary")

                with gr.Row():
                    clone_audio_out = gr.Audio(label="生成的音频", type="numpy")
                    clone_status = gr.Textbox(label="状态", lines=2, interactive=False)

                clone_btn.click(
                    generate_voice_clone,
                    inputs=[clone_ref_audio, clone_ref_text, clone_target_text, clone_language, clone_xvector],
                    outputs=[clone_audio_out, clone_status],
                    api_name="generate_voice_clone"
                )

    return demo

if __name__ == "__main__":
    build_ui().launch()