import json
import os
import re
from pathlib import Path
from typing import Optional

import torch

from shared.mps import mps_device_or
from shared.utils import files_locator as fl

from .prompt_enhancers import TTS_MONOLOGUE_PROMPT, TTS_QWEN3_DIALOGUE_PROMPT


QWEN3_TTS_VARIANTS = {
    "qwen3_tts_customvoice": {
        "repo": "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice",
        "config_file": "qwen3_tts_customvoice.json",
    },
    "qwen3_tts_voicedesign": {
        "repo": "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign",
        "config_file": "qwen3_tts_voicedesign.json",
    },
    "qwen3_tts_base": {
        "repo": "Qwen/Qwen3-TTS-12Hz-1.7B-Base",
        "config_file": "qwen3_tts_base.json",
    },
}

QWEN3_TTS_GENERATION_CONFIG = "qwen3_tts_generation_config.json"
_QWEN3_CONFIG_DIR = Path(__file__).resolve().parent / "qwen3" / "configs"

QWEN3_TTS_TEXT_TOKENIZER_DIR = "qwen3_tts_text_tokenizer"
QWEN3_TTS_SPEECH_TOKENIZER_DIR = "qwen3_tts_tokenizer_12hz"
QWEN3_TTS_SPEECH_TOKENIZER_WEIGHTS = "qwen3_tts_tokenizer_12hz.safetensors"
QWEN3_TTS_REPO = "DeepBeepMeep/TTS"
QWEN3_TTS_TEXT_TOKENIZER_FILES = [
    "merges.txt",
    "vocab.json",
    "tokenizer_config.json",
    "preprocessor_config.json",
]
QWEN3_TTS_SPEECH_TOKENIZER_FILES = [
    "config.json",
    "configuration.json",
    "preprocessor_config.json",
    QWEN3_TTS_SPEECH_TOKENIZER_WEIGHTS,
]

QWEN3_TTS_LANG_FALLBACK = [
    "auto",
    "chinese",
    "english",
    "japanese",
    "korean",
    "german",
    "french",
    "russian",
    "portuguese",
    "spanish",
    "italian",
]
QWEN3_TTS_SPEAKER_FALLBACK = [
    "serena",
    "vivian",
    "uncle_fu",
    "ryan",
    "aiden",
    "ono_anna",
    "sohee",
    "eric",
    "dylan",
]
QWEN3_TTS_SPEAKER_META = {
    "vivian": {
        "style": "Bright, slightly edgy young female voice",
        "language": "Chinese",
    },
    "serena": {
        "style": "Warm, gentle young female voice",
        "language": "Chinese",
    },
    "uncle_fu": {
        "style": "Seasoned male voice with a low, mellow timbre",
        "language": "Chinese",
    },
    "dylan": {
        "style": "Youthful Beijing male voice with a clear, natural timbre",
        "language": "Chinese (Beijing Dialect)",
    },
    "eric": {
        "style": "Lively Chengdu male voice with a slightly husky brightness",
        "language": "Chinese (Sichuan Dialect)",
    },
    "ryan": {
        "style": "Dynamic male voice with strong rhythmic drive",
        "language": "English",
    },
    "aiden": {
        "style": "Sunny American male voice with a clear midrange",
        "language": "English",
    },
    "ono_anna": {
        "style": "Playful Japanese female voice with a light, nimble timbre",
        "language": "Japanese",
    },
    "sohee": {
        "style": "Warm Korean female voice with rich emotion",
        "language": "Korean",
    },
}
QWEN3_TTS_DURATION_SLIDER = {
    "label": "Max duration (seconds)",
    "min": 1,
    "max": 600,
    "increment": 1,
    "default": 20,
}
QWEN3_TTS_AUDIO_PROMPT_TYPE_SOURCES = {
    "selection": ["A", "AB"],
    "labels": {
        "A": "Voice cloning of 1 speaker",
        "AB": "Voice cloning of 2 speakers (Speaker 1 and Speaker 2)",
    },
    "letters_filter": "AB",
    "default": "A",
}
QWEN3_TTS_AUTO_SPLIT_SETTING_ID = "auto_split_every_s"
QWEN3_TTS_AUTO_SPLIT_MIN_SECONDS = 5.0
QWEN3_TTS_AUTO_SPLIT_MAX_SECONDS = 90.0
QWEN3_TTS_CUSTOM_SETTINGS = [
    {
        "id": QWEN3_TTS_AUTO_SPLIT_SETTING_ID,
        "label": "Auto Split Every s (5-90, optional), may reduce VRAM requiremens for very long speeches.",
        "name": "Auto Split Every s",
        "type": "float",
    },
]


def _format_qwen3_label(value: str) -> str:
    return value.replace("_", " ").title()


def _format_qwen3_speaker_label(name: str) -> str:
    label = _format_qwen3_label(name)
    meta = QWEN3_TTS_SPEAKER_META.get(name.lower())
    if not meta:
        return label
    parts = []
    style = meta.get("style", "")
    language = meta.get("language", "")
    if style:
        parts.append(style)
    if language:
        parts.append(language)
    if not parts:
        return label
    return f"{label} ({'; '.join(parts)})"


def get_qwen3_config_path(base_model_type: str) -> Optional[str]:
    variant = QWEN3_TTS_VARIANTS.get(base_model_type)
    if variant is None:
        return None
    config_path = _QWEN3_CONFIG_DIR / variant["config_file"]
    return str(config_path) if config_path.is_file() else None


def get_qwen3_generation_config_path() -> Optional[str]:
    config_path = _QWEN3_CONFIG_DIR / QWEN3_TTS_GENERATION_CONFIG
    return str(config_path) if config_path.is_file() else None


def load_qwen3_config(base_model_type: str) -> Optional[dict]:
    config_path = get_qwen3_config_path(base_model_type)
    if not config_path:
        return None
    with open(config_path, "r", encoding="utf-8") as handle:
        return json.load(handle)


def get_qwen3_languages(base_model_type: str) -> list[str]:
    config = load_qwen3_config(base_model_type)
    if config is None:
        return list(QWEN3_TTS_LANG_FALLBACK)
    lang_map = config.get("talker_config", {}).get("codec_language_id", {})
    languages = [name for name in lang_map.keys() if "dialect" not in name.lower()]
    languages = ["auto"] + sorted({name.lower() for name in languages})
    return languages


def get_qwen3_speakers(base_model_type: str) -> list[str]:
    config = load_qwen3_config(base_model_type)
    if config is None:
        return list(QWEN3_TTS_SPEAKER_FALLBACK)
    speakers = list(config.get("talker_config", {}).get("spk_id", {}).keys())
    speakers = sorted({name.lower() for name in speakers})
    return speakers or list(QWEN3_TTS_SPEAKER_FALLBACK)


def get_qwen3_language_choices(base_model_type: str) -> list[tuple[str, str]]:
    return [(_format_qwen3_label(lang), lang) for lang in get_qwen3_languages(base_model_type)]


def get_qwen3_speaker_choices(base_model_type: str) -> list[tuple[str, str]]:
    return [(_format_qwen3_speaker_label(name), name) for name in get_qwen3_speakers(base_model_type)]


def get_qwen3_model_def(base_model_type: str) -> dict:
    common = {
        "audio_only": True,
        "image_outputs": False,
        "sliding_window": False,
        "guidance_max_phases": 0,
        "no_negative_prompt": True,
        "inference_steps": False,
        "temperature": True,
        "image_prompt_types_allowed": "",
        "supports_early_stop": True,
        "profiles_dir": [base_model_type],
        "duration_slider": dict(QWEN3_TTS_DURATION_SLIDER),
        "top_k_slider": True,
        "text_prompt_enhancer_instructions": TTS_MONOLOGUE_PROMPT,
        "text_prompt_enhancer_max_tokens": 512,
        "prompt_enhancer_button_label": "Write",
        "compile": False,
        "parent_model_type": "qwen3_tts_base",
        "lm_engines": ["cg"],
            "prompt_enhancer_def": {
                "selection": ["T", "T1"] if base_model_type == "qwen3_tts_base" else ["T"],
                "labels": {
                    "T": "A Speech based on current Prompt",
                    "T1": "A Dialogue between two People based on current Prompt",
                },
                "default": "T",
        },
    }
    if base_model_type == "qwen3_tts_customvoice":
        speakers = get_qwen3_speakers(base_model_type)
        default_speaker = speakers[0] if speakers else ""
        return {
            **common,
            "model_modes": {
                "choices": get_qwen3_speaker_choices(base_model_type),
                "default": default_speaker,
                "label": "Speaker",
            },
            "alt_prompt": {
                "label": "Instruction (optional)",
                "placeholder": "calm, friendly, slightly husky",
                "lines": 2,
            },
        }
    if base_model_type == "qwen3_tts_voicedesign":
        return {
            **common,
            "model_modes": {
                "choices": get_qwen3_language_choices(base_model_type),
                "default": "auto",
                "label": "Language",
            },
            "alt_prompt": {
                "label": "Voice instruction",
                "placeholder": "young female, warm tone, clear articulation",
                "lines": 2,
            },
        }
    if base_model_type == "qwen3_tts_base":
        return {
            **common,
            "model_modes": {
                "choices": get_qwen3_language_choices(base_model_type),
                "default": "auto",
                "label": "Language",
            },
            "alt_prompt": {
                "label": "Reference transcript(s) (optional, two-speaker: one per line)",
                "placeholder": "Speaker 1 reference transcript\nSpeaker 2 reference transcript",
                "lines": 3,
            },
            "pause_between_sentences": True,
            "preserve_empty_prompt_lines": True,
            "any_audio_prompt": True,
            "audio_prompt_choices": True,
            "audio_prompt_type_sources": dict(QWEN3_TTS_AUDIO_PROMPT_TYPE_SOURCES),
            "custom_settings": [one.copy() for one in QWEN3_TTS_CUSTOM_SETTINGS],
            "text_prompt_enhancer_instructions1": TTS_QWEN3_DIALOGUE_PROMPT,
            "text_prompt_enhancer_max_tokens1": 512,
            "audio_guide_label": "Speaker 1 reference voice",
            "audio_guide2_label": "Speaker 2 reference voice",
        }
    return common


def get_qwen3_duration_default() -> int:
    return int(QWEN3_TTS_DURATION_SLIDER.get("default", 20))


def get_qwen3_download_def(base_model_type: str) -> list[dict]:
    return [
        {
            "repoId": QWEN3_TTS_REPO,
            "sourceFolderList": [QWEN3_TTS_TEXT_TOKENIZER_DIR],
            "fileList": [QWEN3_TTS_TEXT_TOKENIZER_FILES],
        },
        {
            "repoId": QWEN3_TTS_REPO,
            "sourceFolderList": [QWEN3_TTS_SPEECH_TOKENIZER_DIR],
            "fileList": [QWEN3_TTS_SPEECH_TOKENIZER_FILES],
        },
    ]


class family_handler:
    @staticmethod
    def query_supported_types():
        return list(QWEN3_TTS_VARIANTS)

    @staticmethod
    def query_family_maps():
        return {}, {}

    @staticmethod
    def query_model_family():
        return "tts"

    @staticmethod
    def query_family_infos():
        return {"tts": (200, "TTS")}

    @staticmethod
    def register_lora_cli_args(parser, lora_root):
        parser.add_argument(
            "--lora-dir-qwen3-tts",
            type=str,
            default=None,
            help=f"Path to a directory that contains Qwen3 TTS settings (default: {os.path.join(lora_root, 'qwen3_tts')})",
        )

    @staticmethod
    def get_lora_dir(base_model_type, args, lora_root):
        return getattr(args, "lora_qwen3_tts", None) or os.path.join(lora_root, "qwen3_tts")

    @staticmethod
    def query_model_def(base_model_type, model_def):
        return get_qwen3_model_def(base_model_type)

    @staticmethod
    def query_model_files(computeList, base_model_type, model_def=None):
        return get_qwen3_download_def(base_model_type)

    @staticmethod
    def load_model(
        model_filename,
        model_type,
        base_model_type,
        model_def,
        quantizeTransformer=False,
        text_encoder_quantization=None,
        dtype=None,
        VAE_dtype=None,
        mixed_precision_transformer=False,
        save_quantized=False,
        submodel_no_list=None,
        text_encoder_filename=None,
        profile=0,
        lm_decoder_engine="legacy",
        **kwargs,
    ):
        from .qwen3.pipeline import Qwen3TTSPipeline

        ckpt_root = fl.get_download_location()
        weights_candidate = None
        if isinstance(model_filename, (list, tuple)):
            if len(model_filename) > 0:
                weights_candidate = model_filename[0]
        else:
            weights_candidate = model_filename
        weights_path = None
        if weights_candidate:
            weights_path = fl.locate_file(weights_candidate, error_if_none=False)
            if weights_path is None:
                weights_path = weights_candidate

        pipeline = Qwen3TTSPipeline(
            model_weights_path=weights_path,
            base_model_type=base_model_type,
            ckpt_root=ckpt_root,
            device=mps_device_or(torch.device("cpu")),
            lm_decoder_engine=lm_decoder_engine,
        )
        if str(lm_decoder_engine).strip().lower() in ("cg", "cudagraph"):
            pipeline.model._budget = 0
            talker = getattr(pipeline.model, "talker", None)
            if talker is not None:
                talker._budget = 0
                code_predictor = getattr(talker, "code_predictor", None)
                if code_predictor is not None:
                    code_predictor._budget = 0

        pipe = {"transformer": pipeline.model}
        if getattr(pipeline, "speech_tokenizer", None) is not None:
            pipe["speech_tokenizer"] = pipeline.speech_tokenizer.model
        if save_quantized and weights_path:
            from wgp import save_quantized_model

            config_path = get_qwen3_config_path(base_model_type)
            if config_path is None:
                config_candidate = os.path.join("qwen3", "configs", f"{base_model_type}.json")
                config_path = fl.locate_file(config_candidate, error_if_none=False) or config_candidate
            save_quantized_model(pipeline.model, model_type, weights_path, dtype or torch.bfloat16, config_path)
        return pipeline, pipe

    @staticmethod
    def fix_settings(base_model_type, settings_version, model_def, ui_defaults):
        if "alt_prompt" not in ui_defaults:
            ui_defaults["alt_prompt"] = ""

        if base_model_type == "qwen3_tts_customvoice":
            speakers = get_qwen3_speakers(base_model_type)
            defaults = {
                "audio_prompt_type": "",
                "model_mode": speakers[0] if speakers else "",
            }
        elif base_model_type == "qwen3_tts_voicedesign":
            defaults = {
                "audio_prompt_type": "",
                "model_mode": "auto",
            }
        elif base_model_type == "qwen3_tts_base":
            defaults = {
                "audio_prompt_type": "A",
                "model_mode": "auto",
                "pause_seconds": 0.5,
            }
        else:
            defaults = {
                "audio_prompt_type": "",
                "model_mode": "auto",
            }
        for key, value in defaults.items():
            ui_defaults.setdefault(key, value)
        if base_model_type == "qwen3_tts_base":
            audio_prompt_type = str(ui_defaults.get("audio_prompt_type", "A") or "A").upper()
            if audio_prompt_type not in ("A", "AB"):
                ui_defaults["audio_prompt_type"] = "A"

        if settings_version < 2.44:
            if model_def.get("top_k_slider", False):
                ui_defaults["top_k"] = 50

    @staticmethod
    def update_default_settings(base_model_type, model_def, ui_defaults):
        if base_model_type == "qwen3_tts_customvoice":
            speakers = get_qwen3_speakers(base_model_type)
            default_speaker = speakers[0] if speakers else ""
            ui_defaults.update(
                {
                    "audio_prompt_type": "",
                    "model_mode": default_speaker,
                    "alt_prompt": "",
                    "duration_seconds": get_qwen3_duration_default(),
                    "repeat_generation": 1,
                    "video_length": 0,
                    "num_inference_steps": 0,
                    "negative_prompt": "",
                    "temperature": 0.9,
                    "top_k": 50,
                    "multi_prompts_gen_type": "FG",
                }
            )
            return

        if base_model_type == "qwen3_tts_voicedesign":
            ui_defaults.update(
                {
                    "audio_prompt_type": "",
                    "model_mode": "auto",
                    "alt_prompt": "young female, warm tone, clear articulation",
                    "duration_seconds": get_qwen3_duration_default(),
                    "repeat_generation": 1,
                    "video_length": 0,
                    "num_inference_steps": 0,
                    "negative_prompt": "",
                    "temperature": 0.9,
                    "top_k": 50,
                    "multi_prompts_gen_type": "FG",
                }
            )
            return

        if base_model_type == "qwen3_tts_base":
            ui_defaults.update(
                {
                    "audio_prompt_type": "A",
                    "model_mode": "auto",
                    "alt_prompt": "",
                    "duration_seconds": get_qwen3_duration_default(),
                    "pause_seconds": 0.5,
                    "repeat_generation": 1,
                    "video_length": 0,
                    "num_inference_steps": 0,
                    "negative_prompt": "",
                    "temperature": 0.9,
                    "top_k": 50,
                    "multi_prompts_gen_type": "FG",
                }
            )

    @staticmethod
    def validate_generative_prompt(base_model_type, model_def, inputs, one_prompt):
        if base_model_type == "qwen3_tts_customvoice":
            if one_prompt is None or len(str(one_prompt).strip()) == 0:
                return "Prompt text cannot be empty for Qwen3 CustomVoice."
            speaker = inputs.get("model_mode", "")
            if not speaker:
                return "Please select a speaker for Qwen3 CustomVoice."
            speakers = get_qwen3_speakers(base_model_type)
            if speaker.lower() not in speakers:
                return f"Unsupported speaker '{speaker}'."
            return None

        if base_model_type == "qwen3_tts_voicedesign":
            if one_prompt is None or len(str(one_prompt).strip()) == 0:
                return "Prompt text cannot be empty for Qwen3 VoiceDesign."
            return None

        if base_model_type == "qwen3_tts_base":
            if one_prompt is None or len(str(one_prompt).strip()) == 0:
                return "Prompt text cannot be empty for Qwen3 Base voice clone."
            audio_prompt_type = str(inputs.get("audio_prompt_type", "A") or "A").upper()
            if inputs.get("audio_guide") is None:
                return "Qwen3 Base requires Speaker 1 reference audio."
            prompt_text = str(one_prompt)
            has_speaker_syntax = re.search(r"Speaker\s*\d+\s*:", prompt_text, flags=re.IGNORECASE) is not None
            if "B" in audio_prompt_type:
                if inputs.get("audio_guide2") is None:
                    return "Two-speaker mode requires Speaker 2 reference audio."
                speaker_matches = list(re.finditer(r"Speaker\s*(\d+)\s*:", prompt_text, flags=re.IGNORECASE))
                if not speaker_matches:
                    return (
                        "Two-speaker mode requires prompt lines using Speaker 1: and Speaker 2: "
                    )
                speaker_ids = sorted({int(m.group(1)) for m in speaker_matches})
                if len(speaker_ids) != 2:
                    return (
                        "Two-speaker mode requires exactly two speaker IDs. Use Speaker 1: and Speaker 2:. "
                        "For headless settings, keep 'multi_prompts_gen_type' = 'FG'."
                    )
            elif has_speaker_syntax:
                return "Speaker-tag dialogue requires two-speaker mode (set audio prompt mode to Dialogue)."
            return None

        return None

    @staticmethod
    def validate_generative_settings(base_model_type, model_def, inputs):
        if base_model_type != "qwen3_tts_base":
            return None
        custom_settings = inputs.get("custom_settings", None)
        if custom_settings is None:
            return None
        if not isinstance(custom_settings, dict):
            return "Custom settings must be a dictionary."

        raw_value = custom_settings.get(QWEN3_TTS_AUTO_SPLIT_SETTING_ID, None)
        if raw_value is None:
            return None
        if isinstance(raw_value, str):
            raw_value = raw_value.strip()
            if len(raw_value) == 0:
                custom_settings.pop(QWEN3_TTS_AUTO_SPLIT_SETTING_ID, None)
                inputs["custom_settings"] = custom_settings if len(custom_settings) > 0 else None
                return None

        try:
            if isinstance(raw_value, bool):
                raise ValueError()
            auto_split_seconds = float(raw_value)
        except Exception:
            return (
                f"Auto Split Every s must be a number between "
                f"{int(QWEN3_TTS_AUTO_SPLIT_MIN_SECONDS)} and {int(QWEN3_TTS_AUTO_SPLIT_MAX_SECONDS)} seconds."
            )

        if (
            auto_split_seconds < QWEN3_TTS_AUTO_SPLIT_MIN_SECONDS
            or auto_split_seconds > QWEN3_TTS_AUTO_SPLIT_MAX_SECONDS
        ):
            return (
                f"Auto Split Every s must be between "
                f"{int(QWEN3_TTS_AUTO_SPLIT_MIN_SECONDS)} and {int(QWEN3_TTS_AUTO_SPLIT_MAX_SECONDS)} seconds."
            )

        custom_settings[QWEN3_TTS_AUTO_SPLIT_SETTING_ID] = auto_split_seconds
        inputs["custom_settings"] = custom_settings
        return None