from __future__ import annotations

import logging
import os
import urllib.request
from pathlib import Path

import gradio as gr
import torch

logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s")
logger = logging.getLogger(__name__)

# ─── Device detection ─────────────────────────────────────────────────────────
def _get_device() -> str:
    if torch.cuda.is_available():
        return "cuda"
    if torch.backends.mps.is_available():
        return "mps"
    return "cpu"

DEVICE = _get_device()
# bfloat16 works on CUDA/MPS; use float32 on CPU for stability
DTYPE = torch.bfloat16 if DEVICE in ("cuda", "mps") else torch.float32
logger.info("Running on device: %s, dtype: %s", DEVICE, DTYPE)

MODEL_ID = "OpenMOSS-Team/MOSS-TTS-Nano-100M"
AUDIO_TOKENIZER_ID = "OpenMOSS-Team/MOSS-Audio-Tokenizer-Nano"
OUTPUT_DIR = Path("/tmp/moss-tts-output")
SAMPLE_DIR = Path("/tmp/moss-tts-samples")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
SAMPLE_DIR.mkdir(parents=True, exist_ok=True)

# ─── Load models ──────────────────────────────────────────────────────────────
from transformers import AutoModelForCausalLM, AutoModel, AutoTokenizer

logger.info("Loading TTS model: %s", MODEL_ID)
tts_model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    trust_remote_code=True,
    torch_dtype=DTYPE,
    low_cpu_mem_usage=True,
)
tts_model.eval()
tts_model._set_attention_implementation("sdpa")

logger.info("Loading audio tokenizer: %s", AUDIO_TOKENIZER_ID)
audio_tokenizer = AutoModel.from_pretrained(
    AUDIO_TOKENIZER_ID,
    trust_remote_code=True,
)
audio_tokenizer.eval()

logger.info("Loading text tokenizer")
text_tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

logger.info("Moving models to %s …", DEVICE)
tts_model.to(DEVICE)
audio_tokenizer.to(DEVICE)
logger.info("All models ready.")

# ─── Sample audio files ───────────────────────────────────────────────────────
SAMPLE_AUDIO: dict[str, str] = {}
_SAMPLE_URLS = {
    "en": "https://raw.githubusercontent.com/OpenMOSS/MOSS-TTS-Nano/main/assets/audio/en_2.wav",
    "zh": "https://raw.githubusercontent.com/OpenMOSS/MOSS-TTS-Nano/main/assets/audio/zh_1.wav",
    "jp": "https://raw.githubusercontent.com/OpenMOSS/MOSS-TTS-Nano/main/assets/audio/jp_2.wav",
}
for _lang, _url in _SAMPLE_URLS.items():
    _dest = SAMPLE_DIR / f"{_lang}_sample.wav"
    try:
        if not _dest.exists():
            urllib.request.urlretrieve(_url, _dest)
            logger.info("Downloaded sample audio: %s", _lang)
        SAMPLE_AUDIO[_lang] = str(_dest)
    except Exception as _e:
        logger.warning("Failed to download sample audio %s: %s", _lang, _e)

# ─── Example content ──────────────────────────────────────────────────────────
EXAMPLE_TEXTS: dict[str, str] = {
    "English": (
        "The biggest lesson that can be read from 70 years of AI research is that general methods "
        "that leverage computation are ultimately the most effective, and by a large margin."
    ),
    "Chinese": (
        "欢迎关注模思智能、上海创智学院与复旦大学自然语言处理实验室。"
        "今天我们将为您带来最新的人工智能研究进展。"
    ),
    "French": (
        "Bonjour et bienvenue dans notre émission quotidienne d'actualités. "
        "Nous vous présenterons les nouvelles les plus importantes de la journée."
    ),
    "Japanese": "本日はNHKニュースをご覧いただきありがとうございます。最新のニュースをお伝えします。",
    "German": (
        "Willkommen zu unserem täglichen Nachrichtenüberblick. "
        "Wir berichten über die wichtigsten Ereignisse des Tages."
    ),
    "Spanish": (
        "Bienvenidos al noticiero de la tarde. "
        "Aquí les presentamos las noticias más relevantes del día de hoy."
    ),
    "Korean": "안녕하세요, KBS 뉴스입니다. 오늘의 주요 뉴스를 전해드리겠습니다.",
}

LANG_TO_SAMPLE_AUDIO: dict[str, str | None] = {
    "English": SAMPLE_AUDIO.get("en"),
    "Chinese": SAMPLE_AUDIO.get("zh"),
    "Japanese": SAMPLE_AUDIO.get("jp"),
}


# ─── Inference ────────────────────────────────────────────────────────────────
def generate_speech(
    text: str,
    reference_audio: str | None,
    max_new_frames: int,
    do_sample: bool,
    seed: int,
) -> str:
    if not text.strip():
        raise gr.Error("Please enter text to synthesize.")
    if reference_audio is None:
        raise gr.Error("Please upload reference audio for voice cloning.")

    seed_int = int(seed) if seed else 0
    if seed_int != 0:
        torch.manual_seed(seed_int)
        if DEVICE == "cuda":
            torch.cuda.manual_seed(seed_int)

    output_path = str(OUTPUT_DIR / f"output_{os.getpid()}_{abs(hash(text)) % 1_000_000}.wav")

    try:
        result = tts_model.inference(
            text=text,
            output_audio_path=output_path,
            mode="voice_clone",
            prompt_audio_path=reference_audio,
            text_tokenizer=text_tokenizer,
            audio_tokenizer=audio_tokenizer,
            audio_tokenizer_type="moss-audio-tokenizer-nano",
            device=DEVICE,
            max_new_frames=int(max_new_frames),
            do_sample=bool(do_sample),
            use_kv_cache=True,
            voice_clone_max_text_tokens=75,
        )
        logger.info(
            "Generated: %s  sample_rate=%s",
            result.get("audio_path"),
            result.get("sample_rate"),
        )
        return str(result["audio_path"])
    except Exception as exc:
        logger.exception("TTS inference failed")
        raise gr.Error(str(exc)) from exc


# ─── UI ───────────────────────────────────────────────────────────────────────
def on_example_select(key: str) -> tuple[str, str | None]:
    return EXAMPLE_TEXTS.get(key, ""), LANG_TO_SAMPLE_AUDIO.get(key)


css = """
.gradio-container { max-width: 1000px !important; margin: 0 auto !important; }
footer { display: none !important; }
"""

with gr.Blocks(title="MOSS-TTS-Nano") as demo:
    gr.Markdown(
        """# MOSS-TTS-Nano
**Multilingual 0.1B TTS with zero-shot voice cloning** — 20 languages — 48 kHz stereo

Upload a reference audio clip (3–15 sec) to clone the voice, then enter text in any of the 20 supported languages.
Model: [OpenMOSS-Team/MOSS-TTS-Nano-100M](https://huggingface.co/OpenMOSS-Team/MOSS-TTS-Nano-100M)"""
    )

    with gr.Row(equal_height=False):
        with gr.Column(scale=3):
            example_picker = gr.Dropdown(
                choices=list(EXAMPLE_TEXTS.keys()),
                value="English",
                label="Example language",
                info="Pre-fills text and loads a sample reference audio",
            )
            text_input = gr.Textbox(
                label="Text to synthesize",
                value=EXAMPLE_TEXTS["English"],
                lines=5,
                placeholder="Enter text in any supported language…",
            )
            ref_audio = gr.Audio(
                label="Reference audio (voice to clone)",
                type="filepath",
                sources=["upload", "microphone"],
                value=SAMPLE_AUDIO.get("en"),
            )
            with gr.Accordion("Advanced settings", open=False):
                max_frames_slider = gr.Slider(
                    minimum=64, maximum=512, value=375, step=16,
                    label="Max new frames",
                    info="Controls the maximum length of generated audio",
                )
                do_sample_cb = gr.Checkbox(
                    value=True,
                    label="Sampling",
                    info="Uncheck for deterministic (but potentially repetitive) output",
                )
                seed_input = gr.Number(
                    value=0, precision=0,
                    label="Seed (0 = random)",
                )
            generate_btn = gr.Button("Generate Speech", variant="primary", size="lg")

        with gr.Column(scale=2):
            output_audio = gr.Audio(label="Generated speech", type="filepath")
            gr.Markdown(
                """**Supported languages**

Chinese · English · German · Spanish · French · Japanese · Italian · Hungarian ·
Korean · Russian · Persian · Arabic · Polish · Portuguese · Czech · Danish · Swedish ·
Greek · Turkish

**Resources**: [Paper (arXiv:2603.18090)](https://arxiv.org/abs/2603.18090) ·
[GitHub](https://github.com/OpenMOSS/MOSS-TTS-Nano) ·
[Model card](https://huggingface.co/OpenMOSS-Team/MOSS-TTS-Nano-100M)"""
            )

    example_picker.change(
        fn=on_example_select,
        inputs=example_picker,
        outputs=[text_input, ref_audio],
    )
    generate_btn.click(
        fn=generate_speech,
        inputs=[text_input, ref_audio, max_frames_slider, do_sample_cb, seed_input],
        outputs=output_audio,
    )

demo.launch(show_error=True, ssr_mode=False)