"""
Kanade Tokenizer — Text-to-Audio with Voice Cloning
=====================================================
v3 fixes:
  - kokoro pinned to 0.7.16 (Python 3.13 compatible; 0.9.x requires <3.13)
  - espeak-ng installed via packages.txt (OS-level, not pip)
  - Gradio 6 API: theme/css passed to launch(), not Blocks()
  - No internet required — 100% offline inference

Pipeline:
  Text  →  Kokoro TTS (offline)  →  intermediate WAV
  Reference Audio  →  Kanade encode  →  global_embedding  (WHO)
  intermediate WAV  →  Kanade encode  →  content_token_indices  (WHAT)
  Kanade decode(content_tokens + speaker_embedding)  →  mel
  Vocoder  →  final WAV  ✅
"""

import os
import tempfile
import numpy as np
import torch
import soundfile as sf
import gradio as gr

from kanade_tokenizer import KanadeModel, load_audio, load_vocoder, vocode
from kokoro import KPipeline

# ─────────────────────────────────────────────────────────────────────────────
DEVICE    = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_ID  = "frothywater/kanade-25hz-clean"
KOKORO_SR = 24000

print(f"[init] Loading Kanade ({DEVICE})…")
kanade  = KanadeModel.from_pretrained(MODEL_ID).eval().to(DEVICE)
vocoder = load_vocoder(kanade.config.vocoder_name).to(DEVICE)
SR      = kanade.config.sample_rate   # 16000
print("[init] Kanade ready.")

print("[init] Loading Kokoro TTS…")
_kokoro_us = KPipeline(lang_code='a')   # American English
_kokoro_uk = KPipeline(lang_code='b')   # British English
print("[init] All models ready.")

# ── Voice menu ────────────────────────────────────────────────────────────────
VOICES = {
    "🇺🇸 Female — Heart (warm)":     ("a", "af_heart"),
    "🇺🇸 Female — Bella (smooth)":   ("a", "af_bella"),
    "🇺🇸 Female — Nicole (breathy)": ("a", "af_nicole"),
    "🇺🇸 Female — Sarah":            ("a", "af_sarah"),
    "🇺🇸 Male — Adam":               ("a", "am_adam"),
    "🇺🇸 Male — Michael":            ("a", "am_michael"),
    "🇬🇧 Female — Emma":             ("b", "bf_emma"),
    "🇬🇧 Male — George":             ("b", "bm_george"),
    "🇬🇧 Male — Lewis":              ("b", "bm_lewis"),
}

# ── helpers ───────────────────────────────────────────────────────────────────

def tts_to_wav(text: str, lang: str, voice_id: str) -> str:
    """Kokoro TTS (offline) → temp WAV at Kanade sample rate."""
    pipe = _kokoro_us if lang == 'a' else _kokoro_uk
    chunks = [audio for _, _, audio in pipe(text, voice=voice_id, speed=1.0)]
    if not chunks:
        raise RuntimeError("Kokoro produced no audio. Check your text.")
    audio_24k = np.concatenate(chunks)

    import librosa
    audio_16k = librosa.resample(audio_24k, orig_sr=KOKORO_SR, target_sr=SR)

    tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
    sf.write(tmp.name, audio_16k, SR)
    tmp.close()
    return tmp.name


def load_tensor(path: str) -> torch.Tensor:
    return load_audio(path, sample_rate=SR).to(DEVICE)


# ── inference ─────────────────────────────────────────────────────────────────

def synthesize(text, voice_label, ref_audio_path, speed):
    if not text.strip():
        raise gr.Error("Please enter some text.")
    if ref_audio_path is None:
        raise gr.Error("Please upload a reference audio clip.")

    lang, voice_id = VOICES[voice_label]

    gr.Info("Step 1/4 — Synthesising text with Kokoro (offline)…")
    tts_path = tts_to_wav(text, lang, voice_id)

    gr.Info("Step 2/4 — Extracting content tokens…")
    tts_wav = load_tensor(tts_path);  os.unlink(tts_path)
    with torch.inference_mode():
        tts_feat = kanade.encode(tts_wav)

    gr.Info("Step 3/4 — Extracting speaker embedding from reference…")
    ref_wav = load_tensor(ref_audio_path)
    with torch.inference_mode():
        ref_feat = kanade.encode(ref_wav)

    gr.Info("Step 4/4 — Decoding with cloned voice…")
    with torch.inference_mode():
        mel      = kanade.decode(
            content_token_indices=tts_feat.content_token_indices,
            global_embedding=ref_feat.global_embedding,
        )
        waveform = vocode(vocoder, mel.unsqueeze(0))

    audio_np = waveform.squeeze().cpu().float().numpy()

    if abs(speed - 1.0) > 0.05:
        import librosa
        audio_np = librosa.effects.time_stretch(audio_np, rate=speed)

    return int(SR), audio_np


# ── UI ────────────────────────────────────────────────────────────────────────

CSS = """
#title  { text-align: center; }
#banner { text-align: center; color: #6366f1; }
footer  { display: none !important; }
"""

with gr.Blocks(title="Kanade TTS Voice Cloner") as demo:
    gr.Markdown("# 🎙️ Kanade — Text-to-Audio with Voice Cloning", elem_id="title")
    gr.Markdown(
        "Enter text · Upload a **reference audio** · Get your text spoken "
        "**in that person's voice** — fully offline.",
        elem_id="banner",
    )
    with gr.Row():
        with gr.Column(scale=3):
            text_in  = gr.Textbox(label="📝 Text to synthesise", lines=5,
                                   placeholder="Type anything here…")
            voice_dd = gr.Dropdown(label="🔊 Base TTS voice (content only)",
                                   choices=list(VOICES), value=list(VOICES)[0])
            speed_sl = gr.Slider(label="⏩ Speed", minimum=0.7, maximum=1.5,
                                  value=1.0, step=0.05)
        with gr.Column(scale=2):
            ref_audio = gr.Audio(label="🎤 Reference audio (voice to clone)",
                                  type="filepath",
                                  sources=["upload", "microphone"])
            gr.Markdown("💡 5–30 sec · clean speech · single speaker")

    btn = gr.Button("🚀 Generate", variant="primary", size="lg")
    out = gr.Audio(label="🔈 Output", type="numpy")

    btn.click(fn=synthesize,
              inputs=[text_in, voice_dd, ref_audio, speed_sl],
              outputs=out)

    gr.Markdown(
        "---\n"
        "**Models:** "
        "[`frothywater/kanade-25hz-clean`](https://huggingface.co/frothywater/kanade-25hz-clean) · "
        "[`hexgrad/Kokoro-82M`](https://huggingface.co/hexgrad/Kokoro-82M)"
    )

if __name__ == "__main__":
    demo.launch(theme=gr.themes.Soft(), css=CSS)