"""
Pronunciation Coach — HuggingFace Space
========================================
1. User types a normal English sentence
2. User records themselves saying it
3. App runs phonological model → 35 CTC feature sequences
4. MDD engine aligns them against canonical sequences → errors + score
5. Feedback generator returns coaching tips
"""

import os
import re
import json
import torch
import numpy as np
import gradio as gr
import librosa
import pronouncing

from huggingface_hub import snapshot_download
from transformers import Wav2Vec2FeatureExtractor

from wav2vec2_phonological import PhonologicalWav2Vec2
from mdd_engine import run_mdd
from feedback_generator import generate_feedback
from phonological_features import CMU_39_PHONEMES

# ─────────────────────────────────────────────────────────────────────────────
# Model globals
# ─────────────────────────────────────────────────────────────────────────────

_model = None
_feature_extractor = None
_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

PRETRAINED_BASE = "facebook/wav2vec2-large-robust"
MODEL_REPO      = os.environ.get("HF_MODEL_REPO",    "Backlighteu/phonological-mdd")
MODEL_FILENAME  = os.environ.get("HF_MODEL_FILENAME", "best_model.pt")
HF_TOKEN        = os.environ.get("HF_TOKEN",          None)


def load_model():
    global _model, _feature_extractor
    if _model is not None:
        return

    print(f"[startup] Downloading {MODEL_REPO}/{MODEL_FILENAME} ...")
    snapshot_download(repo_id=MODEL_REPO, token=HF_TOKEN, local_dir="./model_cache")

    model = PhonologicalWav2Vec2(
        pretrained_model_name=PRETRAINED_BASE,
        num_output_nodes=71,
        freeze_cnn_encoder=True,
    )
    state_dict = torch.load("./model_cache/best_model.pt", map_location=_device)
    model.load_state_dict(state_dict)
    model.to(_device)
    model.eval()
    _model = model

    _feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(PRETRAINED_BASE)
    print(f"[startup] Ready on {_device}.")


# ─────────────────────────────────────────────────────────────────────────────
# G2P — plain English → CMU-39 phonemes
# ─────────────────────────────────────────────────────────────────────────────

_CMU_39 = set(CMU_39_PHONEMES)


def sentence_to_phonemes(sentence: str) -> tuple[list[str], list[str]]:
    words = re.sub(r"[^a-zA-Z\s]", "", sentence).split()
    phonemes, unknown = [], []
    for word in words:
        results = pronouncing.phones_for_word(word.lower())
        if results:
            for p in results[0].split():
                p = re.sub(r"[0-9]", "", p).lower()
                if p in _CMU_39:
                    phonemes.append(p)
        else:
            unknown.append(word)
    return phonemes, unknown


# ─────────────────────────────────────────────────────────────────────────────
# Audio inference
# ─────────────────────────────────────────────────────────────────────────────

def decode_audio(audio_path: str) -> list[list[int]]:
    load_model()
    waveform, _ = librosa.load(audio_path, sr=16000, mono=True)
    inputs = _feature_extractor(
        waveform.astype(np.float32), sampling_rate=16000,
        return_tensors="pt", padding=True,
    )
    input_values   = inputs.input_values.to(_device)
    attention_mask = inputs.get("attention_mask")
    if attention_mask is not None:
        attention_mask = attention_mask.to(_device)

    with torch.no_grad():
        logits, output_lengths = _model(input_values, attention_mask,
                                        apply_spec_augment=False)

    decoded_35 = _model.decode(logits, output_lengths)[0]
    return [[1 if v else 0 for v in seq] for seq in decoded_35]


# ─────────────────────────────────────────────────────────────────────────────
# Main handler
# ─────────────────────────────────────────────────────────────────────────────

def process(audio_input, sentence_text, max_issues):
    if audio_input is None:
        return "⚠️ Please record or upload audio.", ""
    if not sentence_text.strip():
        return "⚠️ Please type the sentence you want to practise.", ""

    # G2P
    target_phonemes, unknown = sentence_to_phonemes(sentence_text.strip())
    if not target_phonemes:
        return "⚠️ Could not convert sentence to phonemes. Try simpler English words.", ""

    # Model inference
    try:
        actual_feature_seqs = decode_audio(audio_input)
    except Exception as e:
        return f"❌ Audio error: {e}", ""

    # MDD
    try:
        result = run_mdd(actual_feature_seqs=actual_feature_seqs,
                         target_phonemes=target_phonemes)
    except Exception as e:
        return f"❌ MDD error: {e}", ""

    # Feedback
    feedback_dict = generate_feedback(result, use_llm=False, max_issues=int(max_issues))

    score = feedback_dict["score"]
    main_out = f"**Score: {score}/100**\n\n" + feedback_dict["final_feedback"]
    if unknown:
        main_out += f"\n\n⚠️ Words not in dictionary (skipped): *{', '.join(unknown)}*"

    # Detail
    lines = []
    for e in feedback_dict["error_summary"]:
        tag = " *(deleted)*" if e.get("is_deletion") else ""
        lines.append(
            f"**/{e['target']}/** pos {e['position']}{tag} — "
            f"{e['severity']}, {e['accuracy']:.0%} accurate  \n"
            f"Missing: {', '.join(e['missing_features']) or '—'} | "
            f"Extra: {', '.join(e['extra_features']) or '—'}"
        )
    detail_out = "\n\n".join(lines) if lines else "✅ No errors detected!"

    return main_out, detail_out


# ─────────────────────────────────────────────────────────────────────────────
# Gradio UI — clean and simple
# ─────────────────────────────────────────────────────────────────────────────

with gr.Blocks(title="Pronunciation Coach") as demo:
    gr.Markdown("# 🗣️ Pronunciation Coach\nType a sentence, record yourself saying it, get feedback.")

    with gr.Row():
        with gr.Column(scale=1):
            sentence_input = gr.Textbox(
                label="Sentence to practise",
                placeholder="The cat sat on the mat",
                lines=2,
            )
            audio_input = gr.Audio(
                sources=["microphone", "upload"],
                type="filepath",
                label="Your speech",
            )
            max_issues = gr.Slider(1, 5, value=3, step=1, label="Max issues to show")
            submit_btn = gr.Button("Analyse", variant="primary")

        with gr.Column(scale=2):
            feedback_out = gr.Markdown(label="Feedback")
            with gr.Accordion("Per-phoneme detail", open=False):
                detail_out = gr.Markdown()

    submit_btn.click(
        fn=process,
        inputs=[audio_input, sentence_input, max_issues],
        outputs=[feedback_out, detail_out],
    )

if __name__ == "__main__":
    demo.launch(theme=gr.themes.Soft())