"""
Phrase-library-based conversational response system.
Matches transcribed text against curated Bambara/Fula phrase libraries.

No ML required — the knowledge base is plain JSON files that anyone can edit.
Library files: data/phrases/{lang}.json
User additions: data/phrases/{lang}_additions.json (saved to HF Hub for persistence)

Each phrase entry:
  {
    "patterns":  ["i ni ce", "ini ce", ...],   <- what the user might say (incl. Whisper variants)
    "response":  "I ni ce. N bɛ i dɛmɛ.",     <- what the app says back
    "english":   "Hello! I am here to help.",  <- English translation shown in UI
    "tags":      ["greeting"]
  }
"""
from __future__ import annotations

import json
import logging
from pathlib import Path
from typing import Optional

logger = logging.getLogger(__name__)

# Resolve project root relative to this file:
# src/conversation/phrase_matcher.py  → ../../data/phrases/
DATA_DIR = Path(__file__).parent.parent.parent / "data" / "phrases"

try:
    from rapidfuzz import fuzz as _fuzz, process as _process
    _HAS_RAPIDFUZZ = True
except ImportError:
    _HAS_RAPIDFUZZ = False
    logger.warning("rapidfuzz not installed — using substring matching only (install it for better matching)")


class PhraseMatcher:
    """
    Fuzzy-matches a transcription against a per-language phrase library.
    Falls back to substring matching if rapidfuzz is not installed.
    """

    def __init__(self) -> None:
        self._cache: dict[str, list[dict]] = {}

    # ── Library loading ───────────────────────────────────────────────────────

    def _load(self, lang: str) -> list[dict]:
        """Load and cache all phrases for a language (base + user additions)."""
        if lang not in self._cache:
            phrases: list[dict] = []

            # Base library (git-tracked, always present)
            base_path = DATA_DIR / f"{lang}.json"
            if base_path.exists():
                with open(base_path, encoding="utf-8") as f:
                    phrases = json.load(f)

            # User additions (written locally, uploaded to HF Hub for persistence)
            additions_path = DATA_DIR / f"{lang}_additions.json"
            if additions_path.exists():
                try:
                    with open(additions_path, encoding="utf-8") as f:
                        additions = json.load(f)
                    phrases.extend(additions)
                except Exception as exc:
                    logger.warning("Could not load phrase additions for %s: %s", lang, exc)

            self._cache[lang] = phrases
            logger.info("PhraseMatcher: loaded %d phrases for lang=%s", len(phrases), lang)
        return self._cache[lang]

    def reload(self, lang: str) -> None:
        """Force reload from disk (call after adding new phrases)."""
        self._cache.pop(lang, None)

    def reload_from_hub_data(self, lang: str, additions_json: str) -> None:
        """
        Called at startup with JSON downloaded from HF Hub.
        Writes to the local additions file so _load() picks it up.
        """
        DATA_DIR.mkdir(parents=True, exist_ok=True)
        additions_path = DATA_DIR / f"{lang}_additions.json"
        with open(additions_path, "w", encoding="utf-8") as f:
            f.write(additions_json)
        self.reload(lang)

    # ── Matching ──────────────────────────────────────────────────────────────

    def match(self, text: str, lang: str, threshold: int = 60) -> Optional[dict]:
        """
        Find the best-matching phrase entry for the transcribed text.
        Returns the entry dict (with 'response' and 'english' keys) or None.

        Strategy:
          1. Exact substring: any pattern is a substring of text (or vice-versa)
          2. Fuzzy partial_ratio via rapidfuzz (handles Whisper spelling variants)
        """
        phrases = self._load(lang)
        if not phrases:
            return None

        text_norm = text.lower().strip()
        if not text_norm:
            return None

        # Build flat candidate list: (normalised_pattern, entry)
        candidates: list[tuple[str, dict]] = [
            (pat.lower(), entry)
            for entry in phrases
            for pat in entry.get("patterns", [])
        ]
        if not candidates:
            return None

        # 1. Exact substring check (fast path)
        for pattern, entry in candidates:
            if pattern in text_norm or text_norm in pattern:
                logger.debug("Exact phrase match: %r", pattern)
                return entry

        # 2. Fuzzy match (handles Whisper mis-transcriptions)
        if _HAS_RAPIDFUZZ:
            pattern_strings = [c[0] for c in candidates]
            result = _process.extractOne(
                text_norm,
                pattern_strings,
                scorer=_fuzz.partial_ratio,
                score_cutoff=threshold,
            )
            if result:
                _, score, idx = result
                logger.debug("Fuzzy phrase match score=%d: %r", score, candidates[idx][0])
                return candidates[idx][1]

        return None

    # ── Phrase addition ───────────────────────────────────────────────────────

    def add_phrase(
        self,
        lang: str,
        patterns: list[str],
        response: str,
        english: str,
        tags: list[str] | None = None,
    ) -> None:
        """Add a single phrase to the local additions file."""
        DATA_DIR.mkdir(parents=True, exist_ok=True)
        additions_path = DATA_DIR / f"{lang}_additions.json"

        additions: list[dict] = []
        if additions_path.exists():
            try:
                with open(additions_path, encoding="utf-8") as f:
                    additions = json.load(f)
            except Exception:
                additions = []

        additions.append({
            "patterns": [p.strip() for p in patterns if p.strip()],
            "response": response.strip(),
            "english": english.strip(),
            "tags": tags or ["user_added"],
        })

        with open(additions_path, "w", encoding="utf-8") as f:
            json.dump(additions, f, ensure_ascii=False, indent=2)

        self.reload(lang)

    def import_pairs(self, lang: str, text: str) -> int:
        """
        Bulk-import from pasted text.
        Format: one entry per line → native_phrase | english_translation
        Lines starting with # are ignored.
        Returns number of entries successfully added.
        """
        count = 0
        for line in text.strip().splitlines():
            line = line.strip()
            if not line or line.startswith("#"):
                continue
            if "|" not in line:
                continue
            parts = line.split("|", 1)
            if len(parts) != 2:
                continue
            native, english = parts[0].strip(), parts[1].strip()
            if not native or not english:
                continue
            self.add_phrase(lang, [native], native, english, tags=["imported"])
            count += 1
        return count

    def get_additions_json(self, lang: str) -> str:
        """Return the raw JSON of user additions (for HF Hub upload)."""
        additions_path = DATA_DIR / f"{lang}_additions.json"
        if not additions_path.exists():
            return "[]"
        with open(additions_path, encoding="utf-8") as f:
            return f.read()

    def phrase_count(self, lang: str) -> int:
        return len(self._load(lang))