"""Minimal NIM fallback for first-turn Gemini 503.

Does ONE thing: when single_brain raises SingleBrainError on the first
turn (no single_brain_sticky yet), give the user a working reply using
NIM.

No tools, no function-calling, no faithfulness gate, no normalizer —
just a single NIM chat call with a tiny system prompt. The user gets a
polite "let me think out loud while I reconnect" reply; the next turn,
single_brain (Gemini) takes over and the session sticks.

NIM-only. We elect via llm_health.get_primary("brain") which walks
BRAIN_CHAIN in priority order over election-eligible models.
If the elector has nothing eligible, we fall back to BRAIN_CHAIN[0] as a
last resort so the user still gets a reply.

Contract:
- Returns a single_brain.TurnResult so /api/chat treats the result
  identically to the happy path.
- Always returns — on NIM failure or timeout we synthesise a graceful
  reply (no exceptions escape).
"""

from __future__ import annotations

import asyncio
import logging
import re
import time
from typing import Optional

from backend import single_brain
from backend.providers.base import ChatMessage
from backend.providers.nvidia_nim_llm import BRAIN_CHAIN, NvidiaNimLLM

try:
    # llm_health is the elector — its get_primary("brain") walks the chain
    # in priority order over election-eligible models. Imported lazily-safe
    # in case CLEAN3's edits transiently move the symbol.
    from backend import llm_health
except Exception:  # noqa: BLE001
    llm_health = None  # type: ignore[assignment]


_LOG = logging.getLogger(__name__)

# Devanagari unicode block (Hindi / Marathi / Sanskrit etc.). Used to tag
# `language` on the TurnResult for downstream logging — matches the
# coarse `indic` vs `en` distinction single_brain uses.
_DEVANAGARI_RE = re.compile(r"[ऀ-ॿ]")

PROMPT = (
    "You are an Indian health-insurance advisor. The user just spoke to "
    "you and the primary system is briefly down. Reply in 1-2 sentences "
    "acknowledging what they said and asking them to repeat or rephrase "
    "so the main system can serve a proper recommendation. Keep it warm "
    "and brief. Indian context (use ₹, lakh, IRDAI). Do NOT name "
    "specific policies or insurers — you have no retrieval available."
)

# Graceful synthetic reply used when NIM itself fails / times out. Kept
# short and on-brand with the rest of the codebase's fallback strings.
_GRACEFUL_REPLY = (
    "Sorry, I'm having trouble — please try again in a moment."
)

# Outer per-call budget for the NIM round-trip. Tighter than the caller's
# 20s wait_for so we always emit our own TurnResult rather than letting
# the outer wrapper time out.
_NIM_TIMEOUT_S = 15.0


def _detect_language(text: str) -> str:
    """Coarse 2-bucket language tag matching single_brain conventions."""
    return "indic" if _DEVANAGARI_RE.search(text or "") else "en"


def _pick_model() -> str:
    """Elect the NIM brain model.

    Prefer llm_health.get_primary("brain") so this path participates in
    the same sticky-primary election as the rest of the stack. Fall
    through to BRAIN_CHAIN[0] if the elector is unavailable or has no
    eligible candidates — better to try the chain leader than to refuse
    the turn.
    """
    if llm_health is not None:
        try:
            elected = llm_health.get_primary("brain")
            if elected:
                return elected
        except Exception as exc:  # noqa: BLE001
            _LOG.warning("nim_fallback: get_primary('brain') failed: %s", exc)
    # Last resort — chain leader.
    return BRAIN_CHAIN[0]


def _flatten_history(chat_history: Optional[list[dict]]) -> list[ChatMessage]:
    """Translate the {role, content} chat_history into the NIM
    `ChatMessage` shape. Roles other than user/assistant are dropped;
    assistant aliases collapse to `assistant`.
    """
    out: list[ChatMessage] = []
    for msg in chat_history or []:
        role = (msg.get("role") or "user").lower()
        content = (msg.get("content") or "").strip()
        if not content:
            continue
        if role in ("assistant", "model", "bot"):
            out.append(ChatMessage(role="assistant", content=content))
        else:
            out.append(ChatMessage(role="user", content=content))
    return out


async def _nim_chat(
    model: str,
    chat_history: Optional[list[dict]],
    user_text: str,
) -> str:
    """Single NIM chat call. Raises on any provider error so the caller
    can fall through to the graceful synthetic reply."""
    client = NvidiaNimLLM(model=model, timeout=_NIM_TIMEOUT_S)
    messages: list[ChatMessage] = [ChatMessage(role="system", content=PROMPT)]
    messages.extend(_flatten_history(chat_history))
    messages.append(ChatMessage(role="user", content=user_text))
    result = await client.chat(
        messages,
        temperature=0.4,
        max_tokens=400,
    )
    return (result.text or "").strip()


async def handle_turn_fallback(
    session,
    user_text: str,
    chat_history: Optional[list[dict]] = None,
) -> single_brain.TurnResult:
    """First-turn fallback when single_brain (Gemini) raises.

    Returns a TurnResult with the NIM reply, or a graceful synthetic
    reply if NIM itself fails / times out. Never raises.
    """
    t0 = time.time()
    model = _pick_model()
    brain_used = f"nim_fallback::{model}"
    language = _detect_language(user_text)

    try:
        reply_text = await asyncio.wait_for(
            _nim_chat(model, chat_history, user_text),
            timeout=_NIM_TIMEOUT_S,
        )
        if not reply_text:
            # Model returned empty — treat as failure and fall through.
            raise RuntimeError("empty reply from NIM")
    except asyncio.TimeoutError:
        _LOG.warning(
            "nim_fallback: NIM call timed out after %.1fs (model=%s)",
            _NIM_TIMEOUT_S, model,
        )
        reply_text = _GRACEFUL_REPLY
        brain_used = f"nim_fallback::timeout::{model}"
    except Exception as exc:  # noqa: BLE001
        _LOG.warning(
            "nim_fallback: NIM call failed (model=%s): %s", model, exc,
        )
        reply_text = _GRACEFUL_REPLY
        brain_used = f"nim_fallback::error::{model}"

    latency_ms = int((time.time() - t0) * 1000)
    return single_brain.TurnResult(
        reply_text=reply_text,
        citations=[],
        retrieved_chunk_ids=[],
        brain_used=brain_used,
        intent="qa",
        language=language,
        latency_ms=latency_ms,
        raw_reply=reply_text,
        faithfulness_passed=True,
        faithfulness_reasons=[],
        blocked=False,
        profile_updates={},
        followup_policy_id=None,
    )


__all__ = ["handle_turn_fallback", "PROMPT"]