"""Plain-English risk narrative via MiniCPM5-1B on llama.cpp.

Earns the Llama Champion badge, keeps the app fully offline (Off the Grid), and uses
OpenBMB's model (aligns with the OpenBMB special-category prize). GGUF artifacts:
  repo   openbmb/MiniCPM5-1B-GGUF
  files  MiniCPM5-1B-Q4_K_M.gguf (657 MB, recommended) | -Q8_0 (1.1 GB) | -F16 (2.1 GB)
For offline use, download once then pass `gguf_path` to load from disk and bundle it.
Sampling (no-think): temperature 0.7, top_p 0.95. See skill: minicpm5-deploy-llama-cpp.
"""
from __future__ import annotations

PROMPT_TEMPLATE = """You are a project controls analyst writing for a client report.
Given the forecast below, write a concise, factual 3-4 sentence risk commentary.
Do not invent numbers. Be direct about schedule and cost risk.

Forecast:
- Percent complete: {pct_complete:.0%}
- Cost performance index (CPI): {cpi:.2f}; Schedule performance index (SPI): {spi:.2f}
- Projected finish: period {finish:.0f} (baseline {planned}) -> slippage {slippage:+.0f} periods
- Projected final cost (EAC): {eac:,.0f} vs budget {bac:,.0f} ({overrun:+.0%})
- Probability of cost overrun above 10%: {p_overrun:.0%}

Risk commentary:"""

GGUF_REPO = "openbmb/MiniCPM5-1B-GGUF"
GGUF_FILE = "MiniCPM5-1B-Q4_K_M.gguf"        # 657 MB; swap for -Q8_0 / -F16 for fidelity

_NUM_KEYS = ("pct_complete", "cpi", "spi", "finish", "slippage", "eac", "bac", "overrun", "p_overrun", "planned")


def _coerce(summary: dict) -> dict:
    """Numeric fields can arrive as strings over the JSON/API boundary; float them so the f-string
    format specs (`:.2f`, `:%`) and the `<` comparisons below don't crash."""
    out = dict(summary)
    for k in _NUM_KEYS:
        if k in out:
            try:
                out[k] = float(out[k])
            except (TypeError, ValueError):
                pass
    return out


def fallback_narrative(s: dict) -> str:
    """Deterministic, no-LLM narrative from the forecast summary (used when llama.cpp /
    the MiniCPM5 GGUF is unavailable, e.g. on a fresh checkout)."""
    s = _coerce(s)
    sched = "behind" if s["spi"] < 0.97 else "ahead of" if s["spi"] > 1.03 else "on"
    cost = "over" if s["cpi"] < 0.97 else "under" if s["cpi"] > 1.03 else "on"
    slip = s["slippage"]
    return (
        f"At {s['pct_complete']:.0%} complete, the project is {sched} schedule "
        f"(SPI {s['spi']:.2f}) and {cost} budget (CPI {s['cpi']:.2f}). The forecast projects "
        f"completion around period {s['finish']:.0f} versus a baseline of {s['planned']} "
        f"({slip:+.0f} periods), and a final cost near {s['eac']:,.0f} against a "
        f"{s['bac']:,.0f} budget ({s['overrun']:+.0%}). Estimated chance of a cost overrun "
        f"beyond 10%: {s['p_overrun']:.0%}."
    )


_llm = None


def load_llm(gguf_path: str | None = None, repo: str = GGUF_REPO,
             filename: str = GGUF_FILE, n_ctx: int = 8192):
    """Load (and cache) MiniCPM5-1B. Order of preference, all offline-friendly:
    1. an explicit `gguf_path`;
    2. the **distilled** GGUF under ./models - reuses the single Llama instance the agent
       already loaded (no second download, no extra memory);
    3. otherwise download the base MiniCPM5-1B GGUF from the Hub once and cache it."""
    global _llm
    if _llm is None:
        if gguf_path is None:                       # reuse the local distilled agent model if present
            try:
                from . import local_llm

                if local_llm.is_available():
                    g, t = local_llm.ensure_local()      # download the distilled GGUF if needed
                    _llm, _ = local_llm.load(g, t, n_ctx=n_ctx)
                    return _llm
            except Exception:
                pass
        from llama_cpp import Llama

        if gguf_path:
            _llm = Llama(model_path=gguf_path, n_ctx=n_ctx, verbose=False)
        else:
            _llm = Llama.from_pretrained(repo_id=repo, filename=filename, n_ctx=n_ctx, verbose=False)
    return _llm


def generate_narrative(summary: dict, max_tokens: int = 220, temperature: float = 0.7,
                       top_p: float = 0.95, **load_kw) -> str:
    """`summary` must contain the keys referenced in PROMPT_TEMPLATE. Uses MiniCPM5
    no-think sampling defaults (temp 0.7, top_p 0.95)."""
    llm = load_llm(**load_kw)
    prompt = PROMPT_TEMPLATE.format(**_coerce(summary))
    out = llm.create_chat_completion(
        messages=[{"role": "user", "content": prompt}],
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
    )
    return out["choices"][0]["message"]["content"].strip()