slipstream-webgpu / src /narrative.py
ashaibani's picture
Slipstream WebGPU (in-browser agent)
c658ad5 verified
"""Plain-English risk narrative via MiniCPM5-1B on llama.cpp.
Earns the Llama Champion badge, keeps the app fully offline (Off the Grid), and uses
OpenBMB's model (aligns with the OpenBMB special-category prize). GGUF artifacts:
repo openbmb/MiniCPM5-1B-GGUF
files MiniCPM5-1B-Q4_K_M.gguf (657 MB, recommended) | -Q8_0 (1.1 GB) | -F16 (2.1 GB)
For offline use, download once then pass `gguf_path` to load from disk and bundle it.
Sampling (no-think): temperature 0.7, top_p 0.95. See skill: minicpm5-deploy-llama-cpp.
"""
from __future__ import annotations
PROMPT_TEMPLATE = """You are a project controls analyst writing for a client report.
Given the forecast below, write a concise, factual 3-4 sentence risk commentary.
Do not invent numbers. Be direct about schedule and cost risk.
Forecast:
- Percent complete: {pct_complete:.0%}
- Cost performance index (CPI): {cpi:.2f}; Schedule performance index (SPI): {spi:.2f}
- Projected finish: period {finish:.0f} (baseline {planned}) -> slippage {slippage:+.0f} periods
- Projected final cost (EAC): {eac:,.0f} vs budget {bac:,.0f} ({overrun:+.0%})
- Probability of cost overrun above 10%: {p_overrun:.0%}
Risk commentary:"""
GGUF_REPO = "openbmb/MiniCPM5-1B-GGUF"
GGUF_FILE = "MiniCPM5-1B-Q4_K_M.gguf" # 657 MB; swap for -Q8_0 / -F16 for fidelity
_NUM_KEYS = ("pct_complete", "cpi", "spi", "finish", "slippage", "eac", "bac", "overrun", "p_overrun", "planned")
def _coerce(summary: dict) -> dict:
"""Numeric fields can arrive as strings over the JSON/API boundary; float them so the f-string
format specs (`:.2f`, `:%`) and the `<` comparisons below don't crash."""
out = dict(summary)
for k in _NUM_KEYS:
if k in out:
try:
out[k] = float(out[k])
except (TypeError, ValueError):
pass
return out
def fallback_narrative(s: dict) -> str:
"""Deterministic, no-LLM narrative from the forecast summary (used when llama.cpp /
the MiniCPM5 GGUF is unavailable, e.g. on a fresh checkout)."""
s = _coerce(s)
sched = "behind" if s["spi"] < 0.97 else "ahead of" if s["spi"] > 1.03 else "on"
cost = "over" if s["cpi"] < 0.97 else "under" if s["cpi"] > 1.03 else "on"
slip = s["slippage"]
return (
f"At {s['pct_complete']:.0%} complete, the project is {sched} schedule "
f"(SPI {s['spi']:.2f}) and {cost} budget (CPI {s['cpi']:.2f}). The forecast projects "
f"completion around period {s['finish']:.0f} versus a baseline of {s['planned']} "
f"({slip:+.0f} periods), and a final cost near {s['eac']:,.0f} against a "
f"{s['bac']:,.0f} budget ({s['overrun']:+.0%}). Estimated chance of a cost overrun "
f"beyond 10%: {s['p_overrun']:.0%}."
)
_llm = None
def load_llm(gguf_path: str | None = None, repo: str = GGUF_REPO,
filename: str = GGUF_FILE, n_ctx: int = 8192):
"""Load (and cache) MiniCPM5-1B. Order of preference, all offline-friendly:
1. an explicit `gguf_path`;
2. the **distilled** GGUF under ./models - reuses the single Llama instance the agent
already loaded (no second download, no extra memory);
3. otherwise download the base MiniCPM5-1B GGUF from the Hub once and cache it."""
global _llm
if _llm is None:
if gguf_path is None: # reuse the local distilled agent model if present
try:
from . import local_llm
if local_llm.is_available():
g, t = local_llm.ensure_local() # download the distilled GGUF if needed
_llm, _ = local_llm.load(g, t, n_ctx=n_ctx)
return _llm
except Exception:
pass
from llama_cpp import Llama
if gguf_path:
_llm = Llama(model_path=gguf_path, n_ctx=n_ctx, verbose=False)
else:
_llm = Llama.from_pretrained(repo_id=repo, filename=filename, n_ctx=n_ctx, verbose=False)
return _llm
def generate_narrative(summary: dict, max_tokens: int = 220, temperature: float = 0.7,
top_p: float = 0.95, **load_kw) -> str:
"""`summary` must contain the keys referenced in PROMPT_TEMPLATE. Uses MiniCPM5
no-think sampling defaults (temp 0.7, top_p 0.95)."""
llm = load_llm(**load_kw)
prompt = PROMPT_TEMPLATE.format(**_coerce(summary))
out = llm.create_chat_completion(
messages=[{"role": "user", "content": prompt}],
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
)
return out["choices"][0]["message"]["content"].strip()