Spaces:

ashaibani
/

slipstream-webgpu

Running

App Files Files Community

slipstream-webgpu / src /narrative.py

ashaibani

Slipstream WebGPU (in-browser agent)

c658ad5 verified 5 days ago

raw

history blame contribute delete

4.63 kB

	"""Plain-English risk narrative via MiniCPM5-1B on llama.cpp.

	Earns the Llama Champion badge, keeps the app fully offline (Off the Grid), and uses
	OpenBMB's model (aligns with the OpenBMB special-category prize). GGUF artifacts:
	repo openbmb/MiniCPM5-1B-GGUF
	files MiniCPM5-1B-Q4_K_M.gguf (657 MB, recommended) \| -Q8_0 (1.1 GB) \| -F16 (2.1 GB)
	For offline use, download once then pass `gguf_path` to load from disk and bundle it.
	Sampling (no-think): temperature 0.7, top_p 0.95. See skill: minicpm5-deploy-llama-cpp.
	"""
	from __future__ import annotations

	PROMPT_TEMPLATE = """You are a project controls analyst writing for a client report.
	Given the forecast below, write a concise, factual 3-4 sentence risk commentary.
	Do not invent numbers. Be direct about schedule and cost risk.

	Forecast:
	- Percent complete: {pct_complete:.0%}
	- Cost performance index (CPI): {cpi:.2f}; Schedule performance index (SPI): {spi:.2f}
	- Projected finish: period {finish:.0f} (baseline {planned}) -> slippage {slippage:+.0f} periods
	- Projected final cost (EAC): {eac:,.0f} vs budget {bac:,.0f} ({overrun:+.0%})
	- Probability of cost overrun above 10%: {p_overrun:.0%}

	Risk commentary:"""

	GGUF_REPO = "openbmb/MiniCPM5-1B-GGUF"
	GGUF_FILE = "MiniCPM5-1B-Q4_K_M.gguf" # 657 MB; swap for -Q8_0 / -F16 for fidelity

	_NUM_KEYS = ("pct_complete", "cpi", "spi", "finish", "slippage", "eac", "bac", "overrun", "p_overrun", "planned")


	def _coerce(summary: dict) -> dict:
	"""Numeric fields can arrive as strings over the JSON/API boundary; float them so the f-string
	format specs (`:.2f`, `:%`) and the `<` comparisons below don't crash."""
	out = dict(summary)
	for k in _NUM_KEYS:
	if k in out:
	try:
	out[k] = float(out[k])
	except (TypeError, ValueError):
	pass
	return out


	def fallback_narrative(s: dict) -> str:
	"""Deterministic, no-LLM narrative from the forecast summary (used when llama.cpp /
	the MiniCPM5 GGUF is unavailable, e.g. on a fresh checkout)."""
	s = _coerce(s)
	sched = "behind" if s["spi"] < 0.97 else "ahead of" if s["spi"] > 1.03 else "on"
	cost = "over" if s["cpi"] < 0.97 else "under" if s["cpi"] > 1.03 else "on"
	slip = s["slippage"]
	return (
	f"At {s['pct_complete']:.0%} complete, the project is {sched} schedule "
	f"(SPI {s['spi']:.2f}) and {cost} budget (CPI {s['cpi']:.2f}). The forecast projects "
	f"completion around period {s['finish']:.0f} versus a baseline of {s['planned']} "
	f"({slip:+.0f} periods), and a final cost near {s['eac']:,.0f} against a "
	f"{s['bac']:,.0f} budget ({s['overrun']:+.0%}). Estimated chance of a cost overrun "
	f"beyond 10%: {s['p_overrun']:.0%}."
	)


	_llm = None


	def load_llm(gguf_path: str \| None = None, repo: str = GGUF_REPO,
	filename: str = GGUF_FILE, n_ctx: int = 8192):
	"""Load (and cache) MiniCPM5-1B. Order of preference, all offline-friendly:
	1. an explicit `gguf_path`;
	2. the distilled GGUF under ./models - reuses the single Llama instance the agent
	already loaded (no second download, no extra memory);
	3. otherwise download the base MiniCPM5-1B GGUF from the Hub once and cache it."""
	global _llm
	if _llm is None:
	if gguf_path is None: # reuse the local distilled agent model if present
	try:
	from . import local_llm

	if local_llm.is_available():
	g, t = local_llm.ensure_local() # download the distilled GGUF if needed
	_llm, _ = local_llm.load(g, t, n_ctx=n_ctx)
	return _llm
	except Exception:
	pass
	from llama_cpp import Llama

	if gguf_path:
	_llm = Llama(model_path=gguf_path, n_ctx=n_ctx, verbose=False)
	else:
	_llm = Llama.from_pretrained(repo_id=repo, filename=filename, n_ctx=n_ctx, verbose=False)
	return _llm


	def generate_narrative(summary: dict, max_tokens: int = 220, temperature: float = 0.7,
	top_p: float = 0.95, **load_kw) -> str:
	"""`summary` must contain the keys referenced in PROMPT_TEMPLATE. Uses MiniCPM5
	no-think sampling defaults (temp 0.7, top_p 0.95)."""
	llm = load_llm(**load_kw)
	prompt = PROMPT_TEMPLATE.format(**_coerce(summary))
	out = llm.create_chat_completion(
	messages=[{"role": "user", "content": prompt}],
	max_tokens=max_tokens,
	temperature=temperature,
	top_p=top_p,
	)
	return out["choices"][0]["message"]["content"].strip()