Spaces:

AshwinP
/

compounding-test

Running on Zero

apingali

perf(hf-space): pre-load model at module init (Option 3 refactor)

c673b37 4 days ago

40.3 kB

	"""The Compounding Test — HuggingFace Space.

	A single-shot Gradio app that runs an AI-initiative description through
	the two-axis Berkshire Test for AI and returns a scored writeup.

	Architecture per specs/004-berkshire-test/contracts/hf-space-interface.md:
	- Inputs: a description (200–5000 words) + 3 optional clarifiers.
	- Three backends, selectable by env (`MODEL_PROVIDER`) or auto-detected
	from available credentials and runtime environment:
	* anthropic — Claude Opus / Sonnet via the Anthropic SDK;
	system block is `cache_control:ephemeral` so
	subsequent calls hit the 5-minute prefix cache.
	* huggingface — Open models (Gemma 2 9B by default, swappable to
	Phi-4, Llama-3.3, Qwen 2.5, etc.) via the
	huggingface_hub InferenceClient. Works on HF
	Spaces with the Space's free inference credits;
	locally requires HF_TOKEN.
	* zerogpu — Open model (Phi-4-mini-instruct by default)
	loaded LOCALLY in the Space via transformers,
	decorated with `@spaces.GPU` so a HuggingFace
	Pro plan gets free on-demand A100/H100 GPU
	allocation per request. No per-call credit burn;
	no API round-trip. Requires the Space to have a
	Pro owner; locally falls back to CPU (slow).
	- Output: two Gradio tabs — markdown writeup + raw JSON.

	Engine/Site boundary (Principle VIII): this app lives in gradio-apps/
	only. Never deployed to mile-hi.ai. Reference JSONs are populated by
	hand from the published articles — no runtime fetch from the site.
	"""
	from __future__ import annotations

	import json
	import os
	import re
	import textwrap
	from dataclasses import dataclass, field
	from pathlib import Path
	from typing import Optional

	# ---------------------------------------------------------------------------
	# Parser surface (covered by test_diagnose.py — module-level, no side effects)
	# ---------------------------------------------------------------------------


	class MalformedResponseError(Exception):
	"""Raised when the model's response cannot be parsed into a Response."""


	VALID_QUADRANTS = {"compounder", "one-shot-win", "wrong-thing", "roman-candle"}
	VALID_PORTRAITS = {"progressive", "deere", "mastercard", "mayo"}
	REQUIRED_SCORES = (
	"proprietary_data",
	"self_labeling",
	"decreasing_marginal_cost",
	"defensible_asymmetry",
	)


	@dataclass
	class Score:
	score: int
	rationale: str
	quoted_span: str


	@dataclass
	class Warning:
	text: str
	citation_source: str
	citation_url: str


	@dataclass
	class Response:
	constraint: str
	scores: dict # str → Score (one entry per REQUIRED_SCORES key)
	quadrant: str
	closest_portrait: str
	closest_portrait_paragraph: str
	warnings: list # list[Warning]
	writeup: str


	# Match the FIRST ```json ... ``` fenced block in the response.
	JSON_BLOCK_RE = re.compile(r"```json\s\n(.?)\n\s*```", re.DOTALL)


	def parse_response(raw: str) -> Response:
	"""Extract the first ```json``` block from `raw`, validate the schema,
	and return a populated Response. Trailing markdown becomes `writeup`.

	Raises MalformedResponseError on any schema violation per the contract
	in specs/004-berkshire-test/contracts/hf-space-interface.md §4.
	"""
	match = JSON_BLOCK_RE.search(raw)
	if not match:
	raise MalformedResponseError("No ```json``` block found in response")

	json_text = match.group(1)
	try:
	data = json.loads(json_text)
	except json.JSONDecodeError as e:
	raise MalformedResponseError(f"JSON block did not parse: {e}")

	required = (
	"constraint",
	"scores",
	"quadrant",
	"closest_portrait",
	"closest_portrait_paragraph",
	"warnings",
	)
	for field_name in required:
	if field_name not in data:
	raise MalformedResponseError(f"Missing required field: {field_name}")

	if data["quadrant"] not in VALID_QUADRANTS:
	raise MalformedResponseError(
	f"Invalid quadrant: {data['quadrant']!r}; expected one of {sorted(VALID_QUADRANTS)}"
	)
	if data["closest_portrait"] not in VALID_PORTRAITS:
	raise MalformedResponseError(
	f"Invalid closest_portrait: {data['closest_portrait']!r}; expected one of {sorted(VALID_PORTRAITS)}"
	)

	if not isinstance(data["scores"], dict):
	raise MalformedResponseError("scores must be a JSON object")

	scores: dict[str, Score] = {}
	for key in REQUIRED_SCORES:
	if key not in data["scores"]:
	raise MalformedResponseError(f"Missing score key: {key}")
	s = data["scores"][key]
	if not isinstance(s, dict):
	raise MalformedResponseError(f"Score {key} must be an object")
	for sub in ("score", "rationale", "quoted_span"):
	if sub not in s:
	raise MalformedResponseError(f"Score {key} missing sub-field: {sub}")
	# score must be an int 0-4 (bools are excluded; bool is a subclass of int in Python)
	if isinstance(s["score"], bool) or not isinstance(s["score"], int):
	raise MalformedResponseError(
	f"Score {key}.score must be an integer 0-4, got {type(s['score']).__name__}"
	)
	if s["score"] < 0 or s["score"] > 4:
	raise MalformedResponseError(
	f"Score {key}.score must be in 0-4, got {s['score']}"
	)
	if not isinstance(s["quoted_span"], str) or not s["quoted_span"]:
	raise MalformedResponseError(f"Score {key}.quoted_span must be a non-empty string")
	if len(s["quoted_span"]) > 400:
	raise MalformedResponseError(
	f"Score {key}.quoted_span must be ≤400 chars, got {len(s['quoted_span'])}"
	)
	scores[key] = Score(
	score=s["score"], rationale=s["rationale"], quoted_span=s["quoted_span"]
	)

	if not isinstance(data["warnings"], list):
	raise MalformedResponseError("warnings must be a JSON array")
	warnings = [
	Warning(
	text=w.get("text", ""),
	citation_source=w.get("citation_source", ""),
	citation_url=w.get("citation_url", ""),
	)
	for w in data["warnings"]
	]

	writeup = raw[match.end():].strip()

	return Response(
	constraint=data["constraint"],
	scores=scores,
	quadrant=data["quadrant"],
	closest_portrait=data["closest_portrait"],
	closest_portrait_paragraph=data["closest_portrait_paragraph"],
	warnings=warnings,
	writeup=writeup,
	)


	# ---------------------------------------------------------------------------
	# Configuration (env-driven; see .env.example)
	# ---------------------------------------------------------------------------

	ROOT = Path(__file__).parent

	ANTHROPIC_MODEL_ID = os.environ.get("MODEL_ID", "claude-opus-4-7")
	HF_MODEL_ID = os.environ.get("HF_MODEL_ID", "google/gemma-2-9b-it")
	ZEROGPU_MODEL_ID = os.environ.get("ZEROGPU_MODEL_ID", "microsoft/Phi-4-mini-instruct")
	# ZeroGPU reserves this many seconds from the Space owner's daily quota
	# per request. With the pre-load pattern below (model on CPU at module
	# init, .to('cuda') + inference inside @spaces.GPU), per-call cost is
	# only ~10-25s wall-clock. 45s gives generous margin while squeezing
	# ~2.5x more submissions per quota window vs the original 120s.
	# Pro-tier max is 120s; raise via env if you need bigger headroom.
	ZEROGPU_DURATION_SECONDS = int(os.environ.get("ZEROGPU_DURATION_SECONDS", "45"))
	MAX_DESCRIPTION_WORDS = int(os.environ.get("MAX_DESCRIPTION_WORDS", "5000"))
	MIN_DESCRIPTION_WORDS = 200


	# ZeroGPU availability is detected at import time. The `spaces` package
	# is HuggingFace's runtime for on-demand GPU allocation; `transformers`
	# + `torch` are required to actually load and run the model. All three
	# must be importable for the zerogpu backend to function.
	try:
	import spaces as _spaces
	import torch as _torch
	from transformers import AutoModelForCausalLM as _AutoModelForCausalLM
	from transformers import AutoTokenizer as _AutoTokenizer

	_ZEROGPU_DEPS_AVAILABLE = True
	except ImportError:
	_ZEROGPU_DEPS_AVAILABLE = False


	def _zerogpu_available() -> bool:
	"""Return whether the zerogpu backend can be used. Wrapped as a
	function so tests can monkeypatch the answer without touching the
	real torch/transformers imports."""
	return _ZEROGPU_DEPS_AVAILABLE


	# ---------------------------------------------------------------------------
	# Provider abstraction (anthropic vs huggingface — selectable at runtime)
	# ---------------------------------------------------------------------------


	def _detect_provider(env=None) -> str:
	"""Pick a model provider from env. Order of precedence:
	1. Explicit MODEL_PROVIDER (anthropic \| huggingface \| zerogpu).
	2. Running on a HuggingFace Space (SPACE_ID set) AND the zerogpu
	deps (spaces + transformers + torch) are importable → zerogpu.
	This is the Pro-plan free-GPU path.
	3. Presence of ANTHROPIC_API_KEY → anthropic.
	4. Presence of HF_TOKEN / HUGGING_FACE_HUB_TOKEN, or running on
	a HuggingFace Space without zerogpu deps → huggingface.
	5. Fall through to anthropic (call-time error will tell the user
	which env to set).
	"""
	env = env if env is not None else os.environ
	explicit = env.get("MODEL_PROVIDER", "").strip().lower()
	if explicit in ("anthropic", "huggingface", "zerogpu"):
	return explicit
	if env.get("SPACE_ID") and _zerogpu_available():
	return "zerogpu"
	if env.get("ANTHROPIC_API_KEY"):
	return "anthropic"
	if (
	env.get("HF_TOKEN")
	or env.get("HUGGING_FACE_HUB_TOKEN")
	or env.get("SPACE_ID")
	):
	return "huggingface"
	return "anthropic"


	def _call_anthropic(system_block: str, user_prompt: str, *, api_key: Optional[str] = None) -> str:
	"""Anthropic backend. System block is cache-marked; the user prompt
	is sent fresh. Returns the raw assistant text.

	`api_key`: an optional per-call key. When provided, it goes directly
	to the SDK constructor and is NEVER written to os.environ. This is
	important on a multi-tenant public Space — mutating env would leak
	one visitor's key into a concurrent request from another visitor.
	When `api_key` is None, the SDK reads ANTHROPIC_API_KEY from env
	(the Space-owner's key path)."""
	from anthropic import Anthropic

	client = Anthropic(api_key=api_key) if api_key else Anthropic()
	resp = client.messages.create(
	model=ANTHROPIC_MODEL_ID,
	max_tokens=2500,
	system=[
	{
	"type": "text",
	"text": system_block,
	"cache_control": {"type": "ephemeral"},
	}
	],
	messages=[{"role": "user", "content": user_prompt}],
	)
	return resp.content[0].text


	def _call_huggingface(system_block: str, user_prompt: str) -> str:
	"""HuggingFace backend. Uses the unified chat_completion interface,
	which routes through HF Inference Providers and supports Gemma 2,
	Phi-4-mini-instruct, Llama-3.3, Qwen 2.5, and many others. Lower
	temperature (0.2) than the SDK default to keep JSON output stable —
	smaller open models can be looser than Claude on schema adherence.

	Requires an HF token: HF_TOKEN env var, HUGGING_FACE_HUB_TOKEN env
	var, or a `hf auth login`-stored token (huggingface_hub.get_token()
	checks all three sources). HF Spaces do NOT auto-inject a token on
	public Spaces — the Space owner has to add it as a Space secret.
	Raise a clear, actionable error if missing.
	"""
	from huggingface_hub import InferenceClient, get_token

	token = (
	os.environ.get("HF_TOKEN")
	or os.environ.get("HUGGING_FACE_HUB_TOKEN")
	or get_token() # checks ~/.cache/huggingface/token from `hf auth login`
	)
	if not token:
	raise RuntimeError(
	"No HuggingFace token found. The Space owner needs to add HF_TOKEN "
	"as a Space secret (Settings → Repository secrets → New secret → "
	"name: HF_TOKEN, value: a User Access Token from "
	"https://huggingface.co/settings/tokens). Then restart the Space. "
	"Until then, pick a different model from the dropdown."
	)
	# `provider="auto"` opts into the modern HF Inference Providers
	# routing layer (introduced 2024-Q4), which picks the right partner
	# (featherless-ai / together-ai / hf-inference / etc.) for the model
	# automatically. Without this flag, InferenceClient falls back to
	# the legacy hf-inference-only path, which doesn't serve most newer
	# models and returns a misleading "model not supported" error even
	# when the user has all providers enabled and access to the model.
	client = InferenceClient(
	model=HF_MODEL_ID,
	token=token,
	provider="auto",
	timeout=120,
	)
	try:
	resp = client.chat_completion(
	messages=[
	{"role": "system", "content": system_block},
	{"role": "user", "content": user_prompt},
	],
	max_tokens=2500,
	temperature=0.2,
	)
	except Exception as e:
	msg = str(e)
	# HF Inference Providers routes each model through a partner
	# (featherless-ai, together-ai, hf-inference, etc.). If none of
	# the enabled providers serves the requested model, the API
	# returns a BadRequestError with code=model_not_supported. The
	# raw error is opaque to users, so re-raise with the actual fix
	# instead of the unhelpful default message.
	if "model_not_supported" in msg or "not supported by any provider" in msg:
	raise RuntimeError(
	f"The model '{HF_MODEL_ID}' isn't available through any of "
	f"the HuggingFace Inference Providers enabled on your account. "
	f"Two fixes: (a) enable a provider that supports this model at "
	f"https://huggingface.co/settings/inference-providers, OR "
	f"(b) set HF_MODEL_ID as a Space variable to a model on your "
	f"enabled providers — microsoft/Phi-4-mini-instruct works "
	f"broadly via featherless-ai."
	)
	raise
	return resp.choices[0].message.content


	# ZeroGPU backend — pre-load pattern.
	#
	# Model is loaded onto CPU at Space startup (module init), NOT inside
	# `@spaces.GPU`. This is the documented HuggingFace ZeroGPU pattern:
	# - Module init runs once at Space startup, on CPU, with no GPU
	# quota consumed. The expensive part — downloading ~7.6GB of
	# safetensors and deserializing into PyTorch state — happens here.
	# - Inside `@spaces.GPU`, all we do is `.to('cuda')` + tokenize +
	# generate + decode. Wall-clock drops to ~10-15s warm, ~20-25s
	# after Space restart (the .to('cuda') for 7.6GB takes a few
	# seconds over PCIe).
	#
	# Why deliberately NOT `trust_remote_code=True`. Phi-4-mini-instruct's
	# architecture is `phi3`, which transformers 4.46+ supports natively
	# via `Phi3ForCausalLM` — no custom code download required. The custom
	# modeling code that ships with the model on HF Hub (`modeling_phi3.py`)
	# imports `LossKwargs` from `transformers.utils`, which was removed in
	# transformers 4.57+ — loading WITH `trust_remote_code=True` fails
	# with `ImportError: cannot import name 'LossKwargs' from
	# 'transformers.utils'` and bricks the `@spaces.GPU` worker. The
	# native path avoids the upstream pin-mismatch entirely.
	#
	# Tradeoff: ~30-60s slower Space cold-start (the one-time CPU load).
	# Acceptable because Spaces only restart on deploy or after a long
	# idle period. Worth it for the 2.5x quota efficiency.

	if _ZEROGPU_DEPS_AVAILABLE:
	_zerogpu_tokenizer = _AutoTokenizer.from_pretrained(ZEROGPU_MODEL_ID)
	_zerogpu_model = _AutoModelForCausalLM.from_pretrained(
	ZEROGPU_MODEL_ID,
	torch_dtype=_torch.bfloat16,
	# NO device_map — load to CPU; we move to GPU per-call inside
	# @spaces.GPU. ZeroGPU has no GPU available at module load.
	)
	else:
	_zerogpu_tokenizer = None
	_zerogpu_model = None


	def _zerogpu_invoke(system_block: str, user_prompt: str) -> str:
	"""Model invocation logic for the ZeroGPU backend. Pre-loaded model
	(on CPU) is moved to GPU on entry, then inference + decode. Reads
	module-level globals (`_zerogpu_tokenizer`, `_zerogpu_model`) which
	tests monkeypatch to fake the transformers types.

	Separated from the `@spaces.GPU` decoration below so it can be
	unit-tested without actually allocating a GPU."""
	# Move pre-loaded model from CPU to the GPU that @spaces.GPU just
	# allocated. Fast — just PCIe memory transfer of already-loaded
	# weights, no download or deserialize.
	_zerogpu_model.to("cuda")
	messages = [
	{"role": "system", "content": system_block},
	{"role": "user", "content": user_prompt},
	]
	inputs = _zerogpu_tokenizer.apply_chat_template(
	messages,
	return_tensors="pt",
	add_generation_prompt=True,
	).to("cuda")
	outputs = _zerogpu_model.generate(
	inputs,
	max_new_tokens=2500,
	temperature=0.2,
	do_sample=True,
	pad_token_id=_zerogpu_tokenizer.eos_token_id,
	)
	prompt_len = inputs.shape[1]
	return _zerogpu_tokenizer.decode(
	outputs[0][prompt_len:], skip_special_tokens=True
	)


	if _ZEROGPU_DEPS_AVAILABLE:

	@_spaces.GPU(duration=ZEROGPU_DURATION_SECONDS)
	def _call_zerogpu(system_block: str, user_prompt: str) -> str:
	"""ZeroGPU backend. Loads Phi-4-mini-instruct (or whatever
	ZEROGPU_MODEL_ID points at) into the Space's allocated GPU and
	runs chat-template inference. Thin wrapper around the testable
	`_zerogpu_invoke` so the decorator stays at module load time."""
	return _zerogpu_invoke(system_block, user_prompt)

	else:

	def _call_zerogpu(system_block: str, user_prompt: str) -> str:
	raise RuntimeError(
	"ZeroGPU backend requires `spaces`, `transformers`, and `torch` "
	"to be importable AND should be run on a HuggingFace Pro Space "
	"for free on-demand GPU. Install the full requirements.txt and "
	"deploy to a Space, or pick anthropic / huggingface from the "
	"provider dropdown."
	)


	PROVIDERS = {
	"anthropic": _call_anthropic,
	"huggingface": _call_huggingface,
	"zerogpu": _call_zerogpu,
	}


	def _call_model(system_block: str, user_prompt: str, provider: str) -> str:
	"""Dispatch to the named provider. Raises ValueError on unknown
	provider; callers are expected to validate before calling."""
	if provider not in PROVIDERS:
	raise ValueError(
	f"Unknown provider: {provider!r}; expected one of {sorted(PROVIDERS)}"
	)
	return PROVIDERS[provider](system_block, user_prompt)


	# Auto-detected once at module import; the Gradio UI exposes a runtime
	# override via the Provider dropdown.
	DEFAULT_PROVIDER = _detect_provider()

	INDUSTRIES = [
	"insurance", "banking", "healthcare", "retail", "manufacturing",
	"logistics", "agriculture", "energy", "telecom", "media",
	"professional services", "real estate", "other",
	]
	SCALES = ["pilot", "department", "business unit", "enterprise"]
	BUDGETS = ["<$100K", "$100K–$1M", "$1M–$10M", ">$10M"]


	# ---------------------------------------------------------------------------
	# Sample initiatives (gr.Examples) — one per verdict quadrant
	# ---------------------------------------------------------------------------
	# Realistic ~250–400-word AI-initiative descriptions that should land in a
	# specific quadrant of the 2×2 verdict matrix. Used to seed user testing
	# and give first-time visitors something concrete to click.

	_SAMPLE_COMPOUNDER = (
	"We're a regional commercial insurance carrier specializing in restaurant "
	"general liability. We write about 8,000 policies a year across the "
	"Midwest, with average annual premium around $4,500. Underwriting is "
	"the bottleneck of our business — independent agents wait 36 to 48 "
	"hours for a quote because our underwriters manually pull industry "
	"codes, loss runs, and prior-carrier history from three different "
	"systems and then decide whether to bind, decline, or refer. Roughly "
	"30% of submissions get declined and another 15% are referred to "
	"senior underwriters, which adds another day. We're deploying an "
	"LLM-powered underwriting assistant that pulls the data automatically, "
	"flags risk factors based on patterns in our 12-year claims database, "
	"and proposes a base rate with an explanation. The underwriter "
	"reviews, adjusts, and approves. Every policy we write generates new "
	"claim outcomes — fires, slip-and-falls, liquor-liability claims, "
	"food-poisoning suits — and those outcomes feed back into the next "
	"quarter's model retraining. Our competitors mostly use Verisk's "
	"industry-standard rating models, which we don't share data with, so "
	"our model gets better on our specific book of business while theirs "
	"reflects the industry average. Internal goal: cut time-to-quote from "
	"36 hours to 4 hours, increase the win rate on profitable risks by "
	"15%, and progressively shift the loss ratio by 1–2 points per year "
	"as the model learns from each renewal cycle. Independent agents have "
	"already started favoring carriers with faster quote turnaround."
	)

	_SAMPLE_ONE_SHOT_WIN = (
	"We're a community bank with $4B in assets and 38 branches across two "
	"states. Loan officers spend about 6 hours per commercial loan "
	"reviewing financial statements, tax returns, and corporate documents "
	"before they can write the credit memo. We're deploying GPT-4 to "
	"extract key fields — revenue, EBITDA, debt service coverage ratio, "
	"ownership structure, related-party transactions, collateral "
	"descriptions — from these documents into a structured form. The loan "
	"officer reviews the extraction and writes the credit memo by hand. "
	"We expect to cut document review time from 6 hours to about 90 "
	"minutes per loan, processing roughly 2,400 commercial loans a year. "
	"The vendor provides the model, the document templates, and the "
	"extraction prompts, and is selling the same system to four of our "
	"peer community banks in the region under identical contracts. The AI "
	"doesn't learn from the outcome of the loan: defaults, prepayments, "
	"modifications, restructurings all go into our separate loan "
	"servicing system, which has never connected back to the extraction "
	"model. The vendor's three-year roadmap doesn't include any feedback "
	"loop between loan performance and the model — they treat extraction "
	"as a deterministic task. We're funding the project from the "
	"operations budget; the credit team is excited about the time savings "
	"but the chief credit officer has flagged that the productivity gain "
	"will be one-time and won't show up in the loss-given-default rate "
	"over time."
	)

	_SAMPLE_WRONG_THING = (
	"We're a third-party logistics provider with 8 warehouses on the East "
	"Coast handling about 20,000 orders a day across the network. We're "
	"investing in computer vision software to optimize order picking "
	"routes — the AI looks at the warehouse layout, current orders, and "
	"worker positions and suggests optimized pick paths in real time. "
	"Pilot results show a 12% reduction in steps per order on the test "
	"floor. Our operations team has been excited about this for 18 months "
	"and we just signed a multi-year contract with the vendor. Some "
	"context on the operation: our warehouses run 2 shifts. Order volume "
	"in shift 1 is around 14,000 orders per day; shift 2 is around 6,000. "
	"The pick wave finishes by 2pm on shift 1, then workers wait 4 to 5 "
	"hours for shift 2 trucks to arrive at the loading docks. The trucks "
	"are scheduled by the customer (a major national retailer) and arrive "
	"in unpredictable windows between 6pm and 10pm. We don't control the "
	"truck schedule and the customer won't share their advance schedule "
	"with us. The CFO has been asking us why total throughput per "
	"warehouse hasn't moved much in three years; our answer has been that "
	"the legacy warehouse management system is the constraint, which is "
	"why we're investing in better picking AI. Same-store labor cost is "
	"up 8% year-over-year because workers are paid through the idle hours."
	)

	_SAMPLE_ROMAN_CANDLE = (
	"We run a chain of 220 quick-service restaurants across the Southeast "
	"doing about $480M in annual revenue. Our gross margin has been under "
	"pressure from rising ingredient costs and we're rolling out an "
	"AI-powered personalized marketing platform that sends customized "
	"email and SMS offers based on customer purchase history, location, "
	"and local weather. The platform is from a major QSR-tech vendor used "
	"by several of our direct competitors in the same markets we operate "
	"in. Our customer data — names, emails, phone numbers, purchase "
	"frequency, average ticket size — lives in our point-of-sale "
	"provider's cloud, which the marketing platform pulls from via the "
	"POS provider's standard integration. Both the purchase data feed and "
	"the modeling are the vendor's stack; we don't see the underlying "
	"model and our data is commingled with other QSR brands the vendor "
	"serves on a shared inference fleet. We expect to lift email "
	"click-through by 8–12% based on the vendor's benchmark studies of "
	"similar brands. The marketing team is running the rollout; finance "
	"signed off on the multi-year subscription. We have not measured what "
	"is actually constraining same-store sales growth — drive-thru wait "
	"times, menu pricing relative to local competitors, or breakfast "
	"daypart penetration — we just know revenue has been flat for two "
	"years and the board wants visible action by Q4."
	)


	def _load_reference():
	"""Read the prompt template + reference JSONs from disk at app start."""
	prompt_template = (ROOT / "prompts" / "diagnose.txt").read_text()
	portraits = json.loads((ROOT / "reference" / "portraits.json").read_text())
	failure_modes = json.loads((ROOT / "reference" / "failure-modes.json").read_text())

	portraits_block = "\n".join(
	textwrap.dedent(f"""\
	- id: {p['id']}
	label: {p['label']}
	bottleneck: {p['bottleneck']}
	summary: {p['summary']}
	compounding_summary: {p['compounding_summary']}
	article_url: {p['article_url']}
	""").rstrip()
	for p in portraits
	)

	failure_modes_block = "\n".join(
	textwrap.dedent(f"""\
	- id: {f['id']}
	label: {f['label']}
	applies_to_quadrants: {', '.join(f['applies_to_quadrants'])}
	summary: {f['summary']}
	url: {f['url']}
	""").rstrip()
	for f in failure_modes
	)

	system_block = (
	prompt_template
	.replace("{{portraits_block}}", portraits_block)
	.replace("{{failure_modes_block}}", failure_modes_block)
	)

	return prompt_template, system_block


	# Loaded once at module import; cached in memory for the life of the process.
	PROMPT_TEMPLATE, SYSTEM_BLOCK = _load_reference()


	# ---------------------------------------------------------------------------
	# Diagnose entrypoint (called by the Gradio Submit handler)
	# ---------------------------------------------------------------------------


	def diagnose(
	description: str,
	industry: Optional[str],
	scale: Optional[str],
	budget: Optional[str],
	provider: Optional[str] = None,
	anthropic_api_key: Optional[str] = None,
	) -> tuple[str, str]:
	"""Validate input, call the selected model with the cached system
	block, parse the response, and return (markdown_writeup,
	raw_json_string) for the two Gradio tabs.

	`provider` (anthropic \| huggingface \| zerogpu) defaults to
	DEFAULT_PROVIDER when not supplied — the Gradio dropdown always
	supplies it on a real submission.

	`anthropic_api_key` is a per-call user-supplied key. When provider
	is "anthropic" and the key is provided, it overrides any
	ANTHROPIC_API_KEY env var for this single request. The key is never
	persisted (Anthropic SDK uses it once and the client object is
	garbage-collected at function exit).

	Per F14 + contract §2, all error paths surface a user-friendly message
	in the markdown tab and an empty JSON tab; nothing leaks a stack trace.
	"""
	description = (description or "").strip()
	words = len(description.split())

	if not description:
	return "⚠ Please describe your AI initiative.", ""
	if words < MIN_DESCRIPTION_WORDS:
	return (
	f"⚠ Please describe your initiative in at least {MIN_DESCRIPTION_WORDS} words "
	f"(you wrote {words}). The diagnostic needs enough context to score the four "
	f"compounding conditions with rationale quoting your description.",
	"",
	)
	if words > MAX_DESCRIPTION_WORDS:
	return (
	f"⚠ Please keep your description under {MAX_DESCRIPTION_WORDS} words "
	f"(you wrote {words}). Shorten the description and try again.",
	"",
	)

	provider = provider or DEFAULT_PROVIDER
	if provider not in PROVIDERS:
	return (
	f"⚠ Unknown model provider {provider!r}. Pick one of "
	f"{sorted(PROVIDERS)} from the dropdown.",
	"",
	)

	# If Premium (Anthropic) is selected, the user must supply a key —
	# either via the page's API-key field (per-call) or via an
	# ANTHROPIC_API_KEY env var on the Space. Without either, fail fast
	# with a friendly explanation before we hit the SDK.
	user_key_for_anthropic: Optional[str] = None
	if provider == "anthropic":
	env_key = os.environ.get("ANTHROPIC_API_KEY", "").strip()
	user_key = (anthropic_api_key or "").strip()
	if not env_key and not user_key:
	return (
	"⚠ Premium (Claude Opus) needs an Anthropic API key. Either "
	"paste your key in the field above, or pick one of the free "
	"options from the model dropdown.",
	"",
	)
	if user_key:
	# IMPORTANT: do NOT write the user-supplied key to os.environ.
	# That would leak the key into concurrent requests from other
	# visitors on this Space (the process env is shared across
	# all in-flight requests in the Python worker). Instead we
	# pass it directly to _call_anthropic below, which scopes it
	# to a single SDK client instance that's garbage-collected
	# when the call returns.
	user_key_for_anthropic = user_key

	user_prompt = (
	PROMPT_TEMPLATE
	.replace("{{user_input}}", description)
	.replace("{{industry}}", industry or "(not specified)")
	.replace("{{scale}}", scale or "(not specified)")
	.replace("{{budget}}", budget or "(not specified)")
	)

	try:
	# When the visitor supplied their own Anthropic key, bypass the
	# generic dispatcher so we can pass the key directly via kwarg
	# without ever touching os.environ. All other paths go through
	# the dispatcher and read credentials from env as usual.
	if provider == "anthropic" and user_key_for_anthropic:
	raw = _call_anthropic(
	SYSTEM_BLOCK, user_prompt, api_key=user_key_for_anthropic,
	)
	else:
	raw = _call_model(SYSTEM_BLOCK, user_prompt, provider)
	except Exception as e:
	# API timeout / rate limit / auth / server / network failure
	# (Anthropic SDK, huggingface_hub InferenceClient, or
	# transformers/torch on the zerogpu path). Include both the
	# exception class AND its string form so unexpected failures
	# are diagnosable from the UI without server log access.
	model_label = {
	"anthropic": ANTHROPIC_MODEL_ID,
	"huggingface": HF_MODEL_ID,
	"zerogpu": ZEROGPU_MODEL_ID,
	}.get(provider, provider)
	detail = str(e).strip() or "(no message)"
	# Cap the detail so we don't spill multi-paragraph tracebacks
	# into the UI. 400 chars is enough for a stack-trace summary
	# without flooding the markdown tab.
	if len(detail) > 400:
	detail = detail[:400] + "…"
	# Defense-in-depth: if the user-supplied Anthropic key somehow
	# appears in the exception message (no current SDK version does
	# this, but a future debug-mode override might), redact it
	# before surfacing the writeup. Symmetric with redactKey() in
	# src/lib/anthropic-direct.ts.
	if user_key_for_anthropic and len(user_key_for_anthropic) >= 8:
	detail = detail.replace(user_key_for_anthropic, "[redacted]")
	return (
	f"⚠ The diagnostic call to {provider} ({model_label}) failed.\n\n"
	f"{type(e).__name__}: {detail}\n\n"
	f"Try again in a moment, switch providers in the dropdown, "
	f"or shorten your description.",
	"",
	)

	try:
	parsed = parse_response(raw)
	except MalformedResponseError as e:
	return (
	f"⚠ The model returned malformed output. Try again with a different description "
	f"or shorten the existing one.\n\nDetail: {e}",
	"",
	)

	payload = {
	"constraint": parsed.constraint,
	"quadrant": parsed.quadrant,
	"closest_portrait": parsed.closest_portrait,
	"closest_portrait_paragraph": parsed.closest_portrait_paragraph,
	"scores": {
	k: {"score": v.score, "rationale": v.rationale, "quoted_span": v.quoted_span}
	for k, v in parsed.scores.items()
	},
	"warnings": [
	{"text": w.text, "citation_source": w.citation_source, "citation_url": w.citation_url}
	for w in parsed.warnings
	],
	}
	return parsed.writeup, json.dumps(payload, indent=2)


	# ---------------------------------------------------------------------------
	# Gradio UI (built lazily so `import app` from tests does not require gradio)
	# ---------------------------------------------------------------------------


	def build_demo():
	"""Build and return the Gradio Blocks UI. Called only by __main__."""
	import gradio as gr

	# Free option first, premium second. Plain-English labels with no
	# ANTHROPIC_API_KEY / SPACE_ID / ZeroGPU jargon — the casual user
	# shouldn't have to know what any of those mean.
	#
	# The HuggingFace Inference Providers backend (provider="huggingface")
	# is intentionally NOT in this dropdown: it requires the Space owner
	# to have HF billing set up (credit card on file OR custom provider
	# API keys), which most Pro users don't have by default. The backend
	# code remains in PROVIDERS so it's reachable via MODEL_PROVIDER env
	# override for users who do set up billing — see README.md.
	provider_choices = []
	if _zerogpu_available():
	provider_choices.append((
	f"Free · Phi-4-mini-instruct (Microsoft) — runs on GPU",
	"zerogpu",
	))
	provider_choices.append((
	f"Premium · Claude Opus 4.7 (Anthropic) — paste your API key below",
	"anthropic",
	))
	# Default to the first free option; user can pick Premium if they
	# have a key. Never default to anthropic on a public Space.
	default_choice = provider_choices[0][1]

	with gr.Blocks(title="The Compounding Test") as demo:
	gr.Markdown(
	"# The Compounding Test\n\n"
	"A diagnostic for AI investments at non-technology companies. "
	"Describe your AI initiative — get a scored writeup in one of "
	"four outcomes: compounder, one-shot win, **compounding "
	"the wrong thing, or Roman Candle**.\n\n"
	"The default is free — runs an open model (Phi-4-mini) "
	"on this Space's GPU. Pick Premium · Claude Opus from "
	"the dropdown if you have an Anthropic API key and want the "
	"highest-quality writeup. Read the full framework at "
	"[mile-hi.ai/journal/the-berkshire-test]("
	"https://www.mile-hi.ai/journal/the-berkshire-test)."
	)
	with gr.Row():
	description = gr.Textbox(
	label=f"Describe your AI initiative ({MIN_DESCRIPTION_WORDS}–{MAX_DESCRIPTION_WORDS} words)",
	placeholder=(
	"Describe the bottleneck of your operation, the AI "
	"investment, what data feeds it, where the labels come "
	"from, and how you expect competitors to respond. Be "
	"specific about the workflow.\n\n"
	"Or pick a sample below to see how it works."
	),
	lines=12,
	)

	with gr.Row():
	industry = gr.Dropdown(INDUSTRIES, label="Industry (optional)", value=None)
	scale = gr.Dropdown(SCALES, label="Scale (optional)", value=None)
	budget = gr.Dropdown(BUDGETS, label="Budget tier (optional)", value=None)

	gr.Examples(
	examples=[
	[_SAMPLE_COMPOUNDER, "insurance", "business unit", "$1M–$10M"],
	[_SAMPLE_ONE_SHOT_WIN, "banking", "business unit", "$100K–$1M"],
	[_SAMPLE_WRONG_THING, "logistics", "enterprise", "$1M–$10M"],
	[_SAMPLE_ROMAN_CANDLE, "retail", "enterprise", "$100K–$1M"],
	],
	inputs=[description, industry, scale, budget],
	label="Sample initiatives — click one to load it (then click Diagnose)",
	examples_per_page=4,
	)

	with gr.Row():
	provider = gr.Dropdown(
	choices=provider_choices,
	value=default_choice,
	label="Choose a model",
	)
	# The API-key field appears only when Premium is selected. The
	# key is used per-request and never stored.
	api_key = gr.Textbox(
	label="Anthropic API key",
	placeholder="sk-ant-...",
	type="password",
	info=(
	"Used only for this request and never stored. "
	"Get a key at console.anthropic.com."
	),
	visible=False,
	)

	def _toggle_api_key(p):
	return gr.update(visible=(p == "anthropic"))

	provider.change(_toggle_api_key, inputs=[provider], outputs=[api_key])

	submit = gr.Button("Diagnose", variant="primary")
	with gr.Tabs():
	with gr.Tab("Diagnosis"):
	writeup_out = gr.Markdown()
	with gr.Tab("Raw JSON"):
	json_out = gr.Code(language="json")
	submit.click(
	diagnose,
	inputs=[description, industry, scale, budget, provider, api_key],
	outputs=[writeup_out, json_out],
	)

	return demo


	if __name__ == "__main__":
	# Local dev: relies on .env (loaded by python-dotenv) for ANTHROPIC_API_KEY.
	# HF Spaces: relies on Space secrets.
	try:
	from dotenv import load_dotenv

	load_dotenv()
	except ImportError:
	pass # dotenv is optional; HF Spaces uses Space secrets.

	build_demo().launch()