apingali
perf(hf-space): pre-load model at module init (Option 3 refactor)
c673b37
"""The Compounding Test β€” HuggingFace Space.
A single-shot Gradio app that runs an AI-initiative description through
the two-axis Berkshire Test for AI and returns a scored writeup.
Architecture per specs/004-berkshire-test/contracts/hf-space-interface.md:
- Inputs: a description (200–5000 words) + 3 optional clarifiers.
- Three backends, selectable by env (`MODEL_PROVIDER`) or auto-detected
from available credentials and runtime environment:
* anthropic β€” Claude Opus / Sonnet via the Anthropic SDK;
system block is `cache_control:ephemeral` so
subsequent calls hit the 5-minute prefix cache.
* huggingface β€” Open models (Gemma 2 9B by default, swappable to
Phi-4, Llama-3.3, Qwen 2.5, etc.) via the
huggingface_hub InferenceClient. Works on HF
Spaces with the Space's free inference credits;
locally requires HF_TOKEN.
* zerogpu β€” Open model (Phi-4-mini-instruct by default)
loaded LOCALLY in the Space via transformers,
decorated with `@spaces.GPU` so a HuggingFace
Pro plan gets free on-demand A100/H100 GPU
allocation per request. No per-call credit burn;
no API round-trip. Requires the Space to have a
Pro owner; locally falls back to CPU (slow).
- Output: two Gradio tabs β€” markdown writeup + raw JSON.
Engine/Site boundary (Principle VIII): this app lives in gradio-apps/
only. Never deployed to mile-hi.ai. Reference JSONs are populated by
hand from the published articles β€” no runtime fetch from the site.
"""
from __future__ import annotations
import json
import os
import re
import textwrap
from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional
# ---------------------------------------------------------------------------
# Parser surface (covered by test_diagnose.py β€” module-level, no side effects)
# ---------------------------------------------------------------------------
class MalformedResponseError(Exception):
"""Raised when the model's response cannot be parsed into a Response."""
VALID_QUADRANTS = {"compounder", "one-shot-win", "wrong-thing", "roman-candle"}
VALID_PORTRAITS = {"progressive", "deere", "mastercard", "mayo"}
REQUIRED_SCORES = (
"proprietary_data",
"self_labeling",
"decreasing_marginal_cost",
"defensible_asymmetry",
)
@dataclass
class Score:
score: int
rationale: str
quoted_span: str
@dataclass
class Warning:
text: str
citation_source: str
citation_url: str
@dataclass
class Response:
constraint: str
scores: dict # str β†’ Score (one entry per REQUIRED_SCORES key)
quadrant: str
closest_portrait: str
closest_portrait_paragraph: str
warnings: list # list[Warning]
writeup: str
# Match the FIRST ```json ... ``` fenced block in the response.
JSON_BLOCK_RE = re.compile(r"```json\s*\n(.*?)\n\s*```", re.DOTALL)
def parse_response(raw: str) -> Response:
"""Extract the first ```json``` block from `raw`, validate the schema,
and return a populated Response. Trailing markdown becomes `writeup`.
Raises MalformedResponseError on any schema violation per the contract
in specs/004-berkshire-test/contracts/hf-space-interface.md Β§4.
"""
match = JSON_BLOCK_RE.search(raw)
if not match:
raise MalformedResponseError("No ```json``` block found in response")
json_text = match.group(1)
try:
data = json.loads(json_text)
except json.JSONDecodeError as e:
raise MalformedResponseError(f"JSON block did not parse: {e}")
required = (
"constraint",
"scores",
"quadrant",
"closest_portrait",
"closest_portrait_paragraph",
"warnings",
)
for field_name in required:
if field_name not in data:
raise MalformedResponseError(f"Missing required field: {field_name}")
if data["quadrant"] not in VALID_QUADRANTS:
raise MalformedResponseError(
f"Invalid quadrant: {data['quadrant']!r}; expected one of {sorted(VALID_QUADRANTS)}"
)
if data["closest_portrait"] not in VALID_PORTRAITS:
raise MalformedResponseError(
f"Invalid closest_portrait: {data['closest_portrait']!r}; expected one of {sorted(VALID_PORTRAITS)}"
)
if not isinstance(data["scores"], dict):
raise MalformedResponseError("scores must be a JSON object")
scores: dict[str, Score] = {}
for key in REQUIRED_SCORES:
if key not in data["scores"]:
raise MalformedResponseError(f"Missing score key: {key}")
s = data["scores"][key]
if not isinstance(s, dict):
raise MalformedResponseError(f"Score {key} must be an object")
for sub in ("score", "rationale", "quoted_span"):
if sub not in s:
raise MalformedResponseError(f"Score {key} missing sub-field: {sub}")
# score must be an int 0-4 (bools are excluded; bool is a subclass of int in Python)
if isinstance(s["score"], bool) or not isinstance(s["score"], int):
raise MalformedResponseError(
f"Score {key}.score must be an integer 0-4, got {type(s['score']).__name__}"
)
if s["score"] < 0 or s["score"] > 4:
raise MalformedResponseError(
f"Score {key}.score must be in 0-4, got {s['score']}"
)
if not isinstance(s["quoted_span"], str) or not s["quoted_span"]:
raise MalformedResponseError(f"Score {key}.quoted_span must be a non-empty string")
if len(s["quoted_span"]) > 400:
raise MalformedResponseError(
f"Score {key}.quoted_span must be ≀400 chars, got {len(s['quoted_span'])}"
)
scores[key] = Score(
score=s["score"], rationale=s["rationale"], quoted_span=s["quoted_span"]
)
if not isinstance(data["warnings"], list):
raise MalformedResponseError("warnings must be a JSON array")
warnings = [
Warning(
text=w.get("text", ""),
citation_source=w.get("citation_source", ""),
citation_url=w.get("citation_url", ""),
)
for w in data["warnings"]
]
writeup = raw[match.end():].strip()
return Response(
constraint=data["constraint"],
scores=scores,
quadrant=data["quadrant"],
closest_portrait=data["closest_portrait"],
closest_portrait_paragraph=data["closest_portrait_paragraph"],
warnings=warnings,
writeup=writeup,
)
# ---------------------------------------------------------------------------
# Configuration (env-driven; see .env.example)
# ---------------------------------------------------------------------------
ROOT = Path(__file__).parent
ANTHROPIC_MODEL_ID = os.environ.get("MODEL_ID", "claude-opus-4-7")
HF_MODEL_ID = os.environ.get("HF_MODEL_ID", "google/gemma-2-9b-it")
ZEROGPU_MODEL_ID = os.environ.get("ZEROGPU_MODEL_ID", "microsoft/Phi-4-mini-instruct")
# ZeroGPU reserves this many seconds from the Space owner's daily quota
# per request. With the pre-load pattern below (model on CPU at module
# init, .to('cuda') + inference inside @spaces.GPU), per-call cost is
# only ~10-25s wall-clock. 45s gives generous margin while squeezing
# ~2.5x more submissions per quota window vs the original 120s.
# Pro-tier max is 120s; raise via env if you need bigger headroom.
ZEROGPU_DURATION_SECONDS = int(os.environ.get("ZEROGPU_DURATION_SECONDS", "45"))
MAX_DESCRIPTION_WORDS = int(os.environ.get("MAX_DESCRIPTION_WORDS", "5000"))
MIN_DESCRIPTION_WORDS = 200
# ZeroGPU availability is detected at import time. The `spaces` package
# is HuggingFace's runtime for on-demand GPU allocation; `transformers`
# + `torch` are required to actually load and run the model. All three
# must be importable for the zerogpu backend to function.
try:
import spaces as _spaces
import torch as _torch
from transformers import AutoModelForCausalLM as _AutoModelForCausalLM
from transformers import AutoTokenizer as _AutoTokenizer
_ZEROGPU_DEPS_AVAILABLE = True
except ImportError:
_ZEROGPU_DEPS_AVAILABLE = False
def _zerogpu_available() -> bool:
"""Return whether the zerogpu backend can be used. Wrapped as a
function so tests can monkeypatch the answer without touching the
real torch/transformers imports."""
return _ZEROGPU_DEPS_AVAILABLE
# ---------------------------------------------------------------------------
# Provider abstraction (anthropic vs huggingface β€” selectable at runtime)
# ---------------------------------------------------------------------------
def _detect_provider(env=None) -> str:
"""Pick a model provider from env. Order of precedence:
1. Explicit MODEL_PROVIDER (anthropic | huggingface | zerogpu).
2. Running on a HuggingFace Space (SPACE_ID set) AND the zerogpu
deps (spaces + transformers + torch) are importable β†’ zerogpu.
This is the Pro-plan free-GPU path.
3. Presence of ANTHROPIC_API_KEY β†’ anthropic.
4. Presence of HF_TOKEN / HUGGING_FACE_HUB_TOKEN, or running on
a HuggingFace Space without zerogpu deps β†’ huggingface.
5. Fall through to anthropic (call-time error will tell the user
which env to set).
"""
env = env if env is not None else os.environ
explicit = env.get("MODEL_PROVIDER", "").strip().lower()
if explicit in ("anthropic", "huggingface", "zerogpu"):
return explicit
if env.get("SPACE_ID") and _zerogpu_available():
return "zerogpu"
if env.get("ANTHROPIC_API_KEY"):
return "anthropic"
if (
env.get("HF_TOKEN")
or env.get("HUGGING_FACE_HUB_TOKEN")
or env.get("SPACE_ID")
):
return "huggingface"
return "anthropic"
def _call_anthropic(system_block: str, user_prompt: str, *, api_key: Optional[str] = None) -> str:
"""Anthropic backend. System block is cache-marked; the user prompt
is sent fresh. Returns the raw assistant text.
`api_key`: an optional per-call key. When provided, it goes directly
to the SDK constructor and is NEVER written to os.environ. This is
important on a multi-tenant public Space β€” mutating env would leak
one visitor's key into a concurrent request from another visitor.
When `api_key` is None, the SDK reads ANTHROPIC_API_KEY from env
(the Space-owner's key path)."""
from anthropic import Anthropic
client = Anthropic(api_key=api_key) if api_key else Anthropic()
resp = client.messages.create(
model=ANTHROPIC_MODEL_ID,
max_tokens=2500,
system=[
{
"type": "text",
"text": system_block,
"cache_control": {"type": "ephemeral"},
}
],
messages=[{"role": "user", "content": user_prompt}],
)
return resp.content[0].text
def _call_huggingface(system_block: str, user_prompt: str) -> str:
"""HuggingFace backend. Uses the unified chat_completion interface,
which routes through HF Inference Providers and supports Gemma 2,
Phi-4-mini-instruct, Llama-3.3, Qwen 2.5, and many others. Lower
temperature (0.2) than the SDK default to keep JSON output stable β€”
smaller open models can be looser than Claude on schema adherence.
Requires an HF token: HF_TOKEN env var, HUGGING_FACE_HUB_TOKEN env
var, or a `hf auth login`-stored token (huggingface_hub.get_token()
checks all three sources). HF Spaces do NOT auto-inject a token on
public Spaces β€” the Space owner has to add it as a Space secret.
Raise a clear, actionable error if missing.
"""
from huggingface_hub import InferenceClient, get_token
token = (
os.environ.get("HF_TOKEN")
or os.environ.get("HUGGING_FACE_HUB_TOKEN")
or get_token() # checks ~/.cache/huggingface/token from `hf auth login`
)
if not token:
raise RuntimeError(
"No HuggingFace token found. The Space owner needs to add HF_TOKEN "
"as a Space secret (Settings β†’ Repository secrets β†’ New secret β†’ "
"name: HF_TOKEN, value: a User Access Token from "
"https://huggingface.co/settings/tokens). Then restart the Space. "
"Until then, pick a different model from the dropdown."
)
# `provider="auto"` opts into the modern HF Inference Providers
# routing layer (introduced 2024-Q4), which picks the right partner
# (featherless-ai / together-ai / hf-inference / etc.) for the model
# automatically. Without this flag, InferenceClient falls back to
# the legacy hf-inference-only path, which doesn't serve most newer
# models and returns a misleading "model not supported" error even
# when the user has all providers enabled and access to the model.
client = InferenceClient(
model=HF_MODEL_ID,
token=token,
provider="auto",
timeout=120,
)
try:
resp = client.chat_completion(
messages=[
{"role": "system", "content": system_block},
{"role": "user", "content": user_prompt},
],
max_tokens=2500,
temperature=0.2,
)
except Exception as e:
msg = str(e)
# HF Inference Providers routes each model through a partner
# (featherless-ai, together-ai, hf-inference, etc.). If none of
# the enabled providers serves the requested model, the API
# returns a BadRequestError with code=model_not_supported. The
# raw error is opaque to users, so re-raise with the actual fix
# instead of the unhelpful default message.
if "model_not_supported" in msg or "not supported by any provider" in msg:
raise RuntimeError(
f"The model '{HF_MODEL_ID}' isn't available through any of "
f"the HuggingFace Inference Providers enabled on your account. "
f"Two fixes: (a) enable a provider that supports this model at "
f"https://huggingface.co/settings/inference-providers, OR "
f"(b) set HF_MODEL_ID as a Space variable to a model on your "
f"enabled providers β€” microsoft/Phi-4-mini-instruct works "
f"broadly via featherless-ai."
)
raise
return resp.choices[0].message.content
# ZeroGPU backend β€” pre-load pattern.
#
# Model is loaded onto CPU at Space startup (module init), NOT inside
# `@spaces.GPU`. This is the documented HuggingFace ZeroGPU pattern:
# - Module init runs once at Space startup, on CPU, with no GPU
# quota consumed. The expensive part β€” downloading ~7.6GB of
# safetensors and deserializing into PyTorch state β€” happens here.
# - Inside `@spaces.GPU`, all we do is `.to('cuda')` + tokenize +
# generate + decode. Wall-clock drops to ~10-15s warm, ~20-25s
# after Space restart (the .to('cuda') for 7.6GB takes a few
# seconds over PCIe).
#
# Why deliberately NOT `trust_remote_code=True`. Phi-4-mini-instruct's
# architecture is `phi3`, which transformers 4.46+ supports natively
# via `Phi3ForCausalLM` β€” no custom code download required. The custom
# modeling code that ships with the model on HF Hub (`modeling_phi3.py`)
# imports `LossKwargs` from `transformers.utils`, which was removed in
# transformers 4.57+ β€” loading WITH `trust_remote_code=True` fails
# with `ImportError: cannot import name 'LossKwargs' from
# 'transformers.utils'` and bricks the `@spaces.GPU` worker. The
# native path avoids the upstream pin-mismatch entirely.
#
# Tradeoff: ~30-60s slower Space cold-start (the one-time CPU load).
# Acceptable because Spaces only restart on deploy or after a long
# idle period. Worth it for the 2.5x quota efficiency.
if _ZEROGPU_DEPS_AVAILABLE:
_zerogpu_tokenizer = _AutoTokenizer.from_pretrained(ZEROGPU_MODEL_ID)
_zerogpu_model = _AutoModelForCausalLM.from_pretrained(
ZEROGPU_MODEL_ID,
torch_dtype=_torch.bfloat16,
# NO device_map β€” load to CPU; we move to GPU per-call inside
# @spaces.GPU. ZeroGPU has no GPU available at module load.
)
else:
_zerogpu_tokenizer = None
_zerogpu_model = None
def _zerogpu_invoke(system_block: str, user_prompt: str) -> str:
"""Model invocation logic for the ZeroGPU backend. Pre-loaded model
(on CPU) is moved to GPU on entry, then inference + decode. Reads
module-level globals (`_zerogpu_tokenizer`, `_zerogpu_model`) which
tests monkeypatch to fake the transformers types.
Separated from the `@spaces.GPU` decoration below so it can be
unit-tested without actually allocating a GPU."""
# Move pre-loaded model from CPU to the GPU that @spaces.GPU just
# allocated. Fast β€” just PCIe memory transfer of already-loaded
# weights, no download or deserialize.
_zerogpu_model.to("cuda")
messages = [
{"role": "system", "content": system_block},
{"role": "user", "content": user_prompt},
]
inputs = _zerogpu_tokenizer.apply_chat_template(
messages,
return_tensors="pt",
add_generation_prompt=True,
).to("cuda")
outputs = _zerogpu_model.generate(
inputs,
max_new_tokens=2500,
temperature=0.2,
do_sample=True,
pad_token_id=_zerogpu_tokenizer.eos_token_id,
)
prompt_len = inputs.shape[1]
return _zerogpu_tokenizer.decode(
outputs[0][prompt_len:], skip_special_tokens=True
)
if _ZEROGPU_DEPS_AVAILABLE:
@_spaces.GPU(duration=ZEROGPU_DURATION_SECONDS)
def _call_zerogpu(system_block: str, user_prompt: str) -> str:
"""ZeroGPU backend. Loads Phi-4-mini-instruct (or whatever
ZEROGPU_MODEL_ID points at) into the Space's allocated GPU and
runs chat-template inference. Thin wrapper around the testable
`_zerogpu_invoke` so the decorator stays at module load time."""
return _zerogpu_invoke(system_block, user_prompt)
else:
def _call_zerogpu(system_block: str, user_prompt: str) -> str:
raise RuntimeError(
"ZeroGPU backend requires `spaces`, `transformers`, and `torch` "
"to be importable AND should be run on a HuggingFace Pro Space "
"for free on-demand GPU. Install the full requirements.txt and "
"deploy to a Space, or pick anthropic / huggingface from the "
"provider dropdown."
)
PROVIDERS = {
"anthropic": _call_anthropic,
"huggingface": _call_huggingface,
"zerogpu": _call_zerogpu,
}
def _call_model(system_block: str, user_prompt: str, provider: str) -> str:
"""Dispatch to the named provider. Raises ValueError on unknown
provider; callers are expected to validate before calling."""
if provider not in PROVIDERS:
raise ValueError(
f"Unknown provider: {provider!r}; expected one of {sorted(PROVIDERS)}"
)
return PROVIDERS[provider](system_block, user_prompt)
# Auto-detected once at module import; the Gradio UI exposes a runtime
# override via the Provider dropdown.
DEFAULT_PROVIDER = _detect_provider()
INDUSTRIES = [
"insurance", "banking", "healthcare", "retail", "manufacturing",
"logistics", "agriculture", "energy", "telecom", "media",
"professional services", "real estate", "other",
]
SCALES = ["pilot", "department", "business unit", "enterprise"]
BUDGETS = ["<$100K", "$100K–$1M", "$1M–$10M", ">$10M"]
# ---------------------------------------------------------------------------
# Sample initiatives (gr.Examples) β€” one per verdict quadrant
# ---------------------------------------------------------------------------
# Realistic ~250–400-word AI-initiative descriptions that should land in a
# specific quadrant of the 2Γ—2 verdict matrix. Used to seed user testing
# and give first-time visitors something concrete to click.
_SAMPLE_COMPOUNDER = (
"We're a regional commercial insurance carrier specializing in restaurant "
"general liability. We write about 8,000 policies a year across the "
"Midwest, with average annual premium around $4,500. Underwriting is "
"the bottleneck of our business β€” independent agents wait 36 to 48 "
"hours for a quote because our underwriters manually pull industry "
"codes, loss runs, and prior-carrier history from three different "
"systems and then decide whether to bind, decline, or refer. Roughly "
"30% of submissions get declined and another 15% are referred to "
"senior underwriters, which adds another day. We're deploying an "
"LLM-powered underwriting assistant that pulls the data automatically, "
"flags risk factors based on patterns in our 12-year claims database, "
"and proposes a base rate with an explanation. The underwriter "
"reviews, adjusts, and approves. Every policy we write generates new "
"claim outcomes β€” fires, slip-and-falls, liquor-liability claims, "
"food-poisoning suits β€” and those outcomes feed back into the next "
"quarter's model retraining. Our competitors mostly use Verisk's "
"industry-standard rating models, which we don't share data with, so "
"our model gets better on our specific book of business while theirs "
"reflects the industry average. Internal goal: cut time-to-quote from "
"36 hours to 4 hours, increase the win rate on profitable risks by "
"15%, and progressively shift the loss ratio by 1–2 points per year "
"as the model learns from each renewal cycle. Independent agents have "
"already started favoring carriers with faster quote turnaround."
)
_SAMPLE_ONE_SHOT_WIN = (
"We're a community bank with $4B in assets and 38 branches across two "
"states. Loan officers spend about 6 hours per commercial loan "
"reviewing financial statements, tax returns, and corporate documents "
"before they can write the credit memo. We're deploying GPT-4 to "
"extract key fields β€” revenue, EBITDA, debt service coverage ratio, "
"ownership structure, related-party transactions, collateral "
"descriptions β€” from these documents into a structured form. The loan "
"officer reviews the extraction and writes the credit memo by hand. "
"We expect to cut document review time from 6 hours to about 90 "
"minutes per loan, processing roughly 2,400 commercial loans a year. "
"The vendor provides the model, the document templates, and the "
"extraction prompts, and is selling the same system to four of our "
"peer community banks in the region under identical contracts. The AI "
"doesn't learn from the outcome of the loan: defaults, prepayments, "
"modifications, restructurings all go into our separate loan "
"servicing system, which has never connected back to the extraction "
"model. The vendor's three-year roadmap doesn't include any feedback "
"loop between loan performance and the model β€” they treat extraction "
"as a deterministic task. We're funding the project from the "
"operations budget; the credit team is excited about the time savings "
"but the chief credit officer has flagged that the productivity gain "
"will be one-time and won't show up in the loss-given-default rate "
"over time."
)
_SAMPLE_WRONG_THING = (
"We're a third-party logistics provider with 8 warehouses on the East "
"Coast handling about 20,000 orders a day across the network. We're "
"investing in computer vision software to optimize order picking "
"routes β€” the AI looks at the warehouse layout, current orders, and "
"worker positions and suggests optimized pick paths in real time. "
"Pilot results show a 12% reduction in steps per order on the test "
"floor. Our operations team has been excited about this for 18 months "
"and we just signed a multi-year contract with the vendor. Some "
"context on the operation: our warehouses run 2 shifts. Order volume "
"in shift 1 is around 14,000 orders per day; shift 2 is around 6,000. "
"The pick wave finishes by 2pm on shift 1, then workers wait 4 to 5 "
"hours for shift 2 trucks to arrive at the loading docks. The trucks "
"are scheduled by the customer (a major national retailer) and arrive "
"in unpredictable windows between 6pm and 10pm. We don't control the "
"truck schedule and the customer won't share their advance schedule "
"with us. The CFO has been asking us why total throughput per "
"warehouse hasn't moved much in three years; our answer has been that "
"the legacy warehouse management system is the constraint, which is "
"why we're investing in better picking AI. Same-store labor cost is "
"up 8% year-over-year because workers are paid through the idle hours."
)
_SAMPLE_ROMAN_CANDLE = (
"We run a chain of 220 quick-service restaurants across the Southeast "
"doing about $480M in annual revenue. Our gross margin has been under "
"pressure from rising ingredient costs and we're rolling out an "
"AI-powered personalized marketing platform that sends customized "
"email and SMS offers based on customer purchase history, location, "
"and local weather. The platform is from a major QSR-tech vendor used "
"by several of our direct competitors in the same markets we operate "
"in. Our customer data β€” names, emails, phone numbers, purchase "
"frequency, average ticket size β€” lives in our point-of-sale "
"provider's cloud, which the marketing platform pulls from via the "
"POS provider's standard integration. Both the purchase data feed and "
"the modeling are the vendor's stack; we don't see the underlying "
"model and our data is commingled with other QSR brands the vendor "
"serves on a shared inference fleet. We expect to lift email "
"click-through by 8–12% based on the vendor's benchmark studies of "
"similar brands. The marketing team is running the rollout; finance "
"signed off on the multi-year subscription. We have not measured what "
"is actually constraining same-store sales growth β€” drive-thru wait "
"times, menu pricing relative to local competitors, or breakfast "
"daypart penetration β€” we just know revenue has been flat for two "
"years and the board wants visible action by Q4."
)
def _load_reference():
"""Read the prompt template + reference JSONs from disk at app start."""
prompt_template = (ROOT / "prompts" / "diagnose.txt").read_text()
portraits = json.loads((ROOT / "reference" / "portraits.json").read_text())
failure_modes = json.loads((ROOT / "reference" / "failure-modes.json").read_text())
portraits_block = "\n".join(
textwrap.dedent(f"""\
- id: {p['id']}
label: {p['label']}
bottleneck: {p['bottleneck']}
summary: {p['summary']}
compounding_summary: {p['compounding_summary']}
article_url: {p['article_url']}
""").rstrip()
for p in portraits
)
failure_modes_block = "\n".join(
textwrap.dedent(f"""\
- id: {f['id']}
label: {f['label']}
applies_to_quadrants: {', '.join(f['applies_to_quadrants'])}
summary: {f['summary']}
url: {f['url']}
""").rstrip()
for f in failure_modes
)
system_block = (
prompt_template
.replace("{{portraits_block}}", portraits_block)
.replace("{{failure_modes_block}}", failure_modes_block)
)
return prompt_template, system_block
# Loaded once at module import; cached in memory for the life of the process.
PROMPT_TEMPLATE, SYSTEM_BLOCK = _load_reference()
# ---------------------------------------------------------------------------
# Diagnose entrypoint (called by the Gradio Submit handler)
# ---------------------------------------------------------------------------
def diagnose(
description: str,
industry: Optional[str],
scale: Optional[str],
budget: Optional[str],
provider: Optional[str] = None,
anthropic_api_key: Optional[str] = None,
) -> tuple[str, str]:
"""Validate input, call the selected model with the cached system
block, parse the response, and return (markdown_writeup,
raw_json_string) for the two Gradio tabs.
`provider` (anthropic | huggingface | zerogpu) defaults to
DEFAULT_PROVIDER when not supplied β€” the Gradio dropdown always
supplies it on a real submission.
`anthropic_api_key` is a per-call user-supplied key. When provider
is "anthropic" and the key is provided, it overrides any
ANTHROPIC_API_KEY env var for this single request. The key is never
persisted (Anthropic SDK uses it once and the client object is
garbage-collected at function exit).
Per F14 + contract Β§2, all error paths surface a user-friendly message
in the markdown tab and an empty JSON tab; nothing leaks a stack trace.
"""
description = (description or "").strip()
words = len(description.split())
if not description:
return "⚠ Please describe your AI initiative.", ""
if words < MIN_DESCRIPTION_WORDS:
return (
f"⚠ Please describe your initiative in at least {MIN_DESCRIPTION_WORDS} words "
f"(you wrote {words}). The diagnostic needs enough context to score the four "
f"compounding conditions with rationale quoting your description.",
"",
)
if words > MAX_DESCRIPTION_WORDS:
return (
f"⚠ Please keep your description under {MAX_DESCRIPTION_WORDS} words "
f"(you wrote {words}). Shorten the description and try again.",
"",
)
provider = provider or DEFAULT_PROVIDER
if provider not in PROVIDERS:
return (
f"⚠ Unknown model provider {provider!r}. Pick one of "
f"{sorted(PROVIDERS)} from the dropdown.",
"",
)
# If Premium (Anthropic) is selected, the user must supply a key β€”
# either via the page's API-key field (per-call) or via an
# ANTHROPIC_API_KEY env var on the Space. Without either, fail fast
# with a friendly explanation before we hit the SDK.
user_key_for_anthropic: Optional[str] = None
if provider == "anthropic":
env_key = os.environ.get("ANTHROPIC_API_KEY", "").strip()
user_key = (anthropic_api_key or "").strip()
if not env_key and not user_key:
return (
"⚠ Premium (Claude Opus) needs an Anthropic API key. Either "
"paste your key in the field above, or pick one of the free "
"options from the model dropdown.",
"",
)
if user_key:
# IMPORTANT: do NOT write the user-supplied key to os.environ.
# That would leak the key into concurrent requests from other
# visitors on this Space (the process env is shared across
# all in-flight requests in the Python worker). Instead we
# pass it directly to _call_anthropic below, which scopes it
# to a single SDK client instance that's garbage-collected
# when the call returns.
user_key_for_anthropic = user_key
user_prompt = (
PROMPT_TEMPLATE
.replace("{{user_input}}", description)
.replace("{{industry}}", industry or "(not specified)")
.replace("{{scale}}", scale or "(not specified)")
.replace("{{budget}}", budget or "(not specified)")
)
try:
# When the visitor supplied their own Anthropic key, bypass the
# generic dispatcher so we can pass the key directly via kwarg
# without ever touching os.environ. All other paths go through
# the dispatcher and read credentials from env as usual.
if provider == "anthropic" and user_key_for_anthropic:
raw = _call_anthropic(
SYSTEM_BLOCK, user_prompt, api_key=user_key_for_anthropic,
)
else:
raw = _call_model(SYSTEM_BLOCK, user_prompt, provider)
except Exception as e:
# API timeout / rate limit / auth / server / network failure
# (Anthropic SDK, huggingface_hub InferenceClient, or
# transformers/torch on the zerogpu path). Include both the
# exception class AND its string form so unexpected failures
# are diagnosable from the UI without server log access.
model_label = {
"anthropic": ANTHROPIC_MODEL_ID,
"huggingface": HF_MODEL_ID,
"zerogpu": ZEROGPU_MODEL_ID,
}.get(provider, provider)
detail = str(e).strip() or "(no message)"
# Cap the detail so we don't spill multi-paragraph tracebacks
# into the UI. 400 chars is enough for a stack-trace summary
# without flooding the markdown tab.
if len(detail) > 400:
detail = detail[:400] + "…"
# Defense-in-depth: if the user-supplied Anthropic key somehow
# appears in the exception message (no current SDK version does
# this, but a future debug-mode override might), redact it
# before surfacing the writeup. Symmetric with redactKey() in
# src/lib/anthropic-direct.ts.
if user_key_for_anthropic and len(user_key_for_anthropic) >= 8:
detail = detail.replace(user_key_for_anthropic, "[redacted]")
return (
f"⚠ The diagnostic call to {provider} ({model_label}) failed.\n\n"
f"**{type(e).__name__}:** {detail}\n\n"
f"Try again in a moment, switch providers in the dropdown, "
f"or shorten your description.",
"",
)
try:
parsed = parse_response(raw)
except MalformedResponseError as e:
return (
f"⚠ The model returned malformed output. Try again with a different description "
f"or shorten the existing one.\n\nDetail: {e}",
"",
)
payload = {
"constraint": parsed.constraint,
"quadrant": parsed.quadrant,
"closest_portrait": parsed.closest_portrait,
"closest_portrait_paragraph": parsed.closest_portrait_paragraph,
"scores": {
k: {"score": v.score, "rationale": v.rationale, "quoted_span": v.quoted_span}
for k, v in parsed.scores.items()
},
"warnings": [
{"text": w.text, "citation_source": w.citation_source, "citation_url": w.citation_url}
for w in parsed.warnings
],
}
return parsed.writeup, json.dumps(payload, indent=2)
# ---------------------------------------------------------------------------
# Gradio UI (built lazily so `import app` from tests does not require gradio)
# ---------------------------------------------------------------------------
def build_demo():
"""Build and return the Gradio Blocks UI. Called only by __main__."""
import gradio as gr
# Free option first, premium second. Plain-English labels with no
# ANTHROPIC_API_KEY / SPACE_ID / ZeroGPU jargon β€” the casual user
# shouldn't have to know what any of those mean.
#
# The HuggingFace Inference Providers backend (provider="huggingface")
# is intentionally NOT in this dropdown: it requires the Space owner
# to have HF billing set up (credit card on file OR custom provider
# API keys), which most Pro users don't have by default. The backend
# code remains in PROVIDERS so it's reachable via MODEL_PROVIDER env
# override for users who do set up billing β€” see README.md.
provider_choices = []
if _zerogpu_available():
provider_choices.append((
f"Free Β· Phi-4-mini-instruct (Microsoft) β€” runs on GPU",
"zerogpu",
))
provider_choices.append((
f"Premium Β· Claude Opus 4.7 (Anthropic) β€” paste your API key below",
"anthropic",
))
# Default to the first free option; user can pick Premium if they
# have a key. Never default to anthropic on a public Space.
default_choice = provider_choices[0][1]
with gr.Blocks(title="The Compounding Test") as demo:
gr.Markdown(
"# The Compounding Test\n\n"
"A diagnostic for AI investments at non-technology companies. "
"Describe your AI initiative β€” get a scored writeup in one of "
"four outcomes: **compounder**, **one-shot win**, **compounding "
"the wrong thing**, or **Roman Candle**.\n\n"
"**The default is free** β€” runs an open model (Phi-4-mini) "
"on this Space's GPU. Pick **Premium Β· Claude Opus** from "
"the dropdown if you have an Anthropic API key and want the "
"highest-quality writeup. Read the full framework at "
"[mile-hi.ai/journal/the-berkshire-test]("
"https://www.mile-hi.ai/journal/the-berkshire-test)."
)
with gr.Row():
description = gr.Textbox(
label=f"Describe your AI initiative ({MIN_DESCRIPTION_WORDS}–{MAX_DESCRIPTION_WORDS} words)",
placeholder=(
"Describe the bottleneck of your operation, the AI "
"investment, what data feeds it, where the labels come "
"from, and how you expect competitors to respond. Be "
"specific about the workflow.\n\n"
"Or pick a sample below to see how it works."
),
lines=12,
)
with gr.Row():
industry = gr.Dropdown(INDUSTRIES, label="Industry (optional)", value=None)
scale = gr.Dropdown(SCALES, label="Scale (optional)", value=None)
budget = gr.Dropdown(BUDGETS, label="Budget tier (optional)", value=None)
gr.Examples(
examples=[
[_SAMPLE_COMPOUNDER, "insurance", "business unit", "$1M–$10M"],
[_SAMPLE_ONE_SHOT_WIN, "banking", "business unit", "$100K–$1M"],
[_SAMPLE_WRONG_THING, "logistics", "enterprise", "$1M–$10M"],
[_SAMPLE_ROMAN_CANDLE, "retail", "enterprise", "$100K–$1M"],
],
inputs=[description, industry, scale, budget],
label="Sample initiatives β€” click one to load it (then click Diagnose)",
examples_per_page=4,
)
with gr.Row():
provider = gr.Dropdown(
choices=provider_choices,
value=default_choice,
label="Choose a model",
)
# The API-key field appears only when Premium is selected. The
# key is used per-request and never stored.
api_key = gr.Textbox(
label="Anthropic API key",
placeholder="sk-ant-...",
type="password",
info=(
"Used only for this request and never stored. "
"Get a key at console.anthropic.com."
),
visible=False,
)
def _toggle_api_key(p):
return gr.update(visible=(p == "anthropic"))
provider.change(_toggle_api_key, inputs=[provider], outputs=[api_key])
submit = gr.Button("Diagnose", variant="primary")
with gr.Tabs():
with gr.Tab("Diagnosis"):
writeup_out = gr.Markdown()
with gr.Tab("Raw JSON"):
json_out = gr.Code(language="json")
submit.click(
diagnose,
inputs=[description, industry, scale, budget, provider, api_key],
outputs=[writeup_out, json_out],
)
return demo
if __name__ == "__main__":
# Local dev: relies on .env (loaded by python-dotenv) for ANTHROPIC_API_KEY.
# HF Spaces: relies on Space secrets.
try:
from dotenv import load_dotenv
load_dotenv()
except ImportError:
pass # dotenv is optional; HF Spaces uses Space secrets.
build_demo().launch()