Spaces:

AshwinP
/

compounding-test

Running on Zero

File size: 40,267 Bytes

"""The Compounding Test — HuggingFace Space.

A single-shot Gradio app that runs an AI-initiative description through
the two-axis Berkshire Test for AI and returns a scored writeup.

Architecture per specs/004-berkshire-test/contracts/hf-space-interface.md:
  - Inputs: a description (200–5000 words) + 3 optional clarifiers.
  - Three backends, selectable by env (`MODEL_PROVIDER`) or auto-detected
    from available credentials and runtime environment:
      * anthropic   — Claude Opus / Sonnet via the Anthropic SDK;
                      system block is `cache_control:ephemeral` so
                      subsequent calls hit the 5-minute prefix cache.
      * huggingface — Open models (Gemma 2 9B by default, swappable to
                      Phi-4, Llama-3.3, Qwen 2.5, etc.) via the
                      huggingface_hub InferenceClient. Works on HF
                      Spaces with the Space's free inference credits;
                      locally requires HF_TOKEN.
      * zerogpu     — Open model (Phi-4-mini-instruct by default)
                      loaded LOCALLY in the Space via transformers,
                      decorated with `@spaces.GPU` so a HuggingFace
                      Pro plan gets free on-demand A100/H100 GPU
                      allocation per request. No per-call credit burn;
                      no API round-trip. Requires the Space to have a
                      Pro owner; locally falls back to CPU (slow).
  - Output: two Gradio tabs — markdown writeup + raw JSON.

Engine/Site boundary (Principle VIII): this app lives in gradio-apps/
only. Never deployed to mile-hi.ai. Reference JSONs are populated by
hand from the published articles — no runtime fetch from the site.
"""
from __future__ import annotations

import json
import os
import re
import textwrap
from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional

# ---------------------------------------------------------------------------
# Parser surface (covered by test_diagnose.py — module-level, no side effects)
# ---------------------------------------------------------------------------


class MalformedResponseError(Exception):
    """Raised when the model's response cannot be parsed into a Response."""


VALID_QUADRANTS = {"compounder", "one-shot-win", "wrong-thing", "roman-candle"}
VALID_PORTRAITS = {"progressive", "deere", "mastercard", "mayo"}
REQUIRED_SCORES = (
    "proprietary_data",
    "self_labeling",
    "decreasing_marginal_cost",
    "defensible_asymmetry",
)


@dataclass
class Score:
    score: int
    rationale: str
    quoted_span: str


@dataclass
class Warning:
    text: str
    citation_source: str
    citation_url: str


@dataclass
class Response:
    constraint: str
    scores: dict  # str → Score (one entry per REQUIRED_SCORES key)
    quadrant: str
    closest_portrait: str
    closest_portrait_paragraph: str
    warnings: list  # list[Warning]
    writeup: str


# Match the FIRST ```json ... ``` fenced block in the response.
JSON_BLOCK_RE = re.compile(r"```json\s*\n(.*?)\n\s*```", re.DOTALL)


def parse_response(raw: str) -> Response:
    """Extract the first ```json``` block from `raw`, validate the schema,
    and return a populated Response. Trailing markdown becomes `writeup`.

    Raises MalformedResponseError on any schema violation per the contract
    in specs/004-berkshire-test/contracts/hf-space-interface.md §4.
    """
    match = JSON_BLOCK_RE.search(raw)
    if not match:
        raise MalformedResponseError("No ```json``` block found in response")

    json_text = match.group(1)
    try:
        data = json.loads(json_text)
    except json.JSONDecodeError as e:
        raise MalformedResponseError(f"JSON block did not parse: {e}")

    required = (
        "constraint",
        "scores",
        "quadrant",
        "closest_portrait",
        "closest_portrait_paragraph",
        "warnings",
    )
    for field_name in required:
        if field_name not in data:
            raise MalformedResponseError(f"Missing required field: {field_name}")

    if data["quadrant"] not in VALID_QUADRANTS:
        raise MalformedResponseError(
            f"Invalid quadrant: {data['quadrant']!r}; expected one of {sorted(VALID_QUADRANTS)}"
        )
    if data["closest_portrait"] not in VALID_PORTRAITS:
        raise MalformedResponseError(
            f"Invalid closest_portrait: {data['closest_portrait']!r}; expected one of {sorted(VALID_PORTRAITS)}"
        )

    if not isinstance(data["scores"], dict):
        raise MalformedResponseError("scores must be a JSON object")

    scores: dict[str, Score] = {}
    for key in REQUIRED_SCORES:
        if key not in data["scores"]:
            raise MalformedResponseError(f"Missing score key: {key}")
        s = data["scores"][key]
        if not isinstance(s, dict):
            raise MalformedResponseError(f"Score {key} must be an object")
        for sub in ("score", "rationale", "quoted_span"):
            if sub not in s:
                raise MalformedResponseError(f"Score {key} missing sub-field: {sub}")
        # score must be an int 0-4 (bools are excluded; bool is a subclass of int in Python)
        if isinstance(s["score"], bool) or not isinstance(s["score"], int):
            raise MalformedResponseError(
                f"Score {key}.score must be an integer 0-4, got {type(s['score']).__name__}"
            )
        if s["score"] < 0 or s["score"] > 4:
            raise MalformedResponseError(
                f"Score {key}.score must be in 0-4, got {s['score']}"
            )
        if not isinstance(s["quoted_span"], str) or not s["quoted_span"]:
            raise MalformedResponseError(f"Score {key}.quoted_span must be a non-empty string")
        if len(s["quoted_span"]) > 400:
            raise MalformedResponseError(
                f"Score {key}.quoted_span must be ≤400 chars, got {len(s['quoted_span'])}"
            )
        scores[key] = Score(
            score=s["score"], rationale=s["rationale"], quoted_span=s["quoted_span"]
        )

    if not isinstance(data["warnings"], list):
        raise MalformedResponseError("warnings must be a JSON array")
    warnings = [
        Warning(
            text=w.get("text", ""),
            citation_source=w.get("citation_source", ""),
            citation_url=w.get("citation_url", ""),
        )
        for w in data["warnings"]
    ]

    writeup = raw[match.end():].strip()

    return Response(
        constraint=data["constraint"],
        scores=scores,
        quadrant=data["quadrant"],
        closest_portrait=data["closest_portrait"],
        closest_portrait_paragraph=data["closest_portrait_paragraph"],
        warnings=warnings,
        writeup=writeup,
    )


# ---------------------------------------------------------------------------
# Configuration (env-driven; see .env.example)
# ---------------------------------------------------------------------------

ROOT = Path(__file__).parent

ANTHROPIC_MODEL_ID = os.environ.get("MODEL_ID", "claude-opus-4-7")
HF_MODEL_ID = os.environ.get("HF_MODEL_ID", "google/gemma-2-9b-it")
ZEROGPU_MODEL_ID = os.environ.get("ZEROGPU_MODEL_ID", "microsoft/Phi-4-mini-instruct")
# ZeroGPU reserves this many seconds from the Space owner's daily quota
# per request. With the pre-load pattern below (model on CPU at module
# init, .to('cuda') + inference inside @spaces.GPU), per-call cost is
# only ~10-25s wall-clock. 45s gives generous margin while squeezing
# ~2.5x more submissions per quota window vs the original 120s.
# Pro-tier max is 120s; raise via env if you need bigger headroom.
ZEROGPU_DURATION_SECONDS = int(os.environ.get("ZEROGPU_DURATION_SECONDS", "45"))
MAX_DESCRIPTION_WORDS = int(os.environ.get("MAX_DESCRIPTION_WORDS", "5000"))
MIN_DESCRIPTION_WORDS = 200


# ZeroGPU availability is detected at import time. The `spaces` package
# is HuggingFace's runtime for on-demand GPU allocation; `transformers`
# + `torch` are required to actually load and run the model. All three
# must be importable for the zerogpu backend to function.
try:
    import spaces as _spaces
    import torch as _torch
    from transformers import AutoModelForCausalLM as _AutoModelForCausalLM
    from transformers import AutoTokenizer as _AutoTokenizer

    _ZEROGPU_DEPS_AVAILABLE = True
except ImportError:
    _ZEROGPU_DEPS_AVAILABLE = False


def _zerogpu_available() -> bool:
    """Return whether the zerogpu backend can be used. Wrapped as a
    function so tests can monkeypatch the answer without touching the
    real torch/transformers imports."""
    return _ZEROGPU_DEPS_AVAILABLE


# ---------------------------------------------------------------------------
# Provider abstraction (anthropic vs huggingface — selectable at runtime)
# ---------------------------------------------------------------------------


def _detect_provider(env=None) -> str:
    """Pick a model provider from env. Order of precedence:
      1. Explicit MODEL_PROVIDER (anthropic | huggingface | zerogpu).
      2. Running on a HuggingFace Space (SPACE_ID set) AND the zerogpu
         deps (spaces + transformers + torch) are importable → zerogpu.
         This is the Pro-plan free-GPU path.
      3. Presence of ANTHROPIC_API_KEY → anthropic.
      4. Presence of HF_TOKEN / HUGGING_FACE_HUB_TOKEN, or running on
         a HuggingFace Space without zerogpu deps → huggingface.
      5. Fall through to anthropic (call-time error will tell the user
         which env to set).
    """
    env = env if env is not None else os.environ
    explicit = env.get("MODEL_PROVIDER", "").strip().lower()
    if explicit in ("anthropic", "huggingface", "zerogpu"):
        return explicit
    if env.get("SPACE_ID") and _zerogpu_available():
        return "zerogpu"
    if env.get("ANTHROPIC_API_KEY"):
        return "anthropic"
    if (
        env.get("HF_TOKEN")
        or env.get("HUGGING_FACE_HUB_TOKEN")
        or env.get("SPACE_ID")
    ):
        return "huggingface"
    return "anthropic"


def _call_anthropic(system_block: str, user_prompt: str, *, api_key: Optional[str] = None) -> str:
    """Anthropic backend. System block is cache-marked; the user prompt
    is sent fresh. Returns the raw assistant text.

    `api_key`: an optional per-call key. When provided, it goes directly
    to the SDK constructor and is NEVER written to os.environ. This is
    important on a multi-tenant public Space — mutating env would leak
    one visitor's key into a concurrent request from another visitor.
    When `api_key` is None, the SDK reads ANTHROPIC_API_KEY from env
    (the Space-owner's key path)."""
    from anthropic import Anthropic

    client = Anthropic(api_key=api_key) if api_key else Anthropic()
    resp = client.messages.create(
        model=ANTHROPIC_MODEL_ID,
        max_tokens=2500,
        system=[
            {
                "type": "text",
                "text": system_block,
                "cache_control": {"type": "ephemeral"},
            }
        ],
        messages=[{"role": "user", "content": user_prompt}],
    )
    return resp.content[0].text


def _call_huggingface(system_block: str, user_prompt: str) -> str:
    """HuggingFace backend. Uses the unified chat_completion interface,
    which routes through HF Inference Providers and supports Gemma 2,
    Phi-4-mini-instruct, Llama-3.3, Qwen 2.5, and many others. Lower
    temperature (0.2) than the SDK default to keep JSON output stable —
    smaller open models can be looser than Claude on schema adherence.

    Requires an HF token: HF_TOKEN env var, HUGGING_FACE_HUB_TOKEN env
    var, or a `hf auth login`-stored token (huggingface_hub.get_token()
    checks all three sources). HF Spaces do NOT auto-inject a token on
    public Spaces — the Space owner has to add it as a Space secret.
    Raise a clear, actionable error if missing.
    """
    from huggingface_hub import InferenceClient, get_token

    token = (
        os.environ.get("HF_TOKEN")
        or os.environ.get("HUGGING_FACE_HUB_TOKEN")
        or get_token()  # checks ~/.cache/huggingface/token from `hf auth login`
    )
    if not token:
        raise RuntimeError(
            "No HuggingFace token found. The Space owner needs to add HF_TOKEN "
            "as a Space secret (Settings → Repository secrets → New secret → "
            "name: HF_TOKEN, value: a User Access Token from "
            "https://huggingface.co/settings/tokens). Then restart the Space. "
            "Until then, pick a different model from the dropdown."
        )
    # `provider="auto"` opts into the modern HF Inference Providers
    # routing layer (introduced 2024-Q4), which picks the right partner
    # (featherless-ai / together-ai / hf-inference / etc.) for the model
    # automatically. Without this flag, InferenceClient falls back to
    # the legacy hf-inference-only path, which doesn't serve most newer
    # models and returns a misleading "model not supported" error even
    # when the user has all providers enabled and access to the model.
    client = InferenceClient(
        model=HF_MODEL_ID,
        token=token,
        provider="auto",
        timeout=120,
    )
    try:
        resp = client.chat_completion(
            messages=[
                {"role": "system", "content": system_block},
                {"role": "user", "content": user_prompt},
            ],
            max_tokens=2500,
            temperature=0.2,
        )
    except Exception as e:
        msg = str(e)
        # HF Inference Providers routes each model through a partner
        # (featherless-ai, together-ai, hf-inference, etc.). If none of
        # the enabled providers serves the requested model, the API
        # returns a BadRequestError with code=model_not_supported. The
        # raw error is opaque to users, so re-raise with the actual fix
        # instead of the unhelpful default message.
        if "model_not_supported" in msg or "not supported by any provider" in msg:
            raise RuntimeError(
                f"The model '{HF_MODEL_ID}' isn't available through any of "
                f"the HuggingFace Inference Providers enabled on your account. "
                f"Two fixes: (a) enable a provider that supports this model at "
                f"https://huggingface.co/settings/inference-providers, OR "
                f"(b) set HF_MODEL_ID as a Space variable to a model on your "
                f"enabled providers — microsoft/Phi-4-mini-instruct works "
                f"broadly via featherless-ai."
            )
        raise
    return resp.choices[0].message.content


# ZeroGPU backend — pre-load pattern.
#
# Model is loaded onto CPU at Space startup (module init), NOT inside
# `@spaces.GPU`. This is the documented HuggingFace ZeroGPU pattern:
#   - Module init runs once at Space startup, on CPU, with no GPU
#     quota consumed. The expensive part — downloading ~7.6GB of
#     safetensors and deserializing into PyTorch state — happens here.
#   - Inside `@spaces.GPU`, all we do is `.to('cuda')` + tokenize +
#     generate + decode. Wall-clock drops to ~10-15s warm, ~20-25s
#     after Space restart (the .to('cuda') for 7.6GB takes a few
#     seconds over PCIe).
#
# Why deliberately NOT `trust_remote_code=True`. Phi-4-mini-instruct's
# architecture is `phi3`, which transformers 4.46+ supports natively
# via `Phi3ForCausalLM` — no custom code download required. The custom
# modeling code that ships with the model on HF Hub (`modeling_phi3.py`)
# imports `LossKwargs` from `transformers.utils`, which was removed in
# transformers 4.57+ — loading WITH `trust_remote_code=True` fails
# with `ImportError: cannot import name 'LossKwargs' from
# 'transformers.utils'` and bricks the `@spaces.GPU` worker. The
# native path avoids the upstream pin-mismatch entirely.
#
# Tradeoff: ~30-60s slower Space cold-start (the one-time CPU load).
# Acceptable because Spaces only restart on deploy or after a long
# idle period. Worth it for the 2.5x quota efficiency.

if _ZEROGPU_DEPS_AVAILABLE:
    _zerogpu_tokenizer = _AutoTokenizer.from_pretrained(ZEROGPU_MODEL_ID)
    _zerogpu_model = _AutoModelForCausalLM.from_pretrained(
        ZEROGPU_MODEL_ID,
        torch_dtype=_torch.bfloat16,
        # NO device_map — load to CPU; we move to GPU per-call inside
        # @spaces.GPU. ZeroGPU has no GPU available at module load.
    )
else:
    _zerogpu_tokenizer = None
    _zerogpu_model = None


def _zerogpu_invoke(system_block: str, user_prompt: str) -> str:
    """Model invocation logic for the ZeroGPU backend. Pre-loaded model
    (on CPU) is moved to GPU on entry, then inference + decode. Reads
    module-level globals (`_zerogpu_tokenizer`, `_zerogpu_model`) which
    tests monkeypatch to fake the transformers types.

    Separated from the `@spaces.GPU` decoration below so it can be
    unit-tested without actually allocating a GPU."""
    # Move pre-loaded model from CPU to the GPU that @spaces.GPU just
    # allocated. Fast — just PCIe memory transfer of already-loaded
    # weights, no download or deserialize.
    _zerogpu_model.to("cuda")
    messages = [
        {"role": "system", "content": system_block},
        {"role": "user", "content": user_prompt},
    ]
    inputs = _zerogpu_tokenizer.apply_chat_template(
        messages,
        return_tensors="pt",
        add_generation_prompt=True,
    ).to("cuda")
    outputs = _zerogpu_model.generate(
        inputs,
        max_new_tokens=2500,
        temperature=0.2,
        do_sample=True,
        pad_token_id=_zerogpu_tokenizer.eos_token_id,
    )
    prompt_len = inputs.shape[1]
    return _zerogpu_tokenizer.decode(
        outputs[0][prompt_len:], skip_special_tokens=True
    )


if _ZEROGPU_DEPS_AVAILABLE:

    @_spaces.GPU(duration=ZEROGPU_DURATION_SECONDS)
    def _call_zerogpu(system_block: str, user_prompt: str) -> str:
        """ZeroGPU backend. Loads Phi-4-mini-instruct (or whatever
        ZEROGPU_MODEL_ID points at) into the Space's allocated GPU and
        runs chat-template inference. Thin wrapper around the testable
        `_zerogpu_invoke` so the decorator stays at module load time."""
        return _zerogpu_invoke(system_block, user_prompt)

else:

    def _call_zerogpu(system_block: str, user_prompt: str) -> str:
        raise RuntimeError(
            "ZeroGPU backend requires `spaces`, `transformers`, and `torch` "
            "to be importable AND should be run on a HuggingFace Pro Space "
            "for free on-demand GPU. Install the full requirements.txt and "
            "deploy to a Space, or pick anthropic / huggingface from the "
            "provider dropdown."
        )


PROVIDERS = {
    "anthropic": _call_anthropic,
    "huggingface": _call_huggingface,
    "zerogpu": _call_zerogpu,
}


def _call_model(system_block: str, user_prompt: str, provider: str) -> str:
    """Dispatch to the named provider. Raises ValueError on unknown
    provider; callers are expected to validate before calling."""
    if provider not in PROVIDERS:
        raise ValueError(
            f"Unknown provider: {provider!r}; expected one of {sorted(PROVIDERS)}"
        )
    return PROVIDERS[provider](system_block, user_prompt)


# Auto-detected once at module import; the Gradio UI exposes a runtime
# override via the Provider dropdown.
DEFAULT_PROVIDER = _detect_provider()

INDUSTRIES = [
    "insurance", "banking", "healthcare", "retail", "manufacturing",
    "logistics", "agriculture", "energy", "telecom", "media",
    "professional services", "real estate", "other",
]
SCALES = ["pilot", "department", "business unit", "enterprise"]
BUDGETS = ["<$100K", "$100K–$1M", "$1M–$10M", ">$10M"]


# ---------------------------------------------------------------------------
# Sample initiatives (gr.Examples) — one per verdict quadrant
# ---------------------------------------------------------------------------
# Realistic ~250–400-word AI-initiative descriptions that should land in a
# specific quadrant of the 2×2 verdict matrix. Used to seed user testing
# and give first-time visitors something concrete to click.

_SAMPLE_COMPOUNDER = (
    "We're a regional commercial insurance carrier specializing in restaurant "
    "general liability. We write about 8,000 policies a year across the "
    "Midwest, with average annual premium around $4,500. Underwriting is "
    "the bottleneck of our business — independent agents wait 36 to 48 "
    "hours for a quote because our underwriters manually pull industry "
    "codes, loss runs, and prior-carrier history from three different "
    "systems and then decide whether to bind, decline, or refer. Roughly "
    "30% of submissions get declined and another 15% are referred to "
    "senior underwriters, which adds another day. We're deploying an "
    "LLM-powered underwriting assistant that pulls the data automatically, "
    "flags risk factors based on patterns in our 12-year claims database, "
    "and proposes a base rate with an explanation. The underwriter "
    "reviews, adjusts, and approves. Every policy we write generates new "
    "claim outcomes — fires, slip-and-falls, liquor-liability claims, "
    "food-poisoning suits — and those outcomes feed back into the next "
    "quarter's model retraining. Our competitors mostly use Verisk's "
    "industry-standard rating models, which we don't share data with, so "
    "our model gets better on our specific book of business while theirs "
    "reflects the industry average. Internal goal: cut time-to-quote from "
    "36 hours to 4 hours, increase the win rate on profitable risks by "
    "15%, and progressively shift the loss ratio by 1–2 points per year "
    "as the model learns from each renewal cycle. Independent agents have "
    "already started favoring carriers with faster quote turnaround."
)

_SAMPLE_ONE_SHOT_WIN = (
    "We're a community bank with $4B in assets and 38 branches across two "
    "states. Loan officers spend about 6 hours per commercial loan "
    "reviewing financial statements, tax returns, and corporate documents "
    "before they can write the credit memo. We're deploying GPT-4 to "
    "extract key fields — revenue, EBITDA, debt service coverage ratio, "
    "ownership structure, related-party transactions, collateral "
    "descriptions — from these documents into a structured form. The loan "
    "officer reviews the extraction and writes the credit memo by hand. "
    "We expect to cut document review time from 6 hours to about 90 "
    "minutes per loan, processing roughly 2,400 commercial loans a year. "
    "The vendor provides the model, the document templates, and the "
    "extraction prompts, and is selling the same system to four of our "
    "peer community banks in the region under identical contracts. The AI "
    "doesn't learn from the outcome of the loan: defaults, prepayments, "
    "modifications, restructurings all go into our separate loan "
    "servicing system, which has never connected back to the extraction "
    "model. The vendor's three-year roadmap doesn't include any feedback "
    "loop between loan performance and the model — they treat extraction "
    "as a deterministic task. We're funding the project from the "
    "operations budget; the credit team is excited about the time savings "
    "but the chief credit officer has flagged that the productivity gain "
    "will be one-time and won't show up in the loss-given-default rate "
    "over time."
)

_SAMPLE_WRONG_THING = (
    "We're a third-party logistics provider with 8 warehouses on the East "
    "Coast handling about 20,000 orders a day across the network. We're "
    "investing in computer vision software to optimize order picking "
    "routes — the AI looks at the warehouse layout, current orders, and "
    "worker positions and suggests optimized pick paths in real time. "
    "Pilot results show a 12% reduction in steps per order on the test "
    "floor. Our operations team has been excited about this for 18 months "
    "and we just signed a multi-year contract with the vendor. Some "
    "context on the operation: our warehouses run 2 shifts. Order volume "
    "in shift 1 is around 14,000 orders per day; shift 2 is around 6,000. "
    "The pick wave finishes by 2pm on shift 1, then workers wait 4 to 5 "
    "hours for shift 2 trucks to arrive at the loading docks. The trucks "
    "are scheduled by the customer (a major national retailer) and arrive "
    "in unpredictable windows between 6pm and 10pm. We don't control the "
    "truck schedule and the customer won't share their advance schedule "
    "with us. The CFO has been asking us why total throughput per "
    "warehouse hasn't moved much in three years; our answer has been that "
    "the legacy warehouse management system is the constraint, which is "
    "why we're investing in better picking AI. Same-store labor cost is "
    "up 8% year-over-year because workers are paid through the idle hours."
)

_SAMPLE_ROMAN_CANDLE = (
    "We run a chain of 220 quick-service restaurants across the Southeast "
    "doing about $480M in annual revenue. Our gross margin has been under "
    "pressure from rising ingredient costs and we're rolling out an "
    "AI-powered personalized marketing platform that sends customized "
    "email and SMS offers based on customer purchase history, location, "
    "and local weather. The platform is from a major QSR-tech vendor used "
    "by several of our direct competitors in the same markets we operate "
    "in. Our customer data — names, emails, phone numbers, purchase "
    "frequency, average ticket size — lives in our point-of-sale "
    "provider's cloud, which the marketing platform pulls from via the "
    "POS provider's standard integration. Both the purchase data feed and "
    "the modeling are the vendor's stack; we don't see the underlying "
    "model and our data is commingled with other QSR brands the vendor "
    "serves on a shared inference fleet. We expect to lift email "
    "click-through by 8–12% based on the vendor's benchmark studies of "
    "similar brands. The marketing team is running the rollout; finance "
    "signed off on the multi-year subscription. We have not measured what "
    "is actually constraining same-store sales growth — drive-thru wait "
    "times, menu pricing relative to local competitors, or breakfast "
    "daypart penetration — we just know revenue has been flat for two "
    "years and the board wants visible action by Q4."
)


def _load_reference():
    """Read the prompt template + reference JSONs from disk at app start."""
    prompt_template = (ROOT / "prompts" / "diagnose.txt").read_text()
    portraits = json.loads((ROOT / "reference" / "portraits.json").read_text())
    failure_modes = json.loads((ROOT / "reference" / "failure-modes.json").read_text())

    portraits_block = "\n".join(
        textwrap.dedent(f"""\
        - id: {p['id']}
          label: {p['label']}
          bottleneck: {p['bottleneck']}
          summary: {p['summary']}
          compounding_summary: {p['compounding_summary']}
          article_url: {p['article_url']}
        """).rstrip()
        for p in portraits
    )

    failure_modes_block = "\n".join(
        textwrap.dedent(f"""\
        - id: {f['id']}
          label: {f['label']}
          applies_to_quadrants: {', '.join(f['applies_to_quadrants'])}
          summary: {f['summary']}
          url: {f['url']}
        """).rstrip()
        for f in failure_modes
    )

    system_block = (
        prompt_template
        .replace("{{portraits_block}}", portraits_block)
        .replace("{{failure_modes_block}}", failure_modes_block)
    )

    return prompt_template, system_block


# Loaded once at module import; cached in memory for the life of the process.
PROMPT_TEMPLATE, SYSTEM_BLOCK = _load_reference()


# ---------------------------------------------------------------------------
# Diagnose entrypoint (called by the Gradio Submit handler)
# ---------------------------------------------------------------------------


def diagnose(
    description: str,
    industry: Optional[str],
    scale: Optional[str],
    budget: Optional[str],
    provider: Optional[str] = None,
    anthropic_api_key: Optional[str] = None,
) -> tuple[str, str]:
    """Validate input, call the selected model with the cached system
    block, parse the response, and return (markdown_writeup,
    raw_json_string) for the two Gradio tabs.

    `provider` (anthropic | huggingface | zerogpu) defaults to
    DEFAULT_PROVIDER when not supplied — the Gradio dropdown always
    supplies it on a real submission.

    `anthropic_api_key` is a per-call user-supplied key. When provider
    is "anthropic" and the key is provided, it overrides any
    ANTHROPIC_API_KEY env var for this single request. The key is never
    persisted (Anthropic SDK uses it once and the client object is
    garbage-collected at function exit).

    Per F14 + contract §2, all error paths surface a user-friendly message
    in the markdown tab and an empty JSON tab; nothing leaks a stack trace.
    """
    description = (description or "").strip()
    words = len(description.split())

    if not description:
        return "⚠ Please describe your AI initiative.", ""
    if words < MIN_DESCRIPTION_WORDS:
        return (
            f"⚠ Please describe your initiative in at least {MIN_DESCRIPTION_WORDS} words "
            f"(you wrote {words}). The diagnostic needs enough context to score the four "
            f"compounding conditions with rationale quoting your description.",
            "",
        )
    if words > MAX_DESCRIPTION_WORDS:
        return (
            f"⚠ Please keep your description under {MAX_DESCRIPTION_WORDS} words "
            f"(you wrote {words}). Shorten the description and try again.",
            "",
        )

    provider = provider or DEFAULT_PROVIDER
    if provider not in PROVIDERS:
        return (
            f"⚠ Unknown model provider {provider!r}. Pick one of "
            f"{sorted(PROVIDERS)} from the dropdown.",
            "",
        )

    # If Premium (Anthropic) is selected, the user must supply a key —
    # either via the page's API-key field (per-call) or via an
    # ANTHROPIC_API_KEY env var on the Space. Without either, fail fast
    # with a friendly explanation before we hit the SDK.
    user_key_for_anthropic: Optional[str] = None
    if provider == "anthropic":
        env_key = os.environ.get("ANTHROPIC_API_KEY", "").strip()
        user_key = (anthropic_api_key or "").strip()
        if not env_key and not user_key:
            return (
                "⚠ Premium (Claude Opus) needs an Anthropic API key. Either "
                "paste your key in the field above, or pick one of the free "
                "options from the model dropdown.",
                "",
            )
        if user_key:
            # IMPORTANT: do NOT write the user-supplied key to os.environ.
            # That would leak the key into concurrent requests from other
            # visitors on this Space (the process env is shared across
            # all in-flight requests in the Python worker). Instead we
            # pass it directly to _call_anthropic below, which scopes it
            # to a single SDK client instance that's garbage-collected
            # when the call returns.
            user_key_for_anthropic = user_key

    user_prompt = (
        PROMPT_TEMPLATE
        .replace("{{user_input}}", description)
        .replace("{{industry}}", industry or "(not specified)")
        .replace("{{scale}}", scale or "(not specified)")
        .replace("{{budget}}", budget or "(not specified)")
    )

    try:
        # When the visitor supplied their own Anthropic key, bypass the
        # generic dispatcher so we can pass the key directly via kwarg
        # without ever touching os.environ. All other paths go through
        # the dispatcher and read credentials from env as usual.
        if provider == "anthropic" and user_key_for_anthropic:
            raw = _call_anthropic(
                SYSTEM_BLOCK, user_prompt, api_key=user_key_for_anthropic,
            )
        else:
            raw = _call_model(SYSTEM_BLOCK, user_prompt, provider)
    except Exception as e:
        # API timeout / rate limit / auth / server / network failure
        # (Anthropic SDK, huggingface_hub InferenceClient, or
        # transformers/torch on the zerogpu path). Include both the
        # exception class AND its string form so unexpected failures
        # are diagnosable from the UI without server log access.
        model_label = {
            "anthropic": ANTHROPIC_MODEL_ID,
            "huggingface": HF_MODEL_ID,
            "zerogpu": ZEROGPU_MODEL_ID,
        }.get(provider, provider)
        detail = str(e).strip() or "(no message)"
        # Cap the detail so we don't spill multi-paragraph tracebacks
        # into the UI. 400 chars is enough for a stack-trace summary
        # without flooding the markdown tab.
        if len(detail) > 400:
            detail = detail[:400] + "…"
        # Defense-in-depth: if the user-supplied Anthropic key somehow
        # appears in the exception message (no current SDK version does
        # this, but a future debug-mode override might), redact it
        # before surfacing the writeup. Symmetric with redactKey() in
        # src/lib/anthropic-direct.ts.
        if user_key_for_anthropic and len(user_key_for_anthropic) >= 8:
            detail = detail.replace(user_key_for_anthropic, "[redacted]")
        return (
            f"⚠ The diagnostic call to {provider} ({model_label}) failed.\n\n"
            f"**{type(e).__name__}:** {detail}\n\n"
            f"Try again in a moment, switch providers in the dropdown, "
            f"or shorten your description.",
            "",
        )

    try:
        parsed = parse_response(raw)
    except MalformedResponseError as e:
        return (
            f"⚠ The model returned malformed output. Try again with a different description "
            f"or shorten the existing one.\n\nDetail: {e}",
            "",
        )

    payload = {
        "constraint": parsed.constraint,
        "quadrant": parsed.quadrant,
        "closest_portrait": parsed.closest_portrait,
        "closest_portrait_paragraph": parsed.closest_portrait_paragraph,
        "scores": {
            k: {"score": v.score, "rationale": v.rationale, "quoted_span": v.quoted_span}
            for k, v in parsed.scores.items()
        },
        "warnings": [
            {"text": w.text, "citation_source": w.citation_source, "citation_url": w.citation_url}
            for w in parsed.warnings
        ],
    }
    return parsed.writeup, json.dumps(payload, indent=2)


# ---------------------------------------------------------------------------
# Gradio UI (built lazily so `import app` from tests does not require gradio)
# ---------------------------------------------------------------------------


def build_demo():
    """Build and return the Gradio Blocks UI. Called only by __main__."""
    import gradio as gr

    # Free option first, premium second. Plain-English labels with no
    # ANTHROPIC_API_KEY / SPACE_ID / ZeroGPU jargon — the casual user
    # shouldn't have to know what any of those mean.
    #
    # The HuggingFace Inference Providers backend (provider="huggingface")
    # is intentionally NOT in this dropdown: it requires the Space owner
    # to have HF billing set up (credit card on file OR custom provider
    # API keys), which most Pro users don't have by default. The backend
    # code remains in PROVIDERS so it's reachable via MODEL_PROVIDER env
    # override for users who do set up billing — see README.md.
    provider_choices = []
    if _zerogpu_available():
        provider_choices.append((
            f"Free · Phi-4-mini-instruct (Microsoft) — runs on GPU",
            "zerogpu",
        ))
    provider_choices.append((
        f"Premium · Claude Opus 4.7 (Anthropic) — paste your API key below",
        "anthropic",
    ))
    # Default to the first free option; user can pick Premium if they
    # have a key. Never default to anthropic on a public Space.
    default_choice = provider_choices[0][1]

    with gr.Blocks(title="The Compounding Test") as demo:
        gr.Markdown(
            "# The Compounding Test\n\n"
            "A diagnostic for AI investments at non-technology companies. "
            "Describe your AI initiative — get a scored writeup in one of "
            "four outcomes: **compounder**, **one-shot win**, **compounding "
            "the wrong thing**, or **Roman Candle**.\n\n"
            "**The default is free** — runs an open model (Phi-4-mini) "
            "on this Space's GPU. Pick **Premium · Claude Opus** from "
            "the dropdown if you have an Anthropic API key and want the "
            "highest-quality writeup. Read the full framework at "
            "[mile-hi.ai/journal/the-berkshire-test]("
            "https://www.mile-hi.ai/journal/the-berkshire-test)."
        )
        with gr.Row():
            description = gr.Textbox(
                label=f"Describe your AI initiative ({MIN_DESCRIPTION_WORDS}–{MAX_DESCRIPTION_WORDS} words)",
                placeholder=(
                    "Describe the bottleneck of your operation, the AI "
                    "investment, what data feeds it, where the labels come "
                    "from, and how you expect competitors to respond. Be "
                    "specific about the workflow.\n\n"
                    "Or pick a sample below to see how it works."
                ),
                lines=12,
            )

        with gr.Row():
            industry = gr.Dropdown(INDUSTRIES, label="Industry (optional)", value=None)
            scale = gr.Dropdown(SCALES, label="Scale (optional)", value=None)
            budget = gr.Dropdown(BUDGETS, label="Budget tier (optional)", value=None)

        gr.Examples(
            examples=[
                [_SAMPLE_COMPOUNDER, "insurance", "business unit", "$1M–$10M"],
                [_SAMPLE_ONE_SHOT_WIN, "banking", "business unit", "$100K–$1M"],
                [_SAMPLE_WRONG_THING, "logistics", "enterprise", "$1M–$10M"],
                [_SAMPLE_ROMAN_CANDLE, "retail", "enterprise", "$100K–$1M"],
            ],
            inputs=[description, industry, scale, budget],
            label="Sample initiatives — click one to load it (then click Diagnose)",
            examples_per_page=4,
        )

        with gr.Row():
            provider = gr.Dropdown(
                choices=provider_choices,
                value=default_choice,
                label="Choose a model",
            )
        # The API-key field appears only when Premium is selected. The
        # key is used per-request and never stored.
        api_key = gr.Textbox(
            label="Anthropic API key",
            placeholder="sk-ant-...",
            type="password",
            info=(
                "Used only for this request and never stored. "
                "Get a key at console.anthropic.com."
            ),
            visible=False,
        )

        def _toggle_api_key(p):
            return gr.update(visible=(p == "anthropic"))

        provider.change(_toggle_api_key, inputs=[provider], outputs=[api_key])

        submit = gr.Button("Diagnose", variant="primary")
        with gr.Tabs():
            with gr.Tab("Diagnosis"):
                writeup_out = gr.Markdown()
            with gr.Tab("Raw JSON"):
                json_out = gr.Code(language="json")
        submit.click(
            diagnose,
            inputs=[description, industry, scale, budget, provider, api_key],
            outputs=[writeup_out, json_out],
        )

    return demo


if __name__ == "__main__":
    # Local dev: relies on .env (loaded by python-dotenv) for ANTHROPIC_API_KEY.
    # HF Spaces: relies on Space secrets.
    try:
        from dotenv import load_dotenv

        load_dotenv()
    except ImportError:
        pass  # dotenv is optional; HF Spaces uses Space secrets.

    build_demo().launch()