Spaces:
Running on Zero
Running on Zero
File size: 40,267 Bytes
60518c1 971e3f4 033a83e 971e3f4 60518c1 e084f76 60518c1 e084f76 60518c1 033a83e 971e3f4 3c7d5bb c673b37 60518c1 033a83e 971e3f4 033a83e 971e3f4 033a83e 971e3f4 033a83e 971e3f4 033a83e 37d5cf4 033a83e 37d5cf4 033a83e 37d5cf4 033a83e e0a9313 033a83e e0a9313 033a83e e0a9313 033a83e e0a9313 8f09671 66ce4cf 033a83e c673b37 75a8a07 971e3f4 c673b37 971e3f4 c673b37 971e3f4 3c77cd5 c673b37 3c77cd5 c673b37 3c77cd5 971e3f4 3c77cd5 971e3f4 033a83e 971e3f4 033a83e 60518c1 ebbca73 17c3a19 ebbca73 17c3a19 ebbca73 17c3a19 ebbca73 17c3a19 ebbca73 17c3a19 ebbca73 60518c1 033a83e ebbca73 60518c1 033a83e ebbca73 60518c1 033a83e ebbca73 37d5cf4 ebbca73 37d5cf4 ebbca73 60518c1 37d5cf4 60518c1 033a83e 6fc254e 971e3f4 6fc254e 7b8b778 60518c1 6fc254e 60518c1 61393d7 ebbca73 61393d7 ebbca73 971e3f4 ebbca73 971e3f4 ebbca73 033a83e 60518c1 ebbca73 61393d7 ebbca73 60518c1 ebbca73 60518c1 ebbca73 60518c1 ebbca73 033a83e ebbca73 033a83e ebbca73 60518c1 ebbca73 60518c1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 | """The Compounding Test β HuggingFace Space.
A single-shot Gradio app that runs an AI-initiative description through
the two-axis Berkshire Test for AI and returns a scored writeup.
Architecture per specs/004-berkshire-test/contracts/hf-space-interface.md:
- Inputs: a description (200β5000 words) + 3 optional clarifiers.
- Three backends, selectable by env (`MODEL_PROVIDER`) or auto-detected
from available credentials and runtime environment:
* anthropic β Claude Opus / Sonnet via the Anthropic SDK;
system block is `cache_control:ephemeral` so
subsequent calls hit the 5-minute prefix cache.
* huggingface β Open models (Gemma 2 9B by default, swappable to
Phi-4, Llama-3.3, Qwen 2.5, etc.) via the
huggingface_hub InferenceClient. Works on HF
Spaces with the Space's free inference credits;
locally requires HF_TOKEN.
* zerogpu β Open model (Phi-4-mini-instruct by default)
loaded LOCALLY in the Space via transformers,
decorated with `@spaces.GPU` so a HuggingFace
Pro plan gets free on-demand A100/H100 GPU
allocation per request. No per-call credit burn;
no API round-trip. Requires the Space to have a
Pro owner; locally falls back to CPU (slow).
- Output: two Gradio tabs β markdown writeup + raw JSON.
Engine/Site boundary (Principle VIII): this app lives in gradio-apps/
only. Never deployed to mile-hi.ai. Reference JSONs are populated by
hand from the published articles β no runtime fetch from the site.
"""
from __future__ import annotations
import json
import os
import re
import textwrap
from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional
# ---------------------------------------------------------------------------
# Parser surface (covered by test_diagnose.py β module-level, no side effects)
# ---------------------------------------------------------------------------
class MalformedResponseError(Exception):
"""Raised when the model's response cannot be parsed into a Response."""
VALID_QUADRANTS = {"compounder", "one-shot-win", "wrong-thing", "roman-candle"}
VALID_PORTRAITS = {"progressive", "deere", "mastercard", "mayo"}
REQUIRED_SCORES = (
"proprietary_data",
"self_labeling",
"decreasing_marginal_cost",
"defensible_asymmetry",
)
@dataclass
class Score:
score: int
rationale: str
quoted_span: str
@dataclass
class Warning:
text: str
citation_source: str
citation_url: str
@dataclass
class Response:
constraint: str
scores: dict # str β Score (one entry per REQUIRED_SCORES key)
quadrant: str
closest_portrait: str
closest_portrait_paragraph: str
warnings: list # list[Warning]
writeup: str
# Match the FIRST ```json ... ``` fenced block in the response.
JSON_BLOCK_RE = re.compile(r"```json\s*\n(.*?)\n\s*```", re.DOTALL)
def parse_response(raw: str) -> Response:
"""Extract the first ```json``` block from `raw`, validate the schema,
and return a populated Response. Trailing markdown becomes `writeup`.
Raises MalformedResponseError on any schema violation per the contract
in specs/004-berkshire-test/contracts/hf-space-interface.md Β§4.
"""
match = JSON_BLOCK_RE.search(raw)
if not match:
raise MalformedResponseError("No ```json``` block found in response")
json_text = match.group(1)
try:
data = json.loads(json_text)
except json.JSONDecodeError as e:
raise MalformedResponseError(f"JSON block did not parse: {e}")
required = (
"constraint",
"scores",
"quadrant",
"closest_portrait",
"closest_portrait_paragraph",
"warnings",
)
for field_name in required:
if field_name not in data:
raise MalformedResponseError(f"Missing required field: {field_name}")
if data["quadrant"] not in VALID_QUADRANTS:
raise MalformedResponseError(
f"Invalid quadrant: {data['quadrant']!r}; expected one of {sorted(VALID_QUADRANTS)}"
)
if data["closest_portrait"] not in VALID_PORTRAITS:
raise MalformedResponseError(
f"Invalid closest_portrait: {data['closest_portrait']!r}; expected one of {sorted(VALID_PORTRAITS)}"
)
if not isinstance(data["scores"], dict):
raise MalformedResponseError("scores must be a JSON object")
scores: dict[str, Score] = {}
for key in REQUIRED_SCORES:
if key not in data["scores"]:
raise MalformedResponseError(f"Missing score key: {key}")
s = data["scores"][key]
if not isinstance(s, dict):
raise MalformedResponseError(f"Score {key} must be an object")
for sub in ("score", "rationale", "quoted_span"):
if sub not in s:
raise MalformedResponseError(f"Score {key} missing sub-field: {sub}")
# score must be an int 0-4 (bools are excluded; bool is a subclass of int in Python)
if isinstance(s["score"], bool) or not isinstance(s["score"], int):
raise MalformedResponseError(
f"Score {key}.score must be an integer 0-4, got {type(s['score']).__name__}"
)
if s["score"] < 0 or s["score"] > 4:
raise MalformedResponseError(
f"Score {key}.score must be in 0-4, got {s['score']}"
)
if not isinstance(s["quoted_span"], str) or not s["quoted_span"]:
raise MalformedResponseError(f"Score {key}.quoted_span must be a non-empty string")
if len(s["quoted_span"]) > 400:
raise MalformedResponseError(
f"Score {key}.quoted_span must be β€400 chars, got {len(s['quoted_span'])}"
)
scores[key] = Score(
score=s["score"], rationale=s["rationale"], quoted_span=s["quoted_span"]
)
if not isinstance(data["warnings"], list):
raise MalformedResponseError("warnings must be a JSON array")
warnings = [
Warning(
text=w.get("text", ""),
citation_source=w.get("citation_source", ""),
citation_url=w.get("citation_url", ""),
)
for w in data["warnings"]
]
writeup = raw[match.end():].strip()
return Response(
constraint=data["constraint"],
scores=scores,
quadrant=data["quadrant"],
closest_portrait=data["closest_portrait"],
closest_portrait_paragraph=data["closest_portrait_paragraph"],
warnings=warnings,
writeup=writeup,
)
# ---------------------------------------------------------------------------
# Configuration (env-driven; see .env.example)
# ---------------------------------------------------------------------------
ROOT = Path(__file__).parent
ANTHROPIC_MODEL_ID = os.environ.get("MODEL_ID", "claude-opus-4-7")
HF_MODEL_ID = os.environ.get("HF_MODEL_ID", "google/gemma-2-9b-it")
ZEROGPU_MODEL_ID = os.environ.get("ZEROGPU_MODEL_ID", "microsoft/Phi-4-mini-instruct")
# ZeroGPU reserves this many seconds from the Space owner's daily quota
# per request. With the pre-load pattern below (model on CPU at module
# init, .to('cuda') + inference inside @spaces.GPU), per-call cost is
# only ~10-25s wall-clock. 45s gives generous margin while squeezing
# ~2.5x more submissions per quota window vs the original 120s.
# Pro-tier max is 120s; raise via env if you need bigger headroom.
ZEROGPU_DURATION_SECONDS = int(os.environ.get("ZEROGPU_DURATION_SECONDS", "45"))
MAX_DESCRIPTION_WORDS = int(os.environ.get("MAX_DESCRIPTION_WORDS", "5000"))
MIN_DESCRIPTION_WORDS = 200
# ZeroGPU availability is detected at import time. The `spaces` package
# is HuggingFace's runtime for on-demand GPU allocation; `transformers`
# + `torch` are required to actually load and run the model. All three
# must be importable for the zerogpu backend to function.
try:
import spaces as _spaces
import torch as _torch
from transformers import AutoModelForCausalLM as _AutoModelForCausalLM
from transformers import AutoTokenizer as _AutoTokenizer
_ZEROGPU_DEPS_AVAILABLE = True
except ImportError:
_ZEROGPU_DEPS_AVAILABLE = False
def _zerogpu_available() -> bool:
"""Return whether the zerogpu backend can be used. Wrapped as a
function so tests can monkeypatch the answer without touching the
real torch/transformers imports."""
return _ZEROGPU_DEPS_AVAILABLE
# ---------------------------------------------------------------------------
# Provider abstraction (anthropic vs huggingface β selectable at runtime)
# ---------------------------------------------------------------------------
def _detect_provider(env=None) -> str:
"""Pick a model provider from env. Order of precedence:
1. Explicit MODEL_PROVIDER (anthropic | huggingface | zerogpu).
2. Running on a HuggingFace Space (SPACE_ID set) AND the zerogpu
deps (spaces + transformers + torch) are importable β zerogpu.
This is the Pro-plan free-GPU path.
3. Presence of ANTHROPIC_API_KEY β anthropic.
4. Presence of HF_TOKEN / HUGGING_FACE_HUB_TOKEN, or running on
a HuggingFace Space without zerogpu deps β huggingface.
5. Fall through to anthropic (call-time error will tell the user
which env to set).
"""
env = env if env is not None else os.environ
explicit = env.get("MODEL_PROVIDER", "").strip().lower()
if explicit in ("anthropic", "huggingface", "zerogpu"):
return explicit
if env.get("SPACE_ID") and _zerogpu_available():
return "zerogpu"
if env.get("ANTHROPIC_API_KEY"):
return "anthropic"
if (
env.get("HF_TOKEN")
or env.get("HUGGING_FACE_HUB_TOKEN")
or env.get("SPACE_ID")
):
return "huggingface"
return "anthropic"
def _call_anthropic(system_block: str, user_prompt: str, *, api_key: Optional[str] = None) -> str:
"""Anthropic backend. System block is cache-marked; the user prompt
is sent fresh. Returns the raw assistant text.
`api_key`: an optional per-call key. When provided, it goes directly
to the SDK constructor and is NEVER written to os.environ. This is
important on a multi-tenant public Space β mutating env would leak
one visitor's key into a concurrent request from another visitor.
When `api_key` is None, the SDK reads ANTHROPIC_API_KEY from env
(the Space-owner's key path)."""
from anthropic import Anthropic
client = Anthropic(api_key=api_key) if api_key else Anthropic()
resp = client.messages.create(
model=ANTHROPIC_MODEL_ID,
max_tokens=2500,
system=[
{
"type": "text",
"text": system_block,
"cache_control": {"type": "ephemeral"},
}
],
messages=[{"role": "user", "content": user_prompt}],
)
return resp.content[0].text
def _call_huggingface(system_block: str, user_prompt: str) -> str:
"""HuggingFace backend. Uses the unified chat_completion interface,
which routes through HF Inference Providers and supports Gemma 2,
Phi-4-mini-instruct, Llama-3.3, Qwen 2.5, and many others. Lower
temperature (0.2) than the SDK default to keep JSON output stable β
smaller open models can be looser than Claude on schema adherence.
Requires an HF token: HF_TOKEN env var, HUGGING_FACE_HUB_TOKEN env
var, or a `hf auth login`-stored token (huggingface_hub.get_token()
checks all three sources). HF Spaces do NOT auto-inject a token on
public Spaces β the Space owner has to add it as a Space secret.
Raise a clear, actionable error if missing.
"""
from huggingface_hub import InferenceClient, get_token
token = (
os.environ.get("HF_TOKEN")
or os.environ.get("HUGGING_FACE_HUB_TOKEN")
or get_token() # checks ~/.cache/huggingface/token from `hf auth login`
)
if not token:
raise RuntimeError(
"No HuggingFace token found. The Space owner needs to add HF_TOKEN "
"as a Space secret (Settings β Repository secrets β New secret β "
"name: HF_TOKEN, value: a User Access Token from "
"https://huggingface.co/settings/tokens). Then restart the Space. "
"Until then, pick a different model from the dropdown."
)
# `provider="auto"` opts into the modern HF Inference Providers
# routing layer (introduced 2024-Q4), which picks the right partner
# (featherless-ai / together-ai / hf-inference / etc.) for the model
# automatically. Without this flag, InferenceClient falls back to
# the legacy hf-inference-only path, which doesn't serve most newer
# models and returns a misleading "model not supported" error even
# when the user has all providers enabled and access to the model.
client = InferenceClient(
model=HF_MODEL_ID,
token=token,
provider="auto",
timeout=120,
)
try:
resp = client.chat_completion(
messages=[
{"role": "system", "content": system_block},
{"role": "user", "content": user_prompt},
],
max_tokens=2500,
temperature=0.2,
)
except Exception as e:
msg = str(e)
# HF Inference Providers routes each model through a partner
# (featherless-ai, together-ai, hf-inference, etc.). If none of
# the enabled providers serves the requested model, the API
# returns a BadRequestError with code=model_not_supported. The
# raw error is opaque to users, so re-raise with the actual fix
# instead of the unhelpful default message.
if "model_not_supported" in msg or "not supported by any provider" in msg:
raise RuntimeError(
f"The model '{HF_MODEL_ID}' isn't available through any of "
f"the HuggingFace Inference Providers enabled on your account. "
f"Two fixes: (a) enable a provider that supports this model at "
f"https://huggingface.co/settings/inference-providers, OR "
f"(b) set HF_MODEL_ID as a Space variable to a model on your "
f"enabled providers β microsoft/Phi-4-mini-instruct works "
f"broadly via featherless-ai."
)
raise
return resp.choices[0].message.content
# ZeroGPU backend β pre-load pattern.
#
# Model is loaded onto CPU at Space startup (module init), NOT inside
# `@spaces.GPU`. This is the documented HuggingFace ZeroGPU pattern:
# - Module init runs once at Space startup, on CPU, with no GPU
# quota consumed. The expensive part β downloading ~7.6GB of
# safetensors and deserializing into PyTorch state β happens here.
# - Inside `@spaces.GPU`, all we do is `.to('cuda')` + tokenize +
# generate + decode. Wall-clock drops to ~10-15s warm, ~20-25s
# after Space restart (the .to('cuda') for 7.6GB takes a few
# seconds over PCIe).
#
# Why deliberately NOT `trust_remote_code=True`. Phi-4-mini-instruct's
# architecture is `phi3`, which transformers 4.46+ supports natively
# via `Phi3ForCausalLM` β no custom code download required. The custom
# modeling code that ships with the model on HF Hub (`modeling_phi3.py`)
# imports `LossKwargs` from `transformers.utils`, which was removed in
# transformers 4.57+ β loading WITH `trust_remote_code=True` fails
# with `ImportError: cannot import name 'LossKwargs' from
# 'transformers.utils'` and bricks the `@spaces.GPU` worker. The
# native path avoids the upstream pin-mismatch entirely.
#
# Tradeoff: ~30-60s slower Space cold-start (the one-time CPU load).
# Acceptable because Spaces only restart on deploy or after a long
# idle period. Worth it for the 2.5x quota efficiency.
if _ZEROGPU_DEPS_AVAILABLE:
_zerogpu_tokenizer = _AutoTokenizer.from_pretrained(ZEROGPU_MODEL_ID)
_zerogpu_model = _AutoModelForCausalLM.from_pretrained(
ZEROGPU_MODEL_ID,
torch_dtype=_torch.bfloat16,
# NO device_map β load to CPU; we move to GPU per-call inside
# @spaces.GPU. ZeroGPU has no GPU available at module load.
)
else:
_zerogpu_tokenizer = None
_zerogpu_model = None
def _zerogpu_invoke(system_block: str, user_prompt: str) -> str:
"""Model invocation logic for the ZeroGPU backend. Pre-loaded model
(on CPU) is moved to GPU on entry, then inference + decode. Reads
module-level globals (`_zerogpu_tokenizer`, `_zerogpu_model`) which
tests monkeypatch to fake the transformers types.
Separated from the `@spaces.GPU` decoration below so it can be
unit-tested without actually allocating a GPU."""
# Move pre-loaded model from CPU to the GPU that @spaces.GPU just
# allocated. Fast β just PCIe memory transfer of already-loaded
# weights, no download or deserialize.
_zerogpu_model.to("cuda")
messages = [
{"role": "system", "content": system_block},
{"role": "user", "content": user_prompt},
]
inputs = _zerogpu_tokenizer.apply_chat_template(
messages,
return_tensors="pt",
add_generation_prompt=True,
).to("cuda")
outputs = _zerogpu_model.generate(
inputs,
max_new_tokens=2500,
temperature=0.2,
do_sample=True,
pad_token_id=_zerogpu_tokenizer.eos_token_id,
)
prompt_len = inputs.shape[1]
return _zerogpu_tokenizer.decode(
outputs[0][prompt_len:], skip_special_tokens=True
)
if _ZEROGPU_DEPS_AVAILABLE:
@_spaces.GPU(duration=ZEROGPU_DURATION_SECONDS)
def _call_zerogpu(system_block: str, user_prompt: str) -> str:
"""ZeroGPU backend. Loads Phi-4-mini-instruct (or whatever
ZEROGPU_MODEL_ID points at) into the Space's allocated GPU and
runs chat-template inference. Thin wrapper around the testable
`_zerogpu_invoke` so the decorator stays at module load time."""
return _zerogpu_invoke(system_block, user_prompt)
else:
def _call_zerogpu(system_block: str, user_prompt: str) -> str:
raise RuntimeError(
"ZeroGPU backend requires `spaces`, `transformers`, and `torch` "
"to be importable AND should be run on a HuggingFace Pro Space "
"for free on-demand GPU. Install the full requirements.txt and "
"deploy to a Space, or pick anthropic / huggingface from the "
"provider dropdown."
)
PROVIDERS = {
"anthropic": _call_anthropic,
"huggingface": _call_huggingface,
"zerogpu": _call_zerogpu,
}
def _call_model(system_block: str, user_prompt: str, provider: str) -> str:
"""Dispatch to the named provider. Raises ValueError on unknown
provider; callers are expected to validate before calling."""
if provider not in PROVIDERS:
raise ValueError(
f"Unknown provider: {provider!r}; expected one of {sorted(PROVIDERS)}"
)
return PROVIDERS[provider](system_block, user_prompt)
# Auto-detected once at module import; the Gradio UI exposes a runtime
# override via the Provider dropdown.
DEFAULT_PROVIDER = _detect_provider()
INDUSTRIES = [
"insurance", "banking", "healthcare", "retail", "manufacturing",
"logistics", "agriculture", "energy", "telecom", "media",
"professional services", "real estate", "other",
]
SCALES = ["pilot", "department", "business unit", "enterprise"]
BUDGETS = ["<$100K", "$100Kβ$1M", "$1Mβ$10M", ">$10M"]
# ---------------------------------------------------------------------------
# Sample initiatives (gr.Examples) β one per verdict quadrant
# ---------------------------------------------------------------------------
# Realistic ~250β400-word AI-initiative descriptions that should land in a
# specific quadrant of the 2Γ2 verdict matrix. Used to seed user testing
# and give first-time visitors something concrete to click.
_SAMPLE_COMPOUNDER = (
"We're a regional commercial insurance carrier specializing in restaurant "
"general liability. We write about 8,000 policies a year across the "
"Midwest, with average annual premium around $4,500. Underwriting is "
"the bottleneck of our business β independent agents wait 36 to 48 "
"hours for a quote because our underwriters manually pull industry "
"codes, loss runs, and prior-carrier history from three different "
"systems and then decide whether to bind, decline, or refer. Roughly "
"30% of submissions get declined and another 15% are referred to "
"senior underwriters, which adds another day. We're deploying an "
"LLM-powered underwriting assistant that pulls the data automatically, "
"flags risk factors based on patterns in our 12-year claims database, "
"and proposes a base rate with an explanation. The underwriter "
"reviews, adjusts, and approves. Every policy we write generates new "
"claim outcomes β fires, slip-and-falls, liquor-liability claims, "
"food-poisoning suits β and those outcomes feed back into the next "
"quarter's model retraining. Our competitors mostly use Verisk's "
"industry-standard rating models, which we don't share data with, so "
"our model gets better on our specific book of business while theirs "
"reflects the industry average. Internal goal: cut time-to-quote from "
"36 hours to 4 hours, increase the win rate on profitable risks by "
"15%, and progressively shift the loss ratio by 1β2 points per year "
"as the model learns from each renewal cycle. Independent agents have "
"already started favoring carriers with faster quote turnaround."
)
_SAMPLE_ONE_SHOT_WIN = (
"We're a community bank with $4B in assets and 38 branches across two "
"states. Loan officers spend about 6 hours per commercial loan "
"reviewing financial statements, tax returns, and corporate documents "
"before they can write the credit memo. We're deploying GPT-4 to "
"extract key fields β revenue, EBITDA, debt service coverage ratio, "
"ownership structure, related-party transactions, collateral "
"descriptions β from these documents into a structured form. The loan "
"officer reviews the extraction and writes the credit memo by hand. "
"We expect to cut document review time from 6 hours to about 90 "
"minutes per loan, processing roughly 2,400 commercial loans a year. "
"The vendor provides the model, the document templates, and the "
"extraction prompts, and is selling the same system to four of our "
"peer community banks in the region under identical contracts. The AI "
"doesn't learn from the outcome of the loan: defaults, prepayments, "
"modifications, restructurings all go into our separate loan "
"servicing system, which has never connected back to the extraction "
"model. The vendor's three-year roadmap doesn't include any feedback "
"loop between loan performance and the model β they treat extraction "
"as a deterministic task. We're funding the project from the "
"operations budget; the credit team is excited about the time savings "
"but the chief credit officer has flagged that the productivity gain "
"will be one-time and won't show up in the loss-given-default rate "
"over time."
)
_SAMPLE_WRONG_THING = (
"We're a third-party logistics provider with 8 warehouses on the East "
"Coast handling about 20,000 orders a day across the network. We're "
"investing in computer vision software to optimize order picking "
"routes β the AI looks at the warehouse layout, current orders, and "
"worker positions and suggests optimized pick paths in real time. "
"Pilot results show a 12% reduction in steps per order on the test "
"floor. Our operations team has been excited about this for 18 months "
"and we just signed a multi-year contract with the vendor. Some "
"context on the operation: our warehouses run 2 shifts. Order volume "
"in shift 1 is around 14,000 orders per day; shift 2 is around 6,000. "
"The pick wave finishes by 2pm on shift 1, then workers wait 4 to 5 "
"hours for shift 2 trucks to arrive at the loading docks. The trucks "
"are scheduled by the customer (a major national retailer) and arrive "
"in unpredictable windows between 6pm and 10pm. We don't control the "
"truck schedule and the customer won't share their advance schedule "
"with us. The CFO has been asking us why total throughput per "
"warehouse hasn't moved much in three years; our answer has been that "
"the legacy warehouse management system is the constraint, which is "
"why we're investing in better picking AI. Same-store labor cost is "
"up 8% year-over-year because workers are paid through the idle hours."
)
_SAMPLE_ROMAN_CANDLE = (
"We run a chain of 220 quick-service restaurants across the Southeast "
"doing about $480M in annual revenue. Our gross margin has been under "
"pressure from rising ingredient costs and we're rolling out an "
"AI-powered personalized marketing platform that sends customized "
"email and SMS offers based on customer purchase history, location, "
"and local weather. The platform is from a major QSR-tech vendor used "
"by several of our direct competitors in the same markets we operate "
"in. Our customer data β names, emails, phone numbers, purchase "
"frequency, average ticket size β lives in our point-of-sale "
"provider's cloud, which the marketing platform pulls from via the "
"POS provider's standard integration. Both the purchase data feed and "
"the modeling are the vendor's stack; we don't see the underlying "
"model and our data is commingled with other QSR brands the vendor "
"serves on a shared inference fleet. We expect to lift email "
"click-through by 8β12% based on the vendor's benchmark studies of "
"similar brands. The marketing team is running the rollout; finance "
"signed off on the multi-year subscription. We have not measured what "
"is actually constraining same-store sales growth β drive-thru wait "
"times, menu pricing relative to local competitors, or breakfast "
"daypart penetration β we just know revenue has been flat for two "
"years and the board wants visible action by Q4."
)
def _load_reference():
"""Read the prompt template + reference JSONs from disk at app start."""
prompt_template = (ROOT / "prompts" / "diagnose.txt").read_text()
portraits = json.loads((ROOT / "reference" / "portraits.json").read_text())
failure_modes = json.loads((ROOT / "reference" / "failure-modes.json").read_text())
portraits_block = "\n".join(
textwrap.dedent(f"""\
- id: {p['id']}
label: {p['label']}
bottleneck: {p['bottleneck']}
summary: {p['summary']}
compounding_summary: {p['compounding_summary']}
article_url: {p['article_url']}
""").rstrip()
for p in portraits
)
failure_modes_block = "\n".join(
textwrap.dedent(f"""\
- id: {f['id']}
label: {f['label']}
applies_to_quadrants: {', '.join(f['applies_to_quadrants'])}
summary: {f['summary']}
url: {f['url']}
""").rstrip()
for f in failure_modes
)
system_block = (
prompt_template
.replace("{{portraits_block}}", portraits_block)
.replace("{{failure_modes_block}}", failure_modes_block)
)
return prompt_template, system_block
# Loaded once at module import; cached in memory for the life of the process.
PROMPT_TEMPLATE, SYSTEM_BLOCK = _load_reference()
# ---------------------------------------------------------------------------
# Diagnose entrypoint (called by the Gradio Submit handler)
# ---------------------------------------------------------------------------
def diagnose(
description: str,
industry: Optional[str],
scale: Optional[str],
budget: Optional[str],
provider: Optional[str] = None,
anthropic_api_key: Optional[str] = None,
) -> tuple[str, str]:
"""Validate input, call the selected model with the cached system
block, parse the response, and return (markdown_writeup,
raw_json_string) for the two Gradio tabs.
`provider` (anthropic | huggingface | zerogpu) defaults to
DEFAULT_PROVIDER when not supplied β the Gradio dropdown always
supplies it on a real submission.
`anthropic_api_key` is a per-call user-supplied key. When provider
is "anthropic" and the key is provided, it overrides any
ANTHROPIC_API_KEY env var for this single request. The key is never
persisted (Anthropic SDK uses it once and the client object is
garbage-collected at function exit).
Per F14 + contract Β§2, all error paths surface a user-friendly message
in the markdown tab and an empty JSON tab; nothing leaks a stack trace.
"""
description = (description or "").strip()
words = len(description.split())
if not description:
return "β Please describe your AI initiative.", ""
if words < MIN_DESCRIPTION_WORDS:
return (
f"β Please describe your initiative in at least {MIN_DESCRIPTION_WORDS} words "
f"(you wrote {words}). The diagnostic needs enough context to score the four "
f"compounding conditions with rationale quoting your description.",
"",
)
if words > MAX_DESCRIPTION_WORDS:
return (
f"β Please keep your description under {MAX_DESCRIPTION_WORDS} words "
f"(you wrote {words}). Shorten the description and try again.",
"",
)
provider = provider or DEFAULT_PROVIDER
if provider not in PROVIDERS:
return (
f"β Unknown model provider {provider!r}. Pick one of "
f"{sorted(PROVIDERS)} from the dropdown.",
"",
)
# If Premium (Anthropic) is selected, the user must supply a key β
# either via the page's API-key field (per-call) or via an
# ANTHROPIC_API_KEY env var on the Space. Without either, fail fast
# with a friendly explanation before we hit the SDK.
user_key_for_anthropic: Optional[str] = None
if provider == "anthropic":
env_key = os.environ.get("ANTHROPIC_API_KEY", "").strip()
user_key = (anthropic_api_key or "").strip()
if not env_key and not user_key:
return (
"β Premium (Claude Opus) needs an Anthropic API key. Either "
"paste your key in the field above, or pick one of the free "
"options from the model dropdown.",
"",
)
if user_key:
# IMPORTANT: do NOT write the user-supplied key to os.environ.
# That would leak the key into concurrent requests from other
# visitors on this Space (the process env is shared across
# all in-flight requests in the Python worker). Instead we
# pass it directly to _call_anthropic below, which scopes it
# to a single SDK client instance that's garbage-collected
# when the call returns.
user_key_for_anthropic = user_key
user_prompt = (
PROMPT_TEMPLATE
.replace("{{user_input}}", description)
.replace("{{industry}}", industry or "(not specified)")
.replace("{{scale}}", scale or "(not specified)")
.replace("{{budget}}", budget or "(not specified)")
)
try:
# When the visitor supplied their own Anthropic key, bypass the
# generic dispatcher so we can pass the key directly via kwarg
# without ever touching os.environ. All other paths go through
# the dispatcher and read credentials from env as usual.
if provider == "anthropic" and user_key_for_anthropic:
raw = _call_anthropic(
SYSTEM_BLOCK, user_prompt, api_key=user_key_for_anthropic,
)
else:
raw = _call_model(SYSTEM_BLOCK, user_prompt, provider)
except Exception as e:
# API timeout / rate limit / auth / server / network failure
# (Anthropic SDK, huggingface_hub InferenceClient, or
# transformers/torch on the zerogpu path). Include both the
# exception class AND its string form so unexpected failures
# are diagnosable from the UI without server log access.
model_label = {
"anthropic": ANTHROPIC_MODEL_ID,
"huggingface": HF_MODEL_ID,
"zerogpu": ZEROGPU_MODEL_ID,
}.get(provider, provider)
detail = str(e).strip() or "(no message)"
# Cap the detail so we don't spill multi-paragraph tracebacks
# into the UI. 400 chars is enough for a stack-trace summary
# without flooding the markdown tab.
if len(detail) > 400:
detail = detail[:400] + "β¦"
# Defense-in-depth: if the user-supplied Anthropic key somehow
# appears in the exception message (no current SDK version does
# this, but a future debug-mode override might), redact it
# before surfacing the writeup. Symmetric with redactKey() in
# src/lib/anthropic-direct.ts.
if user_key_for_anthropic and len(user_key_for_anthropic) >= 8:
detail = detail.replace(user_key_for_anthropic, "[redacted]")
return (
f"β The diagnostic call to {provider} ({model_label}) failed.\n\n"
f"**{type(e).__name__}:** {detail}\n\n"
f"Try again in a moment, switch providers in the dropdown, "
f"or shorten your description.",
"",
)
try:
parsed = parse_response(raw)
except MalformedResponseError as e:
return (
f"β The model returned malformed output. Try again with a different description "
f"or shorten the existing one.\n\nDetail: {e}",
"",
)
payload = {
"constraint": parsed.constraint,
"quadrant": parsed.quadrant,
"closest_portrait": parsed.closest_portrait,
"closest_portrait_paragraph": parsed.closest_portrait_paragraph,
"scores": {
k: {"score": v.score, "rationale": v.rationale, "quoted_span": v.quoted_span}
for k, v in parsed.scores.items()
},
"warnings": [
{"text": w.text, "citation_source": w.citation_source, "citation_url": w.citation_url}
for w in parsed.warnings
],
}
return parsed.writeup, json.dumps(payload, indent=2)
# ---------------------------------------------------------------------------
# Gradio UI (built lazily so `import app` from tests does not require gradio)
# ---------------------------------------------------------------------------
def build_demo():
"""Build and return the Gradio Blocks UI. Called only by __main__."""
import gradio as gr
# Free option first, premium second. Plain-English labels with no
# ANTHROPIC_API_KEY / SPACE_ID / ZeroGPU jargon β the casual user
# shouldn't have to know what any of those mean.
#
# The HuggingFace Inference Providers backend (provider="huggingface")
# is intentionally NOT in this dropdown: it requires the Space owner
# to have HF billing set up (credit card on file OR custom provider
# API keys), which most Pro users don't have by default. The backend
# code remains in PROVIDERS so it's reachable via MODEL_PROVIDER env
# override for users who do set up billing β see README.md.
provider_choices = []
if _zerogpu_available():
provider_choices.append((
f"Free Β· Phi-4-mini-instruct (Microsoft) β runs on GPU",
"zerogpu",
))
provider_choices.append((
f"Premium Β· Claude Opus 4.7 (Anthropic) β paste your API key below",
"anthropic",
))
# Default to the first free option; user can pick Premium if they
# have a key. Never default to anthropic on a public Space.
default_choice = provider_choices[0][1]
with gr.Blocks(title="The Compounding Test") as demo:
gr.Markdown(
"# The Compounding Test\n\n"
"A diagnostic for AI investments at non-technology companies. "
"Describe your AI initiative β get a scored writeup in one of "
"four outcomes: **compounder**, **one-shot win**, **compounding "
"the wrong thing**, or **Roman Candle**.\n\n"
"**The default is free** β runs an open model (Phi-4-mini) "
"on this Space's GPU. Pick **Premium Β· Claude Opus** from "
"the dropdown if you have an Anthropic API key and want the "
"highest-quality writeup. Read the full framework at "
"[mile-hi.ai/journal/the-berkshire-test]("
"https://www.mile-hi.ai/journal/the-berkshire-test)."
)
with gr.Row():
description = gr.Textbox(
label=f"Describe your AI initiative ({MIN_DESCRIPTION_WORDS}β{MAX_DESCRIPTION_WORDS} words)",
placeholder=(
"Describe the bottleneck of your operation, the AI "
"investment, what data feeds it, where the labels come "
"from, and how you expect competitors to respond. Be "
"specific about the workflow.\n\n"
"Or pick a sample below to see how it works."
),
lines=12,
)
with gr.Row():
industry = gr.Dropdown(INDUSTRIES, label="Industry (optional)", value=None)
scale = gr.Dropdown(SCALES, label="Scale (optional)", value=None)
budget = gr.Dropdown(BUDGETS, label="Budget tier (optional)", value=None)
gr.Examples(
examples=[
[_SAMPLE_COMPOUNDER, "insurance", "business unit", "$1Mβ$10M"],
[_SAMPLE_ONE_SHOT_WIN, "banking", "business unit", "$100Kβ$1M"],
[_SAMPLE_WRONG_THING, "logistics", "enterprise", "$1Mβ$10M"],
[_SAMPLE_ROMAN_CANDLE, "retail", "enterprise", "$100Kβ$1M"],
],
inputs=[description, industry, scale, budget],
label="Sample initiatives β click one to load it (then click Diagnose)",
examples_per_page=4,
)
with gr.Row():
provider = gr.Dropdown(
choices=provider_choices,
value=default_choice,
label="Choose a model",
)
# The API-key field appears only when Premium is selected. The
# key is used per-request and never stored.
api_key = gr.Textbox(
label="Anthropic API key",
placeholder="sk-ant-...",
type="password",
info=(
"Used only for this request and never stored. "
"Get a key at console.anthropic.com."
),
visible=False,
)
def _toggle_api_key(p):
return gr.update(visible=(p == "anthropic"))
provider.change(_toggle_api_key, inputs=[provider], outputs=[api_key])
submit = gr.Button("Diagnose", variant="primary")
with gr.Tabs():
with gr.Tab("Diagnosis"):
writeup_out = gr.Markdown()
with gr.Tab("Raw JSON"):
json_out = gr.Code(language="json")
submit.click(
diagnose,
inputs=[description, industry, scale, budget, provider, api_key],
outputs=[writeup_out, json_out],
)
return demo
if __name__ == "__main__":
# Local dev: relies on .env (loaded by python-dotenv) for ANTHROPIC_API_KEY.
# HF Spaces: relies on Space secrets.
try:
from dotenv import load_dotenv
load_dotenv()
except ImportError:
pass # dotenv is optional; HF Spaces uses Space secrets.
build_demo().launch()
|