Spaces:
Running on Zero
Running on Zero
| """The Compounding Test β HuggingFace Space. | |
| A single-shot Gradio app that runs an AI-initiative description through | |
| the two-axis Berkshire Test for AI and returns a scored writeup. | |
| Architecture per specs/004-berkshire-test/contracts/hf-space-interface.md: | |
| - Inputs: a description (200β5000 words) + 3 optional clarifiers. | |
| - Three backends, selectable by env (`MODEL_PROVIDER`) or auto-detected | |
| from available credentials and runtime environment: | |
| * anthropic β Claude Opus / Sonnet via the Anthropic SDK; | |
| system block is `cache_control:ephemeral` so | |
| subsequent calls hit the 5-minute prefix cache. | |
| * huggingface β Open models (Gemma 2 9B by default, swappable to | |
| Phi-4, Llama-3.3, Qwen 2.5, etc.) via the | |
| huggingface_hub InferenceClient. Works on HF | |
| Spaces with the Space's free inference credits; | |
| locally requires HF_TOKEN. | |
| * zerogpu β Open model (Phi-4-mini-instruct by default) | |
| loaded LOCALLY in the Space via transformers, | |
| decorated with `@spaces.GPU` so a HuggingFace | |
| Pro plan gets free on-demand A100/H100 GPU | |
| allocation per request. No per-call credit burn; | |
| no API round-trip. Requires the Space to have a | |
| Pro owner; locally falls back to CPU (slow). | |
| - Output: two Gradio tabs β markdown writeup + raw JSON. | |
| Engine/Site boundary (Principle VIII): this app lives in gradio-apps/ | |
| only. Never deployed to mile-hi.ai. Reference JSONs are populated by | |
| hand from the published articles β no runtime fetch from the site. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import os | |
| import re | |
| import textwrap | |
| from dataclasses import dataclass, field | |
| from pathlib import Path | |
| from typing import Optional | |
| # --------------------------------------------------------------------------- | |
| # Parser surface (covered by test_diagnose.py β module-level, no side effects) | |
| # --------------------------------------------------------------------------- | |
| class MalformedResponseError(Exception): | |
| """Raised when the model's response cannot be parsed into a Response.""" | |
| VALID_QUADRANTS = {"compounder", "one-shot-win", "wrong-thing", "roman-candle"} | |
| VALID_PORTRAITS = {"progressive", "deere", "mastercard", "mayo"} | |
| REQUIRED_SCORES = ( | |
| "proprietary_data", | |
| "self_labeling", | |
| "decreasing_marginal_cost", | |
| "defensible_asymmetry", | |
| ) | |
| class Score: | |
| score: int | |
| rationale: str | |
| quoted_span: str | |
| class Warning: | |
| text: str | |
| citation_source: str | |
| citation_url: str | |
| class Response: | |
| constraint: str | |
| scores: dict # str β Score (one entry per REQUIRED_SCORES key) | |
| quadrant: str | |
| closest_portrait: str | |
| closest_portrait_paragraph: str | |
| warnings: list # list[Warning] | |
| writeup: str | |
| # Match the FIRST ```json ... ``` fenced block in the response. | |
| JSON_BLOCK_RE = re.compile(r"```json\s*\n(.*?)\n\s*```", re.DOTALL) | |
| def parse_response(raw: str) -> Response: | |
| """Extract the first ```json``` block from `raw`, validate the schema, | |
| and return a populated Response. Trailing markdown becomes `writeup`. | |
| Raises MalformedResponseError on any schema violation per the contract | |
| in specs/004-berkshire-test/contracts/hf-space-interface.md Β§4. | |
| """ | |
| match = JSON_BLOCK_RE.search(raw) | |
| if not match: | |
| raise MalformedResponseError("No ```json``` block found in response") | |
| json_text = match.group(1) | |
| try: | |
| data = json.loads(json_text) | |
| except json.JSONDecodeError as e: | |
| raise MalformedResponseError(f"JSON block did not parse: {e}") | |
| required = ( | |
| "constraint", | |
| "scores", | |
| "quadrant", | |
| "closest_portrait", | |
| "closest_portrait_paragraph", | |
| "warnings", | |
| ) | |
| for field_name in required: | |
| if field_name not in data: | |
| raise MalformedResponseError(f"Missing required field: {field_name}") | |
| if data["quadrant"] not in VALID_QUADRANTS: | |
| raise MalformedResponseError( | |
| f"Invalid quadrant: {data['quadrant']!r}; expected one of {sorted(VALID_QUADRANTS)}" | |
| ) | |
| if data["closest_portrait"] not in VALID_PORTRAITS: | |
| raise MalformedResponseError( | |
| f"Invalid closest_portrait: {data['closest_portrait']!r}; expected one of {sorted(VALID_PORTRAITS)}" | |
| ) | |
| if not isinstance(data["scores"], dict): | |
| raise MalformedResponseError("scores must be a JSON object") | |
| scores: dict[str, Score] = {} | |
| for key in REQUIRED_SCORES: | |
| if key not in data["scores"]: | |
| raise MalformedResponseError(f"Missing score key: {key}") | |
| s = data["scores"][key] | |
| if not isinstance(s, dict): | |
| raise MalformedResponseError(f"Score {key} must be an object") | |
| for sub in ("score", "rationale", "quoted_span"): | |
| if sub not in s: | |
| raise MalformedResponseError(f"Score {key} missing sub-field: {sub}") | |
| # score must be an int 0-4 (bools are excluded; bool is a subclass of int in Python) | |
| if isinstance(s["score"], bool) or not isinstance(s["score"], int): | |
| raise MalformedResponseError( | |
| f"Score {key}.score must be an integer 0-4, got {type(s['score']).__name__}" | |
| ) | |
| if s["score"] < 0 or s["score"] > 4: | |
| raise MalformedResponseError( | |
| f"Score {key}.score must be in 0-4, got {s['score']}" | |
| ) | |
| if not isinstance(s["quoted_span"], str) or not s["quoted_span"]: | |
| raise MalformedResponseError(f"Score {key}.quoted_span must be a non-empty string") | |
| if len(s["quoted_span"]) > 400: | |
| raise MalformedResponseError( | |
| f"Score {key}.quoted_span must be β€400 chars, got {len(s['quoted_span'])}" | |
| ) | |
| scores[key] = Score( | |
| score=s["score"], rationale=s["rationale"], quoted_span=s["quoted_span"] | |
| ) | |
| if not isinstance(data["warnings"], list): | |
| raise MalformedResponseError("warnings must be a JSON array") | |
| warnings = [ | |
| Warning( | |
| text=w.get("text", ""), | |
| citation_source=w.get("citation_source", ""), | |
| citation_url=w.get("citation_url", ""), | |
| ) | |
| for w in data["warnings"] | |
| ] | |
| writeup = raw[match.end():].strip() | |
| return Response( | |
| constraint=data["constraint"], | |
| scores=scores, | |
| quadrant=data["quadrant"], | |
| closest_portrait=data["closest_portrait"], | |
| closest_portrait_paragraph=data["closest_portrait_paragraph"], | |
| warnings=warnings, | |
| writeup=writeup, | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # Configuration (env-driven; see .env.example) | |
| # --------------------------------------------------------------------------- | |
| ROOT = Path(__file__).parent | |
| ANTHROPIC_MODEL_ID = os.environ.get("MODEL_ID", "claude-opus-4-7") | |
| HF_MODEL_ID = os.environ.get("HF_MODEL_ID", "google/gemma-2-9b-it") | |
| ZEROGPU_MODEL_ID = os.environ.get("ZEROGPU_MODEL_ID", "microsoft/Phi-4-mini-instruct") | |
| # ZeroGPU reserves this many seconds from the Space owner's daily quota | |
| # per request. With the pre-load pattern below (model on CPU at module | |
| # init, .to('cuda') + inference inside @spaces.GPU), per-call cost is | |
| # only ~10-25s wall-clock. 45s gives generous margin while squeezing | |
| # ~2.5x more submissions per quota window vs the original 120s. | |
| # Pro-tier max is 120s; raise via env if you need bigger headroom. | |
| ZEROGPU_DURATION_SECONDS = int(os.environ.get("ZEROGPU_DURATION_SECONDS", "45")) | |
| MAX_DESCRIPTION_WORDS = int(os.environ.get("MAX_DESCRIPTION_WORDS", "5000")) | |
| MIN_DESCRIPTION_WORDS = 200 | |
| # ZeroGPU availability is detected at import time. The `spaces` package | |
| # is HuggingFace's runtime for on-demand GPU allocation; `transformers` | |
| # + `torch` are required to actually load and run the model. All three | |
| # must be importable for the zerogpu backend to function. | |
| try: | |
| import spaces as _spaces | |
| import torch as _torch | |
| from transformers import AutoModelForCausalLM as _AutoModelForCausalLM | |
| from transformers import AutoTokenizer as _AutoTokenizer | |
| _ZEROGPU_DEPS_AVAILABLE = True | |
| except ImportError: | |
| _ZEROGPU_DEPS_AVAILABLE = False | |
| def _zerogpu_available() -> bool: | |
| """Return whether the zerogpu backend can be used. Wrapped as a | |
| function so tests can monkeypatch the answer without touching the | |
| real torch/transformers imports.""" | |
| return _ZEROGPU_DEPS_AVAILABLE | |
| # --------------------------------------------------------------------------- | |
| # Provider abstraction (anthropic vs huggingface β selectable at runtime) | |
| # --------------------------------------------------------------------------- | |
| def _detect_provider(env=None) -> str: | |
| """Pick a model provider from env. Order of precedence: | |
| 1. Explicit MODEL_PROVIDER (anthropic | huggingface | zerogpu). | |
| 2. Running on a HuggingFace Space (SPACE_ID set) AND the zerogpu | |
| deps (spaces + transformers + torch) are importable β zerogpu. | |
| This is the Pro-plan free-GPU path. | |
| 3. Presence of ANTHROPIC_API_KEY β anthropic. | |
| 4. Presence of HF_TOKEN / HUGGING_FACE_HUB_TOKEN, or running on | |
| a HuggingFace Space without zerogpu deps β huggingface. | |
| 5. Fall through to anthropic (call-time error will tell the user | |
| which env to set). | |
| """ | |
| env = env if env is not None else os.environ | |
| explicit = env.get("MODEL_PROVIDER", "").strip().lower() | |
| if explicit in ("anthropic", "huggingface", "zerogpu"): | |
| return explicit | |
| if env.get("SPACE_ID") and _zerogpu_available(): | |
| return "zerogpu" | |
| if env.get("ANTHROPIC_API_KEY"): | |
| return "anthropic" | |
| if ( | |
| env.get("HF_TOKEN") | |
| or env.get("HUGGING_FACE_HUB_TOKEN") | |
| or env.get("SPACE_ID") | |
| ): | |
| return "huggingface" | |
| return "anthropic" | |
| def _call_anthropic(system_block: str, user_prompt: str, *, api_key: Optional[str] = None) -> str: | |
| """Anthropic backend. System block is cache-marked; the user prompt | |
| is sent fresh. Returns the raw assistant text. | |
| `api_key`: an optional per-call key. When provided, it goes directly | |
| to the SDK constructor and is NEVER written to os.environ. This is | |
| important on a multi-tenant public Space β mutating env would leak | |
| one visitor's key into a concurrent request from another visitor. | |
| When `api_key` is None, the SDK reads ANTHROPIC_API_KEY from env | |
| (the Space-owner's key path).""" | |
| from anthropic import Anthropic | |
| client = Anthropic(api_key=api_key) if api_key else Anthropic() | |
| resp = client.messages.create( | |
| model=ANTHROPIC_MODEL_ID, | |
| max_tokens=2500, | |
| system=[ | |
| { | |
| "type": "text", | |
| "text": system_block, | |
| "cache_control": {"type": "ephemeral"}, | |
| } | |
| ], | |
| messages=[{"role": "user", "content": user_prompt}], | |
| ) | |
| return resp.content[0].text | |
| def _call_huggingface(system_block: str, user_prompt: str) -> str: | |
| """HuggingFace backend. Uses the unified chat_completion interface, | |
| which routes through HF Inference Providers and supports Gemma 2, | |
| Phi-4-mini-instruct, Llama-3.3, Qwen 2.5, and many others. Lower | |
| temperature (0.2) than the SDK default to keep JSON output stable β | |
| smaller open models can be looser than Claude on schema adherence. | |
| Requires an HF token: HF_TOKEN env var, HUGGING_FACE_HUB_TOKEN env | |
| var, or a `hf auth login`-stored token (huggingface_hub.get_token() | |
| checks all three sources). HF Spaces do NOT auto-inject a token on | |
| public Spaces β the Space owner has to add it as a Space secret. | |
| Raise a clear, actionable error if missing. | |
| """ | |
| from huggingface_hub import InferenceClient, get_token | |
| token = ( | |
| os.environ.get("HF_TOKEN") | |
| or os.environ.get("HUGGING_FACE_HUB_TOKEN") | |
| or get_token() # checks ~/.cache/huggingface/token from `hf auth login` | |
| ) | |
| if not token: | |
| raise RuntimeError( | |
| "No HuggingFace token found. The Space owner needs to add HF_TOKEN " | |
| "as a Space secret (Settings β Repository secrets β New secret β " | |
| "name: HF_TOKEN, value: a User Access Token from " | |
| "https://huggingface.co/settings/tokens). Then restart the Space. " | |
| "Until then, pick a different model from the dropdown." | |
| ) | |
| # `provider="auto"` opts into the modern HF Inference Providers | |
| # routing layer (introduced 2024-Q4), which picks the right partner | |
| # (featherless-ai / together-ai / hf-inference / etc.) for the model | |
| # automatically. Without this flag, InferenceClient falls back to | |
| # the legacy hf-inference-only path, which doesn't serve most newer | |
| # models and returns a misleading "model not supported" error even | |
| # when the user has all providers enabled and access to the model. | |
| client = InferenceClient( | |
| model=HF_MODEL_ID, | |
| token=token, | |
| provider="auto", | |
| timeout=120, | |
| ) | |
| try: | |
| resp = client.chat_completion( | |
| messages=[ | |
| {"role": "system", "content": system_block}, | |
| {"role": "user", "content": user_prompt}, | |
| ], | |
| max_tokens=2500, | |
| temperature=0.2, | |
| ) | |
| except Exception as e: | |
| msg = str(e) | |
| # HF Inference Providers routes each model through a partner | |
| # (featherless-ai, together-ai, hf-inference, etc.). If none of | |
| # the enabled providers serves the requested model, the API | |
| # returns a BadRequestError with code=model_not_supported. The | |
| # raw error is opaque to users, so re-raise with the actual fix | |
| # instead of the unhelpful default message. | |
| if "model_not_supported" in msg or "not supported by any provider" in msg: | |
| raise RuntimeError( | |
| f"The model '{HF_MODEL_ID}' isn't available through any of " | |
| f"the HuggingFace Inference Providers enabled on your account. " | |
| f"Two fixes: (a) enable a provider that supports this model at " | |
| f"https://huggingface.co/settings/inference-providers, OR " | |
| f"(b) set HF_MODEL_ID as a Space variable to a model on your " | |
| f"enabled providers β microsoft/Phi-4-mini-instruct works " | |
| f"broadly via featherless-ai." | |
| ) | |
| raise | |
| return resp.choices[0].message.content | |
| # ZeroGPU backend β pre-load pattern. | |
| # | |
| # Model is loaded onto CPU at Space startup (module init), NOT inside | |
| # `@spaces.GPU`. This is the documented HuggingFace ZeroGPU pattern: | |
| # - Module init runs once at Space startup, on CPU, with no GPU | |
| # quota consumed. The expensive part β downloading ~7.6GB of | |
| # safetensors and deserializing into PyTorch state β happens here. | |
| # - Inside `@spaces.GPU`, all we do is `.to('cuda')` + tokenize + | |
| # generate + decode. Wall-clock drops to ~10-15s warm, ~20-25s | |
| # after Space restart (the .to('cuda') for 7.6GB takes a few | |
| # seconds over PCIe). | |
| # | |
| # Why deliberately NOT `trust_remote_code=True`. Phi-4-mini-instruct's | |
| # architecture is `phi3`, which transformers 4.46+ supports natively | |
| # via `Phi3ForCausalLM` β no custom code download required. The custom | |
| # modeling code that ships with the model on HF Hub (`modeling_phi3.py`) | |
| # imports `LossKwargs` from `transformers.utils`, which was removed in | |
| # transformers 4.57+ β loading WITH `trust_remote_code=True` fails | |
| # with `ImportError: cannot import name 'LossKwargs' from | |
| # 'transformers.utils'` and bricks the `@spaces.GPU` worker. The | |
| # native path avoids the upstream pin-mismatch entirely. | |
| # | |
| # Tradeoff: ~30-60s slower Space cold-start (the one-time CPU load). | |
| # Acceptable because Spaces only restart on deploy or after a long | |
| # idle period. Worth it for the 2.5x quota efficiency. | |
| if _ZEROGPU_DEPS_AVAILABLE: | |
| _zerogpu_tokenizer = _AutoTokenizer.from_pretrained(ZEROGPU_MODEL_ID) | |
| _zerogpu_model = _AutoModelForCausalLM.from_pretrained( | |
| ZEROGPU_MODEL_ID, | |
| torch_dtype=_torch.bfloat16, | |
| # NO device_map β load to CPU; we move to GPU per-call inside | |
| # @spaces.GPU. ZeroGPU has no GPU available at module load. | |
| ) | |
| else: | |
| _zerogpu_tokenizer = None | |
| _zerogpu_model = None | |
| def _zerogpu_invoke(system_block: str, user_prompt: str) -> str: | |
| """Model invocation logic for the ZeroGPU backend. Pre-loaded model | |
| (on CPU) is moved to GPU on entry, then inference + decode. Reads | |
| module-level globals (`_zerogpu_tokenizer`, `_zerogpu_model`) which | |
| tests monkeypatch to fake the transformers types. | |
| Separated from the `@spaces.GPU` decoration below so it can be | |
| unit-tested without actually allocating a GPU.""" | |
| # Move pre-loaded model from CPU to the GPU that @spaces.GPU just | |
| # allocated. Fast β just PCIe memory transfer of already-loaded | |
| # weights, no download or deserialize. | |
| _zerogpu_model.to("cuda") | |
| messages = [ | |
| {"role": "system", "content": system_block}, | |
| {"role": "user", "content": user_prompt}, | |
| ] | |
| inputs = _zerogpu_tokenizer.apply_chat_template( | |
| messages, | |
| return_tensors="pt", | |
| add_generation_prompt=True, | |
| ).to("cuda") | |
| outputs = _zerogpu_model.generate( | |
| inputs, | |
| max_new_tokens=2500, | |
| temperature=0.2, | |
| do_sample=True, | |
| pad_token_id=_zerogpu_tokenizer.eos_token_id, | |
| ) | |
| prompt_len = inputs.shape[1] | |
| return _zerogpu_tokenizer.decode( | |
| outputs[0][prompt_len:], skip_special_tokens=True | |
| ) | |
| if _ZEROGPU_DEPS_AVAILABLE: | |
| def _call_zerogpu(system_block: str, user_prompt: str) -> str: | |
| """ZeroGPU backend. Loads Phi-4-mini-instruct (or whatever | |
| ZEROGPU_MODEL_ID points at) into the Space's allocated GPU and | |
| runs chat-template inference. Thin wrapper around the testable | |
| `_zerogpu_invoke` so the decorator stays at module load time.""" | |
| return _zerogpu_invoke(system_block, user_prompt) | |
| else: | |
| def _call_zerogpu(system_block: str, user_prompt: str) -> str: | |
| raise RuntimeError( | |
| "ZeroGPU backend requires `spaces`, `transformers`, and `torch` " | |
| "to be importable AND should be run on a HuggingFace Pro Space " | |
| "for free on-demand GPU. Install the full requirements.txt and " | |
| "deploy to a Space, or pick anthropic / huggingface from the " | |
| "provider dropdown." | |
| ) | |
| PROVIDERS = { | |
| "anthropic": _call_anthropic, | |
| "huggingface": _call_huggingface, | |
| "zerogpu": _call_zerogpu, | |
| } | |
| def _call_model(system_block: str, user_prompt: str, provider: str) -> str: | |
| """Dispatch to the named provider. Raises ValueError on unknown | |
| provider; callers are expected to validate before calling.""" | |
| if provider not in PROVIDERS: | |
| raise ValueError( | |
| f"Unknown provider: {provider!r}; expected one of {sorted(PROVIDERS)}" | |
| ) | |
| return PROVIDERS[provider](system_block, user_prompt) | |
| # Auto-detected once at module import; the Gradio UI exposes a runtime | |
| # override via the Provider dropdown. | |
| DEFAULT_PROVIDER = _detect_provider() | |
| INDUSTRIES = [ | |
| "insurance", "banking", "healthcare", "retail", "manufacturing", | |
| "logistics", "agriculture", "energy", "telecom", "media", | |
| "professional services", "real estate", "other", | |
| ] | |
| SCALES = ["pilot", "department", "business unit", "enterprise"] | |
| BUDGETS = ["<$100K", "$100Kβ$1M", "$1Mβ$10M", ">$10M"] | |
| # --------------------------------------------------------------------------- | |
| # Sample initiatives (gr.Examples) β one per verdict quadrant | |
| # --------------------------------------------------------------------------- | |
| # Realistic ~250β400-word AI-initiative descriptions that should land in a | |
| # specific quadrant of the 2Γ2 verdict matrix. Used to seed user testing | |
| # and give first-time visitors something concrete to click. | |
| _SAMPLE_COMPOUNDER = ( | |
| "We're a regional commercial insurance carrier specializing in restaurant " | |
| "general liability. We write about 8,000 policies a year across the " | |
| "Midwest, with average annual premium around $4,500. Underwriting is " | |
| "the bottleneck of our business β independent agents wait 36 to 48 " | |
| "hours for a quote because our underwriters manually pull industry " | |
| "codes, loss runs, and prior-carrier history from three different " | |
| "systems and then decide whether to bind, decline, or refer. Roughly " | |
| "30% of submissions get declined and another 15% are referred to " | |
| "senior underwriters, which adds another day. We're deploying an " | |
| "LLM-powered underwriting assistant that pulls the data automatically, " | |
| "flags risk factors based on patterns in our 12-year claims database, " | |
| "and proposes a base rate with an explanation. The underwriter " | |
| "reviews, adjusts, and approves. Every policy we write generates new " | |
| "claim outcomes β fires, slip-and-falls, liquor-liability claims, " | |
| "food-poisoning suits β and those outcomes feed back into the next " | |
| "quarter's model retraining. Our competitors mostly use Verisk's " | |
| "industry-standard rating models, which we don't share data with, so " | |
| "our model gets better on our specific book of business while theirs " | |
| "reflects the industry average. Internal goal: cut time-to-quote from " | |
| "36 hours to 4 hours, increase the win rate on profitable risks by " | |
| "15%, and progressively shift the loss ratio by 1β2 points per year " | |
| "as the model learns from each renewal cycle. Independent agents have " | |
| "already started favoring carriers with faster quote turnaround." | |
| ) | |
| _SAMPLE_ONE_SHOT_WIN = ( | |
| "We're a community bank with $4B in assets and 38 branches across two " | |
| "states. Loan officers spend about 6 hours per commercial loan " | |
| "reviewing financial statements, tax returns, and corporate documents " | |
| "before they can write the credit memo. We're deploying GPT-4 to " | |
| "extract key fields β revenue, EBITDA, debt service coverage ratio, " | |
| "ownership structure, related-party transactions, collateral " | |
| "descriptions β from these documents into a structured form. The loan " | |
| "officer reviews the extraction and writes the credit memo by hand. " | |
| "We expect to cut document review time from 6 hours to about 90 " | |
| "minutes per loan, processing roughly 2,400 commercial loans a year. " | |
| "The vendor provides the model, the document templates, and the " | |
| "extraction prompts, and is selling the same system to four of our " | |
| "peer community banks in the region under identical contracts. The AI " | |
| "doesn't learn from the outcome of the loan: defaults, prepayments, " | |
| "modifications, restructurings all go into our separate loan " | |
| "servicing system, which has never connected back to the extraction " | |
| "model. The vendor's three-year roadmap doesn't include any feedback " | |
| "loop between loan performance and the model β they treat extraction " | |
| "as a deterministic task. We're funding the project from the " | |
| "operations budget; the credit team is excited about the time savings " | |
| "but the chief credit officer has flagged that the productivity gain " | |
| "will be one-time and won't show up in the loss-given-default rate " | |
| "over time." | |
| ) | |
| _SAMPLE_WRONG_THING = ( | |
| "We're a third-party logistics provider with 8 warehouses on the East " | |
| "Coast handling about 20,000 orders a day across the network. We're " | |
| "investing in computer vision software to optimize order picking " | |
| "routes β the AI looks at the warehouse layout, current orders, and " | |
| "worker positions and suggests optimized pick paths in real time. " | |
| "Pilot results show a 12% reduction in steps per order on the test " | |
| "floor. Our operations team has been excited about this for 18 months " | |
| "and we just signed a multi-year contract with the vendor. Some " | |
| "context on the operation: our warehouses run 2 shifts. Order volume " | |
| "in shift 1 is around 14,000 orders per day; shift 2 is around 6,000. " | |
| "The pick wave finishes by 2pm on shift 1, then workers wait 4 to 5 " | |
| "hours for shift 2 trucks to arrive at the loading docks. The trucks " | |
| "are scheduled by the customer (a major national retailer) and arrive " | |
| "in unpredictable windows between 6pm and 10pm. We don't control the " | |
| "truck schedule and the customer won't share their advance schedule " | |
| "with us. The CFO has been asking us why total throughput per " | |
| "warehouse hasn't moved much in three years; our answer has been that " | |
| "the legacy warehouse management system is the constraint, which is " | |
| "why we're investing in better picking AI. Same-store labor cost is " | |
| "up 8% year-over-year because workers are paid through the idle hours." | |
| ) | |
| _SAMPLE_ROMAN_CANDLE = ( | |
| "We run a chain of 220 quick-service restaurants across the Southeast " | |
| "doing about $480M in annual revenue. Our gross margin has been under " | |
| "pressure from rising ingredient costs and we're rolling out an " | |
| "AI-powered personalized marketing platform that sends customized " | |
| "email and SMS offers based on customer purchase history, location, " | |
| "and local weather. The platform is from a major QSR-tech vendor used " | |
| "by several of our direct competitors in the same markets we operate " | |
| "in. Our customer data β names, emails, phone numbers, purchase " | |
| "frequency, average ticket size β lives in our point-of-sale " | |
| "provider's cloud, which the marketing platform pulls from via the " | |
| "POS provider's standard integration. Both the purchase data feed and " | |
| "the modeling are the vendor's stack; we don't see the underlying " | |
| "model and our data is commingled with other QSR brands the vendor " | |
| "serves on a shared inference fleet. We expect to lift email " | |
| "click-through by 8β12% based on the vendor's benchmark studies of " | |
| "similar brands. The marketing team is running the rollout; finance " | |
| "signed off on the multi-year subscription. We have not measured what " | |
| "is actually constraining same-store sales growth β drive-thru wait " | |
| "times, menu pricing relative to local competitors, or breakfast " | |
| "daypart penetration β we just know revenue has been flat for two " | |
| "years and the board wants visible action by Q4." | |
| ) | |
| def _load_reference(): | |
| """Read the prompt template + reference JSONs from disk at app start.""" | |
| prompt_template = (ROOT / "prompts" / "diagnose.txt").read_text() | |
| portraits = json.loads((ROOT / "reference" / "portraits.json").read_text()) | |
| failure_modes = json.loads((ROOT / "reference" / "failure-modes.json").read_text()) | |
| portraits_block = "\n".join( | |
| textwrap.dedent(f"""\ | |
| - id: {p['id']} | |
| label: {p['label']} | |
| bottleneck: {p['bottleneck']} | |
| summary: {p['summary']} | |
| compounding_summary: {p['compounding_summary']} | |
| article_url: {p['article_url']} | |
| """).rstrip() | |
| for p in portraits | |
| ) | |
| failure_modes_block = "\n".join( | |
| textwrap.dedent(f"""\ | |
| - id: {f['id']} | |
| label: {f['label']} | |
| applies_to_quadrants: {', '.join(f['applies_to_quadrants'])} | |
| summary: {f['summary']} | |
| url: {f['url']} | |
| """).rstrip() | |
| for f in failure_modes | |
| ) | |
| system_block = ( | |
| prompt_template | |
| .replace("{{portraits_block}}", portraits_block) | |
| .replace("{{failure_modes_block}}", failure_modes_block) | |
| ) | |
| return prompt_template, system_block | |
| # Loaded once at module import; cached in memory for the life of the process. | |
| PROMPT_TEMPLATE, SYSTEM_BLOCK = _load_reference() | |
| # --------------------------------------------------------------------------- | |
| # Diagnose entrypoint (called by the Gradio Submit handler) | |
| # --------------------------------------------------------------------------- | |
| def diagnose( | |
| description: str, | |
| industry: Optional[str], | |
| scale: Optional[str], | |
| budget: Optional[str], | |
| provider: Optional[str] = None, | |
| anthropic_api_key: Optional[str] = None, | |
| ) -> tuple[str, str]: | |
| """Validate input, call the selected model with the cached system | |
| block, parse the response, and return (markdown_writeup, | |
| raw_json_string) for the two Gradio tabs. | |
| `provider` (anthropic | huggingface | zerogpu) defaults to | |
| DEFAULT_PROVIDER when not supplied β the Gradio dropdown always | |
| supplies it on a real submission. | |
| `anthropic_api_key` is a per-call user-supplied key. When provider | |
| is "anthropic" and the key is provided, it overrides any | |
| ANTHROPIC_API_KEY env var for this single request. The key is never | |
| persisted (Anthropic SDK uses it once and the client object is | |
| garbage-collected at function exit). | |
| Per F14 + contract Β§2, all error paths surface a user-friendly message | |
| in the markdown tab and an empty JSON tab; nothing leaks a stack trace. | |
| """ | |
| description = (description or "").strip() | |
| words = len(description.split()) | |
| if not description: | |
| return "β Please describe your AI initiative.", "" | |
| if words < MIN_DESCRIPTION_WORDS: | |
| return ( | |
| f"β Please describe your initiative in at least {MIN_DESCRIPTION_WORDS} words " | |
| f"(you wrote {words}). The diagnostic needs enough context to score the four " | |
| f"compounding conditions with rationale quoting your description.", | |
| "", | |
| ) | |
| if words > MAX_DESCRIPTION_WORDS: | |
| return ( | |
| f"β Please keep your description under {MAX_DESCRIPTION_WORDS} words " | |
| f"(you wrote {words}). Shorten the description and try again.", | |
| "", | |
| ) | |
| provider = provider or DEFAULT_PROVIDER | |
| if provider not in PROVIDERS: | |
| return ( | |
| f"β Unknown model provider {provider!r}. Pick one of " | |
| f"{sorted(PROVIDERS)} from the dropdown.", | |
| "", | |
| ) | |
| # If Premium (Anthropic) is selected, the user must supply a key β | |
| # either via the page's API-key field (per-call) or via an | |
| # ANTHROPIC_API_KEY env var on the Space. Without either, fail fast | |
| # with a friendly explanation before we hit the SDK. | |
| user_key_for_anthropic: Optional[str] = None | |
| if provider == "anthropic": | |
| env_key = os.environ.get("ANTHROPIC_API_KEY", "").strip() | |
| user_key = (anthropic_api_key or "").strip() | |
| if not env_key and not user_key: | |
| return ( | |
| "β Premium (Claude Opus) needs an Anthropic API key. Either " | |
| "paste your key in the field above, or pick one of the free " | |
| "options from the model dropdown.", | |
| "", | |
| ) | |
| if user_key: | |
| # IMPORTANT: do NOT write the user-supplied key to os.environ. | |
| # That would leak the key into concurrent requests from other | |
| # visitors on this Space (the process env is shared across | |
| # all in-flight requests in the Python worker). Instead we | |
| # pass it directly to _call_anthropic below, which scopes it | |
| # to a single SDK client instance that's garbage-collected | |
| # when the call returns. | |
| user_key_for_anthropic = user_key | |
| user_prompt = ( | |
| PROMPT_TEMPLATE | |
| .replace("{{user_input}}", description) | |
| .replace("{{industry}}", industry or "(not specified)") | |
| .replace("{{scale}}", scale or "(not specified)") | |
| .replace("{{budget}}", budget or "(not specified)") | |
| ) | |
| try: | |
| # When the visitor supplied their own Anthropic key, bypass the | |
| # generic dispatcher so we can pass the key directly via kwarg | |
| # without ever touching os.environ. All other paths go through | |
| # the dispatcher and read credentials from env as usual. | |
| if provider == "anthropic" and user_key_for_anthropic: | |
| raw = _call_anthropic( | |
| SYSTEM_BLOCK, user_prompt, api_key=user_key_for_anthropic, | |
| ) | |
| else: | |
| raw = _call_model(SYSTEM_BLOCK, user_prompt, provider) | |
| except Exception as e: | |
| # API timeout / rate limit / auth / server / network failure | |
| # (Anthropic SDK, huggingface_hub InferenceClient, or | |
| # transformers/torch on the zerogpu path). Include both the | |
| # exception class AND its string form so unexpected failures | |
| # are diagnosable from the UI without server log access. | |
| model_label = { | |
| "anthropic": ANTHROPIC_MODEL_ID, | |
| "huggingface": HF_MODEL_ID, | |
| "zerogpu": ZEROGPU_MODEL_ID, | |
| }.get(provider, provider) | |
| detail = str(e).strip() or "(no message)" | |
| # Cap the detail so we don't spill multi-paragraph tracebacks | |
| # into the UI. 400 chars is enough for a stack-trace summary | |
| # without flooding the markdown tab. | |
| if len(detail) > 400: | |
| detail = detail[:400] + "β¦" | |
| # Defense-in-depth: if the user-supplied Anthropic key somehow | |
| # appears in the exception message (no current SDK version does | |
| # this, but a future debug-mode override might), redact it | |
| # before surfacing the writeup. Symmetric with redactKey() in | |
| # src/lib/anthropic-direct.ts. | |
| if user_key_for_anthropic and len(user_key_for_anthropic) >= 8: | |
| detail = detail.replace(user_key_for_anthropic, "[redacted]") | |
| return ( | |
| f"β The diagnostic call to {provider} ({model_label}) failed.\n\n" | |
| f"**{type(e).__name__}:** {detail}\n\n" | |
| f"Try again in a moment, switch providers in the dropdown, " | |
| f"or shorten your description.", | |
| "", | |
| ) | |
| try: | |
| parsed = parse_response(raw) | |
| except MalformedResponseError as e: | |
| return ( | |
| f"β The model returned malformed output. Try again with a different description " | |
| f"or shorten the existing one.\n\nDetail: {e}", | |
| "", | |
| ) | |
| payload = { | |
| "constraint": parsed.constraint, | |
| "quadrant": parsed.quadrant, | |
| "closest_portrait": parsed.closest_portrait, | |
| "closest_portrait_paragraph": parsed.closest_portrait_paragraph, | |
| "scores": { | |
| k: {"score": v.score, "rationale": v.rationale, "quoted_span": v.quoted_span} | |
| for k, v in parsed.scores.items() | |
| }, | |
| "warnings": [ | |
| {"text": w.text, "citation_source": w.citation_source, "citation_url": w.citation_url} | |
| for w in parsed.warnings | |
| ], | |
| } | |
| return parsed.writeup, json.dumps(payload, indent=2) | |
| # --------------------------------------------------------------------------- | |
| # Gradio UI (built lazily so `import app` from tests does not require gradio) | |
| # --------------------------------------------------------------------------- | |
| def build_demo(): | |
| """Build and return the Gradio Blocks UI. Called only by __main__.""" | |
| import gradio as gr | |
| # Free option first, premium second. Plain-English labels with no | |
| # ANTHROPIC_API_KEY / SPACE_ID / ZeroGPU jargon β the casual user | |
| # shouldn't have to know what any of those mean. | |
| # | |
| # The HuggingFace Inference Providers backend (provider="huggingface") | |
| # is intentionally NOT in this dropdown: it requires the Space owner | |
| # to have HF billing set up (credit card on file OR custom provider | |
| # API keys), which most Pro users don't have by default. The backend | |
| # code remains in PROVIDERS so it's reachable via MODEL_PROVIDER env | |
| # override for users who do set up billing β see README.md. | |
| provider_choices = [] | |
| if _zerogpu_available(): | |
| provider_choices.append(( | |
| f"Free Β· Phi-4-mini-instruct (Microsoft) β runs on GPU", | |
| "zerogpu", | |
| )) | |
| provider_choices.append(( | |
| f"Premium Β· Claude Opus 4.7 (Anthropic) β paste your API key below", | |
| "anthropic", | |
| )) | |
| # Default to the first free option; user can pick Premium if they | |
| # have a key. Never default to anthropic on a public Space. | |
| default_choice = provider_choices[0][1] | |
| with gr.Blocks(title="The Compounding Test") as demo: | |
| gr.Markdown( | |
| "# The Compounding Test\n\n" | |
| "A diagnostic for AI investments at non-technology companies. " | |
| "Describe your AI initiative β get a scored writeup in one of " | |
| "four outcomes: **compounder**, **one-shot win**, **compounding " | |
| "the wrong thing**, or **Roman Candle**.\n\n" | |
| "**The default is free** β runs an open model (Phi-4-mini) " | |
| "on this Space's GPU. Pick **Premium Β· Claude Opus** from " | |
| "the dropdown if you have an Anthropic API key and want the " | |
| "highest-quality writeup. Read the full framework at " | |
| "[mile-hi.ai/journal/the-berkshire-test](" | |
| "https://www.mile-hi.ai/journal/the-berkshire-test)." | |
| ) | |
| with gr.Row(): | |
| description = gr.Textbox( | |
| label=f"Describe your AI initiative ({MIN_DESCRIPTION_WORDS}β{MAX_DESCRIPTION_WORDS} words)", | |
| placeholder=( | |
| "Describe the bottleneck of your operation, the AI " | |
| "investment, what data feeds it, where the labels come " | |
| "from, and how you expect competitors to respond. Be " | |
| "specific about the workflow.\n\n" | |
| "Or pick a sample below to see how it works." | |
| ), | |
| lines=12, | |
| ) | |
| with gr.Row(): | |
| industry = gr.Dropdown(INDUSTRIES, label="Industry (optional)", value=None) | |
| scale = gr.Dropdown(SCALES, label="Scale (optional)", value=None) | |
| budget = gr.Dropdown(BUDGETS, label="Budget tier (optional)", value=None) | |
| gr.Examples( | |
| examples=[ | |
| [_SAMPLE_COMPOUNDER, "insurance", "business unit", "$1Mβ$10M"], | |
| [_SAMPLE_ONE_SHOT_WIN, "banking", "business unit", "$100Kβ$1M"], | |
| [_SAMPLE_WRONG_THING, "logistics", "enterprise", "$1Mβ$10M"], | |
| [_SAMPLE_ROMAN_CANDLE, "retail", "enterprise", "$100Kβ$1M"], | |
| ], | |
| inputs=[description, industry, scale, budget], | |
| label="Sample initiatives β click one to load it (then click Diagnose)", | |
| examples_per_page=4, | |
| ) | |
| with gr.Row(): | |
| provider = gr.Dropdown( | |
| choices=provider_choices, | |
| value=default_choice, | |
| label="Choose a model", | |
| ) | |
| # The API-key field appears only when Premium is selected. The | |
| # key is used per-request and never stored. | |
| api_key = gr.Textbox( | |
| label="Anthropic API key", | |
| placeholder="sk-ant-...", | |
| type="password", | |
| info=( | |
| "Used only for this request and never stored. " | |
| "Get a key at console.anthropic.com." | |
| ), | |
| visible=False, | |
| ) | |
| def _toggle_api_key(p): | |
| return gr.update(visible=(p == "anthropic")) | |
| provider.change(_toggle_api_key, inputs=[provider], outputs=[api_key]) | |
| submit = gr.Button("Diagnose", variant="primary") | |
| with gr.Tabs(): | |
| with gr.Tab("Diagnosis"): | |
| writeup_out = gr.Markdown() | |
| with gr.Tab("Raw JSON"): | |
| json_out = gr.Code(language="json") | |
| submit.click( | |
| diagnose, | |
| inputs=[description, industry, scale, budget, provider, api_key], | |
| outputs=[writeup_out, json_out], | |
| ) | |
| return demo | |
| if __name__ == "__main__": | |
| # Local dev: relies on .env (loaded by python-dotenv) for ANTHROPIC_API_KEY. | |
| # HF Spaces: relies on Space secrets. | |
| try: | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| except ImportError: | |
| pass # dotenv is optional; HF Spaces uses Space secrets. | |
| build_demo().launch() | |