"""The Compounding Test — HuggingFace Space. A single-shot Gradio app that runs an AI-initiative description through the two-axis Berkshire Test for AI and returns a scored writeup. Architecture per specs/004-berkshire-test/contracts/hf-space-interface.md: - Inputs: a description (200–5000 words) + 3 optional clarifiers. - Three backends, selectable by env (`MODEL_PROVIDER`) or auto-detected from available credentials and runtime environment: * anthropic — Claude Opus / Sonnet via the Anthropic SDK; system block is `cache_control:ephemeral` so subsequent calls hit the 5-minute prefix cache. * huggingface — Open models (Gemma 2 9B by default, swappable to Phi-4, Llama-3.3, Qwen 2.5, etc.) via the huggingface_hub InferenceClient. Works on HF Spaces with the Space's free inference credits; locally requires HF_TOKEN. * zerogpu — Open model (Phi-4-mini-instruct by default) loaded LOCALLY in the Space via transformers, decorated with `@spaces.GPU` so a HuggingFace Pro plan gets free on-demand A100/H100 GPU allocation per request. No per-call credit burn; no API round-trip. Requires the Space to have a Pro owner; locally falls back to CPU (slow). - Output: two Gradio tabs — markdown writeup + raw JSON. Engine/Site boundary (Principle VIII): this app lives in gradio-apps/ only. Never deployed to mile-hi.ai. Reference JSONs are populated by hand from the published articles — no runtime fetch from the site. """ from __future__ import annotations import json import os import re import textwrap from dataclasses import dataclass, field from pathlib import Path from typing import Optional # --------------------------------------------------------------------------- # Parser surface (covered by test_diagnose.py — module-level, no side effects) # --------------------------------------------------------------------------- class MalformedResponseError(Exception): """Raised when the model's response cannot be parsed into a Response.""" VALID_QUADRANTS = {"compounder", "one-shot-win", "wrong-thing", "roman-candle"} VALID_PORTRAITS = {"progressive", "deere", "mastercard", "mayo"} REQUIRED_SCORES = ( "proprietary_data", "self_labeling", "decreasing_marginal_cost", "defensible_asymmetry", ) @dataclass class Score: score: int rationale: str quoted_span: str @dataclass class Warning: text: str citation_source: str citation_url: str @dataclass class Response: constraint: str scores: dict # str → Score (one entry per REQUIRED_SCORES key) quadrant: str closest_portrait: str closest_portrait_paragraph: str warnings: list # list[Warning] writeup: str # Match the FIRST ```json ... ``` fenced block in the response. JSON_BLOCK_RE = re.compile(r"```json\s*\n(.*?)\n\s*```", re.DOTALL) def parse_response(raw: str) -> Response: """Extract the first ```json``` block from `raw`, validate the schema, and return a populated Response. Trailing markdown becomes `writeup`. Raises MalformedResponseError on any schema violation per the contract in specs/004-berkshire-test/contracts/hf-space-interface.md §4. """ match = JSON_BLOCK_RE.search(raw) if not match: raise MalformedResponseError("No ```json``` block found in response") json_text = match.group(1) try: data = json.loads(json_text) except json.JSONDecodeError as e: raise MalformedResponseError(f"JSON block did not parse: {e}") required = ( "constraint", "scores", "quadrant", "closest_portrait", "closest_portrait_paragraph", "warnings", ) for field_name in required: if field_name not in data: raise MalformedResponseError(f"Missing required field: {field_name}") if data["quadrant"] not in VALID_QUADRANTS: raise MalformedResponseError( f"Invalid quadrant: {data['quadrant']!r}; expected one of {sorted(VALID_QUADRANTS)}" ) if data["closest_portrait"] not in VALID_PORTRAITS: raise MalformedResponseError( f"Invalid closest_portrait: {data['closest_portrait']!r}; expected one of {sorted(VALID_PORTRAITS)}" ) if not isinstance(data["scores"], dict): raise MalformedResponseError("scores must be a JSON object") scores: dict[str, Score] = {} for key in REQUIRED_SCORES: if key not in data["scores"]: raise MalformedResponseError(f"Missing score key: {key}") s = data["scores"][key] if not isinstance(s, dict): raise MalformedResponseError(f"Score {key} must be an object") for sub in ("score", "rationale", "quoted_span"): if sub not in s: raise MalformedResponseError(f"Score {key} missing sub-field: {sub}") # score must be an int 0-4 (bools are excluded; bool is a subclass of int in Python) if isinstance(s["score"], bool) or not isinstance(s["score"], int): raise MalformedResponseError( f"Score {key}.score must be an integer 0-4, got {type(s['score']).__name__}" ) if s["score"] < 0 or s["score"] > 4: raise MalformedResponseError( f"Score {key}.score must be in 0-4, got {s['score']}" ) if not isinstance(s["quoted_span"], str) or not s["quoted_span"]: raise MalformedResponseError(f"Score {key}.quoted_span must be a non-empty string") if len(s["quoted_span"]) > 400: raise MalformedResponseError( f"Score {key}.quoted_span must be ≤400 chars, got {len(s['quoted_span'])}" ) scores[key] = Score( score=s["score"], rationale=s["rationale"], quoted_span=s["quoted_span"] ) if not isinstance(data["warnings"], list): raise MalformedResponseError("warnings must be a JSON array") warnings = [ Warning( text=w.get("text", ""), citation_source=w.get("citation_source", ""), citation_url=w.get("citation_url", ""), ) for w in data["warnings"] ] writeup = raw[match.end():].strip() return Response( constraint=data["constraint"], scores=scores, quadrant=data["quadrant"], closest_portrait=data["closest_portrait"], closest_portrait_paragraph=data["closest_portrait_paragraph"], warnings=warnings, writeup=writeup, ) # --------------------------------------------------------------------------- # Configuration (env-driven; see .env.example) # --------------------------------------------------------------------------- ROOT = Path(__file__).parent ANTHROPIC_MODEL_ID = os.environ.get("MODEL_ID", "claude-opus-4-7") HF_MODEL_ID = os.environ.get("HF_MODEL_ID", "google/gemma-2-9b-it") ZEROGPU_MODEL_ID = os.environ.get("ZEROGPU_MODEL_ID", "microsoft/Phi-4-mini-instruct") # ZeroGPU reserves this many seconds from the Space owner's daily quota # per request. With the pre-load pattern below (model on CPU at module # init, .to('cuda') + inference inside @spaces.GPU), per-call cost is # only ~10-25s wall-clock. 45s gives generous margin while squeezing # ~2.5x more submissions per quota window vs the original 120s. # Pro-tier max is 120s; raise via env if you need bigger headroom. ZEROGPU_DURATION_SECONDS = int(os.environ.get("ZEROGPU_DURATION_SECONDS", "45")) MAX_DESCRIPTION_WORDS = int(os.environ.get("MAX_DESCRIPTION_WORDS", "5000")) MIN_DESCRIPTION_WORDS = 200 # ZeroGPU availability is detected at import time. The `spaces` package # is HuggingFace's runtime for on-demand GPU allocation; `transformers` # + `torch` are required to actually load and run the model. All three # must be importable for the zerogpu backend to function. try: import spaces as _spaces import torch as _torch from transformers import AutoModelForCausalLM as _AutoModelForCausalLM from transformers import AutoTokenizer as _AutoTokenizer _ZEROGPU_DEPS_AVAILABLE = True except ImportError: _ZEROGPU_DEPS_AVAILABLE = False def _zerogpu_available() -> bool: """Return whether the zerogpu backend can be used. Wrapped as a function so tests can monkeypatch the answer without touching the real torch/transformers imports.""" return _ZEROGPU_DEPS_AVAILABLE # --------------------------------------------------------------------------- # Provider abstraction (anthropic vs huggingface — selectable at runtime) # --------------------------------------------------------------------------- def _detect_provider(env=None) -> str: """Pick a model provider from env. Order of precedence: 1. Explicit MODEL_PROVIDER (anthropic | huggingface | zerogpu). 2. Running on a HuggingFace Space (SPACE_ID set) AND the zerogpu deps (spaces + transformers + torch) are importable → zerogpu. This is the Pro-plan free-GPU path. 3. Presence of ANTHROPIC_API_KEY → anthropic. 4. Presence of HF_TOKEN / HUGGING_FACE_HUB_TOKEN, or running on a HuggingFace Space without zerogpu deps → huggingface. 5. Fall through to anthropic (call-time error will tell the user which env to set). """ env = env if env is not None else os.environ explicit = env.get("MODEL_PROVIDER", "").strip().lower() if explicit in ("anthropic", "huggingface", "zerogpu"): return explicit if env.get("SPACE_ID") and _zerogpu_available(): return "zerogpu" if env.get("ANTHROPIC_API_KEY"): return "anthropic" if ( env.get("HF_TOKEN") or env.get("HUGGING_FACE_HUB_TOKEN") or env.get("SPACE_ID") ): return "huggingface" return "anthropic" def _call_anthropic(system_block: str, user_prompt: str, *, api_key: Optional[str] = None) -> str: """Anthropic backend. System block is cache-marked; the user prompt is sent fresh. Returns the raw assistant text. `api_key`: an optional per-call key. When provided, it goes directly to the SDK constructor and is NEVER written to os.environ. This is important on a multi-tenant public Space — mutating env would leak one visitor's key into a concurrent request from another visitor. When `api_key` is None, the SDK reads ANTHROPIC_API_KEY from env (the Space-owner's key path).""" from anthropic import Anthropic client = Anthropic(api_key=api_key) if api_key else Anthropic() resp = client.messages.create( model=ANTHROPIC_MODEL_ID, max_tokens=2500, system=[ { "type": "text", "text": system_block, "cache_control": {"type": "ephemeral"}, } ], messages=[{"role": "user", "content": user_prompt}], ) return resp.content[0].text def _call_huggingface(system_block: str, user_prompt: str) -> str: """HuggingFace backend. Uses the unified chat_completion interface, which routes through HF Inference Providers and supports Gemma 2, Phi-4-mini-instruct, Llama-3.3, Qwen 2.5, and many others. Lower temperature (0.2) than the SDK default to keep JSON output stable — smaller open models can be looser than Claude on schema adherence. Requires an HF token: HF_TOKEN env var, HUGGING_FACE_HUB_TOKEN env var, or a `hf auth login`-stored token (huggingface_hub.get_token() checks all three sources). HF Spaces do NOT auto-inject a token on public Spaces — the Space owner has to add it as a Space secret. Raise a clear, actionable error if missing. """ from huggingface_hub import InferenceClient, get_token token = ( os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN") or get_token() # checks ~/.cache/huggingface/token from `hf auth login` ) if not token: raise RuntimeError( "No HuggingFace token found. The Space owner needs to add HF_TOKEN " "as a Space secret (Settings → Repository secrets → New secret → " "name: HF_TOKEN, value: a User Access Token from " "https://huggingface.co/settings/tokens). Then restart the Space. " "Until then, pick a different model from the dropdown." ) # `provider="auto"` opts into the modern HF Inference Providers # routing layer (introduced 2024-Q4), which picks the right partner # (featherless-ai / together-ai / hf-inference / etc.) for the model # automatically. Without this flag, InferenceClient falls back to # the legacy hf-inference-only path, which doesn't serve most newer # models and returns a misleading "model not supported" error even # when the user has all providers enabled and access to the model. client = InferenceClient( model=HF_MODEL_ID, token=token, provider="auto", timeout=120, ) try: resp = client.chat_completion( messages=[ {"role": "system", "content": system_block}, {"role": "user", "content": user_prompt}, ], max_tokens=2500, temperature=0.2, ) except Exception as e: msg = str(e) # HF Inference Providers routes each model through a partner # (featherless-ai, together-ai, hf-inference, etc.). If none of # the enabled providers serves the requested model, the API # returns a BadRequestError with code=model_not_supported. The # raw error is opaque to users, so re-raise with the actual fix # instead of the unhelpful default message. if "model_not_supported" in msg or "not supported by any provider" in msg: raise RuntimeError( f"The model '{HF_MODEL_ID}' isn't available through any of " f"the HuggingFace Inference Providers enabled on your account. " f"Two fixes: (a) enable a provider that supports this model at " f"https://huggingface.co/settings/inference-providers, OR " f"(b) set HF_MODEL_ID as a Space variable to a model on your " f"enabled providers — microsoft/Phi-4-mini-instruct works " f"broadly via featherless-ai." ) raise return resp.choices[0].message.content # ZeroGPU backend — pre-load pattern. # # Model is loaded onto CPU at Space startup (module init), NOT inside # `@spaces.GPU`. This is the documented HuggingFace ZeroGPU pattern: # - Module init runs once at Space startup, on CPU, with no GPU # quota consumed. The expensive part — downloading ~7.6GB of # safetensors and deserializing into PyTorch state — happens here. # - Inside `@spaces.GPU`, all we do is `.to('cuda')` + tokenize + # generate + decode. Wall-clock drops to ~10-15s warm, ~20-25s # after Space restart (the .to('cuda') for 7.6GB takes a few # seconds over PCIe). # # Why deliberately NOT `trust_remote_code=True`. Phi-4-mini-instruct's # architecture is `phi3`, which transformers 4.46+ supports natively # via `Phi3ForCausalLM` — no custom code download required. The custom # modeling code that ships with the model on HF Hub (`modeling_phi3.py`) # imports `LossKwargs` from `transformers.utils`, which was removed in # transformers 4.57+ — loading WITH `trust_remote_code=True` fails # with `ImportError: cannot import name 'LossKwargs' from # 'transformers.utils'` and bricks the `@spaces.GPU` worker. The # native path avoids the upstream pin-mismatch entirely. # # Tradeoff: ~30-60s slower Space cold-start (the one-time CPU load). # Acceptable because Spaces only restart on deploy or after a long # idle period. Worth it for the 2.5x quota efficiency. if _ZEROGPU_DEPS_AVAILABLE: _zerogpu_tokenizer = _AutoTokenizer.from_pretrained(ZEROGPU_MODEL_ID) _zerogpu_model = _AutoModelForCausalLM.from_pretrained( ZEROGPU_MODEL_ID, torch_dtype=_torch.bfloat16, # NO device_map — load to CPU; we move to GPU per-call inside # @spaces.GPU. ZeroGPU has no GPU available at module load. ) else: _zerogpu_tokenizer = None _zerogpu_model = None def _zerogpu_invoke(system_block: str, user_prompt: str) -> str: """Model invocation logic for the ZeroGPU backend. Pre-loaded model (on CPU) is moved to GPU on entry, then inference + decode. Reads module-level globals (`_zerogpu_tokenizer`, `_zerogpu_model`) which tests monkeypatch to fake the transformers types. Separated from the `@spaces.GPU` decoration below so it can be unit-tested without actually allocating a GPU.""" # Move pre-loaded model from CPU to the GPU that @spaces.GPU just # allocated. Fast — just PCIe memory transfer of already-loaded # weights, no download or deserialize. _zerogpu_model.to("cuda") messages = [ {"role": "system", "content": system_block}, {"role": "user", "content": user_prompt}, ] inputs = _zerogpu_tokenizer.apply_chat_template( messages, return_tensors="pt", add_generation_prompt=True, ).to("cuda") outputs = _zerogpu_model.generate( inputs, max_new_tokens=2500, temperature=0.2, do_sample=True, pad_token_id=_zerogpu_tokenizer.eos_token_id, ) prompt_len = inputs.shape[1] return _zerogpu_tokenizer.decode( outputs[0][prompt_len:], skip_special_tokens=True ) if _ZEROGPU_DEPS_AVAILABLE: @_spaces.GPU(duration=ZEROGPU_DURATION_SECONDS) def _call_zerogpu(system_block: str, user_prompt: str) -> str: """ZeroGPU backend. Loads Phi-4-mini-instruct (or whatever ZEROGPU_MODEL_ID points at) into the Space's allocated GPU and runs chat-template inference. Thin wrapper around the testable `_zerogpu_invoke` so the decorator stays at module load time.""" return _zerogpu_invoke(system_block, user_prompt) else: def _call_zerogpu(system_block: str, user_prompt: str) -> str: raise RuntimeError( "ZeroGPU backend requires `spaces`, `transformers`, and `torch` " "to be importable AND should be run on a HuggingFace Pro Space " "for free on-demand GPU. Install the full requirements.txt and " "deploy to a Space, or pick anthropic / huggingface from the " "provider dropdown." ) PROVIDERS = { "anthropic": _call_anthropic, "huggingface": _call_huggingface, "zerogpu": _call_zerogpu, } def _call_model(system_block: str, user_prompt: str, provider: str) -> str: """Dispatch to the named provider. Raises ValueError on unknown provider; callers are expected to validate before calling.""" if provider not in PROVIDERS: raise ValueError( f"Unknown provider: {provider!r}; expected one of {sorted(PROVIDERS)}" ) return PROVIDERS[provider](system_block, user_prompt) # Auto-detected once at module import; the Gradio UI exposes a runtime # override via the Provider dropdown. DEFAULT_PROVIDER = _detect_provider() INDUSTRIES = [ "insurance", "banking", "healthcare", "retail", "manufacturing", "logistics", "agriculture", "energy", "telecom", "media", "professional services", "real estate", "other", ] SCALES = ["pilot", "department", "business unit", "enterprise"] BUDGETS = ["<$100K", "$100K–$1M", "$1M–$10M", ">$10M"] # --------------------------------------------------------------------------- # Sample initiatives (gr.Examples) — one per verdict quadrant # --------------------------------------------------------------------------- # Realistic ~250–400-word AI-initiative descriptions that should land in a # specific quadrant of the 2×2 verdict matrix. Used to seed user testing # and give first-time visitors something concrete to click. _SAMPLE_COMPOUNDER = ( "We're a regional commercial insurance carrier specializing in restaurant " "general liability. We write about 8,000 policies a year across the " "Midwest, with average annual premium around $4,500. Underwriting is " "the bottleneck of our business — independent agents wait 36 to 48 " "hours for a quote because our underwriters manually pull industry " "codes, loss runs, and prior-carrier history from three different " "systems and then decide whether to bind, decline, or refer. Roughly " "30% of submissions get declined and another 15% are referred to " "senior underwriters, which adds another day. We're deploying an " "LLM-powered underwriting assistant that pulls the data automatically, " "flags risk factors based on patterns in our 12-year claims database, " "and proposes a base rate with an explanation. The underwriter " "reviews, adjusts, and approves. Every policy we write generates new " "claim outcomes — fires, slip-and-falls, liquor-liability claims, " "food-poisoning suits — and those outcomes feed back into the next " "quarter's model retraining. Our competitors mostly use Verisk's " "industry-standard rating models, which we don't share data with, so " "our model gets better on our specific book of business while theirs " "reflects the industry average. Internal goal: cut time-to-quote from " "36 hours to 4 hours, increase the win rate on profitable risks by " "15%, and progressively shift the loss ratio by 1–2 points per year " "as the model learns from each renewal cycle. Independent agents have " "already started favoring carriers with faster quote turnaround." ) _SAMPLE_ONE_SHOT_WIN = ( "We're a community bank with $4B in assets and 38 branches across two " "states. Loan officers spend about 6 hours per commercial loan " "reviewing financial statements, tax returns, and corporate documents " "before they can write the credit memo. We're deploying GPT-4 to " "extract key fields — revenue, EBITDA, debt service coverage ratio, " "ownership structure, related-party transactions, collateral " "descriptions — from these documents into a structured form. The loan " "officer reviews the extraction and writes the credit memo by hand. " "We expect to cut document review time from 6 hours to about 90 " "minutes per loan, processing roughly 2,400 commercial loans a year. " "The vendor provides the model, the document templates, and the " "extraction prompts, and is selling the same system to four of our " "peer community banks in the region under identical contracts. The AI " "doesn't learn from the outcome of the loan: defaults, prepayments, " "modifications, restructurings all go into our separate loan " "servicing system, which has never connected back to the extraction " "model. The vendor's three-year roadmap doesn't include any feedback " "loop between loan performance and the model — they treat extraction " "as a deterministic task. We're funding the project from the " "operations budget; the credit team is excited about the time savings " "but the chief credit officer has flagged that the productivity gain " "will be one-time and won't show up in the loss-given-default rate " "over time." ) _SAMPLE_WRONG_THING = ( "We're a third-party logistics provider with 8 warehouses on the East " "Coast handling about 20,000 orders a day across the network. We're " "investing in computer vision software to optimize order picking " "routes — the AI looks at the warehouse layout, current orders, and " "worker positions and suggests optimized pick paths in real time. " "Pilot results show a 12% reduction in steps per order on the test " "floor. Our operations team has been excited about this for 18 months " "and we just signed a multi-year contract with the vendor. Some " "context on the operation: our warehouses run 2 shifts. Order volume " "in shift 1 is around 14,000 orders per day; shift 2 is around 6,000. " "The pick wave finishes by 2pm on shift 1, then workers wait 4 to 5 " "hours for shift 2 trucks to arrive at the loading docks. The trucks " "are scheduled by the customer (a major national retailer) and arrive " "in unpredictable windows between 6pm and 10pm. We don't control the " "truck schedule and the customer won't share their advance schedule " "with us. The CFO has been asking us why total throughput per " "warehouse hasn't moved much in three years; our answer has been that " "the legacy warehouse management system is the constraint, which is " "why we're investing in better picking AI. Same-store labor cost is " "up 8% year-over-year because workers are paid through the idle hours." ) _SAMPLE_ROMAN_CANDLE = ( "We run a chain of 220 quick-service restaurants across the Southeast " "doing about $480M in annual revenue. Our gross margin has been under " "pressure from rising ingredient costs and we're rolling out an " "AI-powered personalized marketing platform that sends customized " "email and SMS offers based on customer purchase history, location, " "and local weather. The platform is from a major QSR-tech vendor used " "by several of our direct competitors in the same markets we operate " "in. Our customer data — names, emails, phone numbers, purchase " "frequency, average ticket size — lives in our point-of-sale " "provider's cloud, which the marketing platform pulls from via the " "POS provider's standard integration. Both the purchase data feed and " "the modeling are the vendor's stack; we don't see the underlying " "model and our data is commingled with other QSR brands the vendor " "serves on a shared inference fleet. We expect to lift email " "click-through by 8–12% based on the vendor's benchmark studies of " "similar brands. The marketing team is running the rollout; finance " "signed off on the multi-year subscription. We have not measured what " "is actually constraining same-store sales growth — drive-thru wait " "times, menu pricing relative to local competitors, or breakfast " "daypart penetration — we just know revenue has been flat for two " "years and the board wants visible action by Q4." ) def _load_reference(): """Read the prompt template + reference JSONs from disk at app start.""" prompt_template = (ROOT / "prompts" / "diagnose.txt").read_text() portraits = json.loads((ROOT / "reference" / "portraits.json").read_text()) failure_modes = json.loads((ROOT / "reference" / "failure-modes.json").read_text()) portraits_block = "\n".join( textwrap.dedent(f"""\ - id: {p['id']} label: {p['label']} bottleneck: {p['bottleneck']} summary: {p['summary']} compounding_summary: {p['compounding_summary']} article_url: {p['article_url']} """).rstrip() for p in portraits ) failure_modes_block = "\n".join( textwrap.dedent(f"""\ - id: {f['id']} label: {f['label']} applies_to_quadrants: {', '.join(f['applies_to_quadrants'])} summary: {f['summary']} url: {f['url']} """).rstrip() for f in failure_modes ) system_block = ( prompt_template .replace("{{portraits_block}}", portraits_block) .replace("{{failure_modes_block}}", failure_modes_block) ) return prompt_template, system_block # Loaded once at module import; cached in memory for the life of the process. PROMPT_TEMPLATE, SYSTEM_BLOCK = _load_reference() # --------------------------------------------------------------------------- # Diagnose entrypoint (called by the Gradio Submit handler) # --------------------------------------------------------------------------- def diagnose( description: str, industry: Optional[str], scale: Optional[str], budget: Optional[str], provider: Optional[str] = None, anthropic_api_key: Optional[str] = None, ) -> tuple[str, str]: """Validate input, call the selected model with the cached system block, parse the response, and return (markdown_writeup, raw_json_string) for the two Gradio tabs. `provider` (anthropic | huggingface | zerogpu) defaults to DEFAULT_PROVIDER when not supplied — the Gradio dropdown always supplies it on a real submission. `anthropic_api_key` is a per-call user-supplied key. When provider is "anthropic" and the key is provided, it overrides any ANTHROPIC_API_KEY env var for this single request. The key is never persisted (Anthropic SDK uses it once and the client object is garbage-collected at function exit). Per F14 + contract §2, all error paths surface a user-friendly message in the markdown tab and an empty JSON tab; nothing leaks a stack trace. """ description = (description or "").strip() words = len(description.split()) if not description: return "⚠ Please describe your AI initiative.", "" if words < MIN_DESCRIPTION_WORDS: return ( f"⚠ Please describe your initiative in at least {MIN_DESCRIPTION_WORDS} words " f"(you wrote {words}). The diagnostic needs enough context to score the four " f"compounding conditions with rationale quoting your description.", "", ) if words > MAX_DESCRIPTION_WORDS: return ( f"⚠ Please keep your description under {MAX_DESCRIPTION_WORDS} words " f"(you wrote {words}). Shorten the description and try again.", "", ) provider = provider or DEFAULT_PROVIDER if provider not in PROVIDERS: return ( f"⚠ Unknown model provider {provider!r}. Pick one of " f"{sorted(PROVIDERS)} from the dropdown.", "", ) # If Premium (Anthropic) is selected, the user must supply a key — # either via the page's API-key field (per-call) or via an # ANTHROPIC_API_KEY env var on the Space. Without either, fail fast # with a friendly explanation before we hit the SDK. user_key_for_anthropic: Optional[str] = None if provider == "anthropic": env_key = os.environ.get("ANTHROPIC_API_KEY", "").strip() user_key = (anthropic_api_key or "").strip() if not env_key and not user_key: return ( "⚠ Premium (Claude Opus) needs an Anthropic API key. Either " "paste your key in the field above, or pick one of the free " "options from the model dropdown.", "", ) if user_key: # IMPORTANT: do NOT write the user-supplied key to os.environ. # That would leak the key into concurrent requests from other # visitors on this Space (the process env is shared across # all in-flight requests in the Python worker). Instead we # pass it directly to _call_anthropic below, which scopes it # to a single SDK client instance that's garbage-collected # when the call returns. user_key_for_anthropic = user_key user_prompt = ( PROMPT_TEMPLATE .replace("{{user_input}}", description) .replace("{{industry}}", industry or "(not specified)") .replace("{{scale}}", scale or "(not specified)") .replace("{{budget}}", budget or "(not specified)") ) try: # When the visitor supplied their own Anthropic key, bypass the # generic dispatcher so we can pass the key directly via kwarg # without ever touching os.environ. All other paths go through # the dispatcher and read credentials from env as usual. if provider == "anthropic" and user_key_for_anthropic: raw = _call_anthropic( SYSTEM_BLOCK, user_prompt, api_key=user_key_for_anthropic, ) else: raw = _call_model(SYSTEM_BLOCK, user_prompt, provider) except Exception as e: # API timeout / rate limit / auth / server / network failure # (Anthropic SDK, huggingface_hub InferenceClient, or # transformers/torch on the zerogpu path). Include both the # exception class AND its string form so unexpected failures # are diagnosable from the UI without server log access. model_label = { "anthropic": ANTHROPIC_MODEL_ID, "huggingface": HF_MODEL_ID, "zerogpu": ZEROGPU_MODEL_ID, }.get(provider, provider) detail = str(e).strip() or "(no message)" # Cap the detail so we don't spill multi-paragraph tracebacks # into the UI. 400 chars is enough for a stack-trace summary # without flooding the markdown tab. if len(detail) > 400: detail = detail[:400] + "…" # Defense-in-depth: if the user-supplied Anthropic key somehow # appears in the exception message (no current SDK version does # this, but a future debug-mode override might), redact it # before surfacing the writeup. Symmetric with redactKey() in # src/lib/anthropic-direct.ts. if user_key_for_anthropic and len(user_key_for_anthropic) >= 8: detail = detail.replace(user_key_for_anthropic, "[redacted]") return ( f"⚠ The diagnostic call to {provider} ({model_label}) failed.\n\n" f"**{type(e).__name__}:** {detail}\n\n" f"Try again in a moment, switch providers in the dropdown, " f"or shorten your description.", "", ) try: parsed = parse_response(raw) except MalformedResponseError as e: return ( f"⚠ The model returned malformed output. Try again with a different description " f"or shorten the existing one.\n\nDetail: {e}", "", ) payload = { "constraint": parsed.constraint, "quadrant": parsed.quadrant, "closest_portrait": parsed.closest_portrait, "closest_portrait_paragraph": parsed.closest_portrait_paragraph, "scores": { k: {"score": v.score, "rationale": v.rationale, "quoted_span": v.quoted_span} for k, v in parsed.scores.items() }, "warnings": [ {"text": w.text, "citation_source": w.citation_source, "citation_url": w.citation_url} for w in parsed.warnings ], } return parsed.writeup, json.dumps(payload, indent=2) # --------------------------------------------------------------------------- # Gradio UI (built lazily so `import app` from tests does not require gradio) # --------------------------------------------------------------------------- def build_demo(): """Build and return the Gradio Blocks UI. Called only by __main__.""" import gradio as gr # Free option first, premium second. Plain-English labels with no # ANTHROPIC_API_KEY / SPACE_ID / ZeroGPU jargon — the casual user # shouldn't have to know what any of those mean. # # The HuggingFace Inference Providers backend (provider="huggingface") # is intentionally NOT in this dropdown: it requires the Space owner # to have HF billing set up (credit card on file OR custom provider # API keys), which most Pro users don't have by default. The backend # code remains in PROVIDERS so it's reachable via MODEL_PROVIDER env # override for users who do set up billing — see README.md. provider_choices = [] if _zerogpu_available(): provider_choices.append(( f"Free · Phi-4-mini-instruct (Microsoft) — runs on GPU", "zerogpu", )) provider_choices.append(( f"Premium · Claude Opus 4.7 (Anthropic) — paste your API key below", "anthropic", )) # Default to the first free option; user can pick Premium if they # have a key. Never default to anthropic on a public Space. default_choice = provider_choices[0][1] with gr.Blocks(title="The Compounding Test") as demo: gr.Markdown( "# The Compounding Test\n\n" "A diagnostic for AI investments at non-technology companies. " "Describe your AI initiative — get a scored writeup in one of " "four outcomes: **compounder**, **one-shot win**, **compounding " "the wrong thing**, or **Roman Candle**.\n\n" "**The default is free** — runs an open model (Phi-4-mini) " "on this Space's GPU. Pick **Premium · Claude Opus** from " "the dropdown if you have an Anthropic API key and want the " "highest-quality writeup. Read the full framework at " "[mile-hi.ai/journal/the-berkshire-test](" "https://www.mile-hi.ai/journal/the-berkshire-test)." ) with gr.Row(): description = gr.Textbox( label=f"Describe your AI initiative ({MIN_DESCRIPTION_WORDS}–{MAX_DESCRIPTION_WORDS} words)", placeholder=( "Describe the bottleneck of your operation, the AI " "investment, what data feeds it, where the labels come " "from, and how you expect competitors to respond. Be " "specific about the workflow.\n\n" "Or pick a sample below to see how it works." ), lines=12, ) with gr.Row(): industry = gr.Dropdown(INDUSTRIES, label="Industry (optional)", value=None) scale = gr.Dropdown(SCALES, label="Scale (optional)", value=None) budget = gr.Dropdown(BUDGETS, label="Budget tier (optional)", value=None) gr.Examples( examples=[ [_SAMPLE_COMPOUNDER, "insurance", "business unit", "$1M–$10M"], [_SAMPLE_ONE_SHOT_WIN, "banking", "business unit", "$100K–$1M"], [_SAMPLE_WRONG_THING, "logistics", "enterprise", "$1M–$10M"], [_SAMPLE_ROMAN_CANDLE, "retail", "enterprise", "$100K–$1M"], ], inputs=[description, industry, scale, budget], label="Sample initiatives — click one to load it (then click Diagnose)", examples_per_page=4, ) with gr.Row(): provider = gr.Dropdown( choices=provider_choices, value=default_choice, label="Choose a model", ) # The API-key field appears only when Premium is selected. The # key is used per-request and never stored. api_key = gr.Textbox( label="Anthropic API key", placeholder="sk-ant-...", type="password", info=( "Used only for this request and never stored. " "Get a key at console.anthropic.com." ), visible=False, ) def _toggle_api_key(p): return gr.update(visible=(p == "anthropic")) provider.change(_toggle_api_key, inputs=[provider], outputs=[api_key]) submit = gr.Button("Diagnose", variant="primary") with gr.Tabs(): with gr.Tab("Diagnosis"): writeup_out = gr.Markdown() with gr.Tab("Raw JSON"): json_out = gr.Code(language="json") submit.click( diagnose, inputs=[description, industry, scale, budget, provider, api_key], outputs=[writeup_out, json_out], ) return demo if __name__ == "__main__": # Local dev: relies on .env (loaded by python-dotenv) for ANTHROPIC_API_KEY. # HF Spaces: relies on Space secrets. try: from dotenv import load_dotenv load_dotenv() except ImportError: pass # dotenv is optional; HF Spaces uses Space secrets. build_demo().launch()