File size: 40,267 Bytes
60518c1
 
 
 
 
 
 
971e3f4
 
033a83e
 
 
 
 
 
 
 
971e3f4
 
 
 
 
 
 
60518c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e084f76
60518c1
e084f76
60518c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
033a83e
 
971e3f4
3c7d5bb
c673b37
 
 
 
 
 
60518c1
 
 
033a83e
971e3f4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
033a83e
 
 
 
 
 
 
971e3f4
 
 
 
 
 
 
 
033a83e
 
 
 
971e3f4
033a83e
971e3f4
 
033a83e
 
 
 
 
 
 
 
 
 
 
37d5cf4
033a83e
37d5cf4
 
 
 
 
 
 
 
033a83e
 
37d5cf4
033a83e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e0a9313
 
 
 
 
 
033a83e
e0a9313
033a83e
 
 
 
e0a9313
033a83e
e0a9313
 
 
 
 
 
 
 
8f09671
 
 
 
 
 
 
 
 
 
 
 
 
66ce4cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
033a83e
 
 
c673b37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75a8a07
971e3f4
 
 
c673b37
 
971e3f4
c673b37
 
 
971e3f4
 
3c77cd5
c673b37
 
 
 
 
 
 
 
 
 
 
3c77cd5
 
 
 
 
 
 
 
c673b37
3c77cd5
 
 
 
 
 
 
 
 
 
 
 
 
971e3f4
 
 
 
 
 
3c77cd5
 
 
971e3f4
 
 
 
 
 
 
 
 
 
 
 
 
033a83e
 
 
971e3f4
033a83e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60518c1
 
 
 
 
 
 
 
 
ebbca73
 
 
 
 
 
 
 
 
 
17c3a19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ebbca73
 
 
17c3a19
ebbca73
 
 
 
17c3a19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ebbca73
 
 
17c3a19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ebbca73
 
 
17c3a19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ebbca73
 
 
60518c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
033a83e
ebbca73
60518c1
033a83e
 
 
 
ebbca73
 
 
 
 
 
 
 
 
60518c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
033a83e
 
 
 
 
 
 
 
ebbca73
 
 
 
37d5cf4
ebbca73
 
 
 
 
 
 
 
 
 
 
37d5cf4
 
 
 
 
 
 
 
ebbca73
60518c1
 
 
 
 
 
 
 
 
37d5cf4
 
 
 
 
 
 
 
 
 
60518c1
033a83e
6fc254e
 
 
 
971e3f4
 
 
 
 
6fc254e
 
 
 
 
 
7b8b778
 
 
 
 
 
 
60518c1
6fc254e
 
 
 
60518c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61393d7
 
ebbca73
61393d7
 
 
 
 
 
 
ebbca73
971e3f4
 
ebbca73
971e3f4
 
ebbca73
 
 
 
 
 
 
033a83e
60518c1
 
 
ebbca73
 
 
 
61393d7
 
 
 
ebbca73
 
60518c1
 
 
 
 
ebbca73
 
 
 
 
60518c1
 
 
ebbca73
60518c1
 
 
 
ebbca73
 
 
 
 
 
 
 
 
 
 
 
 
033a83e
 
 
ebbca73
 
033a83e
ebbca73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60518c1
 
 
 
 
 
 
 
ebbca73
60518c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
"""The Compounding Test β€” HuggingFace Space.

A single-shot Gradio app that runs an AI-initiative description through
the two-axis Berkshire Test for AI and returns a scored writeup.

Architecture per specs/004-berkshire-test/contracts/hf-space-interface.md:
  - Inputs: a description (200–5000 words) + 3 optional clarifiers.
  - Three backends, selectable by env (`MODEL_PROVIDER`) or auto-detected
    from available credentials and runtime environment:
      * anthropic   β€” Claude Opus / Sonnet via the Anthropic SDK;
                      system block is `cache_control:ephemeral` so
                      subsequent calls hit the 5-minute prefix cache.
      * huggingface β€” Open models (Gemma 2 9B by default, swappable to
                      Phi-4, Llama-3.3, Qwen 2.5, etc.) via the
                      huggingface_hub InferenceClient. Works on HF
                      Spaces with the Space's free inference credits;
                      locally requires HF_TOKEN.
      * zerogpu     β€” Open model (Phi-4-mini-instruct by default)
                      loaded LOCALLY in the Space via transformers,
                      decorated with `@spaces.GPU` so a HuggingFace
                      Pro plan gets free on-demand A100/H100 GPU
                      allocation per request. No per-call credit burn;
                      no API round-trip. Requires the Space to have a
                      Pro owner; locally falls back to CPU (slow).
  - Output: two Gradio tabs β€” markdown writeup + raw JSON.

Engine/Site boundary (Principle VIII): this app lives in gradio-apps/
only. Never deployed to mile-hi.ai. Reference JSONs are populated by
hand from the published articles β€” no runtime fetch from the site.
"""
from __future__ import annotations

import json
import os
import re
import textwrap
from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional

# ---------------------------------------------------------------------------
# Parser surface (covered by test_diagnose.py β€” module-level, no side effects)
# ---------------------------------------------------------------------------


class MalformedResponseError(Exception):
    """Raised when the model's response cannot be parsed into a Response."""


VALID_QUADRANTS = {"compounder", "one-shot-win", "wrong-thing", "roman-candle"}
VALID_PORTRAITS = {"progressive", "deere", "mastercard", "mayo"}
REQUIRED_SCORES = (
    "proprietary_data",
    "self_labeling",
    "decreasing_marginal_cost",
    "defensible_asymmetry",
)


@dataclass
class Score:
    score: int
    rationale: str
    quoted_span: str


@dataclass
class Warning:
    text: str
    citation_source: str
    citation_url: str


@dataclass
class Response:
    constraint: str
    scores: dict  # str β†’ Score (one entry per REQUIRED_SCORES key)
    quadrant: str
    closest_portrait: str
    closest_portrait_paragraph: str
    warnings: list  # list[Warning]
    writeup: str


# Match the FIRST ```json ... ``` fenced block in the response.
JSON_BLOCK_RE = re.compile(r"```json\s*\n(.*?)\n\s*```", re.DOTALL)


def parse_response(raw: str) -> Response:
    """Extract the first ```json``` block from `raw`, validate the schema,
    and return a populated Response. Trailing markdown becomes `writeup`.

    Raises MalformedResponseError on any schema violation per the contract
    in specs/004-berkshire-test/contracts/hf-space-interface.md Β§4.
    """
    match = JSON_BLOCK_RE.search(raw)
    if not match:
        raise MalformedResponseError("No ```json``` block found in response")

    json_text = match.group(1)
    try:
        data = json.loads(json_text)
    except json.JSONDecodeError as e:
        raise MalformedResponseError(f"JSON block did not parse: {e}")

    required = (
        "constraint",
        "scores",
        "quadrant",
        "closest_portrait",
        "closest_portrait_paragraph",
        "warnings",
    )
    for field_name in required:
        if field_name not in data:
            raise MalformedResponseError(f"Missing required field: {field_name}")

    if data["quadrant"] not in VALID_QUADRANTS:
        raise MalformedResponseError(
            f"Invalid quadrant: {data['quadrant']!r}; expected one of {sorted(VALID_QUADRANTS)}"
        )
    if data["closest_portrait"] not in VALID_PORTRAITS:
        raise MalformedResponseError(
            f"Invalid closest_portrait: {data['closest_portrait']!r}; expected one of {sorted(VALID_PORTRAITS)}"
        )

    if not isinstance(data["scores"], dict):
        raise MalformedResponseError("scores must be a JSON object")

    scores: dict[str, Score] = {}
    for key in REQUIRED_SCORES:
        if key not in data["scores"]:
            raise MalformedResponseError(f"Missing score key: {key}")
        s = data["scores"][key]
        if not isinstance(s, dict):
            raise MalformedResponseError(f"Score {key} must be an object")
        for sub in ("score", "rationale", "quoted_span"):
            if sub not in s:
                raise MalformedResponseError(f"Score {key} missing sub-field: {sub}")
        # score must be an int 0-4 (bools are excluded; bool is a subclass of int in Python)
        if isinstance(s["score"], bool) or not isinstance(s["score"], int):
            raise MalformedResponseError(
                f"Score {key}.score must be an integer 0-4, got {type(s['score']).__name__}"
            )
        if s["score"] < 0 or s["score"] > 4:
            raise MalformedResponseError(
                f"Score {key}.score must be in 0-4, got {s['score']}"
            )
        if not isinstance(s["quoted_span"], str) or not s["quoted_span"]:
            raise MalformedResponseError(f"Score {key}.quoted_span must be a non-empty string")
        if len(s["quoted_span"]) > 400:
            raise MalformedResponseError(
                f"Score {key}.quoted_span must be ≀400 chars, got {len(s['quoted_span'])}"
            )
        scores[key] = Score(
            score=s["score"], rationale=s["rationale"], quoted_span=s["quoted_span"]
        )

    if not isinstance(data["warnings"], list):
        raise MalformedResponseError("warnings must be a JSON array")
    warnings = [
        Warning(
            text=w.get("text", ""),
            citation_source=w.get("citation_source", ""),
            citation_url=w.get("citation_url", ""),
        )
        for w in data["warnings"]
    ]

    writeup = raw[match.end():].strip()

    return Response(
        constraint=data["constraint"],
        scores=scores,
        quadrant=data["quadrant"],
        closest_portrait=data["closest_portrait"],
        closest_portrait_paragraph=data["closest_portrait_paragraph"],
        warnings=warnings,
        writeup=writeup,
    )


# ---------------------------------------------------------------------------
# Configuration (env-driven; see .env.example)
# ---------------------------------------------------------------------------

ROOT = Path(__file__).parent

ANTHROPIC_MODEL_ID = os.environ.get("MODEL_ID", "claude-opus-4-7")
HF_MODEL_ID = os.environ.get("HF_MODEL_ID", "google/gemma-2-9b-it")
ZEROGPU_MODEL_ID = os.environ.get("ZEROGPU_MODEL_ID", "microsoft/Phi-4-mini-instruct")
# ZeroGPU reserves this many seconds from the Space owner's daily quota
# per request. With the pre-load pattern below (model on CPU at module
# init, .to('cuda') + inference inside @spaces.GPU), per-call cost is
# only ~10-25s wall-clock. 45s gives generous margin while squeezing
# ~2.5x more submissions per quota window vs the original 120s.
# Pro-tier max is 120s; raise via env if you need bigger headroom.
ZEROGPU_DURATION_SECONDS = int(os.environ.get("ZEROGPU_DURATION_SECONDS", "45"))
MAX_DESCRIPTION_WORDS = int(os.environ.get("MAX_DESCRIPTION_WORDS", "5000"))
MIN_DESCRIPTION_WORDS = 200


# ZeroGPU availability is detected at import time. The `spaces` package
# is HuggingFace's runtime for on-demand GPU allocation; `transformers`
# + `torch` are required to actually load and run the model. All three
# must be importable for the zerogpu backend to function.
try:
    import spaces as _spaces
    import torch as _torch
    from transformers import AutoModelForCausalLM as _AutoModelForCausalLM
    from transformers import AutoTokenizer as _AutoTokenizer

    _ZEROGPU_DEPS_AVAILABLE = True
except ImportError:
    _ZEROGPU_DEPS_AVAILABLE = False


def _zerogpu_available() -> bool:
    """Return whether the zerogpu backend can be used. Wrapped as a
    function so tests can monkeypatch the answer without touching the
    real torch/transformers imports."""
    return _ZEROGPU_DEPS_AVAILABLE


# ---------------------------------------------------------------------------
# Provider abstraction (anthropic vs huggingface β€” selectable at runtime)
# ---------------------------------------------------------------------------


def _detect_provider(env=None) -> str:
    """Pick a model provider from env. Order of precedence:
      1. Explicit MODEL_PROVIDER (anthropic | huggingface | zerogpu).
      2. Running on a HuggingFace Space (SPACE_ID set) AND the zerogpu
         deps (spaces + transformers + torch) are importable β†’ zerogpu.
         This is the Pro-plan free-GPU path.
      3. Presence of ANTHROPIC_API_KEY β†’ anthropic.
      4. Presence of HF_TOKEN / HUGGING_FACE_HUB_TOKEN, or running on
         a HuggingFace Space without zerogpu deps β†’ huggingface.
      5. Fall through to anthropic (call-time error will tell the user
         which env to set).
    """
    env = env if env is not None else os.environ
    explicit = env.get("MODEL_PROVIDER", "").strip().lower()
    if explicit in ("anthropic", "huggingface", "zerogpu"):
        return explicit
    if env.get("SPACE_ID") and _zerogpu_available():
        return "zerogpu"
    if env.get("ANTHROPIC_API_KEY"):
        return "anthropic"
    if (
        env.get("HF_TOKEN")
        or env.get("HUGGING_FACE_HUB_TOKEN")
        or env.get("SPACE_ID")
    ):
        return "huggingface"
    return "anthropic"


def _call_anthropic(system_block: str, user_prompt: str, *, api_key: Optional[str] = None) -> str:
    """Anthropic backend. System block is cache-marked; the user prompt
    is sent fresh. Returns the raw assistant text.

    `api_key`: an optional per-call key. When provided, it goes directly
    to the SDK constructor and is NEVER written to os.environ. This is
    important on a multi-tenant public Space β€” mutating env would leak
    one visitor's key into a concurrent request from another visitor.
    When `api_key` is None, the SDK reads ANTHROPIC_API_KEY from env
    (the Space-owner's key path)."""
    from anthropic import Anthropic

    client = Anthropic(api_key=api_key) if api_key else Anthropic()
    resp = client.messages.create(
        model=ANTHROPIC_MODEL_ID,
        max_tokens=2500,
        system=[
            {
                "type": "text",
                "text": system_block,
                "cache_control": {"type": "ephemeral"},
            }
        ],
        messages=[{"role": "user", "content": user_prompt}],
    )
    return resp.content[0].text


def _call_huggingface(system_block: str, user_prompt: str) -> str:
    """HuggingFace backend. Uses the unified chat_completion interface,
    which routes through HF Inference Providers and supports Gemma 2,
    Phi-4-mini-instruct, Llama-3.3, Qwen 2.5, and many others. Lower
    temperature (0.2) than the SDK default to keep JSON output stable β€”
    smaller open models can be looser than Claude on schema adherence.

    Requires an HF token: HF_TOKEN env var, HUGGING_FACE_HUB_TOKEN env
    var, or a `hf auth login`-stored token (huggingface_hub.get_token()
    checks all three sources). HF Spaces do NOT auto-inject a token on
    public Spaces β€” the Space owner has to add it as a Space secret.
    Raise a clear, actionable error if missing.
    """
    from huggingface_hub import InferenceClient, get_token

    token = (
        os.environ.get("HF_TOKEN")
        or os.environ.get("HUGGING_FACE_HUB_TOKEN")
        or get_token()  # checks ~/.cache/huggingface/token from `hf auth login`
    )
    if not token:
        raise RuntimeError(
            "No HuggingFace token found. The Space owner needs to add HF_TOKEN "
            "as a Space secret (Settings β†’ Repository secrets β†’ New secret β†’ "
            "name: HF_TOKEN, value: a User Access Token from "
            "https://huggingface.co/settings/tokens). Then restart the Space. "
            "Until then, pick a different model from the dropdown."
        )
    # `provider="auto"` opts into the modern HF Inference Providers
    # routing layer (introduced 2024-Q4), which picks the right partner
    # (featherless-ai / together-ai / hf-inference / etc.) for the model
    # automatically. Without this flag, InferenceClient falls back to
    # the legacy hf-inference-only path, which doesn't serve most newer
    # models and returns a misleading "model not supported" error even
    # when the user has all providers enabled and access to the model.
    client = InferenceClient(
        model=HF_MODEL_ID,
        token=token,
        provider="auto",
        timeout=120,
    )
    try:
        resp = client.chat_completion(
            messages=[
                {"role": "system", "content": system_block},
                {"role": "user", "content": user_prompt},
            ],
            max_tokens=2500,
            temperature=0.2,
        )
    except Exception as e:
        msg = str(e)
        # HF Inference Providers routes each model through a partner
        # (featherless-ai, together-ai, hf-inference, etc.). If none of
        # the enabled providers serves the requested model, the API
        # returns a BadRequestError with code=model_not_supported. The
        # raw error is opaque to users, so re-raise with the actual fix
        # instead of the unhelpful default message.
        if "model_not_supported" in msg or "not supported by any provider" in msg:
            raise RuntimeError(
                f"The model '{HF_MODEL_ID}' isn't available through any of "
                f"the HuggingFace Inference Providers enabled on your account. "
                f"Two fixes: (a) enable a provider that supports this model at "
                f"https://huggingface.co/settings/inference-providers, OR "
                f"(b) set HF_MODEL_ID as a Space variable to a model on your "
                f"enabled providers β€” microsoft/Phi-4-mini-instruct works "
                f"broadly via featherless-ai."
            )
        raise
    return resp.choices[0].message.content


# ZeroGPU backend β€” pre-load pattern.
#
# Model is loaded onto CPU at Space startup (module init), NOT inside
# `@spaces.GPU`. This is the documented HuggingFace ZeroGPU pattern:
#   - Module init runs once at Space startup, on CPU, with no GPU
#     quota consumed. The expensive part β€” downloading ~7.6GB of
#     safetensors and deserializing into PyTorch state β€” happens here.
#   - Inside `@spaces.GPU`, all we do is `.to('cuda')` + tokenize +
#     generate + decode. Wall-clock drops to ~10-15s warm, ~20-25s
#     after Space restart (the .to('cuda') for 7.6GB takes a few
#     seconds over PCIe).
#
# Why deliberately NOT `trust_remote_code=True`. Phi-4-mini-instruct's
# architecture is `phi3`, which transformers 4.46+ supports natively
# via `Phi3ForCausalLM` β€” no custom code download required. The custom
# modeling code that ships with the model on HF Hub (`modeling_phi3.py`)
# imports `LossKwargs` from `transformers.utils`, which was removed in
# transformers 4.57+ β€” loading WITH `trust_remote_code=True` fails
# with `ImportError: cannot import name 'LossKwargs' from
# 'transformers.utils'` and bricks the `@spaces.GPU` worker. The
# native path avoids the upstream pin-mismatch entirely.
#
# Tradeoff: ~30-60s slower Space cold-start (the one-time CPU load).
# Acceptable because Spaces only restart on deploy or after a long
# idle period. Worth it for the 2.5x quota efficiency.

if _ZEROGPU_DEPS_AVAILABLE:
    _zerogpu_tokenizer = _AutoTokenizer.from_pretrained(ZEROGPU_MODEL_ID)
    _zerogpu_model = _AutoModelForCausalLM.from_pretrained(
        ZEROGPU_MODEL_ID,
        torch_dtype=_torch.bfloat16,
        # NO device_map β€” load to CPU; we move to GPU per-call inside
        # @spaces.GPU. ZeroGPU has no GPU available at module load.
    )
else:
    _zerogpu_tokenizer = None
    _zerogpu_model = None


def _zerogpu_invoke(system_block: str, user_prompt: str) -> str:
    """Model invocation logic for the ZeroGPU backend. Pre-loaded model
    (on CPU) is moved to GPU on entry, then inference + decode. Reads
    module-level globals (`_zerogpu_tokenizer`, `_zerogpu_model`) which
    tests monkeypatch to fake the transformers types.

    Separated from the `@spaces.GPU` decoration below so it can be
    unit-tested without actually allocating a GPU."""
    # Move pre-loaded model from CPU to the GPU that @spaces.GPU just
    # allocated. Fast β€” just PCIe memory transfer of already-loaded
    # weights, no download or deserialize.
    _zerogpu_model.to("cuda")
    messages = [
        {"role": "system", "content": system_block},
        {"role": "user", "content": user_prompt},
    ]
    inputs = _zerogpu_tokenizer.apply_chat_template(
        messages,
        return_tensors="pt",
        add_generation_prompt=True,
    ).to("cuda")
    outputs = _zerogpu_model.generate(
        inputs,
        max_new_tokens=2500,
        temperature=0.2,
        do_sample=True,
        pad_token_id=_zerogpu_tokenizer.eos_token_id,
    )
    prompt_len = inputs.shape[1]
    return _zerogpu_tokenizer.decode(
        outputs[0][prompt_len:], skip_special_tokens=True
    )


if _ZEROGPU_DEPS_AVAILABLE:

    @_spaces.GPU(duration=ZEROGPU_DURATION_SECONDS)
    def _call_zerogpu(system_block: str, user_prompt: str) -> str:
        """ZeroGPU backend. Loads Phi-4-mini-instruct (or whatever
        ZEROGPU_MODEL_ID points at) into the Space's allocated GPU and
        runs chat-template inference. Thin wrapper around the testable
        `_zerogpu_invoke` so the decorator stays at module load time."""
        return _zerogpu_invoke(system_block, user_prompt)

else:

    def _call_zerogpu(system_block: str, user_prompt: str) -> str:
        raise RuntimeError(
            "ZeroGPU backend requires `spaces`, `transformers`, and `torch` "
            "to be importable AND should be run on a HuggingFace Pro Space "
            "for free on-demand GPU. Install the full requirements.txt and "
            "deploy to a Space, or pick anthropic / huggingface from the "
            "provider dropdown."
        )


PROVIDERS = {
    "anthropic": _call_anthropic,
    "huggingface": _call_huggingface,
    "zerogpu": _call_zerogpu,
}


def _call_model(system_block: str, user_prompt: str, provider: str) -> str:
    """Dispatch to the named provider. Raises ValueError on unknown
    provider; callers are expected to validate before calling."""
    if provider not in PROVIDERS:
        raise ValueError(
            f"Unknown provider: {provider!r}; expected one of {sorted(PROVIDERS)}"
        )
    return PROVIDERS[provider](system_block, user_prompt)


# Auto-detected once at module import; the Gradio UI exposes a runtime
# override via the Provider dropdown.
DEFAULT_PROVIDER = _detect_provider()

INDUSTRIES = [
    "insurance", "banking", "healthcare", "retail", "manufacturing",
    "logistics", "agriculture", "energy", "telecom", "media",
    "professional services", "real estate", "other",
]
SCALES = ["pilot", "department", "business unit", "enterprise"]
BUDGETS = ["<$100K", "$100K–$1M", "$1M–$10M", ">$10M"]


# ---------------------------------------------------------------------------
# Sample initiatives (gr.Examples) β€” one per verdict quadrant
# ---------------------------------------------------------------------------
# Realistic ~250–400-word AI-initiative descriptions that should land in a
# specific quadrant of the 2Γ—2 verdict matrix. Used to seed user testing
# and give first-time visitors something concrete to click.

_SAMPLE_COMPOUNDER = (
    "We're a regional commercial insurance carrier specializing in restaurant "
    "general liability. We write about 8,000 policies a year across the "
    "Midwest, with average annual premium around $4,500. Underwriting is "
    "the bottleneck of our business β€” independent agents wait 36 to 48 "
    "hours for a quote because our underwriters manually pull industry "
    "codes, loss runs, and prior-carrier history from three different "
    "systems and then decide whether to bind, decline, or refer. Roughly "
    "30% of submissions get declined and another 15% are referred to "
    "senior underwriters, which adds another day. We're deploying an "
    "LLM-powered underwriting assistant that pulls the data automatically, "
    "flags risk factors based on patterns in our 12-year claims database, "
    "and proposes a base rate with an explanation. The underwriter "
    "reviews, adjusts, and approves. Every policy we write generates new "
    "claim outcomes β€” fires, slip-and-falls, liquor-liability claims, "
    "food-poisoning suits β€” and those outcomes feed back into the next "
    "quarter's model retraining. Our competitors mostly use Verisk's "
    "industry-standard rating models, which we don't share data with, so "
    "our model gets better on our specific book of business while theirs "
    "reflects the industry average. Internal goal: cut time-to-quote from "
    "36 hours to 4 hours, increase the win rate on profitable risks by "
    "15%, and progressively shift the loss ratio by 1–2 points per year "
    "as the model learns from each renewal cycle. Independent agents have "
    "already started favoring carriers with faster quote turnaround."
)

_SAMPLE_ONE_SHOT_WIN = (
    "We're a community bank with $4B in assets and 38 branches across two "
    "states. Loan officers spend about 6 hours per commercial loan "
    "reviewing financial statements, tax returns, and corporate documents "
    "before they can write the credit memo. We're deploying GPT-4 to "
    "extract key fields β€” revenue, EBITDA, debt service coverage ratio, "
    "ownership structure, related-party transactions, collateral "
    "descriptions β€” from these documents into a structured form. The loan "
    "officer reviews the extraction and writes the credit memo by hand. "
    "We expect to cut document review time from 6 hours to about 90 "
    "minutes per loan, processing roughly 2,400 commercial loans a year. "
    "The vendor provides the model, the document templates, and the "
    "extraction prompts, and is selling the same system to four of our "
    "peer community banks in the region under identical contracts. The AI "
    "doesn't learn from the outcome of the loan: defaults, prepayments, "
    "modifications, restructurings all go into our separate loan "
    "servicing system, which has never connected back to the extraction "
    "model. The vendor's three-year roadmap doesn't include any feedback "
    "loop between loan performance and the model β€” they treat extraction "
    "as a deterministic task. We're funding the project from the "
    "operations budget; the credit team is excited about the time savings "
    "but the chief credit officer has flagged that the productivity gain "
    "will be one-time and won't show up in the loss-given-default rate "
    "over time."
)

_SAMPLE_WRONG_THING = (
    "We're a third-party logistics provider with 8 warehouses on the East "
    "Coast handling about 20,000 orders a day across the network. We're "
    "investing in computer vision software to optimize order picking "
    "routes β€” the AI looks at the warehouse layout, current orders, and "
    "worker positions and suggests optimized pick paths in real time. "
    "Pilot results show a 12% reduction in steps per order on the test "
    "floor. Our operations team has been excited about this for 18 months "
    "and we just signed a multi-year contract with the vendor. Some "
    "context on the operation: our warehouses run 2 shifts. Order volume "
    "in shift 1 is around 14,000 orders per day; shift 2 is around 6,000. "
    "The pick wave finishes by 2pm on shift 1, then workers wait 4 to 5 "
    "hours for shift 2 trucks to arrive at the loading docks. The trucks "
    "are scheduled by the customer (a major national retailer) and arrive "
    "in unpredictable windows between 6pm and 10pm. We don't control the "
    "truck schedule and the customer won't share their advance schedule "
    "with us. The CFO has been asking us why total throughput per "
    "warehouse hasn't moved much in three years; our answer has been that "
    "the legacy warehouse management system is the constraint, which is "
    "why we're investing in better picking AI. Same-store labor cost is "
    "up 8% year-over-year because workers are paid through the idle hours."
)

_SAMPLE_ROMAN_CANDLE = (
    "We run a chain of 220 quick-service restaurants across the Southeast "
    "doing about $480M in annual revenue. Our gross margin has been under "
    "pressure from rising ingredient costs and we're rolling out an "
    "AI-powered personalized marketing platform that sends customized "
    "email and SMS offers based on customer purchase history, location, "
    "and local weather. The platform is from a major QSR-tech vendor used "
    "by several of our direct competitors in the same markets we operate "
    "in. Our customer data β€” names, emails, phone numbers, purchase "
    "frequency, average ticket size β€” lives in our point-of-sale "
    "provider's cloud, which the marketing platform pulls from via the "
    "POS provider's standard integration. Both the purchase data feed and "
    "the modeling are the vendor's stack; we don't see the underlying "
    "model and our data is commingled with other QSR brands the vendor "
    "serves on a shared inference fleet. We expect to lift email "
    "click-through by 8–12% based on the vendor's benchmark studies of "
    "similar brands. The marketing team is running the rollout; finance "
    "signed off on the multi-year subscription. We have not measured what "
    "is actually constraining same-store sales growth β€” drive-thru wait "
    "times, menu pricing relative to local competitors, or breakfast "
    "daypart penetration β€” we just know revenue has been flat for two "
    "years and the board wants visible action by Q4."
)


def _load_reference():
    """Read the prompt template + reference JSONs from disk at app start."""
    prompt_template = (ROOT / "prompts" / "diagnose.txt").read_text()
    portraits = json.loads((ROOT / "reference" / "portraits.json").read_text())
    failure_modes = json.loads((ROOT / "reference" / "failure-modes.json").read_text())

    portraits_block = "\n".join(
        textwrap.dedent(f"""\
        - id: {p['id']}
          label: {p['label']}
          bottleneck: {p['bottleneck']}
          summary: {p['summary']}
          compounding_summary: {p['compounding_summary']}
          article_url: {p['article_url']}
        """).rstrip()
        for p in portraits
    )

    failure_modes_block = "\n".join(
        textwrap.dedent(f"""\
        - id: {f['id']}
          label: {f['label']}
          applies_to_quadrants: {', '.join(f['applies_to_quadrants'])}
          summary: {f['summary']}
          url: {f['url']}
        """).rstrip()
        for f in failure_modes
    )

    system_block = (
        prompt_template
        .replace("{{portraits_block}}", portraits_block)
        .replace("{{failure_modes_block}}", failure_modes_block)
    )

    return prompt_template, system_block


# Loaded once at module import; cached in memory for the life of the process.
PROMPT_TEMPLATE, SYSTEM_BLOCK = _load_reference()


# ---------------------------------------------------------------------------
# Diagnose entrypoint (called by the Gradio Submit handler)
# ---------------------------------------------------------------------------


def diagnose(
    description: str,
    industry: Optional[str],
    scale: Optional[str],
    budget: Optional[str],
    provider: Optional[str] = None,
    anthropic_api_key: Optional[str] = None,
) -> tuple[str, str]:
    """Validate input, call the selected model with the cached system
    block, parse the response, and return (markdown_writeup,
    raw_json_string) for the two Gradio tabs.

    `provider` (anthropic | huggingface | zerogpu) defaults to
    DEFAULT_PROVIDER when not supplied β€” the Gradio dropdown always
    supplies it on a real submission.

    `anthropic_api_key` is a per-call user-supplied key. When provider
    is "anthropic" and the key is provided, it overrides any
    ANTHROPIC_API_KEY env var for this single request. The key is never
    persisted (Anthropic SDK uses it once and the client object is
    garbage-collected at function exit).

    Per F14 + contract Β§2, all error paths surface a user-friendly message
    in the markdown tab and an empty JSON tab; nothing leaks a stack trace.
    """
    description = (description or "").strip()
    words = len(description.split())

    if not description:
        return "⚠ Please describe your AI initiative.", ""
    if words < MIN_DESCRIPTION_WORDS:
        return (
            f"⚠ Please describe your initiative in at least {MIN_DESCRIPTION_WORDS} words "
            f"(you wrote {words}). The diagnostic needs enough context to score the four "
            f"compounding conditions with rationale quoting your description.",
            "",
        )
    if words > MAX_DESCRIPTION_WORDS:
        return (
            f"⚠ Please keep your description under {MAX_DESCRIPTION_WORDS} words "
            f"(you wrote {words}). Shorten the description and try again.",
            "",
        )

    provider = provider or DEFAULT_PROVIDER
    if provider not in PROVIDERS:
        return (
            f"⚠ Unknown model provider {provider!r}. Pick one of "
            f"{sorted(PROVIDERS)} from the dropdown.",
            "",
        )

    # If Premium (Anthropic) is selected, the user must supply a key β€”
    # either via the page's API-key field (per-call) or via an
    # ANTHROPIC_API_KEY env var on the Space. Without either, fail fast
    # with a friendly explanation before we hit the SDK.
    user_key_for_anthropic: Optional[str] = None
    if provider == "anthropic":
        env_key = os.environ.get("ANTHROPIC_API_KEY", "").strip()
        user_key = (anthropic_api_key or "").strip()
        if not env_key and not user_key:
            return (
                "⚠ Premium (Claude Opus) needs an Anthropic API key. Either "
                "paste your key in the field above, or pick one of the free "
                "options from the model dropdown.",
                "",
            )
        if user_key:
            # IMPORTANT: do NOT write the user-supplied key to os.environ.
            # That would leak the key into concurrent requests from other
            # visitors on this Space (the process env is shared across
            # all in-flight requests in the Python worker). Instead we
            # pass it directly to _call_anthropic below, which scopes it
            # to a single SDK client instance that's garbage-collected
            # when the call returns.
            user_key_for_anthropic = user_key

    user_prompt = (
        PROMPT_TEMPLATE
        .replace("{{user_input}}", description)
        .replace("{{industry}}", industry or "(not specified)")
        .replace("{{scale}}", scale or "(not specified)")
        .replace("{{budget}}", budget or "(not specified)")
    )

    try:
        # When the visitor supplied their own Anthropic key, bypass the
        # generic dispatcher so we can pass the key directly via kwarg
        # without ever touching os.environ. All other paths go through
        # the dispatcher and read credentials from env as usual.
        if provider == "anthropic" and user_key_for_anthropic:
            raw = _call_anthropic(
                SYSTEM_BLOCK, user_prompt, api_key=user_key_for_anthropic,
            )
        else:
            raw = _call_model(SYSTEM_BLOCK, user_prompt, provider)
    except Exception as e:
        # API timeout / rate limit / auth / server / network failure
        # (Anthropic SDK, huggingface_hub InferenceClient, or
        # transformers/torch on the zerogpu path). Include both the
        # exception class AND its string form so unexpected failures
        # are diagnosable from the UI without server log access.
        model_label = {
            "anthropic": ANTHROPIC_MODEL_ID,
            "huggingface": HF_MODEL_ID,
            "zerogpu": ZEROGPU_MODEL_ID,
        }.get(provider, provider)
        detail = str(e).strip() or "(no message)"
        # Cap the detail so we don't spill multi-paragraph tracebacks
        # into the UI. 400 chars is enough for a stack-trace summary
        # without flooding the markdown tab.
        if len(detail) > 400:
            detail = detail[:400] + "…"
        # Defense-in-depth: if the user-supplied Anthropic key somehow
        # appears in the exception message (no current SDK version does
        # this, but a future debug-mode override might), redact it
        # before surfacing the writeup. Symmetric with redactKey() in
        # src/lib/anthropic-direct.ts.
        if user_key_for_anthropic and len(user_key_for_anthropic) >= 8:
            detail = detail.replace(user_key_for_anthropic, "[redacted]")
        return (
            f"⚠ The diagnostic call to {provider} ({model_label}) failed.\n\n"
            f"**{type(e).__name__}:** {detail}\n\n"
            f"Try again in a moment, switch providers in the dropdown, "
            f"or shorten your description.",
            "",
        )

    try:
        parsed = parse_response(raw)
    except MalformedResponseError as e:
        return (
            f"⚠ The model returned malformed output. Try again with a different description "
            f"or shorten the existing one.\n\nDetail: {e}",
            "",
        )

    payload = {
        "constraint": parsed.constraint,
        "quadrant": parsed.quadrant,
        "closest_portrait": parsed.closest_portrait,
        "closest_portrait_paragraph": parsed.closest_portrait_paragraph,
        "scores": {
            k: {"score": v.score, "rationale": v.rationale, "quoted_span": v.quoted_span}
            for k, v in parsed.scores.items()
        },
        "warnings": [
            {"text": w.text, "citation_source": w.citation_source, "citation_url": w.citation_url}
            for w in parsed.warnings
        ],
    }
    return parsed.writeup, json.dumps(payload, indent=2)


# ---------------------------------------------------------------------------
# Gradio UI (built lazily so `import app` from tests does not require gradio)
# ---------------------------------------------------------------------------


def build_demo():
    """Build and return the Gradio Blocks UI. Called only by __main__."""
    import gradio as gr

    # Free option first, premium second. Plain-English labels with no
    # ANTHROPIC_API_KEY / SPACE_ID / ZeroGPU jargon β€” the casual user
    # shouldn't have to know what any of those mean.
    #
    # The HuggingFace Inference Providers backend (provider="huggingface")
    # is intentionally NOT in this dropdown: it requires the Space owner
    # to have HF billing set up (credit card on file OR custom provider
    # API keys), which most Pro users don't have by default. The backend
    # code remains in PROVIDERS so it's reachable via MODEL_PROVIDER env
    # override for users who do set up billing β€” see README.md.
    provider_choices = []
    if _zerogpu_available():
        provider_choices.append((
            f"Free Β· Phi-4-mini-instruct (Microsoft) β€” runs on GPU",
            "zerogpu",
        ))
    provider_choices.append((
        f"Premium Β· Claude Opus 4.7 (Anthropic) β€” paste your API key below",
        "anthropic",
    ))
    # Default to the first free option; user can pick Premium if they
    # have a key. Never default to anthropic on a public Space.
    default_choice = provider_choices[0][1]

    with gr.Blocks(title="The Compounding Test") as demo:
        gr.Markdown(
            "# The Compounding Test\n\n"
            "A diagnostic for AI investments at non-technology companies. "
            "Describe your AI initiative β€” get a scored writeup in one of "
            "four outcomes: **compounder**, **one-shot win**, **compounding "
            "the wrong thing**, or **Roman Candle**.\n\n"
            "**The default is free** β€” runs an open model (Phi-4-mini) "
            "on this Space's GPU. Pick **Premium Β· Claude Opus** from "
            "the dropdown if you have an Anthropic API key and want the "
            "highest-quality writeup. Read the full framework at "
            "[mile-hi.ai/journal/the-berkshire-test]("
            "https://www.mile-hi.ai/journal/the-berkshire-test)."
        )
        with gr.Row():
            description = gr.Textbox(
                label=f"Describe your AI initiative ({MIN_DESCRIPTION_WORDS}–{MAX_DESCRIPTION_WORDS} words)",
                placeholder=(
                    "Describe the bottleneck of your operation, the AI "
                    "investment, what data feeds it, where the labels come "
                    "from, and how you expect competitors to respond. Be "
                    "specific about the workflow.\n\n"
                    "Or pick a sample below to see how it works."
                ),
                lines=12,
            )

        with gr.Row():
            industry = gr.Dropdown(INDUSTRIES, label="Industry (optional)", value=None)
            scale = gr.Dropdown(SCALES, label="Scale (optional)", value=None)
            budget = gr.Dropdown(BUDGETS, label="Budget tier (optional)", value=None)

        gr.Examples(
            examples=[
                [_SAMPLE_COMPOUNDER, "insurance", "business unit", "$1M–$10M"],
                [_SAMPLE_ONE_SHOT_WIN, "banking", "business unit", "$100K–$1M"],
                [_SAMPLE_WRONG_THING, "logistics", "enterprise", "$1M–$10M"],
                [_SAMPLE_ROMAN_CANDLE, "retail", "enterprise", "$100K–$1M"],
            ],
            inputs=[description, industry, scale, budget],
            label="Sample initiatives β€” click one to load it (then click Diagnose)",
            examples_per_page=4,
        )

        with gr.Row():
            provider = gr.Dropdown(
                choices=provider_choices,
                value=default_choice,
                label="Choose a model",
            )
        # The API-key field appears only when Premium is selected. The
        # key is used per-request and never stored.
        api_key = gr.Textbox(
            label="Anthropic API key",
            placeholder="sk-ant-...",
            type="password",
            info=(
                "Used only for this request and never stored. "
                "Get a key at console.anthropic.com."
            ),
            visible=False,
        )

        def _toggle_api_key(p):
            return gr.update(visible=(p == "anthropic"))

        provider.change(_toggle_api_key, inputs=[provider], outputs=[api_key])

        submit = gr.Button("Diagnose", variant="primary")
        with gr.Tabs():
            with gr.Tab("Diagnosis"):
                writeup_out = gr.Markdown()
            with gr.Tab("Raw JSON"):
                json_out = gr.Code(language="json")
        submit.click(
            diagnose,
            inputs=[description, industry, scale, budget, provider, api_key],
            outputs=[writeup_out, json_out],
        )

    return demo


if __name__ == "__main__":
    # Local dev: relies on .env (loaded by python-dotenv) for ANTHROPIC_API_KEY.
    # HF Spaces: relies on Space secrets.
    try:
        from dotenv import load_dotenv

        load_dotenv()
    except ImportError:
        pass  # dotenv is optional; HF Spaces uses Space secrets.

    build_demo().launch()