Spaces:

AshwinP
/

compounding-test

Sleeping

File size: 37,654 Bytes

"""Sanity tests for parse_response() — the deterministic JSON-extraction
contract between the Claude response and the Gradio UI. The LLM call
itself is exempt from unit tests (Principle VII); these tests cover
only the parser surface.

Run: pytest gradio-apps/compounding-test/test_diagnose.py -v
"""
from __future__ import annotations

import pytest

import json
import os
from unittest.mock import MagicMock

from app import (
    ANTHROPIC_MODEL_ID,
    HF_MODEL_ID,
    MalformedResponseError,
    PROVIDERS,
    _call_anthropic,
    _call_huggingface,
    _call_model,
    _call_zerogpu,
    _detect_provider,
    _zerogpu_available,
    _zerogpu_invoke,
    diagnose,
    parse_response,
)
import app as app_module


# --- Fixtures ---------------------------------------------------------------

VALID_JSON_BLOCK = """```json
{
  "constraint": "Underwriting at the quote screen.",
  "scores": {
    "proprietary_data":         { "score": 4, "rationale": "First-party policy data.", "quoted_span": "claim outcomes Progressive observes directly" },
    "self_labeling":            { "score": 4, "rationale": "Every policy term self-labels.", "quoted_span": "every policy term produces a claim" },
    "decreasing_marginal_cost": { "score": 3, "rationale": "Pipeline amortized.", "quoted_span": "17 years of amortized pipeline" },
    "defensible_asymmetry":     { "score": 3, "rationale": "Integration depth capped.", "quoted_span": "behavior data integration depth" }
  },
  "quadrant": "compounder",
  "closest_portrait": "progressive",
  "closest_portrait_paragraph": "Your case tracks Progressive most closely because the labeling loop runs through claim outcomes you directly observe.",
  "warnings": []
}
```

# The Verdict

Your initiative is a compounder. Here's why this lands cleanly on all
four conditions and what to do Monday morning to keep it compounding.
"""


# --- Happy path -------------------------------------------------------------


def test_valid_response_parses_top_level_fields():
    r = parse_response(VALID_JSON_BLOCK)
    assert r.constraint == "Underwriting at the quote screen."
    assert r.quadrant == "compounder"
    assert r.closest_portrait == "progressive"
    assert "compounder" in r.closest_portrait_paragraph or "Progressive" in r.closest_portrait_paragraph
    assert r.warnings == []


def test_valid_response_captures_writeup_after_json_block():
    r = parse_response(VALID_JSON_BLOCK)
    assert "# The Verdict" in r.writeup
    assert "Monday morning" in r.writeup


def test_valid_response_extracts_all_four_scores():
    r = parse_response(VALID_JSON_BLOCK)
    assert set(r.scores.keys()) == {
        "proprietary_data",
        "self_labeling",
        "decreasing_marginal_cost",
        "defensible_asymmetry",
    }
    assert r.scores["proprietary_data"].score == 4
    assert r.scores["decreasing_marginal_cost"].score == 3
    assert r.scores["defensible_asymmetry"].quoted_span == "behavior data integration depth"


# --- Sad path: missing or malformed JSON ------------------------------------


def test_no_json_block_raises():
    raw = "Hi there, no JSON here at all, just prose explaining the verdict."
    with pytest.raises(MalformedResponseError, match="json"):
        parse_response(raw)


def test_invalid_json_inside_block_raises():
    raw = "```json\n{ this is not valid json }\n```\n# Writeup"
    with pytest.raises(MalformedResponseError):
        parse_response(raw)


def test_missing_required_top_level_field_raises():
    raw = """```json
{
  "constraint": "...",
  "scores": {},
  "quadrant": "compounder"
}
```"""
    # missing closest_portrait, closest_portrait_paragraph, warnings
    with pytest.raises(MalformedResponseError, match="closest_portrait"):
        parse_response(raw)


# --- Sad path: enum validation ----------------------------------------------


def test_invalid_quadrant_raises():
    raw = VALID_JSON_BLOCK.replace('"quadrant": "compounder"', '"quadrant": "bogus-quadrant"')
    with pytest.raises(MalformedResponseError, match="quadrant"):
        parse_response(raw)


def test_invalid_closest_portrait_raises():
    raw = VALID_JSON_BLOCK.replace(
        '"closest_portrait": "progressive"', '"closest_portrait": "wells-fargo"'
    )
    with pytest.raises(MalformedResponseError, match="closest_portrait"):
        parse_response(raw)


# --- Sad path: score-range validation --------------------------------------


def test_score_below_zero_raises():
    raw = VALID_JSON_BLOCK.replace('"score": 4, "rationale": "First-party policy data."', '"score": -1, "rationale": "First-party policy data."')
    with pytest.raises(MalformedResponseError, match="score"):
        parse_response(raw)


def test_score_above_four_raises():
    raw = VALID_JSON_BLOCK.replace('"score": 4, "rationale": "First-party policy data."', '"score": 7, "rationale": "First-party policy data."')
    with pytest.raises(MalformedResponseError, match="score"):
        parse_response(raw)


def test_score_not_integer_raises():
    raw = VALID_JSON_BLOCK.replace('"score": 4, "rationale": "First-party policy data."', '"score": "high", "rationale": "First-party policy data."')
    with pytest.raises(MalformedResponseError, match="score"):
        parse_response(raw)


# --- Sad path: quoted_span validation --------------------------------------


def test_empty_quoted_span_raises():
    raw = VALID_JSON_BLOCK.replace(
        '"quoted_span": "claim outcomes Progressive observes directly"',
        '"quoted_span": ""',
    )
    with pytest.raises(MalformedResponseError, match="quoted_span"):
        parse_response(raw)


def test_quoted_span_over_400_chars_raises():
    """The 400-char limit is a generous ceiling — Phi-4-mini consistently
    generates ~200-220 char quoted_spans when asked for 5-15 words, so
    we bumped from 200 to 400 to accommodate normal model output without
    losing the runaway-output guard."""
    over_limit = "x" * 401
    raw = VALID_JSON_BLOCK.replace(
        '"quoted_span": "claim outcomes Progressive observes directly"',
        f'"quoted_span": "{over_limit}"',
    )
    with pytest.raises(MalformedResponseError, match="quoted_span"):
        parse_response(raw)


def test_quoted_span_up_to_400_chars_accepted():
    """Confirms the new ceiling lets typical Phi-4-mini output through."""
    at_limit = "x" * 250  # well above the prior 200-char cap
    raw = VALID_JSON_BLOCK.replace(
        '"quoted_span": "claim outcomes Progressive observes directly"',
        f'"quoted_span": "{at_limit}"',
    )
    r = parse_response(raw)
    assert len(r.scores["proprietary_data"].quoted_span) == 250


# --- Tolerance: forward-compat and whitespace ------------------------------


def test_extra_unknown_fields_tolerated():
    raw = VALID_JSON_BLOCK.replace(
        '"warnings": []',
        '"warnings": [], "future_field": "ignored", "another": 42',
    )
    r = parse_response(raw)  # should not raise
    assert r.quadrant == "compounder"


# --- Provider auto-detection (multi-backend support) ----------------------


def test_detect_provider_explicit_anthropic_wins():
    env = {"MODEL_PROVIDER": "anthropic", "HF_TOKEN": "hf-xxx"}
    assert _detect_provider(env) == "anthropic"


def test_detect_provider_explicit_huggingface_wins():
    env = {"MODEL_PROVIDER": "huggingface", "ANTHROPIC_API_KEY": "sk-xxx"}
    assert _detect_provider(env) == "huggingface"


def test_detect_provider_case_insensitive():
    assert _detect_provider({"MODEL_PROVIDER": "HuggingFace"}) == "huggingface"


def test_detect_provider_invalid_explicit_falls_through():
    # bogus MODEL_PROVIDER is ignored; auto-detect kicks in
    env = {"MODEL_PROVIDER": "bogus", "ANTHROPIC_API_KEY": "sk-xxx"}
    assert _detect_provider(env) == "anthropic"


def test_detect_provider_anthropic_when_only_anthropic_key_set():
    assert _detect_provider({"ANTHROPIC_API_KEY": "sk-xxx"}) == "anthropic"


def test_detect_provider_huggingface_when_only_hf_token_set():
    assert _detect_provider({"HF_TOKEN": "hf-xxx"}) == "huggingface"


def test_detect_provider_huggingface_when_running_on_hf_space_without_zerogpu(monkeypatch):
    # On a Space WITHOUT ZeroGPU deps installed, fall back to the inference API.
    monkeypatch.setattr(app_module, "_zerogpu_available", lambda: False)
    assert _detect_provider({"SPACE_ID": "mile-hi-ai/compounding-test"}) == "huggingface"


def test_detect_provider_prefers_zerogpu_on_pro_space_with_deps(monkeypatch):
    # On a Space WITH ZeroGPU deps installed (transformers + torch + spaces),
    # default to the free GPU backend rather than burning inference credits.
    monkeypatch.setattr(app_module, "_zerogpu_available", lambda: True)
    assert _detect_provider({"SPACE_ID": "mile-hi-ai/compounding-test"}) == "zerogpu"


def test_detect_provider_explicit_anthropic_wins_over_zerogpu(monkeypatch):
    # Explicit MODEL_PROVIDER beats the zerogpu auto-detect even on a Pro Space.
    monkeypatch.setattr(app_module, "_zerogpu_available", lambda: True)
    env = {"MODEL_PROVIDER": "anthropic", "SPACE_ID": "mile-hi-ai/compounding-test"}
    assert _detect_provider(env) == "anthropic"


def test_detect_provider_explicit_zerogpu_wins():
    assert _detect_provider({"MODEL_PROVIDER": "zerogpu"}) == "zerogpu"


def test_zerogpu_is_in_providers_dict():
    # Even when deps aren't installed locally, the provider key exists so the
    # UI dropdown can show it (the stub raises a clear error if invoked).
    assert "zerogpu" in PROVIDERS


def test_detect_provider_alt_hf_token_var():
    # HuggingFace SDKs also recognize HUGGING_FACE_HUB_TOKEN
    assert _detect_provider({"HUGGING_FACE_HUB_TOKEN": "hf-xxx"}) == "huggingface"


def test_detect_provider_default_when_nothing_set():
    # No creds anywhere → default to anthropic (clearest error at call time)
    assert _detect_provider({}) == "anthropic"


# --- Provider dispatch (_call_model routes to the right backend) -----------


def test_call_model_routes_to_anthropic_backend(monkeypatch):
    calls = []
    monkeypatch.setitem(PROVIDERS, "anthropic", lambda s, u: (calls.append(("anthropic", s, u)) or "anth-out"))
    out = _call_model("system-text", "user-text", "anthropic")
    assert out == "anth-out"
    assert calls == [("anthropic", "system-text", "user-text")]


def test_call_model_routes_to_huggingface_backend(monkeypatch):
    calls = []
    monkeypatch.setitem(PROVIDERS, "huggingface", lambda s, u: (calls.append(("hf", s, u)) or "hf-out"))
    out = _call_model("system-text", "user-text", "huggingface")
    assert out == "hf-out"
    assert calls == [("hf", "system-text", "user-text")]


def test_call_model_unknown_provider_raises():
    with pytest.raises(ValueError, match="provider"):
        _call_model("s", "u", "bogus-provider")


# --- diagnose() input validation -------------------------------------------

# Reused across diagnose() tests: a description long enough to pass the
# 200-word minimum. The actual content doesn't matter for these tests
# because we mock the backend.
_LONG_DESCRIPTION = " ".join(["word"] * 250)


def test_diagnose_empty_description_returns_friendly_error():
    writeup, json_str = diagnose("", None, None, None, provider="zerogpu")
    assert "Please describe" in writeup
    assert json_str == ""


def test_diagnose_short_description_returns_word_count_error():
    short = " ".join(["word"] * 50)
    writeup, json_str = diagnose(short, None, None, None, provider="zerogpu")
    assert "at least 200 words" in writeup
    assert "50" in writeup  # current word count
    assert json_str == ""


def test_diagnose_long_description_returns_word_count_error():
    long = " ".join(["word"] * 5001)
    writeup, json_str = diagnose(long, None, None, None, provider="zerogpu")
    assert "under 5000 words" in writeup
    assert json_str == ""


def test_diagnose_unknown_provider_returns_friendly_error():
    writeup, json_str = diagnose(_LONG_DESCRIPTION, None, None, None, provider="bogus")
    assert "Unknown model provider" in writeup
    assert "bogus" in writeup
    assert json_str == ""


# --- diagnose() Premium (Anthropic) path -----------------------------------


def test_diagnose_premium_without_any_key_returns_friendly_error(monkeypatch):
    monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False)
    writeup, json_str = diagnose(
        _LONG_DESCRIPTION, None, None, None,
        provider="anthropic",
        anthropic_api_key=None,
    )
    assert "Premium" in writeup
    assert "API key" in writeup
    assert json_str == ""


def test_diagnose_premium_with_empty_string_key_returns_friendly_error(monkeypatch):
    # Whitespace-only key should not count as supplied
    monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False)
    writeup, _ = diagnose(
        _LONG_DESCRIPTION, None, None, None,
        provider="anthropic",
        anthropic_api_key="   ",
    )
    assert "Premium" in writeup
    assert "API key" in writeup


def test_diagnose_premium_with_env_key_dispatches_to_anthropic(monkeypatch):
    monkeypatch.setenv("ANTHROPIC_API_KEY", "sk-env-xxx")
    captured = {}

    def fake_anthropic(system, user):
        captured["system"] = system
        captured["user"] = user
        captured["env_key_at_call_time"] = os.environ.get("ANTHROPIC_API_KEY")
        return _VALID_BACKEND_RESPONSE

    monkeypatch.setitem(PROVIDERS, "anthropic", fake_anthropic)

    writeup, json_str = diagnose(
        _LONG_DESCRIPTION, "insurance", "enterprise", "$1M–$10M",
        provider="anthropic",
        anthropic_api_key=None,
    )
    # Backend was called (so dispatch worked) and the env key was visible
    assert captured.get("env_key_at_call_time") == "sk-env-xxx"
    # And the response made it through the parser → JSON tab populated
    assert json_str
    parsed = json.loads(json_str)
    assert parsed["quadrant"] == "compounder"


def test_diagnose_premium_user_key_passed_directly_not_via_env(monkeypatch):
    """The page's API-key field should take precedence over any
    ANTHROPIC_API_KEY env var the Space owner has configured. Critically,
    the visitor's key must be passed DIRECTLY to _call_anthropic via
    kwarg — never written to os.environ — or concurrent requests from
    other visitors could pick up the wrong key from shared process env.
    See _call_anthropic docstring."""
    monkeypatch.setenv("ANTHROPIC_API_KEY", "sk-space-owner-xxx")
    captured = {}

    def fake_call_anthropic(system, user, *, api_key=None):
        captured["api_key_kwarg"] = api_key
        captured["env_at_call_time"] = os.environ.get("ANTHROPIC_API_KEY")
        return _VALID_BACKEND_RESPONSE

    monkeypatch.setattr(app_module, "_call_anthropic", fake_call_anthropic)

    diagnose(
        _LONG_DESCRIPTION, None, None, None,
        provider="anthropic",
        anthropic_api_key="sk-user-yyy",
    )
    # User key passed directly via kwarg (the override mechanism)
    assert captured["api_key_kwarg"] == "sk-user-yyy"
    # CRITICAL: env was NOT clobbered with the user's key — Space
    # owner's key remained intact for any concurrent request that
    # legitimately needs it (or for no request at all if there's no
    # owner-set key).
    assert captured["env_at_call_time"] == "sk-space-owner-xxx"


def test_diagnose_premium_does_not_mutate_env_with_user_key(monkeypatch):
    """Cross-tenant key-leak regression test. On a public Space, two
    concurrent visitors may both submit Premium requests. Each must use
    only their own key; neither should ever see the other's key via
    os.environ. The fix is to never write user-supplied keys to env."""
    monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False)
    captured = {}

    def fake_call_anthropic(system, user, *, api_key=None):
        captured["api_key_kwarg"] = api_key
        captured["env_at_call_time"] = os.environ.get("ANTHROPIC_API_KEY")
        return _VALID_BACKEND_RESPONSE

    monkeypatch.setattr(app_module, "_call_anthropic", fake_call_anthropic)

    diagnose(
        _LONG_DESCRIPTION, None, None, None,
        provider="anthropic",
        anthropic_api_key="sk-visitor-A-secret",
    )
    # The key went directly to the SDK, not via env
    assert captured["api_key_kwarg"] == "sk-visitor-A-secret"
    # Env was never set during the call
    assert captured["env_at_call_time"] is None
    # And env is still unset after the call returns — no residue for
    # the next visitor's concurrent request to pick up
    assert os.environ.get("ANTHROPIC_API_KEY") is None


def test_diagnose_redacts_user_key_from_error_messages(monkeypatch):
    """Defense-in-depth: if a backend exception ever included the
    user-supplied Anthropic key in its string representation, the F14
    wrapper must redact it before surfacing the error to the UI.
    Symmetric with redactKey() in src/lib/anthropic-direct.ts."""
    monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False)
    user_key = "sk-ant-very-secret-12345"

    class _LeakyError(Exception):
        pass

    def leaky_anthropic(system, user, *, api_key=None):
        # Simulate the worst case: SDK echoes the key in its error
        raise _LeakyError(f"auth fail with key {api_key} rejected")

    monkeypatch.setattr(app_module, "_call_anthropic", leaky_anthropic)

    writeup, _ = diagnose(
        _LONG_DESCRIPTION, None, None, None,
        provider="anthropic",
        anthropic_api_key=user_key,
    )
    assert user_key not in writeup
    assert "[redacted]" in writeup
    # And the rest of the error info should still be visible
    assert "LeakyError" in writeup


def test_call_anthropic_passes_api_key_to_sdk_constructor(monkeypatch):
    """When _call_anthropic receives api_key=, it must be passed to the
    Anthropic() SDK constructor — not stored in os.environ, not
    discarded, not exposed elsewhere."""
    captured_init = {}

    class FakeContentBlock:
        text = "ok"

    class FakeMessage:
        content = [FakeContentBlock()]

    class FakeClient:
        class messages:  # noqa: N801
            @staticmethod
            def create(**kwargs):
                return FakeMessage()

    def fake_anthropic_ctor(**kwargs):
        captured_init.update(kwargs)
        return FakeClient()

    import anthropic as anthropic_module
    monkeypatch.setattr(anthropic_module, "Anthropic", fake_anthropic_ctor)
    monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False)

    _call_anthropic("sys", "usr", api_key="sk-direct-yyy")

    assert captured_init.get("api_key") == "sk-direct-yyy"
    # And env was not touched
    assert os.environ.get("ANTHROPIC_API_KEY") is None


def test_call_anthropic_without_api_key_uses_env_via_sdk(monkeypatch):
    """When api_key is not supplied, the SDK constructor is called with
    no kwargs — letting it read ANTHROPIC_API_KEY from env, as is the
    SDK's normal default behavior. We don't explicitly pass api_key=None
    because the SDK treats that differently than 'not supplied'."""
    captured_init = {}

    class FakeContentBlock:
        text = "ok"

    class FakeMessage:
        content = [FakeContentBlock()]

    class FakeClient:
        class messages:  # noqa: N801
            @staticmethod
            def create(**kwargs):
                return FakeMessage()

    def fake_anthropic_ctor(**kwargs):
        captured_init.update(kwargs)
        return FakeClient()

    import anthropic as anthropic_module
    monkeypatch.setattr(anthropic_module, "Anthropic", fake_anthropic_ctor)
    monkeypatch.setenv("ANTHROPIC_API_KEY", "sk-env-default")

    _call_anthropic("sys", "usr")  # no api_key kwarg

    # SDK constructor called with no api_key — it'll use env on its own
    assert "api_key" not in captured_init


def test_diagnose_premium_backend_exception_returns_friendly_error(monkeypatch):
    """When the Anthropic SDK raises (auth fail, rate limit, network),
    F14 should wrap it in a markdown message that names the provider,
    model, exception class, and exception detail — never a raw trace."""
    monkeypatch.setenv("ANTHROPIC_API_KEY", "sk-test")

    class FakeAuthError(Exception):
        pass

    def failing_anthropic(system, user):
        raise FakeAuthError("invalid x-api-key header")

    monkeypatch.setitem(PROVIDERS, "anthropic", failing_anthropic)

    writeup, json_str = diagnose(
        _LONG_DESCRIPTION, None, None, None,
        provider="anthropic",
        anthropic_api_key=None,
    )
    assert "anthropic" in writeup
    assert ANTHROPIC_MODEL_ID in writeup
    assert "FakeAuthError" in writeup
    assert "invalid x-api-key header" in writeup
    assert "stack" not in writeup.lower()  # no stack trace leaked
    assert json_str == ""


def test_diagnose_premium_backend_returns_malformed_response(monkeypatch):
    """When the backend returns something that fails the JSON schema
    validator, surface the parser's error message — don't crash."""
    monkeypatch.setenv("ANTHROPIC_API_KEY", "sk-test")

    def garbage_anthropic(system, user):
        return "Sorry, I cannot help with that request."  # no JSON block

    monkeypatch.setitem(PROVIDERS, "anthropic", garbage_anthropic)

    writeup, json_str = diagnose(
        _LONG_DESCRIPTION, None, None, None,
        provider="anthropic",
        anthropic_api_key=None,
    )
    assert "malformed output" in writeup
    assert json_str == ""


# --- _call_anthropic: Anthropic SDK call shape -----------------------------
#
# Per Principle VII the actual API call is exempt from automated tests
# (the SDK and the remote API are not our code). But the SHAPE of the
# call we make IS our code: model id, system block, cache_control flag,
# messages structure, response-unwrap path. These are easy to typo and
# easy to miss in review. Shape tests catch that without hitting the
# network.


def test_call_anthropic_passes_system_block_with_cache_control(monkeypatch):
    captured = {}

    class FakeContentBlock:
        text = "raw response text"

    class FakeMessage:
        content = [FakeContentBlock()]

    class FakeClient:
        class messages:  # noqa: N801 — mirroring SDK's nested .messages.create
            @staticmethod
            def create(**kwargs):
                captured.update(kwargs)
                return FakeMessage()

    import anthropic as anthropic_module
    monkeypatch.setattr(anthropic_module, "Anthropic", lambda: FakeClient())

    result = _call_anthropic("MY SYSTEM BLOCK", "MY USER PROMPT")

    # The function unwrapped content[0].text correctly
    assert result == "raw response text"
    # Model + token budget
    assert captured["model"] == ANTHROPIC_MODEL_ID
    assert captured["max_tokens"] == 2500
    # System block is a list of one dict with cache_control
    sys_block = captured["system"]
    assert isinstance(sys_block, list)
    assert len(sys_block) == 1
    assert sys_block[0]["type"] == "text"
    assert sys_block[0]["text"] == "MY SYSTEM BLOCK"
    assert sys_block[0]["cache_control"] == {"type": "ephemeral"}
    # User prompt in the messages array
    assert captured["messages"] == [{"role": "user", "content": "MY USER PROMPT"}]


# --- _call_huggingface: token resolution + call shape ----------------------


def _install_fake_inference_client(monkeypatch, captured: dict, *,
                                    response_text: str = "hf response",
                                    raises: Exception | None = None):
    """Replace huggingface_hub.InferenceClient with a fake that records
    its init kwargs and chat_completion kwargs into `captured`. Optionally
    have chat_completion raise an exception instead of returning."""

    class _FakeMsg:
        content = response_text

    class _FakeChoice:
        message = _FakeMsg()

    class _FakeResponse:
        choices = [_FakeChoice()]

    class _FakeClient:
        def __init__(self, **kwargs):
            captured["init_kwargs"] = kwargs

        def chat_completion(self, **kwargs):
            captured["chat_kwargs"] = kwargs
            if raises is not None:
                raise raises
            return _FakeResponse()

    import huggingface_hub
    monkeypatch.setattr(huggingface_hub, "InferenceClient", _FakeClient)


def test_call_huggingface_no_token_anywhere_raises_actionable_error(monkeypatch):
    monkeypatch.delenv("HF_TOKEN", raising=False)
    monkeypatch.delenv("HUGGING_FACE_HUB_TOKEN", raising=False)
    import huggingface_hub
    monkeypatch.setattr(huggingface_hub, "get_token", lambda: None)

    with pytest.raises(RuntimeError, match="No HuggingFace token"):
        _call_huggingface("sys", "usr")


def test_call_huggingface_uses_HF_TOKEN_env(monkeypatch):
    monkeypatch.setenv("HF_TOKEN", "hf_from_env")
    captured = {}
    _install_fake_inference_client(monkeypatch, captured)
    _call_huggingface("sys", "usr")
    assert captured["init_kwargs"]["token"] == "hf_from_env"


def test_call_huggingface_uses_HUGGING_FACE_HUB_TOKEN_env_as_fallback(monkeypatch):
    monkeypatch.delenv("HF_TOKEN", raising=False)
    monkeypatch.setenv("HUGGING_FACE_HUB_TOKEN", "hf_legacy_var")
    captured = {}
    _install_fake_inference_client(monkeypatch, captured)
    _call_huggingface("sys", "usr")
    assert captured["init_kwargs"]["token"] == "hf_legacy_var"


def test_call_huggingface_uses_get_token_when_no_env(monkeypatch):
    monkeypatch.delenv("HF_TOKEN", raising=False)
    monkeypatch.delenv("HUGGING_FACE_HUB_TOKEN", raising=False)
    import huggingface_hub
    monkeypatch.setattr(huggingface_hub, "get_token", lambda: "hf_from_cli_login")
    captured = {}
    _install_fake_inference_client(monkeypatch, captured)
    _call_huggingface("sys", "usr")
    assert captured["init_kwargs"]["token"] == "hf_from_cli_login"


def test_call_huggingface_HF_TOKEN_wins_over_other_sources(monkeypatch):
    monkeypatch.setenv("HF_TOKEN", "hf_winner")
    monkeypatch.setenv("HUGGING_FACE_HUB_TOKEN", "hf_loser_1")
    import huggingface_hub
    monkeypatch.setattr(huggingface_hub, "get_token", lambda: "hf_loser_2")
    captured = {}
    _install_fake_inference_client(monkeypatch, captured)
    _call_huggingface("sys", "usr")
    assert captured["init_kwargs"]["token"] == "hf_winner"


def test_call_huggingface_init_shape_model_provider_timeout(monkeypatch):
    monkeypatch.setenv("HF_TOKEN", "hf_test")
    captured = {}
    _install_fake_inference_client(monkeypatch, captured)
    _call_huggingface("sys", "usr")
    init = captured["init_kwargs"]
    assert init["model"] == HF_MODEL_ID
    # provider="auto" is the critical config that enables the modern HF
    # Inference Providers routing layer — without it, the client falls
    # back to the legacy hf-inference-only path. Catch any regression
    # that removes this flag.
    assert init["provider"] == "auto"
    assert init["timeout"] == 120


def test_call_huggingface_chat_completion_call_shape(monkeypatch):
    monkeypatch.setenv("HF_TOKEN", "hf_test")
    captured = {}
    _install_fake_inference_client(monkeypatch, captured)
    result = _call_huggingface("MY SYSTEM BLOCK", "MY USER PROMPT")
    chat = captured["chat_kwargs"]
    assert chat["messages"] == [
        {"role": "system", "content": "MY SYSTEM BLOCK"},
        {"role": "user", "content": "MY USER PROMPT"},
    ]
    assert chat["max_tokens"] == 2500
    # Low temperature is intentional — smaller open models can produce
    # looser JSON at higher temperatures. Catch any drift.
    assert chat["temperature"] == 0.2
    # Response unwrap: choices[0].message.content
    assert result == "hf response"


def test_call_huggingface_model_not_supported_error_wrapped(monkeypatch):
    monkeypatch.setenv("HF_TOKEN", "hf_test")
    fake_hf_error = Exception(
        "Bad request: {'message': \"The requested model is not supported "
        "by any provider you have enabled.\", 'code': 'model_not_supported'}"
    )
    captured = {}
    _install_fake_inference_client(monkeypatch, captured, raises=fake_hf_error)
    with pytest.raises(RuntimeError, match="isn't available through any"):
        _call_huggingface("sys", "usr")


def test_call_huggingface_model_not_supported_alternate_phrasing_wrapped(monkeypatch):
    monkeypatch.setenv("HF_TOKEN", "hf_test")
    fake_hf_error = Exception("...'code': 'model_not_supported'...")
    captured = {}
    _install_fake_inference_client(monkeypatch, captured, raises=fake_hf_error)
    with pytest.raises(RuntimeError, match="isn't available through any"):
        _call_huggingface("sys", "usr")


def test_call_huggingface_other_exception_passes_through(monkeypatch):
    """Errors that aren't the model_not_supported case (auth fail,
    network timeout, malformed response) should propagate up so the
    F14 wrapper in diagnose() can surface them with the original class
    name and detail."""
    monkeypatch.setenv("HF_TOKEN", "hf_test")
    fake_other_error = ValueError("Invalid API key")
    captured = {}
    _install_fake_inference_client(monkeypatch, captured, raises=fake_other_error)
    with pytest.raises(ValueError, match="Invalid API key"):
        _call_huggingface("sys", "usr")


# --- _call_zerogpu: stub path + invocation shape --------------------------


def test_call_zerogpu_stub_raises_clear_error_when_deps_unavailable():
    """In a local environment without spaces/torch/transformers installed,
    _ZEROGPU_DEPS_AVAILABLE is False and _call_zerogpu is the stub that
    raises a RuntimeError pointing the user to the other two backends."""
    if _zerogpu_available():
        pytest.skip("Test only meaningful when zerogpu deps are NOT installed")
    with pytest.raises(RuntimeError, match="ZeroGPU backend requires"):
        _call_zerogpu("sys", "usr")


def test_zerogpu_available_reflects_dep_state():
    """_zerogpu_available() is the sole gating function for the zerogpu
    branch in _detect_provider; it must return the cached import-time
    boolean rather than re-trying imports on every call."""
    import app as app_module
    assert _zerogpu_available() is app_module._ZEROGPU_DEPS_AVAILABLE


def _install_fake_zerogpu_model(monkeypatch, captured: dict, *,
                                 prompt_len: int = 5,
                                 decoded_text: str = "model output"):
    """Replace the module-level _zerogpu_tokenizer and _zerogpu_model
    with fakes that record their calls. Simulates transformers types
    just enough for _zerogpu_invoke() to run end-to-end without torch
    actually installed."""
    import app as app_module

    class _FakeInputs:
        def __init__(self):
            self.shape = (1, prompt_len)

        def to(self, device):
            captured["inputs_moved_to_device"] = device
            return self  # chain .to() back into self for further use

    fake_inputs = _FakeInputs()
    fake_outputs = [list(range(prompt_len + 10))]  # prompt tokens + 10 new tokens

    class _FakeTokenizer:
        eos_token_id = 99

        def apply_chat_template(self, messages, **kwargs):
            captured["apply_chat_template"] = {
                "messages": messages,
                "kwargs": kwargs,
            }
            return fake_inputs

        def decode(self, token_ids, **kwargs):
            captured["decode"] = {"token_ids": list(token_ids), "kwargs": kwargs}
            return decoded_text

    class _FakeModel:
        device = "cpu"  # starts on CPU; _zerogpu_invoke moves to cuda

        def to(self, device):
            captured["model_moved_to_device"] = device
            self.device = device
            return self

        def generate(self, inputs, **kwargs):
            captured["generate_inputs"] = inputs
            captured["generate_kwargs"] = kwargs
            return fake_outputs

    monkeypatch.setattr(app_module, "_zerogpu_tokenizer", _FakeTokenizer())
    monkeypatch.setattr(app_module, "_zerogpu_model", _FakeModel())
    # Note: no _load_zerogpu_model to patch — after the pre-load refactor
    # (commit ___), model load happens at module init, not lazily.


def test_zerogpu_invoke_builds_chat_template_with_system_and_user(monkeypatch):
    captured = {}
    _install_fake_zerogpu_model(monkeypatch, captured)
    _zerogpu_invoke("MY SYSTEM BLOCK", "MY USER PROMPT")
    chat = captured["apply_chat_template"]
    assert chat["messages"] == [
        {"role": "system", "content": "MY SYSTEM BLOCK"},
        {"role": "user", "content": "MY USER PROMPT"},
    ]
    assert chat["kwargs"]["return_tensors"] == "pt"
    assert chat["kwargs"]["add_generation_prompt"] is True


def test_zerogpu_invoke_moves_model_and_inputs_to_cuda(monkeypatch):
    """Post-refactor (pre-load pattern): the model lives on CPU at
    module init, and _zerogpu_invoke must explicitly move it AND the
    input tensors to cuda inside the @spaces.GPU context."""
    captured = {}
    _install_fake_zerogpu_model(monkeypatch, captured)
    _zerogpu_invoke("sys", "usr")
    # Model: moved CPU → cuda inside the invoke
    assert captured["model_moved_to_device"] == "cuda"
    # Inputs: tokenized then moved to cuda for inference
    assert captured["inputs_moved_to_device"] == "cuda"


def test_zerogpu_invoke_generate_call_shape(monkeypatch):
    """The .generate() kwargs are easy to typo and carry real semantics:
      max_new_tokens=2500 caps output length
      temperature=0.2 keeps JSON output stable for small models
      do_sample=True is needed for non-zero temperature to have effect
      pad_token_id=eos_token_id avoids warning spam on short prompts
    Catch regressions in any of these."""
    captured = {}
    _install_fake_zerogpu_model(monkeypatch, captured)
    _zerogpu_invoke("sys", "usr")
    gen = captured["generate_kwargs"]
    assert gen["max_new_tokens"] == 2500
    assert gen["temperature"] == 0.2
    assert gen["do_sample"] is True
    assert gen["pad_token_id"] == 99  # _FakeTokenizer.eos_token_id


def test_zerogpu_invoke_strips_prompt_tokens_before_decode(monkeypatch):
    """The decoded output must be the GENERATED text only, not echo back
    the prompt. The function does this by slicing outputs[0][prompt_len:]
    before calling decode. Verify the slice happens correctly."""
    captured = {}
    # prompt_len=5 → fake_outputs returns range(15) (5 prompt + 10 generated)
    # so decode should be called with tokens [5..15)
    _install_fake_zerogpu_model(monkeypatch, captured, prompt_len=5)
    _zerogpu_invoke("sys", "usr")
    decoded_tokens = captured["decode"]["token_ids"]
    assert decoded_tokens == list(range(5, 15))
    # And skip_special_tokens is on so we don't include things like </s>
    assert captured["decode"]["kwargs"]["skip_special_tokens"] is True


def test_zerogpu_invoke_returns_decoded_text(monkeypatch):
    captured = {}
    _install_fake_zerogpu_model(monkeypatch, captured, decoded_text="my generated answer")
    result = _zerogpu_invoke("sys", "usr")
    assert result == "my generated answer"


# --- Integration test (opt-in; hits the real Anthropic API) ----------------
#
# Skipped unless ANTHROPIC_API_KEY is set AND ANTHROPIC_INTEGRATION=1 is
# set. Costs money to run (~$0.05 per call to Opus 4.7). Use this when
# you want to verify end-to-end that the key works and the model is
# reachable; routine CI should leave this skipped.

import pytest as _pytest  # already imported above as pytest, but kept explicit


@_pytest.mark.skipif(
    not (os.environ.get("ANTHROPIC_API_KEY") and os.environ.get("ANTHROPIC_INTEGRATION") == "1"),
    reason="needs ANTHROPIC_API_KEY + ANTHROPIC_INTEGRATION=1 to hit the real API",
)
def test_call_anthropic_real_api_returns_text():
    result = _call_anthropic(
        "You are a one-word echo. Reply with exactly one word.",
        "Say hello.",
    )
    assert isinstance(result, str)
    assert len(result) > 0
    assert len(result.split()) < 20  # one-word reply, generously bounded


# ---------------------------------------------------------------------------
# Fixture: a backend response that satisfies parse_response() so the
# diagnose-Premium happy-path tests can assert on parser output without
# duplicating the full JSON shape per test.
# ---------------------------------------------------------------------------

_VALID_BACKEND_RESPONSE = """```json
{
  "constraint": "Underwriting at the quote screen.",
  "scores": {
    "proprietary_data":         { "score": 4, "rationale": "first-party.", "quoted_span": "12-year claims database" },
    "self_labeling":            { "score": 4, "rationale": "policies self-label.", "quoted_span": "Every policy we write" },
    "decreasing_marginal_cost": { "score": 3, "rationale": "amortized pipeline.", "quoted_span": "feed back into the next quarter" },
    "defensible_asymmetry":     { "score": 3, "rationale": "no data sharing.", "quoted_span": "we don't share data with" }
  },
  "quadrant": "compounder",
  "closest_portrait": "progressive",
  "closest_portrait_paragraph": "Your case tracks Progressive most closely.",
  "warnings": []
}
```

# The Verdict

Your initiative is a compounder. Here's why.
"""


def test_warnings_populated_for_failure_quadrant():
    raw = VALID_JSON_BLOCK.replace('"quadrant": "compounder"', '"quadrant": "roman-candle"').replace(
        '"warnings": []',
        '"warnings": [{"text": "Wrong place, conditions weak.", "citation_source": "Buffett 2007", "citation_url": "https://www.berkshirehathaway.com/letters/2007ltr.pdf"}]',
    )
    r = parse_response(raw)
    assert r.quadrant == "roman-candle"
    assert len(r.warnings) == 1
    assert r.warnings[0].citation_source == "Buffett 2007"
    assert "berkshirehathaway" in r.warnings[0].citation_url