Spaces:

minhtudragon
/

headroom

Running

File size: 10,638 Bytes

"""Eval: Does the LLM invoke headroom_retrieve when summaries are present?

The REAL test — it's not enough for the LLM to know something is missing.
It must actually call the tool to fetch it.

Compares:
- WITH summary: LLM sees "2 failed, 1 error" → should call headroom_retrieve
- WITHOUT summary: LLM sees "[90 items compressed]" → likely does NOT call tool

Requires: ANTHROPIC_API_KEY in environment or .env file.

Run: python -m pytest tests/test_compression_summary_tool_eval.py -v -s
"""

from __future__ import annotations

import json
import os

import pytest

from tests._dotenv import autouse_apply_env, load_env_overrides

_env_overrides = load_env_overrides()
ANTHROPIC_KEY = os.environ.get("ANTHROPIC_API_KEY") or _env_overrides.get("ANTHROPIC_API_KEY", "")
apply_dotenv = autouse_apply_env(_env_overrides)

pytestmark = pytest.mark.skipif(
    not ANTHROPIC_KEY,
    reason="ANTHROPIC_API_KEY not set — skipping integration tests",
)

# The headroom_retrieve tool definition (same as what CCR injects)
HEADROOM_RETRIEVE_TOOL = {
    "name": "headroom_retrieve",
    "description": (
        "Retrieve original uncompressed content from Headroom's compression cache. "
        "Use this when you need more details from compressed data. "
        "You can pass a query to search within the compressed content."
    ),
    "input_schema": {
        "type": "object",
        "properties": {
            "hash": {
                "type": "string",
                "description": "The hash key from the compression marker",
            },
            "query": {
                "type": "string",
                "description": "Optional search query to find specific items within the compressed data",
            },
        },
        "required": ["hash"],
    },
}


def _call_claude_with_tools(messages: list[dict], tools: list[dict], max_tokens: int = 300) -> dict:
    """Make a real Anthropic API call with tool use."""
    import httpx

    resp = httpx.post(
        "https://api.anthropic.com/v1/messages",
        headers={
            "X-Api-Key": ANTHROPIC_KEY,
            "anthropic-version": "2023-06-01",
            "Content-Type": "application/json",
        },
        json={
            "model": "claude-sonnet-4-5-20250929",
            "max_tokens": max_tokens,
            "messages": messages,
            "tools": tools,
        },
        timeout=30,
    )
    return resp.json()


def _make_test_results(n: int = 100) -> list[dict]:
    """Test suite output with hidden failures in the compressed portion."""
    results = []
    for i in range(n):
        result = {
            "test_name": f"test_module_{i // 10}.test_case_{i}",
            "status": "passed",
            "duration_ms": 50 + i * 3,
        }
        if i == 42:
            result["status"] = "failed"
            result["error"] = "AssertionError: expected 200, got 401 in auth_middleware"
            result["test_name"] = "test_auth.test_login_expired_token"
        if i == 67:
            result["status"] = "failed"
            result["error"] = "TimeoutError: database pool exhausted after 30s"
            result["test_name"] = "test_database.test_concurrent_connections"
        if i == 88:
            result["status"] = "error"
            result["error"] = "ImportError: cannot import 'NewFeature'"
            result["test_name"] = "test_features.test_new_feature_integration"
        results.append(result)
    return results


def _has_tool_use(response: dict) -> bool:
    """Check if the response contains a tool_use block."""
    for block in response.get("content", []):
        if block.get("type") == "tool_use":
            return True
    return False


def _get_tool_calls(response: dict) -> list[dict]:
    """Extract all tool_use blocks from response."""
    calls = []
    for block in response.get("content", []):
        if block.get("type") == "tool_use":
            calls.append(
                {
                    "name": block.get("name"),
                    "input": block.get("input", {}),
                }
            )
    return calls


class TestToolInvocationWithSummary:
    """The real eval: does the LLM call headroom_retrieve?"""

    def test_with_summary_triggers_tool_call(self):
        """WITH compression summary → LLM should call headroom_retrieve."""
        test_results = _make_test_results(100)
        kept = test_results[:10]  # All passing

        from headroom.transforms.compression_summary import summarize_dropped_items

        summary = summarize_dropped_items(test_results, kept)

        compressed = json.dumps(kept, indent=2)
        compressed += (
            f"\n[90 items compressed to 10. Omitted: {summary}."
            f' Retrieve specific items: headroom_retrieve(hash="ccr_test_abc123", query="your search")]'
        )

        messages = [
            {
                "role": "user",
                "content": (
                    "Here are the test results from our CI pipeline:\n\n"
                    f"{compressed}\n\n"
                    "Tell me about any test failures. What went wrong?"
                ),
            },
        ]

        resp = _call_claude_with_tools(messages, [HEADROOM_RETRIEVE_TOOL])

        tool_calls = _get_tool_calls(resp)
        stop_reason = resp.get("stop_reason", "")

        print(f"\n  Summary: {summary}")
        print(f"  Stop reason: {stop_reason}")
        print(f"  Tool calls: {tool_calls}")

        # With a summary showing failures, the LLM SHOULD call the tool
        if stop_reason == "tool_use":
            assert len(tool_calls) > 0
            call = tool_calls[0]
            assert call["name"] == "headroom_retrieve"
            assert call["input"].get("hash") == "ccr_test_abc123"
            # The query should be about failures/errors
            query = call["input"].get("query", "").lower()
            print(f"  Query used: {query}")
            has_relevant_query = any(
                term in query for term in ["fail", "error", "issue", "problem", "broken", "test"]
            )
            assert has_relevant_query, f"Tool was called but query isn't relevant: {query}"
            print("  RESULT: LLM invoked headroom_retrieve with relevant query ✓")
        else:
            # LLM responded with text — check if it at least mentions the failures
            text = ""
            for block in resp.get("content", []):
                if block.get("type") == "text":
                    text += block.get("text", "")
            print(f"  LLM text response: {text[:200]}")
            # It's acceptable if the LLM mentions it WANTS to retrieve
            mentions_retrieval = any(
                term in text.lower()
                for term in ["retrieve", "headroom_retrieve", "fetch", "see more", "compressed"]
            )
            print(f"  Mentions retrieval: {mentions_retrieval}")

    def test_without_summary_baseline(self):
        """WITHOUT compression summary → LLM likely does NOT call tool."""
        test_results = _make_test_results(100)
        kept = test_results[:10]  # All passing

        compressed = json.dumps(kept, indent=2)
        compressed += "\n[90 items compressed to 10. Retrieve more: hash=ccr_test_abc123]"

        messages = [
            {
                "role": "user",
                "content": (
                    "Here are the test results from our CI pipeline:\n\n"
                    f"{compressed}\n\n"
                    "Tell me about any test failures. What went wrong?"
                ),
            },
        ]

        resp = _call_claude_with_tools(messages, [HEADROOM_RETRIEVE_TOOL])

        tool_calls = _get_tool_calls(resp)
        stop_reason = resp.get("stop_reason", "")

        print(f"\n  Stop reason: {stop_reason}")
        print(f"  Tool calls: {tool_calls}")

        if stop_reason == "tool_use":
            call = tool_calls[0]
            print(f"  Query used: {call['input'].get('query', 'none')}")
            print("  RESULT: LLM DID invoke tool (may check proactively)")
        else:
            text = ""
            for block in resp.get("content", []):
                if block.get("type") == "text":
                    text += block.get("text", "")
            print(f"  LLM text response: {text[:200]}")
            print("  RESULT: LLM did NOT invoke tool — assumed all tests passed")

    def test_code_summary_triggers_retrieval(self):
        """Code compression summary → LLM should retrieve specific function."""
        compressed_code = '''class PaymentProcessor:
    """Processes payments via Stripe."""

    def __init__(self, api_key: str):
        # [2 lines omitted]
        pass

    def charge(self, amount: float, currency: str, token: str) -> dict:
        # [8 lines omitted]
        pass

    def refund(self, charge_id: str, amount: float = None) -> dict:
        # [3 lines omitted]
        pass

    def get_balance(self) -> float:
        # [2 lines omitted]
        pass
# [180 tokens compressed. removed: def charge (12 lines), def refund (6 lines). Retrieve full code: headroom_retrieve(hash="ccr_code_xyz", query="function name")]'''

        messages = [
            {
                "role": "user",
                "content": (
                    "Here's the payment processor code:\n\n"
                    f"```python\n{compressed_code}\n```\n\n"
                    "There's a bug in the retry logic for failed charges. "
                    "Can you find and fix it?"
                ),
            },
        ]

        resp = _call_claude_with_tools(messages, [HEADROOM_RETRIEVE_TOOL])

        tool_calls = _get_tool_calls(resp)
        stop_reason = resp.get("stop_reason", "")

        print(f"\n  Stop reason: {stop_reason}")
        print(f"  Tool calls: {tool_calls}")

        if stop_reason == "tool_use":
            call = tool_calls[0]
            assert call["name"] == "headroom_retrieve"
            query = call["input"].get("query", "").lower()
            print(f"  Query: {query}")
            # Should be asking for the charge function specifically
            has_charge = any(term in query for term in ["charge", "retry", "payment", "stripe"])
            print(f"  Targets charge/retry: {has_charge}")
            print("  RESULT: LLM invoked tool to get the charge() implementation ✓")
        else:
            text = ""
            for block in resp.get("content", []):
                if block.get("type") == "text":
                    text += block.get("text", "")
            print(f"  LLM text: {text[:200]}")
            print("  RESULT: LLM did NOT invoke tool")