Spaces:
Running
Running
| """Eval: Does the LLM invoke headroom_retrieve when summaries are present? | |
| The REAL test — it's not enough for the LLM to know something is missing. | |
| It must actually call the tool to fetch it. | |
| Compares: | |
| - WITH summary: LLM sees "2 failed, 1 error" → should call headroom_retrieve | |
| - WITHOUT summary: LLM sees "[90 items compressed]" → likely does NOT call tool | |
| Requires: ANTHROPIC_API_KEY in environment or .env file. | |
| Run: python -m pytest tests/test_compression_summary_tool_eval.py -v -s | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import os | |
| import pytest | |
| from tests._dotenv import autouse_apply_env, load_env_overrides | |
| _env_overrides = load_env_overrides() | |
| ANTHROPIC_KEY = os.environ.get("ANTHROPIC_API_KEY") or _env_overrides.get("ANTHROPIC_API_KEY", "") | |
| apply_dotenv = autouse_apply_env(_env_overrides) | |
| pytestmark = pytest.mark.skipif( | |
| not ANTHROPIC_KEY, | |
| reason="ANTHROPIC_API_KEY not set — skipping integration tests", | |
| ) | |
| # The headroom_retrieve tool definition (same as what CCR injects) | |
| HEADROOM_RETRIEVE_TOOL = { | |
| "name": "headroom_retrieve", | |
| "description": ( | |
| "Retrieve original uncompressed content from Headroom's compression cache. " | |
| "Use this when you need more details from compressed data. " | |
| "You can pass a query to search within the compressed content." | |
| ), | |
| "input_schema": { | |
| "type": "object", | |
| "properties": { | |
| "hash": { | |
| "type": "string", | |
| "description": "The hash key from the compression marker", | |
| }, | |
| "query": { | |
| "type": "string", | |
| "description": "Optional search query to find specific items within the compressed data", | |
| }, | |
| }, | |
| "required": ["hash"], | |
| }, | |
| } | |
| def _call_claude_with_tools(messages: list[dict], tools: list[dict], max_tokens: int = 300) -> dict: | |
| """Make a real Anthropic API call with tool use.""" | |
| import httpx | |
| resp = httpx.post( | |
| "https://api.anthropic.com/v1/messages", | |
| headers={ | |
| "X-Api-Key": ANTHROPIC_KEY, | |
| "anthropic-version": "2023-06-01", | |
| "Content-Type": "application/json", | |
| }, | |
| json={ | |
| "model": "claude-sonnet-4-5-20250929", | |
| "max_tokens": max_tokens, | |
| "messages": messages, | |
| "tools": tools, | |
| }, | |
| timeout=30, | |
| ) | |
| return resp.json() | |
| def _make_test_results(n: int = 100) -> list[dict]: | |
| """Test suite output with hidden failures in the compressed portion.""" | |
| results = [] | |
| for i in range(n): | |
| result = { | |
| "test_name": f"test_module_{i // 10}.test_case_{i}", | |
| "status": "passed", | |
| "duration_ms": 50 + i * 3, | |
| } | |
| if i == 42: | |
| result["status"] = "failed" | |
| result["error"] = "AssertionError: expected 200, got 401 in auth_middleware" | |
| result["test_name"] = "test_auth.test_login_expired_token" | |
| if i == 67: | |
| result["status"] = "failed" | |
| result["error"] = "TimeoutError: database pool exhausted after 30s" | |
| result["test_name"] = "test_database.test_concurrent_connections" | |
| if i == 88: | |
| result["status"] = "error" | |
| result["error"] = "ImportError: cannot import 'NewFeature'" | |
| result["test_name"] = "test_features.test_new_feature_integration" | |
| results.append(result) | |
| return results | |
| def _has_tool_use(response: dict) -> bool: | |
| """Check if the response contains a tool_use block.""" | |
| for block in response.get("content", []): | |
| if block.get("type") == "tool_use": | |
| return True | |
| return False | |
| def _get_tool_calls(response: dict) -> list[dict]: | |
| """Extract all tool_use blocks from response.""" | |
| calls = [] | |
| for block in response.get("content", []): | |
| if block.get("type") == "tool_use": | |
| calls.append( | |
| { | |
| "name": block.get("name"), | |
| "input": block.get("input", {}), | |
| } | |
| ) | |
| return calls | |
| class TestToolInvocationWithSummary: | |
| """The real eval: does the LLM call headroom_retrieve?""" | |
| def test_with_summary_triggers_tool_call(self): | |
| """WITH compression summary → LLM should call headroom_retrieve.""" | |
| test_results = _make_test_results(100) | |
| kept = test_results[:10] # All passing | |
| from headroom.transforms.compression_summary import summarize_dropped_items | |
| summary = summarize_dropped_items(test_results, kept) | |
| compressed = json.dumps(kept, indent=2) | |
| compressed += ( | |
| f"\n[90 items compressed to 10. Omitted: {summary}." | |
| f' Retrieve specific items: headroom_retrieve(hash="ccr_test_abc123", query="your search")]' | |
| ) | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": ( | |
| "Here are the test results from our CI pipeline:\n\n" | |
| f"{compressed}\n\n" | |
| "Tell me about any test failures. What went wrong?" | |
| ), | |
| }, | |
| ] | |
| resp = _call_claude_with_tools(messages, [HEADROOM_RETRIEVE_TOOL]) | |
| tool_calls = _get_tool_calls(resp) | |
| stop_reason = resp.get("stop_reason", "") | |
| print(f"\n Summary: {summary}") | |
| print(f" Stop reason: {stop_reason}") | |
| print(f" Tool calls: {tool_calls}") | |
| # With a summary showing failures, the LLM SHOULD call the tool | |
| if stop_reason == "tool_use": | |
| assert len(tool_calls) > 0 | |
| call = tool_calls[0] | |
| assert call["name"] == "headroom_retrieve" | |
| assert call["input"].get("hash") == "ccr_test_abc123" | |
| # The query should be about failures/errors | |
| query = call["input"].get("query", "").lower() | |
| print(f" Query used: {query}") | |
| has_relevant_query = any( | |
| term in query for term in ["fail", "error", "issue", "problem", "broken", "test"] | |
| ) | |
| assert has_relevant_query, f"Tool was called but query isn't relevant: {query}" | |
| print(" RESULT: LLM invoked headroom_retrieve with relevant query ✓") | |
| else: | |
| # LLM responded with text — check if it at least mentions the failures | |
| text = "" | |
| for block in resp.get("content", []): | |
| if block.get("type") == "text": | |
| text += block.get("text", "") | |
| print(f" LLM text response: {text[:200]}") | |
| # It's acceptable if the LLM mentions it WANTS to retrieve | |
| mentions_retrieval = any( | |
| term in text.lower() | |
| for term in ["retrieve", "headroom_retrieve", "fetch", "see more", "compressed"] | |
| ) | |
| print(f" Mentions retrieval: {mentions_retrieval}") | |
| def test_without_summary_baseline(self): | |
| """WITHOUT compression summary → LLM likely does NOT call tool.""" | |
| test_results = _make_test_results(100) | |
| kept = test_results[:10] # All passing | |
| compressed = json.dumps(kept, indent=2) | |
| compressed += "\n[90 items compressed to 10. Retrieve more: hash=ccr_test_abc123]" | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": ( | |
| "Here are the test results from our CI pipeline:\n\n" | |
| f"{compressed}\n\n" | |
| "Tell me about any test failures. What went wrong?" | |
| ), | |
| }, | |
| ] | |
| resp = _call_claude_with_tools(messages, [HEADROOM_RETRIEVE_TOOL]) | |
| tool_calls = _get_tool_calls(resp) | |
| stop_reason = resp.get("stop_reason", "") | |
| print(f"\n Stop reason: {stop_reason}") | |
| print(f" Tool calls: {tool_calls}") | |
| if stop_reason == "tool_use": | |
| call = tool_calls[0] | |
| print(f" Query used: {call['input'].get('query', 'none')}") | |
| print(" RESULT: LLM DID invoke tool (may check proactively)") | |
| else: | |
| text = "" | |
| for block in resp.get("content", []): | |
| if block.get("type") == "text": | |
| text += block.get("text", "") | |
| print(f" LLM text response: {text[:200]}") | |
| print(" RESULT: LLM did NOT invoke tool — assumed all tests passed") | |
| def test_code_summary_triggers_retrieval(self): | |
| """Code compression summary → LLM should retrieve specific function.""" | |
| compressed_code = '''class PaymentProcessor: | |
| """Processes payments via Stripe.""" | |
| def __init__(self, api_key: str): | |
| # [2 lines omitted] | |
| pass | |
| def charge(self, amount: float, currency: str, token: str) -> dict: | |
| # [8 lines omitted] | |
| pass | |
| def refund(self, charge_id: str, amount: float = None) -> dict: | |
| # [3 lines omitted] | |
| pass | |
| def get_balance(self) -> float: | |
| # [2 lines omitted] | |
| pass | |
| # [180 tokens compressed. removed: def charge (12 lines), def refund (6 lines). Retrieve full code: headroom_retrieve(hash="ccr_code_xyz", query="function name")]''' | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": ( | |
| "Here's the payment processor code:\n\n" | |
| f"```python\n{compressed_code}\n```\n\n" | |
| "There's a bug in the retry logic for failed charges. " | |
| "Can you find and fix it?" | |
| ), | |
| }, | |
| ] | |
| resp = _call_claude_with_tools(messages, [HEADROOM_RETRIEVE_TOOL]) | |
| tool_calls = _get_tool_calls(resp) | |
| stop_reason = resp.get("stop_reason", "") | |
| print(f"\n Stop reason: {stop_reason}") | |
| print(f" Tool calls: {tool_calls}") | |
| if stop_reason == "tool_use": | |
| call = tool_calls[0] | |
| assert call["name"] == "headroom_retrieve" | |
| query = call["input"].get("query", "").lower() | |
| print(f" Query: {query}") | |
| # Should be asking for the charge function specifically | |
| has_charge = any(term in query for term in ["charge", "retry", "payment", "stripe"]) | |
| print(f" Targets charge/retry: {has_charge}") | |
| print(" RESULT: LLM invoked tool to get the charge() implementation ✓") | |
| else: | |
| text = "" | |
| for block in resp.get("content", []): | |
| if block.get("type") == "text": | |
| text += block.get("text", "") | |
| print(f" LLM text: {text[:200]}") | |
| print(" RESULT: LLM did NOT invoke tool") | |