Spaces:

minhtudragon
/

headroom

Running

headroom / tests /test_compression /test_compression_summary_tool_eval.py

tudragon154203

fix: route count_tokens to api.anthropic.com, not proxy base_url

0adb431 24 days ago

10.6 kB

	"""Eval: Does the LLM invoke headroom_retrieve when summaries are present?

	The REAL test — it's not enough for the LLM to know something is missing.
	It must actually call the tool to fetch it.

	Compares:
	- WITH summary: LLM sees "2 failed, 1 error" → should call headroom_retrieve
	- WITHOUT summary: LLM sees "[90 items compressed]" → likely does NOT call tool

	Requires: ANTHROPIC_API_KEY in environment or .env file.

	Run: python -m pytest tests/test_compression_summary_tool_eval.py -v -s
	"""

	from __future__ import annotations

	import json
	import os

	import pytest

	from tests._dotenv import autouse_apply_env, load_env_overrides

	_env_overrides = load_env_overrides()
	ANTHROPIC_KEY = os.environ.get("ANTHROPIC_API_KEY") or _env_overrides.get("ANTHROPIC_API_KEY", "")
	apply_dotenv = autouse_apply_env(_env_overrides)

	pytestmark = pytest.mark.skipif(
	not ANTHROPIC_KEY,
	reason="ANTHROPIC_API_KEY not set — skipping integration tests",
	)

	# The headroom_retrieve tool definition (same as what CCR injects)
	HEADROOM_RETRIEVE_TOOL = {
	"name": "headroom_retrieve",
	"description": (
	"Retrieve original uncompressed content from Headroom's compression cache. "
	"Use this when you need more details from compressed data. "
	"You can pass a query to search within the compressed content."
	),
	"input_schema": {
	"type": "object",
	"properties": {
	"hash": {
	"type": "string",
	"description": "The hash key from the compression marker",
	},
	"query": {
	"type": "string",
	"description": "Optional search query to find specific items within the compressed data",
	},
	},
	"required": ["hash"],
	},
	}


	def _call_claude_with_tools(messages: list[dict], tools: list[dict], max_tokens: int = 300) -> dict:
	"""Make a real Anthropic API call with tool use."""
	import httpx

	resp = httpx.post(
	"https://api.anthropic.com/v1/messages",
	headers={
	"X-Api-Key": ANTHROPIC_KEY,
	"anthropic-version": "2023-06-01",
	"Content-Type": "application/json",
	},
	json={
	"model": "claude-sonnet-4-5-20250929",
	"max_tokens": max_tokens,
	"messages": messages,
	"tools": tools,
	},
	timeout=30,
	)
	return resp.json()


	def _make_test_results(n: int = 100) -> list[dict]:
	"""Test suite output with hidden failures in the compressed portion."""
	results = []
	for i in range(n):
	result = {
	"test_name": f"test_module_{i // 10}.test_case_{i}",
	"status": "passed",
	"duration_ms": 50 + i * 3,
	}
	if i == 42:
	result["status"] = "failed"
	result["error"] = "AssertionError: expected 200, got 401 in auth_middleware"
	result["test_name"] = "test_auth.test_login_expired_token"
	if i == 67:
	result["status"] = "failed"
	result["error"] = "TimeoutError: database pool exhausted after 30s"
	result["test_name"] = "test_database.test_concurrent_connections"
	if i == 88:
	result["status"] = "error"
	result["error"] = "ImportError: cannot import 'NewFeature'"
	result["test_name"] = "test_features.test_new_feature_integration"
	results.append(result)
	return results


	def _has_tool_use(response: dict) -> bool:
	"""Check if the response contains a tool_use block."""
	for block in response.get("content", []):
	if block.get("type") == "tool_use":
	return True
	return False


	def _get_tool_calls(response: dict) -> list[dict]:
	"""Extract all tool_use blocks from response."""
	calls = []
	for block in response.get("content", []):
	if block.get("type") == "tool_use":
	calls.append(
	{
	"name": block.get("name"),
	"input": block.get("input", {}),
	}
	)
	return calls


	class TestToolInvocationWithSummary:
	"""The real eval: does the LLM call headroom_retrieve?"""

	def test_with_summary_triggers_tool_call(self):
	"""WITH compression summary → LLM should call headroom_retrieve."""
	test_results = _make_test_results(100)
	kept = test_results[:10] # All passing

	from headroom.transforms.compression_summary import summarize_dropped_items

	summary = summarize_dropped_items(test_results, kept)

	compressed = json.dumps(kept, indent=2)
	compressed += (
	f"\n[90 items compressed to 10. Omitted: {summary}."
	f' Retrieve specific items: headroom_retrieve(hash="ccr_test_abc123", query="your search")]'
	)

	messages = [
	{
	"role": "user",
	"content": (
	"Here are the test results from our CI pipeline:\n\n"
	f"{compressed}\n\n"
	"Tell me about any test failures. What went wrong?"
	),
	},
	]

	resp = _call_claude_with_tools(messages, [HEADROOM_RETRIEVE_TOOL])

	tool_calls = _get_tool_calls(resp)
	stop_reason = resp.get("stop_reason", "")

	print(f"\n Summary: {summary}")
	print(f" Stop reason: {stop_reason}")
	print(f" Tool calls: {tool_calls}")

	# With a summary showing failures, the LLM SHOULD call the tool
	if stop_reason == "tool_use":
	assert len(tool_calls) > 0
	call = tool_calls[0]
	assert call["name"] == "headroom_retrieve"
	assert call["input"].get("hash") == "ccr_test_abc123"
	# The query should be about failures/errors
	query = call["input"].get("query", "").lower()
	print(f" Query used: {query}")
	has_relevant_query = any(
	term in query for term in ["fail", "error", "issue", "problem", "broken", "test"]
	)
	assert has_relevant_query, f"Tool was called but query isn't relevant: {query}"
	print(" RESULT: LLM invoked headroom_retrieve with relevant query ✓")
	else:
	# LLM responded with text — check if it at least mentions the failures
	text = ""
	for block in resp.get("content", []):
	if block.get("type") == "text":
	text += block.get("text", "")
	print(f" LLM text response: {text[:200]}")
	# It's acceptable if the LLM mentions it WANTS to retrieve
	mentions_retrieval = any(
	term in text.lower()
	for term in ["retrieve", "headroom_retrieve", "fetch", "see more", "compressed"]
	)
	print(f" Mentions retrieval: {mentions_retrieval}")

	def test_without_summary_baseline(self):
	"""WITHOUT compression summary → LLM likely does NOT call tool."""
	test_results = _make_test_results(100)
	kept = test_results[:10] # All passing

	compressed = json.dumps(kept, indent=2)
	compressed += "\n[90 items compressed to 10. Retrieve more: hash=ccr_test_abc123]"

	messages = [
	{
	"role": "user",
	"content": (
	"Here are the test results from our CI pipeline:\n\n"
	f"{compressed}\n\n"
	"Tell me about any test failures. What went wrong?"
	),
	},
	]

	resp = _call_claude_with_tools(messages, [HEADROOM_RETRIEVE_TOOL])

	tool_calls = _get_tool_calls(resp)
	stop_reason = resp.get("stop_reason", "")

	print(f"\n Stop reason: {stop_reason}")
	print(f" Tool calls: {tool_calls}")

	if stop_reason == "tool_use":
	call = tool_calls[0]
	print(f" Query used: {call['input'].get('query', 'none')}")
	print(" RESULT: LLM DID invoke tool (may check proactively)")
	else:
	text = ""
	for block in resp.get("content", []):
	if block.get("type") == "text":
	text += block.get("text", "")
	print(f" LLM text response: {text[:200]}")
	print(" RESULT: LLM did NOT invoke tool — assumed all tests passed")

	def test_code_summary_triggers_retrieval(self):
	"""Code compression summary → LLM should retrieve specific function."""
	compressed_code = '''class PaymentProcessor:
	"""Processes payments via Stripe."""

	def __init__(self, api_key: str):
	# [2 lines omitted]
	pass

	def charge(self, amount: float, currency: str, token: str) -> dict:
	# [8 lines omitted]
	pass

	def refund(self, charge_id: str, amount: float = None) -> dict:
	# [3 lines omitted]
	pass

	def get_balance(self) -> float:
	# [2 lines omitted]
	pass
	# [180 tokens compressed. removed: def charge (12 lines), def refund (6 lines). Retrieve full code: headroom_retrieve(hash="ccr_code_xyz", query="function name")]'''

	messages = [
	{
	"role": "user",
	"content": (
	"Here's the payment processor code:\n\n"
	f"```python\n{compressed_code}\n```\n\n"
	"There's a bug in the retry logic for failed charges. "
	"Can you find and fix it?"
	),
	},
	]

	resp = _call_claude_with_tools(messages, [HEADROOM_RETRIEVE_TOOL])

	tool_calls = _get_tool_calls(resp)
	stop_reason = resp.get("stop_reason", "")

	print(f"\n Stop reason: {stop_reason}")
	print(f" Tool calls: {tool_calls}")

	if stop_reason == "tool_use":
	call = tool_calls[0]
	assert call["name"] == "headroom_retrieve"
	query = call["input"].get("query", "").lower()
	print(f" Query: {query}")
	# Should be asking for the charge function specifically
	has_charge = any(term in query for term in ["charge", "retry", "payment", "stripe"])
	print(f" Targets charge/retry: {has_charge}")
	print(" RESULT: LLM invoked tool to get the charge() implementation ✓")
	else:
	text = ""
	for block in resp.get("content", []):
	if block.get("type") == "text":
	text += block.get("text", "")
	print(f" LLM text: {text[:200]}")
	print(" RESULT: LLM did NOT invoke tool")