Spaces:

minhtudragon
/

headroom

Running

headroom / tests /test_compression /test_compression_summary_integration.py

tudragon154203

fix: route count_tokens to api.anthropic.com, not proxy base_url

0adb431 26 days ago

7.65 kB

	"""Integration eval: Compression summaries with real LLM calls.

	Tests whether compression summaries actually help the LLM find information
	in compressed data. Compares behavior with and without summaries.

	Requires: ANTHROPIC_API_KEY in environment or .env file.

	Run: python -m pytest tests/test_compression_summary_integration.py -v -s
	"""

	from __future__ import annotations

	import json
	import os

	import pytest

	from tests._dotenv import autouse_apply_env, load_env_overrides

	_env_overrides = load_env_overrides()
	ANTHROPIC_KEY = os.environ.get("ANTHROPIC_API_KEY") or _env_overrides.get("ANTHROPIC_API_KEY", "")
	apply_dotenv = autouse_apply_env(_env_overrides)

	pytestmark = pytest.mark.skipif(
	not ANTHROPIC_KEY,
	reason="ANTHROPIC_API_KEY not set — skipping integration tests",
	)


	def _call_claude(messages: list[dict], max_tokens: int = 200) -> dict:
	"""Make a real Anthropic API call."""
	import httpx

	resp = httpx.post(
	"https://api.anthropic.com/v1/messages",
	headers={
	"X-Api-Key": ANTHROPIC_KEY,
	"anthropic-version": "2023-06-01",
	"Content-Type": "application/json",
	},
	json={
	"model": "claude-sonnet-4-5-20250929",
	"max_tokens": max_tokens,
	"messages": messages,
	},
	timeout=30,
	)
	return resp.json()


	# ============================================================================
	# Test data: realistic tool output that gets compressed
	# ============================================================================


	def _make_test_suite_output(n: int = 100) -> list[dict]:
	"""Simulate a large test suite result (like from a CI/CD tool)."""
	results = []
	for i in range(n):
	result = {
	"test_name": f"test_module_{i // 10}.test_case_{i}",
	"status": "passed",
	"duration_ms": 50 + i * 3,
	"file": f"tests/test_module_{i // 10}.py",
	}
	# Inject specific failures that the LLM should find
	if i == 42:
	result["status"] = "failed"
	result["error"] = "AssertionError: expected status 200, got 401 in auth_middleware"
	result["test_name"] = "test_auth.test_login_with_expired_token"
	if i == 67:
	result["status"] = "failed"
	result["error"] = "TimeoutError: database connection pool exhausted after 30s"
	result["test_name"] = "test_database.test_concurrent_connections"
	if i == 88:
	result["status"] = "error"
	result["error"] = "ImportError: cannot import name 'NewFeature' from 'app.features'"
	result["test_name"] = "test_features.test_new_feature_integration"
	results.append(result)
	return results


	class TestSummaryHelpfulness:
	"""Compare LLM accuracy with vs without compression summaries."""

	def test_find_failures_with_summary(self):
	"""LLM can identify failure types from the summary alone."""
	test_results = _make_test_suite_output(100)

	# Simulate compression: keep first 10, compress rest with summary
	kept = test_results[:10]
	from headroom.transforms.compression_summary import summarize_dropped_items

	summary = summarize_dropped_items(test_results, kept)

	compressed_output = json.dumps(kept, indent=2)
	compressed_output += f"\n[90 items compressed to 10. Omitted: {summary}. "
	compressed_output += (
	'Retrieve specific items: headroom_retrieve(hash="abc123", query="your search")]'
	)

	messages = [
	{
	"role": "user",
	"content": (
	"Here are the test results from CI:\n\n"
	f"{compressed_output}\n\n"
	"Are there any test failures? What types of failures are there? "
	"Answer concisely."
	),
	},
	]

	resp = _call_claude(messages)
	text = resp.get("content", [{}])[0].get("text", "").lower()

	# The LLM should mention failures (from the summary info)
	has_failure_info = any(
	word in text for word in ["fail", "error", "timeout", "assert", "import"]
	)
	print(f"\n Summary: {summary}")
	print(f" LLM response: {text[:200]}")
	print(f" Detected failure info: {has_failure_info}")

	assert has_failure_info, f"LLM didn't detect failures from summary. Response: {text[:300]}"

	def test_find_failures_without_summary(self):
	"""Baseline: LLM with NO summary — just '[90 items compressed]'."""
	test_results = _make_test_suite_output(100)

	kept = test_results[:10]
	compressed_output = json.dumps(kept, indent=2)
	compressed_output += "\n[90 items compressed to 10. Retrieve more: hash=abc123]"

	messages = [
	{
	"role": "user",
	"content": (
	"Here are the test results from CI:\n\n"
	f"{compressed_output}\n\n"
	"Are there any test failures? What types of failures are there? "
	"Answer concisely."
	),
	},
	]

	resp = _call_claude(messages)
	text = resp.get("content", [{}])[0].get("text", "").lower()

	# The LLM may or may not detect failures (it only sees 10 passing tests)
	has_failure_info = any(
	word in text for word in ["fail", "error", "timeout", "assert", "import"]
	)
	print(f"\n LLM response (no summary): {text[:200]}")
	print(f" Detected failure info: {has_failure_info}")

	# We're NOT asserting here — this is the baseline.
	# We expect this to often MISS failures since the summary is generic.

	def test_code_summary_helps_identify_functions(self):
	"""LLM can identify which functions were removed from compressed code."""
	compressed_code = '''
	class PaymentProcessor:
	"""Processes payments via Stripe."""

	def __init__(self, api_key: str):
	# [2 lines omitted]
	pass

	def charge(self, amount: float, currency: str, token: str) -> dict:
	# [8 lines omitted]
	pass

	def refund(self, charge_id: str, amount: float = None) -> dict:
	# [3 lines omitted]
	pass

	def get_balance(self) -> float:
	# [2 lines omitted]
	pass
	'''
	from headroom.transforms.compression_summary import summarize_compressed_code

	# Use AST-based summary (language-agnostic)
	bodies = [
	("def charge(self, amount: float, currency: str, token: str) -> dict:", "...", 10),
	("def refund(self, charge_id: str, amount: float = None) -> dict:", "...", 20),
	("def get_balance(self) -> float:", "...", 30),
	]
	code_summary = summarize_compressed_code(bodies, 3)

	prompt = f"Here is a compressed Python file:\n\n```python\n{compressed_code}\n```\n\n"
	if code_summary:
	prompt += f"[Compression info: {code_summary}]\n\n"
	prompt += "I need to understand the retry logic. Which function should I look at? Answer in one sentence."

	messages = [{"role": "user", "content": prompt}]
	resp = _call_claude(messages, max_tokens=100)
	text = resp.get("content", [{}])[0].get("text", "").lower()

	print(f"\n Code summary: {code_summary}")
	print(f" LLM response: {text[:200]}")

	# The LLM should identify the charge() function
	assert "charge" in text, f"LLM didn't identify charge() function. Response: {text}"