"""Integration eval: Compression summaries with real LLM calls. Tests whether compression summaries actually help the LLM find information in compressed data. Compares behavior with and without summaries. Requires: ANTHROPIC_API_KEY in environment or .env file. Run: python -m pytest tests/test_compression_summary_integration.py -v -s """ from __future__ import annotations import json import os import pytest from tests._dotenv import autouse_apply_env, load_env_overrides _env_overrides = load_env_overrides() ANTHROPIC_KEY = os.environ.get("ANTHROPIC_API_KEY") or _env_overrides.get("ANTHROPIC_API_KEY", "") apply_dotenv = autouse_apply_env(_env_overrides) pytestmark = pytest.mark.skipif( not ANTHROPIC_KEY, reason="ANTHROPIC_API_KEY not set — skipping integration tests", ) def _call_claude(messages: list[dict], max_tokens: int = 200) -> dict: """Make a real Anthropic API call.""" import httpx resp = httpx.post( "https://api.anthropic.com/v1/messages", headers={ "X-Api-Key": ANTHROPIC_KEY, "anthropic-version": "2023-06-01", "Content-Type": "application/json", }, json={ "model": "claude-sonnet-4-5-20250929", "max_tokens": max_tokens, "messages": messages, }, timeout=30, ) return resp.json() # ============================================================================ # Test data: realistic tool output that gets compressed # ============================================================================ def _make_test_suite_output(n: int = 100) -> list[dict]: """Simulate a large test suite result (like from a CI/CD tool).""" results = [] for i in range(n): result = { "test_name": f"test_module_{i // 10}.test_case_{i}", "status": "passed", "duration_ms": 50 + i * 3, "file": f"tests/test_module_{i // 10}.py", } # Inject specific failures that the LLM should find if i == 42: result["status"] = "failed" result["error"] = "AssertionError: expected status 200, got 401 in auth_middleware" result["test_name"] = "test_auth.test_login_with_expired_token" if i == 67: result["status"] = "failed" result["error"] = "TimeoutError: database connection pool exhausted after 30s" result["test_name"] = "test_database.test_concurrent_connections" if i == 88: result["status"] = "error" result["error"] = "ImportError: cannot import name 'NewFeature' from 'app.features'" result["test_name"] = "test_features.test_new_feature_integration" results.append(result) return results class TestSummaryHelpfulness: """Compare LLM accuracy with vs without compression summaries.""" def test_find_failures_with_summary(self): """LLM can identify failure types from the summary alone.""" test_results = _make_test_suite_output(100) # Simulate compression: keep first 10, compress rest with summary kept = test_results[:10] from headroom.transforms.compression_summary import summarize_dropped_items summary = summarize_dropped_items(test_results, kept) compressed_output = json.dumps(kept, indent=2) compressed_output += f"\n[90 items compressed to 10. Omitted: {summary}. " compressed_output += ( 'Retrieve specific items: headroom_retrieve(hash="abc123", query="your search")]' ) messages = [ { "role": "user", "content": ( "Here are the test results from CI:\n\n" f"{compressed_output}\n\n" "Are there any test failures? What types of failures are there? " "Answer concisely." ), }, ] resp = _call_claude(messages) text = resp.get("content", [{}])[0].get("text", "").lower() # The LLM should mention failures (from the summary info) has_failure_info = any( word in text for word in ["fail", "error", "timeout", "assert", "import"] ) print(f"\n Summary: {summary}") print(f" LLM response: {text[:200]}") print(f" Detected failure info: {has_failure_info}") assert has_failure_info, f"LLM didn't detect failures from summary. Response: {text[:300]}" def test_find_failures_without_summary(self): """Baseline: LLM with NO summary — just '[90 items compressed]'.""" test_results = _make_test_suite_output(100) kept = test_results[:10] compressed_output = json.dumps(kept, indent=2) compressed_output += "\n[90 items compressed to 10. Retrieve more: hash=abc123]" messages = [ { "role": "user", "content": ( "Here are the test results from CI:\n\n" f"{compressed_output}\n\n" "Are there any test failures? What types of failures are there? " "Answer concisely." ), }, ] resp = _call_claude(messages) text = resp.get("content", [{}])[0].get("text", "").lower() # The LLM may or may not detect failures (it only sees 10 passing tests) has_failure_info = any( word in text for word in ["fail", "error", "timeout", "assert", "import"] ) print(f"\n LLM response (no summary): {text[:200]}") print(f" Detected failure info: {has_failure_info}") # We're NOT asserting here — this is the baseline. # We expect this to often MISS failures since the summary is generic. def test_code_summary_helps_identify_functions(self): """LLM can identify which functions were removed from compressed code.""" compressed_code = ''' class PaymentProcessor: """Processes payments via Stripe.""" def __init__(self, api_key: str): # [2 lines omitted] pass def charge(self, amount: float, currency: str, token: str) -> dict: # [8 lines omitted] pass def refund(self, charge_id: str, amount: float = None) -> dict: # [3 lines omitted] pass def get_balance(self) -> float: # [2 lines omitted] pass ''' from headroom.transforms.compression_summary import summarize_compressed_code # Use AST-based summary (language-agnostic) bodies = [ ("def charge(self, amount: float, currency: str, token: str) -> dict:", "...", 10), ("def refund(self, charge_id: str, amount: float = None) -> dict:", "...", 20), ("def get_balance(self) -> float:", "...", 30), ] code_summary = summarize_compressed_code(bodies, 3) prompt = f"Here is a compressed Python file:\n\n```python\n{compressed_code}\n```\n\n" if code_summary: prompt += f"[Compression info: {code_summary}]\n\n" prompt += "I need to understand the retry logic. Which function should I look at? Answer in one sentence." messages = [{"role": "user", "content": prompt}] resp = _call_claude(messages, max_tokens=100) text = resp.get("content", [{}])[0].get("text", "").lower() print(f"\n Code summary: {code_summary}") print(f" LLM response: {text[:200]}") # The LLM should identify the charge() function assert "charge" in text, f"LLM didn't identify charge() function. Response: {text}"