Spaces:
Running
Running
| """Integration eval: Compression summaries with real LLM calls. | |
| Tests whether compression summaries actually help the LLM find information | |
| in compressed data. Compares behavior with and without summaries. | |
| Requires: ANTHROPIC_API_KEY in environment or .env file. | |
| Run: python -m pytest tests/test_compression_summary_integration.py -v -s | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import os | |
| import pytest | |
| from tests._dotenv import autouse_apply_env, load_env_overrides | |
| _env_overrides = load_env_overrides() | |
| ANTHROPIC_KEY = os.environ.get("ANTHROPIC_API_KEY") or _env_overrides.get("ANTHROPIC_API_KEY", "") | |
| apply_dotenv = autouse_apply_env(_env_overrides) | |
| pytestmark = pytest.mark.skipif( | |
| not ANTHROPIC_KEY, | |
| reason="ANTHROPIC_API_KEY not set — skipping integration tests", | |
| ) | |
| def _call_claude(messages: list[dict], max_tokens: int = 200) -> dict: | |
| """Make a real Anthropic API call.""" | |
| import httpx | |
| resp = httpx.post( | |
| "https://api.anthropic.com/v1/messages", | |
| headers={ | |
| "X-Api-Key": ANTHROPIC_KEY, | |
| "anthropic-version": "2023-06-01", | |
| "Content-Type": "application/json", | |
| }, | |
| json={ | |
| "model": "claude-sonnet-4-5-20250929", | |
| "max_tokens": max_tokens, | |
| "messages": messages, | |
| }, | |
| timeout=30, | |
| ) | |
| return resp.json() | |
| # ============================================================================ | |
| # Test data: realistic tool output that gets compressed | |
| # ============================================================================ | |
| def _make_test_suite_output(n: int = 100) -> list[dict]: | |
| """Simulate a large test suite result (like from a CI/CD tool).""" | |
| results = [] | |
| for i in range(n): | |
| result = { | |
| "test_name": f"test_module_{i // 10}.test_case_{i}", | |
| "status": "passed", | |
| "duration_ms": 50 + i * 3, | |
| "file": f"tests/test_module_{i // 10}.py", | |
| } | |
| # Inject specific failures that the LLM should find | |
| if i == 42: | |
| result["status"] = "failed" | |
| result["error"] = "AssertionError: expected status 200, got 401 in auth_middleware" | |
| result["test_name"] = "test_auth.test_login_with_expired_token" | |
| if i == 67: | |
| result["status"] = "failed" | |
| result["error"] = "TimeoutError: database connection pool exhausted after 30s" | |
| result["test_name"] = "test_database.test_concurrent_connections" | |
| if i == 88: | |
| result["status"] = "error" | |
| result["error"] = "ImportError: cannot import name 'NewFeature' from 'app.features'" | |
| result["test_name"] = "test_features.test_new_feature_integration" | |
| results.append(result) | |
| return results | |
| class TestSummaryHelpfulness: | |
| """Compare LLM accuracy with vs without compression summaries.""" | |
| def test_find_failures_with_summary(self): | |
| """LLM can identify failure types from the summary alone.""" | |
| test_results = _make_test_suite_output(100) | |
| # Simulate compression: keep first 10, compress rest with summary | |
| kept = test_results[:10] | |
| from headroom.transforms.compression_summary import summarize_dropped_items | |
| summary = summarize_dropped_items(test_results, kept) | |
| compressed_output = json.dumps(kept, indent=2) | |
| compressed_output += f"\n[90 items compressed to 10. Omitted: {summary}. " | |
| compressed_output += ( | |
| 'Retrieve specific items: headroom_retrieve(hash="abc123", query="your search")]' | |
| ) | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": ( | |
| "Here are the test results from CI:\n\n" | |
| f"{compressed_output}\n\n" | |
| "Are there any test failures? What types of failures are there? " | |
| "Answer concisely." | |
| ), | |
| }, | |
| ] | |
| resp = _call_claude(messages) | |
| text = resp.get("content", [{}])[0].get("text", "").lower() | |
| # The LLM should mention failures (from the summary info) | |
| has_failure_info = any( | |
| word in text for word in ["fail", "error", "timeout", "assert", "import"] | |
| ) | |
| print(f"\n Summary: {summary}") | |
| print(f" LLM response: {text[:200]}") | |
| print(f" Detected failure info: {has_failure_info}") | |
| assert has_failure_info, f"LLM didn't detect failures from summary. Response: {text[:300]}" | |
| def test_find_failures_without_summary(self): | |
| """Baseline: LLM with NO summary — just '[90 items compressed]'.""" | |
| test_results = _make_test_suite_output(100) | |
| kept = test_results[:10] | |
| compressed_output = json.dumps(kept, indent=2) | |
| compressed_output += "\n[90 items compressed to 10. Retrieve more: hash=abc123]" | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": ( | |
| "Here are the test results from CI:\n\n" | |
| f"{compressed_output}\n\n" | |
| "Are there any test failures? What types of failures are there? " | |
| "Answer concisely." | |
| ), | |
| }, | |
| ] | |
| resp = _call_claude(messages) | |
| text = resp.get("content", [{}])[0].get("text", "").lower() | |
| # The LLM may or may not detect failures (it only sees 10 passing tests) | |
| has_failure_info = any( | |
| word in text for word in ["fail", "error", "timeout", "assert", "import"] | |
| ) | |
| print(f"\n LLM response (no summary): {text[:200]}") | |
| print(f" Detected failure info: {has_failure_info}") | |
| # We're NOT asserting here — this is the baseline. | |
| # We expect this to often MISS failures since the summary is generic. | |
| def test_code_summary_helps_identify_functions(self): | |
| """LLM can identify which functions were removed from compressed code.""" | |
| compressed_code = ''' | |
| class PaymentProcessor: | |
| """Processes payments via Stripe.""" | |
| def __init__(self, api_key: str): | |
| # [2 lines omitted] | |
| pass | |
| def charge(self, amount: float, currency: str, token: str) -> dict: | |
| # [8 lines omitted] | |
| pass | |
| def refund(self, charge_id: str, amount: float = None) -> dict: | |
| # [3 lines omitted] | |
| pass | |
| def get_balance(self) -> float: | |
| # [2 lines omitted] | |
| pass | |
| ''' | |
| from headroom.transforms.compression_summary import summarize_compressed_code | |
| # Use AST-based summary (language-agnostic) | |
| bodies = [ | |
| ("def charge(self, amount: float, currency: str, token: str) -> dict:", "...", 10), | |
| ("def refund(self, charge_id: str, amount: float = None) -> dict:", "...", 20), | |
| ("def get_balance(self) -> float:", "...", 30), | |
| ] | |
| code_summary = summarize_compressed_code(bodies, 3) | |
| prompt = f"Here is a compressed Python file:\n\n```python\n{compressed_code}\n```\n\n" | |
| if code_summary: | |
| prompt += f"[Compression info: {code_summary}]\n\n" | |
| prompt += "I need to understand the retry logic. Which function should I look at? Answer in one sentence." | |
| messages = [{"role": "user", "content": prompt}] | |
| resp = _call_claude(messages, max_tokens=100) | |
| text = resp.get("content", [{}])[0].get("text", "").lower() | |
| print(f"\n Code summary: {code_summary}") | |
| print(f" LLM response: {text[:200]}") | |
| # The LLM should identify the charge() function | |
| assert "charge" in text, f"LLM didn't identify charge() function. Response: {text}" | |