Spaces:
Running
Running
| """Eval: Compression summary quality — generic, unbiased. | |
| Tests that compression summaries are: | |
| 1. Accurate (categories match actual dropped items) | |
| 2. Useful (contain information that would help retrieval) | |
| 3. Not misleading (don't hallucinate categories) | |
| These are NOT skewed to show summaries as amazing — they test | |
| real-world data patterns and verify correctness. | |
| """ | |
| from headroom.transforms.compression_summary import ( | |
| summarize_compressed_code, | |
| summarize_dropped_items, | |
| ) | |
| # ============================================================================ | |
| # Realistic test data (modeled on actual tool outputs) | |
| # ============================================================================ | |
| def _make_github_issues(n: int) -> list[dict]: | |
| """Realistic GitHub issues list.""" | |
| statuses = ["open"] * (n // 2) + ["closed"] * (n // 4) + ["in_progress"] * (n // 4) | |
| issues = [] | |
| for i in range(n): | |
| issue = { | |
| "id": i + 1, | |
| "title": f"Issue #{i + 1}: {'Fix auth bug' if i == 42 else 'General issue'}", | |
| "status": statuses[i % len(statuses)], | |
| "labels": ["bug"] if i % 10 == 0 else ["enhancement"], | |
| "assignee": f"user-{i % 5}", | |
| } | |
| if i in (42, 87): | |
| issue["status"] = "open" | |
| issue["labels"] = ["critical", "bug"] | |
| issue["title"] = f"CRITICAL: Auth failure in production (issue #{i + 1})" | |
| issues.append(issue) | |
| return issues | |
| def _make_test_results(n: int) -> list[dict]: | |
| """Realistic test suite results.""" | |
| results = [] | |
| for i in range(n): | |
| result = { | |
| "name": f"test_{'auth' if i < 10 else 'general'}_{i}", | |
| "status": "pass", | |
| "duration_ms": 50 + i * 2, | |
| } | |
| if i in (3, 7, 45, 88): | |
| result["status"] = "fail" | |
| result["error"] = "AssertionError: expected True, got False" | |
| if i in (12, 67): | |
| result["status"] = "error" | |
| result["error"] = "TimeoutError: test exceeded 30s limit" | |
| results.append(result) | |
| return results | |
| def _make_log_entries(n: int) -> list[dict]: | |
| """Realistic log entries.""" | |
| entries = [] | |
| for i in range(n): | |
| entry = { | |
| "timestamp": f"2024-01-15T10:{i:02d}:00Z", | |
| "level": "info", | |
| "message": f"Request processed in {10 + i}ms", | |
| "service": "api-gateway", | |
| } | |
| if i in (15, 45, 89): | |
| entry["level"] = "error" | |
| entry["message"] = "Connection refused: database pool exhausted" | |
| if i in (20, 50): | |
| entry["level"] = "warning" | |
| entry["message"] = "High memory usage: 85% threshold exceeded" | |
| entries.append(entry) | |
| return entries | |
| def _make_api_endpoints(n: int) -> list[dict]: | |
| """Realistic API endpoint list.""" | |
| return [ | |
| { | |
| "path": f"/api/v1/{'users' if i < n // 3 else 'orders' if i < 2 * n // 3 else 'products'}/{i}", | |
| "method": "GET" if i % 3 else "POST", | |
| "status_code": 200 if i % 20 else 500, | |
| "latency_ms": 50 + i, | |
| } | |
| for i in range(n) | |
| ] | |
| # ============================================================================ | |
| # Eval: Summary accuracy | |
| # ============================================================================ | |
| class TestSummaryAccuracy: | |
| """Verify summaries accurately reflect what was dropped.""" | |
| def test_github_issues_categories_correct(self): | |
| """Summary mentions actual status values from dropped items.""" | |
| issues = _make_github_issues(100) | |
| kept = issues[:5] | |
| summary = summarize_dropped_items(issues, kept) | |
| # Should mention the status values present in dropped items | |
| assert summary # Non-empty | |
| # At minimum, should contain some status category info | |
| has_category = any(s in summary.lower() for s in ["open", "closed", "in_progress"]) | |
| assert has_category, f"Summary missing status categories: {summary}" | |
| def test_test_results_mentions_failures(self): | |
| """Summary mentions failures when test results are compressed.""" | |
| results = _make_test_results(100) | |
| kept = results[:5] | |
| summary = summarize_dropped_items(results, kept) | |
| assert summary | |
| # Should mention pass/fail somewhere | |
| has_result = any(s in summary.lower() for s in ["pass", "fail", "error"]) | |
| assert has_result, f"Summary missing test result info: {summary}" | |
| def test_log_entries_mentions_errors(self): | |
| """Summary mentions error log entries.""" | |
| logs = _make_log_entries(100) | |
| kept = logs[:3] | |
| summary = summarize_dropped_items(logs, kept) | |
| assert summary | |
| # Should categorize by log level | |
| has_level = any(s in summary.lower() for s in ["info", "error", "warning"]) | |
| assert has_level, f"Summary missing log level info: {summary}" | |
| def test_no_hallucinated_categories(self): | |
| """Summary should NOT mention categories that don't exist.""" | |
| items = [{"status": "active", "id": i} for i in range(50)] | |
| kept = items[:3] | |
| summary = summarize_dropped_items(items, kept) | |
| # Should NOT mention statuses that don't exist in the data | |
| assert "error" not in summary.lower() or "notable" in summary.lower() | |
| assert "fail" not in summary.lower() | |
| assert "critical" not in summary.lower() | |
| def test_summary_proportional_to_data(self): | |
| """Category counts in summary should roughly match actual data.""" | |
| items = ( | |
| [{"type": "log", "data": "x"}] * 100 | |
| + [{"type": "metric", "data": "y"}] * 50 | |
| + [{"type": "alert", "data": "z"}] * 10 | |
| ) | |
| kept = items[:3] | |
| summary = summarize_dropped_items(items, kept) | |
| # "log" should appear with a higher count than "alert" | |
| # (We can't verify exact counts from the summary string, | |
| # but we verify the summary is non-empty and reasonable) | |
| assert summary | |
| assert len(summary) < 300 | |
| class TestSummaryUsefulness: | |
| """Verify summaries contain information useful for retrieval.""" | |
| def test_enough_info_to_search(self): | |
| """Summary should contain terms the LLM could use as search queries.""" | |
| results = _make_test_results(100) | |
| kept = results[:5] | |
| summary = summarize_dropped_items(results, kept) | |
| # The LLM should be able to extract search terms from the summary | |
| # At minimum, it should know WHAT KIND of items are in the compressed data | |
| assert len(summary) > 10, "Summary too short to be useful" | |
| def test_notable_items_actionable(self): | |
| """Notable items should contain enough info to act on.""" | |
| logs = _make_log_entries(100) | |
| kept = logs[:2] | |
| summary = summarize_dropped_items(logs, kept) | |
| # If there are errors in the logs, the summary should help | |
| # the LLM decide to retrieve them | |
| assert summary | |
| # Just verify it's substantive enough | |
| assert len(summary.split()) > 3 | |
| def test_api_endpoints_described(self): | |
| """API endpoint data should produce some useful description.""" | |
| endpoints = _make_api_endpoints(60) | |
| kept = endpoints[:5] | |
| summary = summarize_dropped_items(endpoints, kept) | |
| assert summary # Should produce SOMETHING, even without type/status fields | |
| class TestCodeSummaryAccuracy: | |
| """Verify code summaries accurately describe removed sections.""" | |
| def test_real_python_module(self): | |
| """Summary of a realistic Python module compression.""" | |
| # Use AST-based summary (language-agnostic) | |
| bodies = [ | |
| ("def __init__(self, url: str, pool_size: int = 10):", "...", 8), | |
| ("def connect(self) -> Any:", "...", 15), | |
| ("def _create_new(self) -> Any:", "...", 22), | |
| ("def release(self, conn: Any) -> None:", "...", 28), | |
| ("def close_all(self) -> None:", "...", 33), | |
| ("def create_engine(url: str) -> DatabaseConnection:", "...", 38), | |
| ] | |
| summary = summarize_compressed_code(bodies, 6) | |
| assert "6 bodies compressed" in summary | |
| has_names = any( | |
| name in summary for name in ["connect()", "release()", "close_all()", "create_engine()"] | |
| ) | |
| assert has_names, f"Summary missing function names: {summary}" | |