Spaces:

minhtudragon
/

headroom

Running

headroom / tests /test_compression /test_compression_summary_eval.py

tudragon154203

fix: route count_tokens to api.anthropic.com, not proxy base_url

0adb431 24 days ago

8.42 kB

	"""Eval: Compression summary quality — generic, unbiased.

	Tests that compression summaries are:
	1. Accurate (categories match actual dropped items)
	2. Useful (contain information that would help retrieval)
	3. Not misleading (don't hallucinate categories)

	These are NOT skewed to show summaries as amazing — they test
	real-world data patterns and verify correctness.
	"""

	from headroom.transforms.compression_summary import (
	summarize_compressed_code,
	summarize_dropped_items,
	)

	# ============================================================================
	# Realistic test data (modeled on actual tool outputs)
	# ============================================================================


	def _make_github_issues(n: int) -> list[dict]:
	"""Realistic GitHub issues list."""
	statuses = ["open"] * (n // 2) + ["closed"] * (n // 4) + ["in_progress"] * (n // 4)
	issues = []
	for i in range(n):
	issue = {
	"id": i + 1,
	"title": f"Issue #{i + 1}: {'Fix auth bug' if i == 42 else 'General issue'}",
	"status": statuses[i % len(statuses)],
	"labels": ["bug"] if i % 10 == 0 else ["enhancement"],
	"assignee": f"user-{i % 5}",
	}
	if i in (42, 87):
	issue["status"] = "open"
	issue["labels"] = ["critical", "bug"]
	issue["title"] = f"CRITICAL: Auth failure in production (issue #{i + 1})"
	issues.append(issue)
	return issues


	def _make_test_results(n: int) -> list[dict]:
	"""Realistic test suite results."""
	results = []
	for i in range(n):
	result = {
	"name": f"test_{'auth' if i < 10 else 'general'}_{i}",
	"status": "pass",
	"duration_ms": 50 + i * 2,
	}
	if i in (3, 7, 45, 88):
	result["status"] = "fail"
	result["error"] = "AssertionError: expected True, got False"
	if i in (12, 67):
	result["status"] = "error"
	result["error"] = "TimeoutError: test exceeded 30s limit"
	results.append(result)
	return results


	def _make_log_entries(n: int) -> list[dict]:
	"""Realistic log entries."""
	entries = []
	for i in range(n):
	entry = {
	"timestamp": f"2024-01-15T10:{i:02d}:00Z",
	"level": "info",
	"message": f"Request processed in {10 + i}ms",
	"service": "api-gateway",
	}
	if i in (15, 45, 89):
	entry["level"] = "error"
	entry["message"] = "Connection refused: database pool exhausted"
	if i in (20, 50):
	entry["level"] = "warning"
	entry["message"] = "High memory usage: 85% threshold exceeded"
	entries.append(entry)
	return entries


	def _make_api_endpoints(n: int) -> list[dict]:
	"""Realistic API endpoint list."""
	return [
	{
	"path": f"/api/v1/{'users' if i < n // 3 else 'orders' if i < 2 * n // 3 else 'products'}/{i}",
	"method": "GET" if i % 3 else "POST",
	"status_code": 200 if i % 20 else 500,
	"latency_ms": 50 + i,
	}
	for i in range(n)
	]


	# ============================================================================
	# Eval: Summary accuracy
	# ============================================================================


	class TestSummaryAccuracy:
	"""Verify summaries accurately reflect what was dropped."""

	def test_github_issues_categories_correct(self):
	"""Summary mentions actual status values from dropped items."""
	issues = _make_github_issues(100)
	kept = issues[:5]
	summary = summarize_dropped_items(issues, kept)

	# Should mention the status values present in dropped items
	assert summary # Non-empty
	# At minimum, should contain some status category info
	has_category = any(s in summary.lower() for s in ["open", "closed", "in_progress"])
	assert has_category, f"Summary missing status categories: {summary}"

	def test_test_results_mentions_failures(self):
	"""Summary mentions failures when test results are compressed."""
	results = _make_test_results(100)
	kept = results[:5]
	summary = summarize_dropped_items(results, kept)

	assert summary
	# Should mention pass/fail somewhere
	has_result = any(s in summary.lower() for s in ["pass", "fail", "error"])
	assert has_result, f"Summary missing test result info: {summary}"

	def test_log_entries_mentions_errors(self):
	"""Summary mentions error log entries."""
	logs = _make_log_entries(100)
	kept = logs[:3]
	summary = summarize_dropped_items(logs, kept)

	assert summary
	# Should categorize by log level
	has_level = any(s in summary.lower() for s in ["info", "error", "warning"])
	assert has_level, f"Summary missing log level info: {summary}"

	def test_no_hallucinated_categories(self):
	"""Summary should NOT mention categories that don't exist."""
	items = [{"status": "active", "id": i} for i in range(50)]
	kept = items[:3]
	summary = summarize_dropped_items(items, kept)

	# Should NOT mention statuses that don't exist in the data
	assert "error" not in summary.lower() or "notable" in summary.lower()
	assert "fail" not in summary.lower()
	assert "critical" not in summary.lower()

	def test_summary_proportional_to_data(self):
	"""Category counts in summary should roughly match actual data."""
	items = (
	[{"type": "log", "data": "x"}] * 100
	+ [{"type": "metric", "data": "y"}] * 50
	+ [{"type": "alert", "data": "z"}] * 10
	)
	kept = items[:3]
	summary = summarize_dropped_items(items, kept)

	# "log" should appear with a higher count than "alert"
	# (We can't verify exact counts from the summary string,
	# but we verify the summary is non-empty and reasonable)
	assert summary
	assert len(summary) < 300


	class TestSummaryUsefulness:
	"""Verify summaries contain information useful for retrieval."""

	def test_enough_info_to_search(self):
	"""Summary should contain terms the LLM could use as search queries."""
	results = _make_test_results(100)
	kept = results[:5]
	summary = summarize_dropped_items(results, kept)

	# The LLM should be able to extract search terms from the summary
	# At minimum, it should know WHAT KIND of items are in the compressed data
	assert len(summary) > 10, "Summary too short to be useful"

	def test_notable_items_actionable(self):
	"""Notable items should contain enough info to act on."""
	logs = _make_log_entries(100)
	kept = logs[:2]
	summary = summarize_dropped_items(logs, kept)

	# If there are errors in the logs, the summary should help
	# the LLM decide to retrieve them
	assert summary
	# Just verify it's substantive enough
	assert len(summary.split()) > 3

	def test_api_endpoints_described(self):
	"""API endpoint data should produce some useful description."""
	endpoints = _make_api_endpoints(60)
	kept = endpoints[:5]
	summary = summarize_dropped_items(endpoints, kept)

	assert summary # Should produce SOMETHING, even without type/status fields


	class TestCodeSummaryAccuracy:
	"""Verify code summaries accurately describe removed sections."""

	def test_real_python_module(self):
	"""Summary of a realistic Python module compression."""
	# Use AST-based summary (language-agnostic)
	bodies = [
	("def __init__(self, url: str, pool_size: int = 10):", "...", 8),
	("def connect(self) -> Any:", "...", 15),
	("def _create_new(self) -> Any:", "...", 22),
	("def release(self, conn: Any) -> None:", "...", 28),
	("def close_all(self) -> None:", "...", 33),
	("def create_engine(url: str) -> DatabaseConnection:", "...", 38),
	]
	summary = summarize_compressed_code(bodies, 6)
	assert "6 bodies compressed" in summary
	has_names = any(
	name in summary for name in ["connect()", "release()", "close_all()", "create_engine()"]
	)
	assert has_names, f"Summary missing function names: {summary}"