Spaces:
Running
Running
| """Tests for diversity-aware compute_optimal_k in adaptive_sizer.""" | |
| from __future__ import annotations | |
| import json | |
| from headroom.transforms.adaptive_sizer import compute_optimal_k | |
| def _make_unique_items(n: int) -> list[str]: | |
| """Create n completely unique JSON items (high diversity).""" | |
| return [ | |
| json.dumps( | |
| { | |
| "id": i, | |
| "title": f"Unique topic number {i} about subject area {chr(65 + i % 26)}", | |
| "content": ( | |
| f"This is document {i} discussing a completely different subject. " | |
| f"It covers concepts like {chr(65 + i % 26)}-theory, " | |
| f"methodology-{i * 7 % 100}, and framework-{i * 13 % 50}. " | |
| f"The key finding is result-{i} which has implications for field-{i % 10}." | |
| ), | |
| "source": f"source_{i}.pdf", | |
| "score": round(0.99 - i * 0.03, 2), | |
| } | |
| ) | |
| for i in range(n) | |
| ] | |
| def _make_repetitive_items(n: int, templates: int = 3) -> list[str]: | |
| """Create n items from a few templates (low diversity).""" | |
| base_templates = [ | |
| { | |
| "status": "ok", | |
| "message": "Health check passed", | |
| "latency_ms": 12, | |
| "service": "api-gateway", | |
| }, | |
| { | |
| "status": "ok", | |
| "message": "Health check passed", | |
| "latency_ms": 15, | |
| "service": "auth-service", | |
| }, | |
| { | |
| "status": "ok", | |
| "message": "Health check passed", | |
| "latency_ms": 8, | |
| "service": "db-proxy", | |
| }, | |
| ] | |
| return [ | |
| json.dumps({**base_templates[i % templates], "timestamp": f"2026-03-25T10:{i:02d}:00Z"}) | |
| for i in range(n) | |
| ] | |
| def _make_mixed_items(n: int, unique_fraction: float) -> list[str]: | |
| """Create items where unique_fraction are unique, rest are duplicates.""" | |
| unique_count = int(n * unique_fraction) | |
| dup_count = n - unique_count | |
| items = _make_unique_items(unique_count) | |
| if dup_count > 0: | |
| template = json.dumps( | |
| { | |
| "status": "ok", | |
| "message": "Routine health check passed successfully", | |
| "latency_ms": 10, | |
| } | |
| ) | |
| items.extend([template] * dup_count) | |
| return items | |
| class TestSmallArrays: | |
| def test_small_array_returns_n(self): | |
| """Arrays with n <= 8 should always return n (unchanged).""" | |
| items = _make_unique_items(5) | |
| assert compute_optimal_k(items) == 5 | |
| def test_eight_items_returns_eight(self): | |
| items = _make_unique_items(8) | |
| assert compute_optimal_k(items) == 8 | |
| class TestNearTotalRedundancy: | |
| def test_identical_items_returns_min(self): | |
| """20 identical items should return ~3 (near-total redundancy).""" | |
| items = [json.dumps({"status": "ok", "msg": "healthy"})] * 20 | |
| k = compute_optimal_k(items) | |
| assert k <= 3 | |
| def test_two_groups_returns_small_k(self): | |
| """Items from 2 groups should return small k.""" | |
| items = [json.dumps({"type": "A", "val": 1})] * 10 + [ | |
| json.dumps({"type": "B", "val": 2}) | |
| ] * 10 | |
| k = compute_optimal_k(items) | |
| assert k <= 5 | |
| class TestHighDiversity: | |
| def test_all_unique_keeps_most(self): | |
| """15 completely unique items → should keep >= 10 (not 4 like before).""" | |
| items = _make_unique_items(15) | |
| k = compute_optimal_k(items) | |
| assert k >= 10, f"Expected k >= 10 for 15 unique items, got k={k}" | |
| def test_twenty_unique_keeps_most(self): | |
| """20 unique items → should keep >= 14.""" | |
| items = _make_unique_items(20) | |
| k = compute_optimal_k(items) | |
| assert k >= 14, f"Expected k >= 14 for 20 unique items, got k={k}" | |
| def test_twelve_unique_rag_chunks(self): | |
| """12 unique RAG chunks → should keep >= 8.""" | |
| items = _make_unique_items(12) | |
| k = compute_optimal_k(items) | |
| assert k >= 8, f"Expected k >= 8 for 12 unique RAG chunks, got k={k}" | |
| class TestLowDiversity: | |
| def test_repetitive_items_unchanged(self): | |
| """15 items from 3 templates → k should stay small (same as before).""" | |
| items = _make_repetitive_items(15, templates=3) | |
| k = compute_optimal_k(items) | |
| assert k <= 8, f"Expected k <= 8 for repetitive items, got k={k}" | |
| def test_twenty_repetitive_stays_small(self): | |
| """20 items from 3 templates → k stays small.""" | |
| items = _make_repetitive_items(20, templates=3) | |
| k = compute_optimal_k(items) | |
| assert k <= 10, f"Expected k <= 10 for 20 repetitive items, got k={k}" | |
| class TestModerateDiversity: | |
| def test_half_unique_scales(self): | |
| """20 items, 50% unique → k should be in middle range.""" | |
| items = _make_mixed_items(20, unique_fraction=0.5) | |
| k = compute_optimal_k(items) | |
| assert 6 <= k <= 16, f"Expected 6 <= k <= 16 for 50% unique, got k={k}" | |
| class TestKneeInteraction: | |
| def test_knee_with_high_diversity_gets_floor(self): | |
| """Even if knee is found at low value, high diversity boosts k.""" | |
| # Create items that have a weak bigram knee but are all unique via SimHash | |
| items = _make_unique_items(15) | |
| k = compute_optimal_k(items) | |
| # With diversity_ratio ~1.0, diversity_floor should boost k | |
| assert k >= 10, f"Expected k >= 10 with high diversity floor, got k={k}" | |
| def test_knee_with_low_diversity_stays(self): | |
| """Low diversity + knee found → k stays at knee.""" | |
| items = _make_repetitive_items(15, templates=3) | |
| k = compute_optimal_k(items) | |
| assert k <= 8, f"Expected knee-derived k <= 8 for low diversity, got k={k}" | |
| class TestBiasAndCaps: | |
| def test_bias_increases_k(self): | |
| """Bias > 1 should increase k.""" | |
| items = _make_unique_items(15) | |
| k_normal = compute_optimal_k(items, bias=1.0) | |
| k_biased = compute_optimal_k(items, bias=1.5) | |
| assert k_biased >= k_normal | |
| def test_bias_decreases_k(self): | |
| """Bias < 1 should decrease k.""" | |
| items = _make_unique_items(15) | |
| k_normal = compute_optimal_k(items, bias=1.0) | |
| k_biased = compute_optimal_k(items, bias=0.5) | |
| assert k_biased <= k_normal | |
| def test_max_k_cap_respected(self): | |
| """Even with high diversity, max_k cap is honored.""" | |
| items = _make_unique_items(20) | |
| k = compute_optimal_k(items, max_k=5) | |
| assert k <= 5 | |
| def test_min_k_floor_respected(self): | |
| """Even with low diversity, min_k floor is honored.""" | |
| items = [json.dumps({"x": 1})] * 20 | |
| k = compute_optimal_k(items, min_k=3) | |
| assert k >= 3 | |