Spaces:
Running on Zero
Running on Zero
File size: 6,425 Bytes
2bc8e46 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 | """Tests for lightweight benchmark harnesses."""
from __future__ import annotations
from unittest.mock import MagicMock
import torch
from obliteratus.evaluation.benchmarks import (
KNOWLEDGE_ITEMS,
TRUTHFULNESS_ITEMS,
MATH_REASONING_ITEMS,
BenchmarkRunner,
BenchmarkResult,
format_benchmark_report,
)
def _make_mock_model_and_tokenizer(vocab_size=1000, hidden_dim=64):
"""Create mock model and tokenizer for benchmark testing."""
model = MagicMock()
# Model returns logits when called
def mock_forward(**kwargs):
input_ids = kwargs.get("input_ids", torch.randint(0, vocab_size, (1, 10)))
batch_size, seq_len = input_ids.shape
result = MagicMock()
result.logits = torch.randn(batch_size, seq_len, vocab_size)
return result
model.side_effect = mock_forward
model.__call__ = mock_forward
# Model.generate returns token IDs
def mock_generate(**kwargs):
input_ids = kwargs.get("input_ids", torch.randint(0, vocab_size, (1, 10)))
# Append some "generated" tokens
gen_tokens = torch.randint(0, vocab_size, (1, 20))
return torch.cat([input_ids, gen_tokens], dim=1)
model.generate = mock_generate
# Model.parameters for device detection
param = torch.nn.Parameter(torch.randn(1))
model.parameters = MagicMock(return_value=iter([param]))
tokenizer = MagicMock()
tokenizer.return_value = {
"input_ids": torch.randint(0, vocab_size, (1, 15)),
"attention_mask": torch.ones(1, 15, dtype=torch.long),
}
tokenizer.side_effect = lambda text, **kwargs: {
"input_ids": torch.randint(0, vocab_size, (1, 15)),
"attention_mask": torch.ones(1, 15, dtype=torch.long),
}
def mock_decode(ids, **kwargs):
return "The answer is 42. This is a generated response about the topic."
def mock_encode(text, **kwargs):
# Return different IDs for A, B, C, D
if text == "A":
return [65]
elif text == "B":
return [66]
elif text == "C":
return [67]
elif text == "D":
return [68]
return [hash(text) % vocab_size]
tokenizer.decode = mock_decode
tokenizer.encode = mock_encode
return model, tokenizer
class TestBenchmarkItems:
def test_knowledge_items_have_required_fields(self):
for item in KNOWLEDGE_ITEMS:
assert "q" in item
assert "choices" in item
assert "answer" in item
assert "category" in item
assert 0 <= item["answer"] < len(item["choices"])
def test_knowledge_items_count(self):
assert len(KNOWLEDGE_ITEMS) >= 20
def test_knowledge_categories(self):
categories = set(item["category"] for item in KNOWLEDGE_ITEMS)
assert len(categories) >= 4 # multiple categories
def test_truthfulness_items_have_required_fields(self):
for item in TRUTHFULNESS_ITEMS:
assert "q" in item
assert "true_answer" in item
assert "common_false" in item
assert "category" in item
def test_truthfulness_items_count(self):
assert len(TRUTHFULNESS_ITEMS) >= 10
def test_math_items_have_required_fields(self):
for item in MATH_REASONING_ITEMS:
assert "q" in item
assert "answer" in item
assert "category" in item
assert isinstance(item["answer"], (int, float))
def test_math_items_count(self):
assert len(MATH_REASONING_ITEMS) >= 10
class TestBenchmarkRunner:
def test_knowledge_probe_returns_result(self):
model, tokenizer = _make_mock_model_and_tokenizer()
runner = BenchmarkRunner(model, tokenizer, device="cpu")
result = runner.run_knowledge_probe()
assert isinstance(result, BenchmarkResult)
assert result.benchmark_name == "knowledge_probe"
assert 0 <= result.score <= 1.0
assert result.n_total == len(KNOWLEDGE_ITEMS)
assert result.n_correct >= 0
assert len(result.per_category) > 0
def test_truthfulness_probe_returns_result(self):
model, tokenizer = _make_mock_model_and_tokenizer()
runner = BenchmarkRunner(model, tokenizer, device="cpu")
result = runner.run_truthfulness_probe()
assert isinstance(result, BenchmarkResult)
assert result.benchmark_name == "truthfulness_probe"
assert 0 <= result.score <= 1.0
assert result.n_total == len(TRUTHFULNESS_ITEMS)
def test_math_probe_returns_result(self):
model, tokenizer = _make_mock_model_and_tokenizer()
runner = BenchmarkRunner(model, tokenizer, device="cpu")
result = runner.run_math_reasoning_probe()
assert isinstance(result, BenchmarkResult)
assert result.benchmark_name == "math_reasoning_probe"
assert 0 <= result.score <= 1.0
assert result.n_total == len(MATH_REASONING_ITEMS)
def test_run_all(self):
model, tokenizer = _make_mock_model_and_tokenizer()
runner = BenchmarkRunner(model, tokenizer, device="cpu")
results = runner.run_all()
assert "knowledge" in results
assert "truthfulness" in results
assert "math_reasoning" in results
def test_format_report(self):
model, tokenizer = _make_mock_model_and_tokenizer()
runner = BenchmarkRunner(model, tokenizer, device="cpu")
results = runner.run_all()
report = format_benchmark_report(results)
assert "Capability" in report
assert "knowledge" in report
assert "truthfulness" in report
assert "math" in report
def test_per_category_scores_bounded(self):
model, tokenizer = _make_mock_model_and_tokenizer()
runner = BenchmarkRunner(model, tokenizer, device="cpu")
result = runner.run_knowledge_probe()
for cat, score in result.per_category.items():
assert 0 <= score <= 1.0
def test_extract_number(self):
model, tokenizer = _make_mock_model_and_tokenizer()
runner = BenchmarkRunner(model, tokenizer, device="cpu")
assert runner._extract_number("The answer is 42.") == 42.0
assert runner._extract_number("$20.50 is the price") == 20.50
assert runner._extract_number("Result: -3.14") == -3.14
assert runner._extract_number("No numbers here") is None
|