obliteratus / tests /test_benchmarks.py
pliny-the-prompter's picture
Upload 127 files
45113e6 verified
"""Tests for lightweight benchmark harnesses."""
from __future__ import annotations
from unittest.mock import MagicMock
import torch
from obliteratus.evaluation.benchmarks import (
KNOWLEDGE_ITEMS,
TRUTHFULNESS_ITEMS,
MATH_REASONING_ITEMS,
BenchmarkRunner,
BenchmarkResult,
format_benchmark_report,
)
def _make_mock_model_and_tokenizer(vocab_size=1000, hidden_dim=64):
"""Create mock model and tokenizer for benchmark testing."""
model = MagicMock()
# Model returns logits when called
def mock_forward(**kwargs):
input_ids = kwargs.get("input_ids", torch.randint(0, vocab_size, (1, 10)))
batch_size, seq_len = input_ids.shape
result = MagicMock()
result.logits = torch.randn(batch_size, seq_len, vocab_size)
return result
model.side_effect = mock_forward
model.__call__ = mock_forward
# Model.generate returns token IDs
def mock_generate(**kwargs):
input_ids = kwargs.get("input_ids", torch.randint(0, vocab_size, (1, 10)))
# Append some "generated" tokens
gen_tokens = torch.randint(0, vocab_size, (1, 20))
return torch.cat([input_ids, gen_tokens], dim=1)
model.generate = mock_generate
# Model.parameters for device detection
param = torch.nn.Parameter(torch.randn(1))
model.parameters = MagicMock(return_value=iter([param]))
tokenizer = MagicMock()
tokenizer.return_value = {
"input_ids": torch.randint(0, vocab_size, (1, 15)),
"attention_mask": torch.ones(1, 15, dtype=torch.long),
}
tokenizer.side_effect = lambda text, **kwargs: {
"input_ids": torch.randint(0, vocab_size, (1, 15)),
"attention_mask": torch.ones(1, 15, dtype=torch.long),
}
def mock_decode(ids, **kwargs):
return "The answer is 42. This is a generated response about the topic."
def mock_encode(text, **kwargs):
# Return different IDs for A, B, C, D
if text == "A":
return [65]
elif text == "B":
return [66]
elif text == "C":
return [67]
elif text == "D":
return [68]
return [hash(text) % vocab_size]
tokenizer.decode = mock_decode
tokenizer.encode = mock_encode
return model, tokenizer
class TestBenchmarkItems:
def test_knowledge_items_have_required_fields(self):
for item in KNOWLEDGE_ITEMS:
assert "q" in item
assert "choices" in item
assert "answer" in item
assert "category" in item
assert 0 <= item["answer"] < len(item["choices"])
def test_knowledge_items_count(self):
assert len(KNOWLEDGE_ITEMS) >= 20
def test_knowledge_categories(self):
categories = set(item["category"] for item in KNOWLEDGE_ITEMS)
assert len(categories) >= 4 # multiple categories
def test_truthfulness_items_have_required_fields(self):
for item in TRUTHFULNESS_ITEMS:
assert "q" in item
assert "true_answer" in item
assert "common_false" in item
assert "category" in item
def test_truthfulness_items_count(self):
assert len(TRUTHFULNESS_ITEMS) >= 10
def test_math_items_have_required_fields(self):
for item in MATH_REASONING_ITEMS:
assert "q" in item
assert "answer" in item
assert "category" in item
assert isinstance(item["answer"], (int, float))
def test_math_items_count(self):
assert len(MATH_REASONING_ITEMS) >= 10
class TestBenchmarkRunner:
def test_knowledge_probe_returns_result(self):
model, tokenizer = _make_mock_model_and_tokenizer()
runner = BenchmarkRunner(model, tokenizer, device="cpu")
result = runner.run_knowledge_probe()
assert isinstance(result, BenchmarkResult)
assert result.benchmark_name == "knowledge_probe"
assert 0 <= result.score <= 1.0
assert result.n_total == len(KNOWLEDGE_ITEMS)
assert result.n_correct >= 0
assert len(result.per_category) > 0
def test_truthfulness_probe_returns_result(self):
model, tokenizer = _make_mock_model_and_tokenizer()
runner = BenchmarkRunner(model, tokenizer, device="cpu")
result = runner.run_truthfulness_probe()
assert isinstance(result, BenchmarkResult)
assert result.benchmark_name == "truthfulness_probe"
assert 0 <= result.score <= 1.0
assert result.n_total == len(TRUTHFULNESS_ITEMS)
def test_math_probe_returns_result(self):
model, tokenizer = _make_mock_model_and_tokenizer()
runner = BenchmarkRunner(model, tokenizer, device="cpu")
result = runner.run_math_reasoning_probe()
assert isinstance(result, BenchmarkResult)
assert result.benchmark_name == "math_reasoning_probe"
assert 0 <= result.score <= 1.0
assert result.n_total == len(MATH_REASONING_ITEMS)
def test_run_all(self):
model, tokenizer = _make_mock_model_and_tokenizer()
runner = BenchmarkRunner(model, tokenizer, device="cpu")
results = runner.run_all()
assert "knowledge" in results
assert "truthfulness" in results
assert "math_reasoning" in results
def test_format_report(self):
model, tokenizer = _make_mock_model_and_tokenizer()
runner = BenchmarkRunner(model, tokenizer, device="cpu")
results = runner.run_all()
report = format_benchmark_report(results)
assert "Capability" in report
assert "knowledge" in report
assert "truthfulness" in report
assert "math" in report
def test_per_category_scores_bounded(self):
model, tokenizer = _make_mock_model_and_tokenizer()
runner = BenchmarkRunner(model, tokenizer, device="cpu")
result = runner.run_knowledge_probe()
for cat, score in result.per_category.items():
assert 0 <= score <= 1.0
def test_extract_number(self):
model, tokenizer = _make_mock_model_and_tokenizer()
runner = BenchmarkRunner(model, tokenizer, device="cpu")
assert runner._extract_number("The answer is 42.") == 42.0
assert runner._extract_number("$20.50 is the price") == 20.50
assert runner._extract_number("Result: -3.14") == -3.14
assert runner._extract_number("No numbers here") is None