"""Tests for lightweight benchmark harnesses.""" from __future__ import annotations from unittest.mock import MagicMock import torch from obliteratus.evaluation.benchmarks import ( KNOWLEDGE_ITEMS, TRUTHFULNESS_ITEMS, MATH_REASONING_ITEMS, BenchmarkRunner, BenchmarkResult, format_benchmark_report, ) def _make_mock_model_and_tokenizer(vocab_size=1000, hidden_dim=64): """Create mock model and tokenizer for benchmark testing.""" model = MagicMock() # Model returns logits when called def mock_forward(**kwargs): input_ids = kwargs.get("input_ids", torch.randint(0, vocab_size, (1, 10))) batch_size, seq_len = input_ids.shape result = MagicMock() result.logits = torch.randn(batch_size, seq_len, vocab_size) return result model.side_effect = mock_forward model.__call__ = mock_forward # Model.generate returns token IDs def mock_generate(**kwargs): input_ids = kwargs.get("input_ids", torch.randint(0, vocab_size, (1, 10))) # Append some "generated" tokens gen_tokens = torch.randint(0, vocab_size, (1, 20)) return torch.cat([input_ids, gen_tokens], dim=1) model.generate = mock_generate # Model.parameters for device detection param = torch.nn.Parameter(torch.randn(1)) model.parameters = MagicMock(return_value=iter([param])) tokenizer = MagicMock() tokenizer.return_value = { "input_ids": torch.randint(0, vocab_size, (1, 15)), "attention_mask": torch.ones(1, 15, dtype=torch.long), } tokenizer.side_effect = lambda text, **kwargs: { "input_ids": torch.randint(0, vocab_size, (1, 15)), "attention_mask": torch.ones(1, 15, dtype=torch.long), } def mock_decode(ids, **kwargs): return "The answer is 42. This is a generated response about the topic." def mock_encode(text, **kwargs): # Return different IDs for A, B, C, D if text == "A": return [65] elif text == "B": return [66] elif text == "C": return [67] elif text == "D": return [68] return [hash(text) % vocab_size] tokenizer.decode = mock_decode tokenizer.encode = mock_encode return model, tokenizer class TestBenchmarkItems: def test_knowledge_items_have_required_fields(self): for item in KNOWLEDGE_ITEMS: assert "q" in item assert "choices" in item assert "answer" in item assert "category" in item assert 0 <= item["answer"] < len(item["choices"]) def test_knowledge_items_count(self): assert len(KNOWLEDGE_ITEMS) >= 20 def test_knowledge_categories(self): categories = set(item["category"] for item in KNOWLEDGE_ITEMS) assert len(categories) >= 4 # multiple categories def test_truthfulness_items_have_required_fields(self): for item in TRUTHFULNESS_ITEMS: assert "q" in item assert "true_answer" in item assert "common_false" in item assert "category" in item def test_truthfulness_items_count(self): assert len(TRUTHFULNESS_ITEMS) >= 10 def test_math_items_have_required_fields(self): for item in MATH_REASONING_ITEMS: assert "q" in item assert "answer" in item assert "category" in item assert isinstance(item["answer"], (int, float)) def test_math_items_count(self): assert len(MATH_REASONING_ITEMS) >= 10 class TestBenchmarkRunner: def test_knowledge_probe_returns_result(self): model, tokenizer = _make_mock_model_and_tokenizer() runner = BenchmarkRunner(model, tokenizer, device="cpu") result = runner.run_knowledge_probe() assert isinstance(result, BenchmarkResult) assert result.benchmark_name == "knowledge_probe" assert 0 <= result.score <= 1.0 assert result.n_total == len(KNOWLEDGE_ITEMS) assert result.n_correct >= 0 assert len(result.per_category) > 0 def test_truthfulness_probe_returns_result(self): model, tokenizer = _make_mock_model_and_tokenizer() runner = BenchmarkRunner(model, tokenizer, device="cpu") result = runner.run_truthfulness_probe() assert isinstance(result, BenchmarkResult) assert result.benchmark_name == "truthfulness_probe" assert 0 <= result.score <= 1.0 assert result.n_total == len(TRUTHFULNESS_ITEMS) def test_math_probe_returns_result(self): model, tokenizer = _make_mock_model_and_tokenizer() runner = BenchmarkRunner(model, tokenizer, device="cpu") result = runner.run_math_reasoning_probe() assert isinstance(result, BenchmarkResult) assert result.benchmark_name == "math_reasoning_probe" assert 0 <= result.score <= 1.0 assert result.n_total == len(MATH_REASONING_ITEMS) def test_run_all(self): model, tokenizer = _make_mock_model_and_tokenizer() runner = BenchmarkRunner(model, tokenizer, device="cpu") results = runner.run_all() assert "knowledge" in results assert "truthfulness" in results assert "math_reasoning" in results def test_format_report(self): model, tokenizer = _make_mock_model_and_tokenizer() runner = BenchmarkRunner(model, tokenizer, device="cpu") results = runner.run_all() report = format_benchmark_report(results) assert "Capability" in report assert "knowledge" in report assert "truthfulness" in report assert "math" in report def test_per_category_scores_bounded(self): model, tokenizer = _make_mock_model_and_tokenizer() runner = BenchmarkRunner(model, tokenizer, device="cpu") result = runner.run_knowledge_probe() for cat, score in result.per_category.items(): assert 0 <= score <= 1.0 def test_extract_number(self): model, tokenizer = _make_mock_model_and_tokenizer() runner = BenchmarkRunner(model, tokenizer, device="cpu") assert runner._extract_number("The answer is 42.") == 42.0 assert runner._extract_number("$20.50 is the price") == 20.50 assert runner._extract_number("Result: -3.14") == -3.14 assert runner._extract_number("No numbers here") is None