Spaces:
Running on Zero
Running on Zero
| """Tests for lightweight benchmark harnesses.""" | |
| from __future__ import annotations | |
| from unittest.mock import MagicMock | |
| import torch | |
| from obliteratus.evaluation.benchmarks import ( | |
| KNOWLEDGE_ITEMS, | |
| TRUTHFULNESS_ITEMS, | |
| MATH_REASONING_ITEMS, | |
| BenchmarkRunner, | |
| BenchmarkResult, | |
| format_benchmark_report, | |
| ) | |
| def _make_mock_model_and_tokenizer(vocab_size=1000, hidden_dim=64): | |
| """Create mock model and tokenizer for benchmark testing.""" | |
| model = MagicMock() | |
| # Model returns logits when called | |
| def mock_forward(**kwargs): | |
| input_ids = kwargs.get("input_ids", torch.randint(0, vocab_size, (1, 10))) | |
| batch_size, seq_len = input_ids.shape | |
| result = MagicMock() | |
| result.logits = torch.randn(batch_size, seq_len, vocab_size) | |
| return result | |
| model.side_effect = mock_forward | |
| model.__call__ = mock_forward | |
| # Model.generate returns token IDs | |
| def mock_generate(**kwargs): | |
| input_ids = kwargs.get("input_ids", torch.randint(0, vocab_size, (1, 10))) | |
| # Append some "generated" tokens | |
| gen_tokens = torch.randint(0, vocab_size, (1, 20)) | |
| return torch.cat([input_ids, gen_tokens], dim=1) | |
| model.generate = mock_generate | |
| # Model.parameters for device detection | |
| param = torch.nn.Parameter(torch.randn(1)) | |
| model.parameters = MagicMock(return_value=iter([param])) | |
| tokenizer = MagicMock() | |
| tokenizer.return_value = { | |
| "input_ids": torch.randint(0, vocab_size, (1, 15)), | |
| "attention_mask": torch.ones(1, 15, dtype=torch.long), | |
| } | |
| tokenizer.side_effect = lambda text, **kwargs: { | |
| "input_ids": torch.randint(0, vocab_size, (1, 15)), | |
| "attention_mask": torch.ones(1, 15, dtype=torch.long), | |
| } | |
| def mock_decode(ids, **kwargs): | |
| return "The answer is 42. This is a generated response about the topic." | |
| def mock_encode(text, **kwargs): | |
| # Return different IDs for A, B, C, D | |
| if text == "A": | |
| return [65] | |
| elif text == "B": | |
| return [66] | |
| elif text == "C": | |
| return [67] | |
| elif text == "D": | |
| return [68] | |
| return [hash(text) % vocab_size] | |
| tokenizer.decode = mock_decode | |
| tokenizer.encode = mock_encode | |
| return model, tokenizer | |
| class TestBenchmarkItems: | |
| def test_knowledge_items_have_required_fields(self): | |
| for item in KNOWLEDGE_ITEMS: | |
| assert "q" in item | |
| assert "choices" in item | |
| assert "answer" in item | |
| assert "category" in item | |
| assert 0 <= item["answer"] < len(item["choices"]) | |
| def test_knowledge_items_count(self): | |
| assert len(KNOWLEDGE_ITEMS) >= 20 | |
| def test_knowledge_categories(self): | |
| categories = set(item["category"] for item in KNOWLEDGE_ITEMS) | |
| assert len(categories) >= 4 # multiple categories | |
| def test_truthfulness_items_have_required_fields(self): | |
| for item in TRUTHFULNESS_ITEMS: | |
| assert "q" in item | |
| assert "true_answer" in item | |
| assert "common_false" in item | |
| assert "category" in item | |
| def test_truthfulness_items_count(self): | |
| assert len(TRUTHFULNESS_ITEMS) >= 10 | |
| def test_math_items_have_required_fields(self): | |
| for item in MATH_REASONING_ITEMS: | |
| assert "q" in item | |
| assert "answer" in item | |
| assert "category" in item | |
| assert isinstance(item["answer"], (int, float)) | |
| def test_math_items_count(self): | |
| assert len(MATH_REASONING_ITEMS) >= 10 | |
| class TestBenchmarkRunner: | |
| def test_knowledge_probe_returns_result(self): | |
| model, tokenizer = _make_mock_model_and_tokenizer() | |
| runner = BenchmarkRunner(model, tokenizer, device="cpu") | |
| result = runner.run_knowledge_probe() | |
| assert isinstance(result, BenchmarkResult) | |
| assert result.benchmark_name == "knowledge_probe" | |
| assert 0 <= result.score <= 1.0 | |
| assert result.n_total == len(KNOWLEDGE_ITEMS) | |
| assert result.n_correct >= 0 | |
| assert len(result.per_category) > 0 | |
| def test_truthfulness_probe_returns_result(self): | |
| model, tokenizer = _make_mock_model_and_tokenizer() | |
| runner = BenchmarkRunner(model, tokenizer, device="cpu") | |
| result = runner.run_truthfulness_probe() | |
| assert isinstance(result, BenchmarkResult) | |
| assert result.benchmark_name == "truthfulness_probe" | |
| assert 0 <= result.score <= 1.0 | |
| assert result.n_total == len(TRUTHFULNESS_ITEMS) | |
| def test_math_probe_returns_result(self): | |
| model, tokenizer = _make_mock_model_and_tokenizer() | |
| runner = BenchmarkRunner(model, tokenizer, device="cpu") | |
| result = runner.run_math_reasoning_probe() | |
| assert isinstance(result, BenchmarkResult) | |
| assert result.benchmark_name == "math_reasoning_probe" | |
| assert 0 <= result.score <= 1.0 | |
| assert result.n_total == len(MATH_REASONING_ITEMS) | |
| def test_run_all(self): | |
| model, tokenizer = _make_mock_model_and_tokenizer() | |
| runner = BenchmarkRunner(model, tokenizer, device="cpu") | |
| results = runner.run_all() | |
| assert "knowledge" in results | |
| assert "truthfulness" in results | |
| assert "math_reasoning" in results | |
| def test_format_report(self): | |
| model, tokenizer = _make_mock_model_and_tokenizer() | |
| runner = BenchmarkRunner(model, tokenizer, device="cpu") | |
| results = runner.run_all() | |
| report = format_benchmark_report(results) | |
| assert "Capability" in report | |
| assert "knowledge" in report | |
| assert "truthfulness" in report | |
| assert "math" in report | |
| def test_per_category_scores_bounded(self): | |
| model, tokenizer = _make_mock_model_and_tokenizer() | |
| runner = BenchmarkRunner(model, tokenizer, device="cpu") | |
| result = runner.run_knowledge_probe() | |
| for cat, score in result.per_category.items(): | |
| assert 0 <= score <= 1.0 | |
| def test_extract_number(self): | |
| model, tokenizer = _make_mock_model_and_tokenizer() | |
| runner = BenchmarkRunner(model, tokenizer, device="cpu") | |
| assert runner._extract_number("The answer is 42.") == 42.0 | |
| assert runner._extract_number("$20.50 is the price") == 20.50 | |
| assert runner._extract_number("Result: -3.14") == -3.14 | |
| assert runner._extract_number("No numbers here") is None | |