| """ |
| Tests for Meta-Harness core components. |
| Run with: python tests/test_core.py |
| """ |
|
|
| import json |
| import os |
| import sys |
| import tempfile |
| import shutil |
|
|
| |
| sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
|
|
| from meta_harness.filesystem import HarnessFilesystem, HarnessRecord |
| from meta_harness.evaluator import OnlineEvaluator, QuickValidator, HarnessLoader |
| from meta_harness.proposer import validate_harness_code, extract_code_from_response |
| from meta_harness.baselines import BASELINE_HARNESSES |
|
|
|
|
| def test_filesystem(): |
| """Test HarnessFilesystem operations.""" |
| print("Testing Filesystem...") |
| workspace = tempfile.mkdtemp() |
| try: |
| fs = HarnessFilesystem(workspace) |
|
|
| |
| record1 = HarnessRecord( |
| harness_id="test_001", |
| iteration=0, |
| source_code="class Harness: pass", |
| scores={"accuracy": 0.5, "context_tokens": 100}, |
| per_example_scores=[{"correct": True}], |
| traces=[{"step": 0, "action": "predict"}], |
| ) |
| fs.add_harness(record1) |
|
|
| |
| record2 = HarnessRecord( |
| harness_id="test_002", |
| iteration=1, |
| source_code="class Harness: pass # v2", |
| scores={"accuracy": 0.8, "context_tokens": 500}, |
| per_example_scores=[], |
| traces=[], |
| parent_id="test_001", |
| ) |
| fs.add_harness(record2) |
|
|
| |
| record3 = HarnessRecord( |
| harness_id="test_003", |
| iteration=1, |
| source_code="class Harness: pass # v3", |
| scores={"accuracy": 0.4, "context_tokens": 50}, |
| per_example_scores=[], |
| traces=[], |
| ) |
| fs.add_harness(record3) |
|
|
| |
| assert len(fs.list_harnesses()) == 3 |
|
|
| |
| frontier = fs.get_pareto_frontier() |
| frontier_ids = {r.harness_id for r in frontier} |
| |
| |
| assert "test_002" in frontier_ids |
| assert "test_003" in frontier_ids |
|
|
| |
| summary = fs.get_summary_text() |
| assert "test_002" in summary |
| assert "Total harnesses evaluated: 3" in summary |
|
|
| |
| pareto_output = fs.get_cli_output("pareto") |
| assert "test_002" in pareto_output |
|
|
| top_output = fs.get_cli_output("top 2") |
| assert "test_002" in top_output |
|
|
| scores_output = fs.get_cli_output("scores") |
| assert "test_001" in scores_output |
|
|
| |
| assert os.path.exists(os.path.join(workspace, "harness_test_001", "harness.py")) |
| assert os.path.exists(os.path.join(workspace, "harness_test_001", "scores.json")) |
| assert os.path.exists(os.path.join(workspace, "index.json")) |
|
|
| print(" Filesystem: OK ✓") |
|
|
| finally: |
| shutil.rmtree(workspace) |
|
|
|
|
| def test_harness_loader(): |
| """Test loading harnesses from code.""" |
| print("Testing HarnessLoader...") |
|
|
| labels = ["cat", "dog", "bird"] |
| mock_fn = lambda prompt: "cat" |
|
|
| |
| code = ''' |
| class Harness: |
| def __init__(self, model_fn, labels): |
| self.model_fn = model_fn |
| self.labels = labels |
| self.memory = [] |
| |
| def update(self, x, y): |
| self.memory.append((x, y)) |
| |
| def predict(self, x): |
| return self.model_fn(f"Classify: {x}") |
| |
| def reset(self): |
| self.memory = [] |
| ''' |
| harness = HarnessLoader.load_from_code(code, mock_fn, labels, "test") |
| assert hasattr(harness, "predict") |
| assert hasattr(harness, "update") |
|
|
| harness.update("furry animal", "cat") |
| result = harness.predict("small furry thing") |
| assert result == "cat" |
|
|
| print(" HarnessLoader: OK ✓") |
|
|
|
|
| def test_evaluator(): |
| """Test the online evaluator.""" |
| print("Testing OnlineEvaluator...") |
|
|
| labels = ["cat", "dog", "bird"] |
| |
| model_fn = lambda prompt: "cat" |
|
|
| evaluator = OnlineEvaluator(model_fn, labels, verbose=False) |
|
|
| code = ''' |
| class Harness: |
| def __init__(self, model_fn, labels): |
| self.model_fn = model_fn |
| self.labels = labels |
| self.memory = [] |
| |
| def update(self, x, y): |
| self.memory.append((x, y)) |
| |
| def predict(self, x): |
| return self.model_fn(f"Classify: {x}") |
| |
| def reset(self): |
| self.memory = [] |
| ''' |
|
|
| train_data = [("meow", "cat"), ("bark", "dog"), ("tweet", "bird")] |
| test_data = [("purr", "cat"), ("woof", "dog"), ("chirp", "bird")] |
|
|
| results = evaluator.evaluate(code, train_data, test_data, "test_eval") |
|
|
| assert "scores" in results |
| assert "per_example_scores" in results |
| assert "traces" in results |
| |
| assert results["scores"]["accuracy"] == 1 / 3 |
|
|
| print(" OnlineEvaluator: OK ✓") |
|
|
|
|
| def test_quick_validator(): |
| """Test the quick validation.""" |
| print("Testing QuickValidator...") |
|
|
| labels = ["cat", "dog"] |
| model_fn = lambda prompt: "cat" |
|
|
| |
| valid_code = ''' |
| class Harness: |
| def __init__(self, model_fn, labels): |
| self.model_fn = model_fn |
| self.labels = labels |
| |
| def update(self, x, y): |
| pass |
| |
| def predict(self, x): |
| return self.labels[0] |
| |
| def reset(self): |
| pass |
| ''' |
| is_valid, msg = QuickValidator.validate(valid_code, model_fn, labels) |
| assert is_valid, f"Should be valid: {msg}" |
|
|
| |
| invalid_code = ''' |
| class Harness: |
| def __init__(self, model_fn, labels): |
| pass |
| |
| def update(self, x, y): |
| pass |
| |
| def reset(self): |
| pass |
| ''' |
| |
| |
| is_valid, msg = QuickValidator.validate(invalid_code, model_fn, labels) |
| assert not is_valid |
|
|
| print(" QuickValidator: OK ✓") |
|
|
|
|
| def test_code_validation(): |
| """Test code validation and extraction.""" |
| print("Testing code validation...") |
|
|
| |
| code = ''' |
| class Harness: |
| def __init__(self, model_fn, labels): |
| pass |
| def update(self, x, y): |
| pass |
| def predict(self, x): |
| return "label" |
| def reset(self): |
| pass |
| ''' |
| is_valid, msg = validate_harness_code(code) |
| assert is_valid, msg |
|
|
| |
| is_valid, msg = validate_harness_code("def foo(): pass") |
| assert not is_valid |
|
|
| |
| is_valid, msg = validate_harness_code("class Harness:\ndef __init__ def update def predict def reset") |
| assert not is_valid |
|
|
| print(" Code validation: OK ✓") |
|
|
|
|
| def test_code_extraction(): |
| """Test extracting code from LLM responses.""" |
| print("Testing code extraction...") |
|
|
| response = ''' |
| Here is my improved harness: |
| |
| ```python |
| class Harness: |
| def __init__(self, model_fn, labels): |
| self.model_fn = model_fn |
| |
| def update(self, x, y): |
| pass |
| |
| def predict(self, x): |
| return self.model_fn(x) |
| |
| def reset(self): |
| pass |
| ``` |
| |
| This harness improves by doing X. |
| ''' |
| code = extract_code_from_response(response) |
| assert code is not None |
| assert "class Harness" in code |
| assert "def predict" in code |
|
|
| print(" Code extraction: OK ✓") |
|
|
|
|
| def test_baselines(): |
| """Test that all baseline harnesses are valid and executable.""" |
| print("Testing baseline harnesses...") |
|
|
| labels = ["disease_a", "disease_b", "disease_c"] |
| model_fn = lambda prompt: "disease_a" |
|
|
| for name, code in BASELINE_HARNESSES.items(): |
| |
| is_valid, msg = validate_harness_code(code) |
| assert is_valid, f"Baseline {name} invalid: {msg}" |
|
|
| |
| is_valid, msg = QuickValidator.validate(code, model_fn, labels) |
| assert is_valid, f"Baseline {name} failed quick validation: {msg}" |
|
|
| |
| evaluator = OnlineEvaluator(model_fn, labels, verbose=False) |
| train_data = [("symptoms a", "disease_a"), ("symptoms b", "disease_b")] |
| test_data = [("symptoms c", "disease_a"), ("symptoms d", "disease_b")] |
|
|
| results = evaluator.evaluate(code, train_data, test_data, f"test_{name}") |
| assert "scores" in results |
| assert results["scores"]["accuracy"] >= 0 |
|
|
| print(f" {name}: OK ✓ (accuracy={results['scores']['accuracy']:.2f})") |
|
|
| print(" All baselines: OK ✓") |
|
|
|
|
| def test_pareto_dominance(): |
| """Test Pareto frontier computation.""" |
| print("Testing Pareto dominance...") |
|
|
| workspace = tempfile.mkdtemp() |
| try: |
| fs = HarnessFilesystem(workspace) |
|
|
| |
| configs = [ |
| ("h1", {"accuracy": 0.5, "context_tokens": 100}), |
| ("h2", {"accuracy": 0.8, "context_tokens": 50}), |
| ("h3", {"accuracy": 0.9, "context_tokens": 200}), |
| ("h4", {"accuracy": 0.3, "context_tokens": 300}), |
| ("h5", {"accuracy": 0.85, "context_tokens": 50}), |
| ] |
|
|
| for hid, scores in configs: |
| record = HarnessRecord( |
| harness_id=hid, iteration=0, |
| source_code="class Harness: pass", |
| scores=scores, per_example_scores=[], traces=[], |
| ) |
| fs.add_harness(record) |
|
|
| frontier = fs.get_pareto_frontier() |
| frontier_ids = {r.harness_id for r in frontier} |
|
|
| |
| |
| |
| assert "h5" in frontier_ids |
| assert "h3" in frontier_ids |
| assert "h2" not in frontier_ids |
| assert "h1" not in frontier_ids |
| assert "h4" not in frontier_ids |
|
|
| print(" Pareto dominance: OK ✓") |
|
|
| finally: |
| shutil.rmtree(workspace) |
|
|
|
|
| if __name__ == "__main__": |
| print("=" * 50) |
| print("Meta-Harness Core Tests") |
| print("=" * 50) |
|
|
| test_filesystem() |
| test_harness_loader() |
| test_evaluator() |
| test_quick_validator() |
| test_code_validation() |
| test_code_extraction() |
| test_baselines() |
| test_pareto_dominance() |
|
|
| print("\n" + "=" * 50) |
| print("ALL TESTS PASSED ✓") |
| print("=" * 50) |
|
|