""" Tests for Meta-Harness core components. Run with: python tests/test_core.py """ import json import os import sys import tempfile import shutil # Add parent to path sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from meta_harness.filesystem import HarnessFilesystem, HarnessRecord from meta_harness.evaluator import OnlineEvaluator, QuickValidator, HarnessLoader from meta_harness.proposer import validate_harness_code, extract_code_from_response from meta_harness.baselines import BASELINE_HARNESSES def test_filesystem(): """Test HarnessFilesystem operations.""" print("Testing Filesystem...") workspace = tempfile.mkdtemp() try: fs = HarnessFilesystem(workspace) # Add a harness record1 = HarnessRecord( harness_id="test_001", iteration=0, source_code="class Harness: pass", scores={"accuracy": 0.5, "context_tokens": 100}, per_example_scores=[{"correct": True}], traces=[{"step": 0, "action": "predict"}], ) fs.add_harness(record1) # Add another (better accuracy, more tokens) record2 = HarnessRecord( harness_id="test_002", iteration=1, source_code="class Harness: pass # v2", scores={"accuracy": 0.8, "context_tokens": 500}, per_example_scores=[], traces=[], parent_id="test_001", ) fs.add_harness(record2) # Add another (lower accuracy, fewer tokens) record3 = HarnessRecord( harness_id="test_003", iteration=1, source_code="class Harness: pass # v3", scores={"accuracy": 0.4, "context_tokens": 50}, per_example_scores=[], traces=[], ) fs.add_harness(record3) # Test listing assert len(fs.list_harnesses()) == 3 # Test Pareto frontier frontier = fs.get_pareto_frontier() frontier_ids = {r.harness_id for r in frontier} # test_002 (high acc) and test_003 (low ctx) should be on frontier # test_001 is dominated by test_002 (lower acc AND higher ctx) assert "test_002" in frontier_ids assert "test_003" in frontier_ids # Test summary summary = fs.get_summary_text() assert "test_002" in summary assert "Total harnesses evaluated: 3" in summary # Test CLI pareto_output = fs.get_cli_output("pareto") assert "test_002" in pareto_output top_output = fs.get_cli_output("top 2") assert "test_002" in top_output scores_output = fs.get_cli_output("scores") assert "test_001" in scores_output # Verify files on disk assert os.path.exists(os.path.join(workspace, "harness_test_001", "harness.py")) assert os.path.exists(os.path.join(workspace, "harness_test_001", "scores.json")) assert os.path.exists(os.path.join(workspace, "index.json")) print(" Filesystem: OK ✓") finally: shutil.rmtree(workspace) def test_harness_loader(): """Test loading harnesses from code.""" print("Testing HarnessLoader...") labels = ["cat", "dog", "bird"] mock_fn = lambda prompt: "cat" # Test with a simple valid harness code = ''' class Harness: def __init__(self, model_fn, labels): self.model_fn = model_fn self.labels = labels self.memory = [] def update(self, x, y): self.memory.append((x, y)) def predict(self, x): return self.model_fn(f"Classify: {x}") def reset(self): self.memory = [] ''' harness = HarnessLoader.load_from_code(code, mock_fn, labels, "test") assert hasattr(harness, "predict") assert hasattr(harness, "update") harness.update("furry animal", "cat") result = harness.predict("small furry thing") assert result == "cat" print(" HarnessLoader: OK ✓") def test_evaluator(): """Test the online evaluator.""" print("Testing OnlineEvaluator...") labels = ["cat", "dog", "bird"] # Model that always returns "cat" model_fn = lambda prompt: "cat" evaluator = OnlineEvaluator(model_fn, labels, verbose=False) code = ''' class Harness: def __init__(self, model_fn, labels): self.model_fn = model_fn self.labels = labels self.memory = [] def update(self, x, y): self.memory.append((x, y)) def predict(self, x): return self.model_fn(f"Classify: {x}") def reset(self): self.memory = [] ''' train_data = [("meow", "cat"), ("bark", "dog"), ("tweet", "bird")] test_data = [("purr", "cat"), ("woof", "dog"), ("chirp", "bird")] results = evaluator.evaluate(code, train_data, test_data, "test_eval") assert "scores" in results assert "per_example_scores" in results assert "traces" in results # Model always says "cat", so 1/3 correct assert results["scores"]["accuracy"] == 1 / 3 print(" OnlineEvaluator: OK ✓") def test_quick_validator(): """Test the quick validation.""" print("Testing QuickValidator...") labels = ["cat", "dog"] model_fn = lambda prompt: "cat" # Valid harness valid_code = ''' class Harness: def __init__(self, model_fn, labels): self.model_fn = model_fn self.labels = labels def update(self, x, y): pass def predict(self, x): return self.labels[0] def reset(self): pass ''' is_valid, msg = QuickValidator.validate(valid_code, model_fn, labels) assert is_valid, f"Should be valid: {msg}" # Invalid harness (missing predict) invalid_code = ''' class Harness: def __init__(self, model_fn, labels): pass def update(self, x, y): pass def reset(self): pass ''' # This will fail because predict is missing -> the loader will succeed # but calling predict will fail is_valid, msg = QuickValidator.validate(invalid_code, model_fn, labels) assert not is_valid print(" QuickValidator: OK ✓") def test_code_validation(): """Test code validation and extraction.""" print("Testing code validation...") # Valid code code = ''' class Harness: def __init__(self, model_fn, labels): pass def update(self, x, y): pass def predict(self, x): return "label" def reset(self): pass ''' is_valid, msg = validate_harness_code(code) assert is_valid, msg # Missing class is_valid, msg = validate_harness_code("def foo(): pass") assert not is_valid # Syntax error is_valid, msg = validate_harness_code("class Harness:\ndef __init__ def update def predict def reset") assert not is_valid print(" Code validation: OK ✓") def test_code_extraction(): """Test extracting code from LLM responses.""" print("Testing code extraction...") response = ''' Here is my improved harness: ```python class Harness: def __init__(self, model_fn, labels): self.model_fn = model_fn def update(self, x, y): pass def predict(self, x): return self.model_fn(x) def reset(self): pass ``` This harness improves by doing X. ''' code = extract_code_from_response(response) assert code is not None assert "class Harness" in code assert "def predict" in code print(" Code extraction: OK ✓") def test_baselines(): """Test that all baseline harnesses are valid and executable.""" print("Testing baseline harnesses...") labels = ["disease_a", "disease_b", "disease_c"] model_fn = lambda prompt: "disease_a" for name, code in BASELINE_HARNESSES.items(): # Validate code is_valid, msg = validate_harness_code(code) assert is_valid, f"Baseline {name} invalid: {msg}" # Quick validation is_valid, msg = QuickValidator.validate(code, model_fn, labels) assert is_valid, f"Baseline {name} failed quick validation: {msg}" # Full evaluation evaluator = OnlineEvaluator(model_fn, labels, verbose=False) train_data = [("symptoms a", "disease_a"), ("symptoms b", "disease_b")] test_data = [("symptoms c", "disease_a"), ("symptoms d", "disease_b")] results = evaluator.evaluate(code, train_data, test_data, f"test_{name}") assert "scores" in results assert results["scores"]["accuracy"] >= 0 print(f" {name}: OK ✓ (accuracy={results['scores']['accuracy']:.2f})") print(" All baselines: OK ✓") def test_pareto_dominance(): """Test Pareto frontier computation.""" print("Testing Pareto dominance...") workspace = tempfile.mkdtemp() try: fs = HarnessFilesystem(workspace) # Create harnesses with known dominance relationships configs = [ ("h1", {"accuracy": 0.5, "context_tokens": 100}), # Dominated by h2 ("h2", {"accuracy": 0.8, "context_tokens": 50}), # Pareto-optimal ("h3", {"accuracy": 0.9, "context_tokens": 200}), # Pareto-optimal ("h4", {"accuracy": 0.3, "context_tokens": 300}), # Dominated by h1 ("h5", {"accuracy": 0.85, "context_tokens": 50}), # Dominates h2 ] for hid, scores in configs: record = HarnessRecord( harness_id=hid, iteration=0, source_code="class Harness: pass", scores=scores, per_example_scores=[], traces=[], ) fs.add_harness(record) frontier = fs.get_pareto_frontier() frontier_ids = {r.harness_id for r in frontier} # h5 dominates h2 (higher acc, same ctx) # h3 is Pareto-optimal (highest acc) # h5 is Pareto-optimal (best acc/ctx tradeoff) assert "h5" in frontier_ids assert "h3" in frontier_ids assert "h2" not in frontier_ids # dominated by h5 assert "h1" not in frontier_ids # dominated by h5 assert "h4" not in frontier_ids # dominated by everything print(" Pareto dominance: OK ✓") finally: shutil.rmtree(workspace) if __name__ == "__main__": print("=" * 50) print("Meta-Harness Core Tests") print("=" * 50) test_filesystem() test_harness_loader() test_evaluator() test_quick_validator() test_code_validation() test_code_extraction() test_baselines() test_pareto_dominance() print("\n" + "=" * 50) print("ALL TESTS PASSED ✓") print("=" * 50)