meta-harness / tests /test_core.py
dkhanal's picture
Add unit tests
c78456c verified
"""
Tests for Meta-Harness core components.
Run with: python tests/test_core.py
"""
import json
import os
import sys
import tempfile
import shutil
# Add parent to path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from meta_harness.filesystem import HarnessFilesystem, HarnessRecord
from meta_harness.evaluator import OnlineEvaluator, QuickValidator, HarnessLoader
from meta_harness.proposer import validate_harness_code, extract_code_from_response
from meta_harness.baselines import BASELINE_HARNESSES
def test_filesystem():
"""Test HarnessFilesystem operations."""
print("Testing Filesystem...")
workspace = tempfile.mkdtemp()
try:
fs = HarnessFilesystem(workspace)
# Add a harness
record1 = HarnessRecord(
harness_id="test_001",
iteration=0,
source_code="class Harness: pass",
scores={"accuracy": 0.5, "context_tokens": 100},
per_example_scores=[{"correct": True}],
traces=[{"step": 0, "action": "predict"}],
)
fs.add_harness(record1)
# Add another (better accuracy, more tokens)
record2 = HarnessRecord(
harness_id="test_002",
iteration=1,
source_code="class Harness: pass # v2",
scores={"accuracy": 0.8, "context_tokens": 500},
per_example_scores=[],
traces=[],
parent_id="test_001",
)
fs.add_harness(record2)
# Add another (lower accuracy, fewer tokens)
record3 = HarnessRecord(
harness_id="test_003",
iteration=1,
source_code="class Harness: pass # v3",
scores={"accuracy": 0.4, "context_tokens": 50},
per_example_scores=[],
traces=[],
)
fs.add_harness(record3)
# Test listing
assert len(fs.list_harnesses()) == 3
# Test Pareto frontier
frontier = fs.get_pareto_frontier()
frontier_ids = {r.harness_id for r in frontier}
# test_002 (high acc) and test_003 (low ctx) should be on frontier
# test_001 is dominated by test_002 (lower acc AND higher ctx)
assert "test_002" in frontier_ids
assert "test_003" in frontier_ids
# Test summary
summary = fs.get_summary_text()
assert "test_002" in summary
assert "Total harnesses evaluated: 3" in summary
# Test CLI
pareto_output = fs.get_cli_output("pareto")
assert "test_002" in pareto_output
top_output = fs.get_cli_output("top 2")
assert "test_002" in top_output
scores_output = fs.get_cli_output("scores")
assert "test_001" in scores_output
# Verify files on disk
assert os.path.exists(os.path.join(workspace, "harness_test_001", "harness.py"))
assert os.path.exists(os.path.join(workspace, "harness_test_001", "scores.json"))
assert os.path.exists(os.path.join(workspace, "index.json"))
print(" Filesystem: OK ✓")
finally:
shutil.rmtree(workspace)
def test_harness_loader():
"""Test loading harnesses from code."""
print("Testing HarnessLoader...")
labels = ["cat", "dog", "bird"]
mock_fn = lambda prompt: "cat"
# Test with a simple valid harness
code = '''
class Harness:
def __init__(self, model_fn, labels):
self.model_fn = model_fn
self.labels = labels
self.memory = []
def update(self, x, y):
self.memory.append((x, y))
def predict(self, x):
return self.model_fn(f"Classify: {x}")
def reset(self):
self.memory = []
'''
harness = HarnessLoader.load_from_code(code, mock_fn, labels, "test")
assert hasattr(harness, "predict")
assert hasattr(harness, "update")
harness.update("furry animal", "cat")
result = harness.predict("small furry thing")
assert result == "cat"
print(" HarnessLoader: OK ✓")
def test_evaluator():
"""Test the online evaluator."""
print("Testing OnlineEvaluator...")
labels = ["cat", "dog", "bird"]
# Model that always returns "cat"
model_fn = lambda prompt: "cat"
evaluator = OnlineEvaluator(model_fn, labels, verbose=False)
code = '''
class Harness:
def __init__(self, model_fn, labels):
self.model_fn = model_fn
self.labels = labels
self.memory = []
def update(self, x, y):
self.memory.append((x, y))
def predict(self, x):
return self.model_fn(f"Classify: {x}")
def reset(self):
self.memory = []
'''
train_data = [("meow", "cat"), ("bark", "dog"), ("tweet", "bird")]
test_data = [("purr", "cat"), ("woof", "dog"), ("chirp", "bird")]
results = evaluator.evaluate(code, train_data, test_data, "test_eval")
assert "scores" in results
assert "per_example_scores" in results
assert "traces" in results
# Model always says "cat", so 1/3 correct
assert results["scores"]["accuracy"] == 1 / 3
print(" OnlineEvaluator: OK ✓")
def test_quick_validator():
"""Test the quick validation."""
print("Testing QuickValidator...")
labels = ["cat", "dog"]
model_fn = lambda prompt: "cat"
# Valid harness
valid_code = '''
class Harness:
def __init__(self, model_fn, labels):
self.model_fn = model_fn
self.labels = labels
def update(self, x, y):
pass
def predict(self, x):
return self.labels[0]
def reset(self):
pass
'''
is_valid, msg = QuickValidator.validate(valid_code, model_fn, labels)
assert is_valid, f"Should be valid: {msg}"
# Invalid harness (missing predict)
invalid_code = '''
class Harness:
def __init__(self, model_fn, labels):
pass
def update(self, x, y):
pass
def reset(self):
pass
'''
# This will fail because predict is missing -> the loader will succeed
# but calling predict will fail
is_valid, msg = QuickValidator.validate(invalid_code, model_fn, labels)
assert not is_valid
print(" QuickValidator: OK ✓")
def test_code_validation():
"""Test code validation and extraction."""
print("Testing code validation...")
# Valid code
code = '''
class Harness:
def __init__(self, model_fn, labels):
pass
def update(self, x, y):
pass
def predict(self, x):
return "label"
def reset(self):
pass
'''
is_valid, msg = validate_harness_code(code)
assert is_valid, msg
# Missing class
is_valid, msg = validate_harness_code("def foo(): pass")
assert not is_valid
# Syntax error
is_valid, msg = validate_harness_code("class Harness:\ndef __init__ def update def predict def reset")
assert not is_valid
print(" Code validation: OK ✓")
def test_code_extraction():
"""Test extracting code from LLM responses."""
print("Testing code extraction...")
response = '''
Here is my improved harness:
```python
class Harness:
def __init__(self, model_fn, labels):
self.model_fn = model_fn
def update(self, x, y):
pass
def predict(self, x):
return self.model_fn(x)
def reset(self):
pass
```
This harness improves by doing X.
'''
code = extract_code_from_response(response)
assert code is not None
assert "class Harness" in code
assert "def predict" in code
print(" Code extraction: OK ✓")
def test_baselines():
"""Test that all baseline harnesses are valid and executable."""
print("Testing baseline harnesses...")
labels = ["disease_a", "disease_b", "disease_c"]
model_fn = lambda prompt: "disease_a"
for name, code in BASELINE_HARNESSES.items():
# Validate code
is_valid, msg = validate_harness_code(code)
assert is_valid, f"Baseline {name} invalid: {msg}"
# Quick validation
is_valid, msg = QuickValidator.validate(code, model_fn, labels)
assert is_valid, f"Baseline {name} failed quick validation: {msg}"
# Full evaluation
evaluator = OnlineEvaluator(model_fn, labels, verbose=False)
train_data = [("symptoms a", "disease_a"), ("symptoms b", "disease_b")]
test_data = [("symptoms c", "disease_a"), ("symptoms d", "disease_b")]
results = evaluator.evaluate(code, train_data, test_data, f"test_{name}")
assert "scores" in results
assert results["scores"]["accuracy"] >= 0
print(f" {name}: OK ✓ (accuracy={results['scores']['accuracy']:.2f})")
print(" All baselines: OK ✓")
def test_pareto_dominance():
"""Test Pareto frontier computation."""
print("Testing Pareto dominance...")
workspace = tempfile.mkdtemp()
try:
fs = HarnessFilesystem(workspace)
# Create harnesses with known dominance relationships
configs = [
("h1", {"accuracy": 0.5, "context_tokens": 100}), # Dominated by h2
("h2", {"accuracy": 0.8, "context_tokens": 50}), # Pareto-optimal
("h3", {"accuracy": 0.9, "context_tokens": 200}), # Pareto-optimal
("h4", {"accuracy": 0.3, "context_tokens": 300}), # Dominated by h1
("h5", {"accuracy": 0.85, "context_tokens": 50}), # Dominates h2
]
for hid, scores in configs:
record = HarnessRecord(
harness_id=hid, iteration=0,
source_code="class Harness: pass",
scores=scores, per_example_scores=[], traces=[],
)
fs.add_harness(record)
frontier = fs.get_pareto_frontier()
frontier_ids = {r.harness_id for r in frontier}
# h5 dominates h2 (higher acc, same ctx)
# h3 is Pareto-optimal (highest acc)
# h5 is Pareto-optimal (best acc/ctx tradeoff)
assert "h5" in frontier_ids
assert "h3" in frontier_ids
assert "h2" not in frontier_ids # dominated by h5
assert "h1" not in frontier_ids # dominated by h5
assert "h4" not in frontier_ids # dominated by everything
print(" Pareto dominance: OK ✓")
finally:
shutil.rmtree(workspace)
if __name__ == "__main__":
print("=" * 50)
print("Meta-Harness Core Tests")
print("=" * 50)
test_filesystem()
test_harness_loader()
test_evaluator()
test_quick_validator()
test_code_validation()
test_code_extraction()
test_baselines()
test_pareto_dominance()
print("\n" + "=" * 50)
print("ALL TESTS PASSED ✓")
print("=" * 50)