| """Tests for HYDRA harness components. |
| |
| Covers: |
| - eval_agent: parse_run_log, check_secondary_alarms, should_keep |
| - search_strategy: diagnose, should_explore |
| - meta_agent: generate_directive, _strip_previous_directive |
| |
| All tests are CPU-only and create/destroy temp files as needed. |
| |
| Run: |
| uv run pytest tests/test_harness.py -v |
| """ |
| import os |
| import tempfile |
| import pytest |
|
|
| import sys |
| sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
|
|
|
|
| |
| |
| |
|
|
| class TestParseRunLog: |
| def _write_log(self, content: str) -> str: |
| """Write content to a temp log file and return its path.""" |
| fh = tempfile.NamedTemporaryFile( |
| mode="w", suffix=".log", delete=False |
| ) |
| fh.write(content) |
| fh.flush() |
| fh.close() |
| return fh.name |
|
|
| def test_parse_valid_summary_block(self): |
| """All fields are extracted correctly from a well-formed log.""" |
| from harness.eval_agent import parse_run_log |
|
|
| log = ( |
| "step 00100 (50.0%) | loss: 3.123456\n" |
| "---\n" |
| "val_bpb: 1.234567\n" |
| "training_seconds: 300.100\n" |
| "total_seconds: 325.000\n" |
| "peak_vram_mb: 2048.000\n" |
| "mfu_percent: 12.500\n" |
| "total_tokens_M: 100.000\n" |
| "num_steps: 200\n" |
| "num_params_M: 7.900\n" |
| "n_layer: 4\n" |
| "d_model: 256\n" |
| "mhc_spectral_norm: 1.2300\n" |
| "engram_hit_rate: 0.4500\n" |
| "sr_bypass_rate: 1.0000\n" |
| ) |
| path = self._write_log(log) |
| try: |
| result = parse_run_log(path) |
| assert result.val_bpb == pytest.approx(1.234567) |
| assert result.training_seconds == pytest.approx(300.1) |
| assert result.total_seconds == pytest.approx(325.0) |
| assert result.peak_vram_mb == pytest.approx(2048.0) |
| assert result.mfu_percent == pytest.approx(12.5) |
| assert result.total_tokens_m == pytest.approx(100.0) |
| assert result.num_steps == 200 |
| assert result.num_params_m == pytest.approx(7.9) |
| assert result.n_layer == 4 |
| assert result.d_model == 256 |
| assert result.mhc_spectral_norm == pytest.approx(1.23) |
| assert result.engram_hit_rate == pytest.approx(0.45) |
| assert result.sr_bypass_rate == pytest.approx(1.0) |
| assert not result.crashed |
| assert result.error_message == "" |
| finally: |
| os.unlink(path) |
|
|
| def test_parse_crash_traceback(self): |
| """Crashed run sets crashed=True and captures error_message.""" |
| from harness.eval_agent import parse_run_log |
|
|
| log = ( |
| "Traceback (most recent call last):\n" |
| " File 'train.py', line 100, in <module>\n" |
| "RuntimeError: CUDA out of memory\n" |
| ) |
| path = self._write_log(log) |
| try: |
| result = parse_run_log(path) |
| assert result.crashed |
| assert "CUDA out of memory" in result.error_message |
| finally: |
| os.unlink(path) |
|
|
| def test_parse_missing_file(self): |
| """Non-existent log file sets crashed=True.""" |
| from harness.eval_agent import parse_run_log |
|
|
| result = parse_run_log("/nonexistent/path/run.log") |
| assert result.crashed |
| assert result.error_message != "" |
|
|
| def test_parse_empty_file(self): |
| """Empty log file returns crashed=False with all defaults.""" |
| from harness.eval_agent import parse_run_log |
|
|
| path = self._write_log("") |
| try: |
| result = parse_run_log(path) |
| assert result.val_bpb == 0.0 |
| assert result.num_steps == 0 |
| finally: |
| os.unlink(path) |
|
|
| def test_parse_partial_log(self): |
| """Partial log (only some fields) populates only those fields.""" |
| from harness.eval_agent import parse_run_log |
|
|
| log = "val_bpb: 0.987654\npeak_vram_mb: 1500.0\n" |
| path = self._write_log(log) |
| try: |
| result = parse_run_log(path) |
| assert result.val_bpb == pytest.approx(0.987654) |
| assert result.peak_vram_mb == pytest.approx(1500.0) |
| assert result.num_steps == 0 |
| finally: |
| os.unlink(path) |
|
|
| def test_int_fields_parsed_as_int(self): |
| """num_steps, n_layer, d_model are ints, not floats.""" |
| from harness.eval_agent import parse_run_log |
|
|
| log = "num_steps: 500\nn_layer: 4\nd_model: 256\n" |
| path = self._write_log(log) |
| try: |
| result = parse_run_log(path) |
| assert isinstance(result.num_steps, int) |
| assert isinstance(result.n_layer, int) |
| assert isinstance(result.d_model, int) |
| finally: |
| os.unlink(path) |
|
|
|
|
| class TestCheckSecondaryAlarms: |
| def test_all_clear_no_alarms(self): |
| """No alarms when all metrics are within thresholds.""" |
| from harness.eval_agent import ExperimentResult, check_secondary_alarms |
|
|
| result = ExperimentResult(mhc_spectral_norm=1.5, engram_hit_rate=0.5, mfu_percent=25.0) |
| alarms = check_secondary_alarms(result) |
| assert alarms == [] |
|
|
| def test_mhc_spectral_norm_alarm(self): |
| """Alarm fires when mhc_spectral_norm > 2.0.""" |
| from harness.eval_agent import ExperimentResult, check_secondary_alarms |
|
|
| result = ExperimentResult(mhc_spectral_norm=2.5) |
| alarms = check_secondary_alarms(result) |
| assert any("mhc_spectral_norm" in a for a in alarms) |
|
|
| def test_engram_hit_rate_alarm(self): |
| """Alarm fires when engram_hit_rate is in (0, 0.1).""" |
| from harness.eval_agent import ExperimentResult, check_secondary_alarms |
|
|
| result = ExperimentResult(engram_hit_rate=0.05) |
| alarms = check_secondary_alarms(result) |
| assert any("engram_hit_rate" in a for a in alarms) |
|
|
| def test_engram_hit_rate_zero_no_alarm(self): |
| """Zero engram_hit_rate does NOT fire alarm (gated off).""" |
| from harness.eval_agent import ExperimentResult, check_secondary_alarms |
|
|
| result = ExperimentResult(engram_hit_rate=0.0) |
| alarms = check_secondary_alarms(result) |
| assert not any("engram_hit_rate" in a for a in alarms) |
|
|
| def test_mfu_alarm(self): |
| """Alarm fires when mfu_percent is in (0, 10).""" |
| from harness.eval_agent import ExperimentResult, check_secondary_alarms |
|
|
| result = ExperimentResult(mfu_percent=5.0) |
| alarms = check_secondary_alarms(result) |
| assert any("mfu_percent" in a for a in alarms) |
|
|
| def test_three_alarms_simultaneously(self): |
| """All three alarms fire when all thresholds are exceeded.""" |
| from harness.eval_agent import ExperimentResult, check_secondary_alarms |
|
|
| result = ExperimentResult(mhc_spectral_norm=2.5, engram_hit_rate=0.05, mfu_percent=5.0) |
| alarms = check_secondary_alarms(result) |
| assert len(alarms) == 3 |
|
|
|
|
| class TestShouldKeep: |
| def test_improved_bpb_keeps(self): |
| """val_bpb strictly lower than best_bpb -> keep.""" |
| from harness.eval_agent import ExperimentResult, should_keep |
|
|
| result = ExperimentResult(val_bpb=0.95) |
| keep, reason = should_keep(result, best_bpb=1.0) |
| assert keep is True |
| assert reason == "keep" |
|
|
| def test_worse_bpb_discards(self): |
| """val_bpb >= best_bpb -> discard.""" |
| from harness.eval_agent import ExperimentResult, should_keep |
|
|
| result = ExperimentResult(val_bpb=1.05) |
| keep, reason = should_keep(result, best_bpb=1.0) |
| assert keep is False |
| assert reason == "discard" |
|
|
| def test_equal_bpb_discards(self): |
| """val_bpb == best_bpb -> discard (strict improvement required).""" |
| from harness.eval_agent import ExperimentResult, should_keep |
|
|
| result = ExperimentResult(val_bpb=1.0) |
| keep, reason = should_keep(result, best_bpb=1.0) |
| assert keep is False |
|
|
| def test_crashed_discards(self): |
| """Crashed result is always discarded regardless of bpb.""" |
| from harness.eval_agent import ExperimentResult, should_keep |
|
|
| result = ExperimentResult(val_bpb=0.5, crashed=True) |
| keep, reason = should_keep(result, best_bpb=1.0) |
| assert keep is False |
| assert reason == "crash" |
|
|
| def test_zero_bpb_discards(self): |
| """val_bpb <= 0 is treated as invalid and discarded.""" |
| from harness.eval_agent import ExperimentResult, should_keep |
|
|
| result = ExperimentResult(val_bpb=0.0) |
| keep, reason = should_keep(result, best_bpb=1.0) |
| assert keep is False |
|
|
| def test_secondary_gate_mhc_rejects(self): |
| """mhc_spectral_norm gate rejects even an improving result.""" |
| from harness.eval_agent import ExperimentResult, should_keep |
|
|
| result = ExperimentResult(val_bpb=0.9, mhc_spectral_norm=3.0) |
| gates = {"mhc_spectral_norm": {"max": 2.0}} |
| keep, reason = should_keep(result, best_bpb=1.0, gates=gates) |
| assert keep is False |
| assert "mhc_spectral_norm" in reason |
|
|
| def test_secondary_gate_engram_rejects(self): |
| """engram_hit_rate gate rejects even an improving result.""" |
| from harness.eval_agent import ExperimentResult, should_keep |
|
|
| result = ExperimentResult(val_bpb=0.9, engram_hit_rate=0.01) |
| gates = {"engram_hit_rate": {"min": 0.05}} |
| keep, reason = should_keep(result, best_bpb=1.0, gates=gates) |
| assert keep is False |
| assert "engram_hit_rate" in reason |
|
|
| def test_no_gates_passed(self): |
| """No gates argument keeps an improving result.""" |
| from harness.eval_agent import ExperimentResult, should_keep |
|
|
| result = ExperimentResult(val_bpb=0.8, mhc_spectral_norm=5.0) |
| keep, reason = should_keep(result, best_bpb=1.0, gates=None) |
| assert keep is True |
|
|
|
|
| |
| |
| |
|
|
| class TestDiagnose: |
| def test_missing_file_returns_exploring(self): |
| """Non-existent results.tsv returns EXPLORING state.""" |
| from harness.search_strategy import diagnose |
|
|
| state = diagnose("/nonexistent/results.tsv") |
| assert state.label == "EXPLORING" |
| assert state.total_experiments == 0 |
| assert state.best_bpb == float("inf") |
|
|
| def test_empty_file_returns_exploring(self): |
| """results.tsv with only a header returns EXPLORING.""" |
| from harness.search_strategy import diagnose |
|
|
| with tempfile.NamedTemporaryFile(mode="w", suffix=".tsv", delete=False) as fh: |
| fh.write("commit\tval_bpb\tmemory_gb\tstatus\tdescription\n") |
| path = fh.name |
| try: |
| state = diagnose(path) |
| assert state.label == "EXPLORING" |
| assert state.total_experiments == 0 |
| finally: |
| os.unlink(path) |
|
|
| def test_improving_trend_is_exploring(self): |
| """Steadily decreasing val_bpb trend -> EXPLORING.""" |
| from harness.search_strategy import diagnose |
|
|
| with tempfile.NamedTemporaryFile(mode="w", suffix=".tsv", delete=False) as fh: |
| fh.write("commit\tval_bpb\tmemory_gb\tstatus\tdescription\n") |
| |
| for i in range(12): |
| bpb = 1.0 - i * 0.01 |
| fh.write(f"abc{i:04d}\t{bpb:.6f}\t2.0\tkeep\texperiment_{i:02d}_arch\n") |
| path = fh.name |
| try: |
| state = diagnose(path, stuck_threshold=20) |
| assert state.total_experiments == 12 |
| assert state.best_bpb == pytest.approx(1.0 - 11 * 0.01) |
| assert state.label in ("EXPLORING", "EXPLOITING") |
| finally: |
| os.unlink(path) |
|
|
| def test_stuck_state_after_no_improvement(self): |
| """10+ experiments without improvement -> STUCK.""" |
| from harness.search_strategy import diagnose |
|
|
| with tempfile.NamedTemporaryFile(mode="w", suffix=".tsv", delete=False) as fh: |
| fh.write("commit\tval_bpb\tmemory_gb\tstatus\tdescription\n") |
| |
| fh.write("best0001\t0.800000\t2.0\tkeep\texperiment 0\n") |
| for i in range(1, 16): |
| fh.write(f"abc{i:04d}\t1.000000\t2.0\tkeep\texperiment {i}\n") |
| path = fh.name |
| try: |
| state = diagnose(path, stuck_threshold=10) |
| assert state.label == "STUCK" |
| assert state.best_bpb == pytest.approx(0.8) |
| finally: |
| os.unlink(path) |
|
|
| def test_broken_state_high_crash_rate(self): |
| """Crash rate > 0.5 -> BROKEN.""" |
| from harness.search_strategy import diagnose |
|
|
| with tempfile.NamedTemporaryFile(mode="w", suffix=".tsv", delete=False) as fh: |
| fh.write("commit\tval_bpb\tmemory_gb\tstatus\tdescription\n") |
| for i in range(10): |
| status = "crash" if i < 7 else "keep" |
| bpb = "0.0" if i < 7 else "1.0" |
| fh.write(f"abc{i:04d}\t{bpb}\t2.0\t{status}\texperiment {i}\n") |
| path = fh.name |
| try: |
| state = diagnose(path) |
| assert state.label == "BROKEN" |
| assert state.crash_rate > 0.5 |
| finally: |
| os.unlink(path) |
|
|
| def test_best_bpb_tracked_correctly(self): |
| """best_bpb is the global minimum across all experiments.""" |
| from harness.search_strategy import diagnose |
|
|
| with tempfile.NamedTemporaryFile(mode="w", suffix=".tsv", delete=False) as fh: |
| fh.write("commit\tval_bpb\tmemory_gb\tstatus\tdescription\n") |
| bpbs = [1.0, 0.9, 0.85, 0.95, 1.1, 0.87] |
| for i, bpb in enumerate(bpbs): |
| fh.write(f"abc{i:04d}\t{bpb:.6f}\t2.0\tkeep\texperiment {i}\n") |
| path = fh.name |
| try: |
| state = diagnose(path) |
| assert state.best_bpb == pytest.approx(0.85) |
| finally: |
| os.unlink(path) |
|
|
|
|
| class TestShouldExplore: |
| def test_no_improvement_returns_true(self): |
| """should_explore returns True when stuck for N experiments.""" |
| from harness.search_strategy import should_explore |
|
|
| with tempfile.NamedTemporaryFile(mode="w", suffix=".tsv", delete=False) as fh: |
| fh.write("commit\tval_bpb\tmemory_gb\tstatus\tdescription\n") |
| |
| fh.write("best0001\t0.800000\t2.0\tkeep\texperiment 0\n") |
| for i in range(1, 13): |
| fh.write(f"abc{i:04d}\t1.000000\t2.0\tkeep\texperiment {i}\n") |
| path = fh.name |
| try: |
| assert should_explore(path, n=10) is True |
| finally: |
| os.unlink(path) |
|
|
| def test_active_improvement_returns_false(self): |
| """should_explore returns False when improvement is happening.""" |
| from harness.search_strategy import should_explore |
|
|
| with tempfile.NamedTemporaryFile(mode="w", suffix=".tsv", delete=False) as fh: |
| fh.write("commit\tval_bpb\tmemory_gb\tstatus\tdescription\n") |
| |
| for i in range(5): |
| bpb = 1.0 - i * 0.05 |
| fh.write(f"abc{i:04d}\t{bpb:.6f}\t2.0\tkeep\texperiment {i}\n") |
| path = fh.name |
| try: |
| assert should_explore(path, n=10) is False |
| finally: |
| os.unlink(path) |
|
|
|
|
| |
| |
| |
|
|
| class TestGenerateDirective: |
| def test_exploring_returns_none(self): |
| """EXPLORING state produces no directive.""" |
| from harness.meta_agent import generate_directive |
| from harness.search_strategy import ResearchState |
|
|
| state = ResearchState( |
| label="EXPLORING", |
| trend_improving=True, |
| experiment_diversity=0.8, |
| crash_rate=0.0, |
| best_bpb=0.9, |
| last_improvement_at=10, |
| total_experiments=10, |
| ) |
| assert generate_directive(state) is None |
|
|
| def test_stuck_returns_bold_directive(self): |
| """STUCK state returns a directive containing 'BOLD' or 'bold'.""" |
| from harness.meta_agent import generate_directive |
| from harness.search_strategy import ResearchState |
|
|
| state = ResearchState( |
| label="STUCK", |
| trend_improving=False, |
| experiment_diversity=0.2, |
| crash_rate=0.0, |
| best_bpb=1.0, |
| last_improvement_at=1, |
| total_experiments=20, |
| ) |
| directive = generate_directive(state) |
| assert directive is not None |
| assert "BOLD" in directive or "bold" in directive.lower(), ( |
| f"Expected 'BOLD' in directive, got: {directive}" |
| ) |
|
|
| def test_broken_returns_alert_directive(self): |
| """BROKEN state returns a directive containing 'ALERT' and crash rate.""" |
| from harness.meta_agent import generate_directive |
| from harness.search_strategy import ResearchState |
|
|
| state = ResearchState( |
| label="BROKEN", |
| trend_improving=False, |
| experiment_diversity=0.0, |
| crash_rate=0.75, |
| best_bpb=float("inf"), |
| last_improvement_at=0, |
| total_experiments=8, |
| ) |
| directive = generate_directive(state) |
| assert directive is not None |
| assert "ALERT" in directive |
|
|
| def test_exploiting_returns_diversity_directive(self): |
| """EXPLOITING state returns a directive mentioning diversity.""" |
| from harness.meta_agent import generate_directive |
| from harness.search_strategy import ResearchState |
|
|
| state = ResearchState( |
| label="EXPLOITING", |
| trend_improving=False, |
| experiment_diversity=0.1, |
| crash_rate=0.0, |
| best_bpb=0.9, |
| last_improvement_at=8, |
| total_experiments=10, |
| ) |
| directive = generate_directive(state) |
| assert directive is not None |
| assert "divers" in directive.lower() or "Search" in directive |
|
|
|
|
| class TestStripPreviousDirective: |
| def test_strips_marker_block(self): |
| """_strip_previous_directive removes the auto-generated section.""" |
| from harness.meta_agent import _strip_previous_directive, _DIRECTIVE_MARKER |
|
|
| content = f"Some content\n\n{_DIRECTIVE_MARKER}\nOld directive text.\n" |
| result = _strip_previous_directive(content) |
| assert _DIRECTIVE_MARKER not in result |
| assert "Some content" in result |
|
|
| def test_no_marker_unchanged(self): |
| """Content without a marker is returned unchanged (modulo trailing space).""" |
| from harness.meta_agent import _strip_previous_directive |
|
|
| content = "Normal program.md content\nNo directive here.\n" |
| result = _strip_previous_directive(content) |
| assert "Normal program.md content" in result |
| assert "No directive here" in result |
|
|
|
|
| class TestRunMetaIteration: |
| def test_run_on_empty_results(self, tmp_path): |
| """run_meta_iteration with no results returns state=EXPLORING, changed=False.""" |
| from harness.meta_agent import run_meta_iteration |
|
|
| results = str(tmp_path / "results.tsv") |
| program = str(tmp_path / "program.md") |
| summary = run_meta_iteration(program_path=program, results_path=results) |
| assert summary["state"] == "EXPLORING" |
| assert summary["changed"] is False |
|
|
| def test_run_writes_directive_when_stuck(self, tmp_path): |
| """run_meta_iteration writes a directive to program.md when STUCK.""" |
| from harness.meta_agent import run_meta_iteration |
|
|
| results = tmp_path / "results.tsv" |
| results.write_text( |
| "commit\tval_bpb\tmemory_gb\tstatus\tdescription\n" |
| + "best0001\t0.800000\t2.0\tkeep\texperiment 0\n" |
| + "".join( |
| f"abc{i:04d}\t1.000000\t2.0\tkeep\texperiment {i}\n" |
| for i in range(1, 16) |
| ) |
| ) |
| program = tmp_path / "program.md" |
| program.write_text("# Program\n") |
|
|
| summary = run_meta_iteration( |
| program_path=str(program), results_path=str(results) |
| ) |
| assert summary["changed"] is True |
| assert "directive" in summary |
| written = program.read_text() |
| assert "Meta-Agent Directive" in written |
|
|