"""Tests for HYDRA harness components. Covers: - eval_agent: parse_run_log, check_secondary_alarms, should_keep - search_strategy: diagnose, should_explore - meta_agent: generate_directive, _strip_previous_directive All tests are CPU-only and create/destroy temp files as needed. Run: uv run pytest tests/test_harness.py -v """ import os import tempfile import pytest import sys sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) # --------------------------------------------------------------------------- # eval_agent tests # --------------------------------------------------------------------------- class TestParseRunLog: def _write_log(self, content: str) -> str: """Write content to a temp log file and return its path.""" fh = tempfile.NamedTemporaryFile( mode="w", suffix=".log", delete=False ) fh.write(content) fh.flush() fh.close() return fh.name def test_parse_valid_summary_block(self): """All fields are extracted correctly from a well-formed log.""" from harness.eval_agent import parse_run_log log = ( "step 00100 (50.0%) | loss: 3.123456\n" "---\n" "val_bpb: 1.234567\n" "training_seconds: 300.100\n" "total_seconds: 325.000\n" "peak_vram_mb: 2048.000\n" "mfu_percent: 12.500\n" "total_tokens_M: 100.000\n" "num_steps: 200\n" "num_params_M: 7.900\n" "n_layer: 4\n" "d_model: 256\n" "mhc_spectral_norm: 1.2300\n" "engram_hit_rate: 0.4500\n" "sr_bypass_rate: 1.0000\n" ) path = self._write_log(log) try: result = parse_run_log(path) assert result.val_bpb == pytest.approx(1.234567) assert result.training_seconds == pytest.approx(300.1) assert result.total_seconds == pytest.approx(325.0) assert result.peak_vram_mb == pytest.approx(2048.0) assert result.mfu_percent == pytest.approx(12.5) assert result.total_tokens_m == pytest.approx(100.0) assert result.num_steps == 200 assert result.num_params_m == pytest.approx(7.9) assert result.n_layer == 4 assert result.d_model == 256 assert result.mhc_spectral_norm == pytest.approx(1.23) assert result.engram_hit_rate == pytest.approx(0.45) assert result.sr_bypass_rate == pytest.approx(1.0) assert not result.crashed assert result.error_message == "" finally: os.unlink(path) def test_parse_crash_traceback(self): """Crashed run sets crashed=True and captures error_message.""" from harness.eval_agent import parse_run_log log = ( "Traceback (most recent call last):\n" " File 'train.py', line 100, in \n" "RuntimeError: CUDA out of memory\n" ) path = self._write_log(log) try: result = parse_run_log(path) assert result.crashed assert "CUDA out of memory" in result.error_message finally: os.unlink(path) def test_parse_missing_file(self): """Non-existent log file sets crashed=True.""" from harness.eval_agent import parse_run_log result = parse_run_log("/nonexistent/path/run.log") assert result.crashed assert result.error_message != "" def test_parse_empty_file(self): """Empty log file returns crashed=False with all defaults.""" from harness.eval_agent import parse_run_log path = self._write_log("") try: result = parse_run_log(path) assert result.val_bpb == 0.0 assert result.num_steps == 0 finally: os.unlink(path) def test_parse_partial_log(self): """Partial log (only some fields) populates only those fields.""" from harness.eval_agent import parse_run_log log = "val_bpb: 0.987654\npeak_vram_mb: 1500.0\n" path = self._write_log(log) try: result = parse_run_log(path) assert result.val_bpb == pytest.approx(0.987654) assert result.peak_vram_mb == pytest.approx(1500.0) assert result.num_steps == 0 # not present, stays default finally: os.unlink(path) def test_int_fields_parsed_as_int(self): """num_steps, n_layer, d_model are ints, not floats.""" from harness.eval_agent import parse_run_log log = "num_steps: 500\nn_layer: 4\nd_model: 256\n" path = self._write_log(log) try: result = parse_run_log(path) assert isinstance(result.num_steps, int) assert isinstance(result.n_layer, int) assert isinstance(result.d_model, int) finally: os.unlink(path) class TestCheckSecondaryAlarms: def test_all_clear_no_alarms(self): """No alarms when all metrics are within thresholds.""" from harness.eval_agent import ExperimentResult, check_secondary_alarms result = ExperimentResult(mhc_spectral_norm=1.5, engram_hit_rate=0.5, mfu_percent=25.0) alarms = check_secondary_alarms(result) assert alarms == [] def test_mhc_spectral_norm_alarm(self): """Alarm fires when mhc_spectral_norm > 2.0.""" from harness.eval_agent import ExperimentResult, check_secondary_alarms result = ExperimentResult(mhc_spectral_norm=2.5) alarms = check_secondary_alarms(result) assert any("mhc_spectral_norm" in a for a in alarms) def test_engram_hit_rate_alarm(self): """Alarm fires when engram_hit_rate is in (0, 0.1).""" from harness.eval_agent import ExperimentResult, check_secondary_alarms result = ExperimentResult(engram_hit_rate=0.05) alarms = check_secondary_alarms(result) assert any("engram_hit_rate" in a for a in alarms) def test_engram_hit_rate_zero_no_alarm(self): """Zero engram_hit_rate does NOT fire alarm (gated off).""" from harness.eval_agent import ExperimentResult, check_secondary_alarms result = ExperimentResult(engram_hit_rate=0.0) alarms = check_secondary_alarms(result) assert not any("engram_hit_rate" in a for a in alarms) def test_mfu_alarm(self): """Alarm fires when mfu_percent is in (0, 10).""" from harness.eval_agent import ExperimentResult, check_secondary_alarms result = ExperimentResult(mfu_percent=5.0) alarms = check_secondary_alarms(result) assert any("mfu_percent" in a for a in alarms) def test_three_alarms_simultaneously(self): """All three alarms fire when all thresholds are exceeded.""" from harness.eval_agent import ExperimentResult, check_secondary_alarms result = ExperimentResult(mhc_spectral_norm=2.5, engram_hit_rate=0.05, mfu_percent=5.0) alarms = check_secondary_alarms(result) assert len(alarms) == 3 class TestShouldKeep: def test_improved_bpb_keeps(self): """val_bpb strictly lower than best_bpb -> keep.""" from harness.eval_agent import ExperimentResult, should_keep result = ExperimentResult(val_bpb=0.95) keep, reason = should_keep(result, best_bpb=1.0) assert keep is True assert reason == "keep" def test_worse_bpb_discards(self): """val_bpb >= best_bpb -> discard.""" from harness.eval_agent import ExperimentResult, should_keep result = ExperimentResult(val_bpb=1.05) keep, reason = should_keep(result, best_bpb=1.0) assert keep is False assert reason == "discard" def test_equal_bpb_discards(self): """val_bpb == best_bpb -> discard (strict improvement required).""" from harness.eval_agent import ExperimentResult, should_keep result = ExperimentResult(val_bpb=1.0) keep, reason = should_keep(result, best_bpb=1.0) assert keep is False def test_crashed_discards(self): """Crashed result is always discarded regardless of bpb.""" from harness.eval_agent import ExperimentResult, should_keep result = ExperimentResult(val_bpb=0.5, crashed=True) keep, reason = should_keep(result, best_bpb=1.0) assert keep is False assert reason == "crash" def test_zero_bpb_discards(self): """val_bpb <= 0 is treated as invalid and discarded.""" from harness.eval_agent import ExperimentResult, should_keep result = ExperimentResult(val_bpb=0.0) keep, reason = should_keep(result, best_bpb=1.0) assert keep is False def test_secondary_gate_mhc_rejects(self): """mhc_spectral_norm gate rejects even an improving result.""" from harness.eval_agent import ExperimentResult, should_keep result = ExperimentResult(val_bpb=0.9, mhc_spectral_norm=3.0) gates = {"mhc_spectral_norm": {"max": 2.0}} keep, reason = should_keep(result, best_bpb=1.0, gates=gates) assert keep is False assert "mhc_spectral_norm" in reason def test_secondary_gate_engram_rejects(self): """engram_hit_rate gate rejects even an improving result.""" from harness.eval_agent import ExperimentResult, should_keep result = ExperimentResult(val_bpb=0.9, engram_hit_rate=0.01) gates = {"engram_hit_rate": {"min": 0.05}} keep, reason = should_keep(result, best_bpb=1.0, gates=gates) assert keep is False assert "engram_hit_rate" in reason def test_no_gates_passed(self): """No gates argument keeps an improving result.""" from harness.eval_agent import ExperimentResult, should_keep result = ExperimentResult(val_bpb=0.8, mhc_spectral_norm=5.0) keep, reason = should_keep(result, best_bpb=1.0, gates=None) assert keep is True # --------------------------------------------------------------------------- # search_strategy tests # --------------------------------------------------------------------------- class TestDiagnose: def test_missing_file_returns_exploring(self): """Non-existent results.tsv returns EXPLORING state.""" from harness.search_strategy import diagnose state = diagnose("/nonexistent/results.tsv") assert state.label == "EXPLORING" assert state.total_experiments == 0 assert state.best_bpb == float("inf") def test_empty_file_returns_exploring(self): """results.tsv with only a header returns EXPLORING.""" from harness.search_strategy import diagnose with tempfile.NamedTemporaryFile(mode="w", suffix=".tsv", delete=False) as fh: fh.write("commit\tval_bpb\tmemory_gb\tstatus\tdescription\n") path = fh.name try: state = diagnose(path) assert state.label == "EXPLORING" assert state.total_experiments == 0 finally: os.unlink(path) def test_improving_trend_is_exploring(self): """Steadily decreasing val_bpb trend -> EXPLORING.""" from harness.search_strategy import diagnose with tempfile.NamedTemporaryFile(mode="w", suffix=".tsv", delete=False) as fh: fh.write("commit\tval_bpb\tmemory_gb\tstatus\tdescription\n") # 12 rows with improving BPB (each unique description for diversity) for i in range(12): bpb = 1.0 - i * 0.01 fh.write(f"abc{i:04d}\t{bpb:.6f}\t2.0\tkeep\texperiment_{i:02d}_arch\n") path = fh.name try: state = diagnose(path, stuck_threshold=20) assert state.total_experiments == 12 assert state.best_bpb == pytest.approx(1.0 - 11 * 0.01) assert state.label in ("EXPLORING", "EXPLOITING") finally: os.unlink(path) def test_stuck_state_after_no_improvement(self): """10+ experiments without improvement -> STUCK.""" from harness.search_strategy import diagnose with tempfile.NamedTemporaryFile(mode="w", suffix=".tsv", delete=False) as fh: fh.write("commit\tval_bpb\tmemory_gb\tstatus\tdescription\n") # First row is the best, then 15 rows that are worse fh.write("best0001\t0.800000\t2.0\tkeep\texperiment 0\n") for i in range(1, 16): fh.write(f"abc{i:04d}\t1.000000\t2.0\tkeep\texperiment {i}\n") path = fh.name try: state = diagnose(path, stuck_threshold=10) assert state.label == "STUCK" assert state.best_bpb == pytest.approx(0.8) finally: os.unlink(path) def test_broken_state_high_crash_rate(self): """Crash rate > 0.5 -> BROKEN.""" from harness.search_strategy import diagnose with tempfile.NamedTemporaryFile(mode="w", suffix=".tsv", delete=False) as fh: fh.write("commit\tval_bpb\tmemory_gb\tstatus\tdescription\n") for i in range(10): status = "crash" if i < 7 else "keep" bpb = "0.0" if i < 7 else "1.0" fh.write(f"abc{i:04d}\t{bpb}\t2.0\t{status}\texperiment {i}\n") path = fh.name try: state = diagnose(path) assert state.label == "BROKEN" assert state.crash_rate > 0.5 finally: os.unlink(path) def test_best_bpb_tracked_correctly(self): """best_bpb is the global minimum across all experiments.""" from harness.search_strategy import diagnose with tempfile.NamedTemporaryFile(mode="w", suffix=".tsv", delete=False) as fh: fh.write("commit\tval_bpb\tmemory_gb\tstatus\tdescription\n") bpbs = [1.0, 0.9, 0.85, 0.95, 1.1, 0.87] for i, bpb in enumerate(bpbs): fh.write(f"abc{i:04d}\t{bpb:.6f}\t2.0\tkeep\texperiment {i}\n") path = fh.name try: state = diagnose(path) assert state.best_bpb == pytest.approx(0.85) finally: os.unlink(path) class TestShouldExplore: def test_no_improvement_returns_true(self): """should_explore returns True when stuck for N experiments.""" from harness.search_strategy import should_explore with tempfile.NamedTemporaryFile(mode="w", suffix=".tsv", delete=False) as fh: fh.write("commit\tval_bpb\tmemory_gb\tstatus\tdescription\n") # Best is first row, then 12 rows with no improvement fh.write("best0001\t0.800000\t2.0\tkeep\texperiment 0\n") for i in range(1, 13): fh.write(f"abc{i:04d}\t1.000000\t2.0\tkeep\texperiment {i}\n") path = fh.name try: assert should_explore(path, n=10) is True finally: os.unlink(path) def test_active_improvement_returns_false(self): """should_explore returns False when improvement is happening.""" from harness.search_strategy import should_explore with tempfile.NamedTemporaryFile(mode="w", suffix=".tsv", delete=False) as fh: fh.write("commit\tval_bpb\tmemory_gb\tstatus\tdescription\n") # Steady improvement for i in range(5): bpb = 1.0 - i * 0.05 fh.write(f"abc{i:04d}\t{bpb:.6f}\t2.0\tkeep\texperiment {i}\n") path = fh.name try: assert should_explore(path, n=10) is False finally: os.unlink(path) # --------------------------------------------------------------------------- # meta_agent tests # --------------------------------------------------------------------------- class TestGenerateDirective: def test_exploring_returns_none(self): """EXPLORING state produces no directive.""" from harness.meta_agent import generate_directive from harness.search_strategy import ResearchState state = ResearchState( label="EXPLORING", trend_improving=True, experiment_diversity=0.8, crash_rate=0.0, best_bpb=0.9, last_improvement_at=10, total_experiments=10, ) assert generate_directive(state) is None def test_stuck_returns_bold_directive(self): """STUCK state returns a directive containing 'BOLD' or 'bold'.""" from harness.meta_agent import generate_directive from harness.search_strategy import ResearchState state = ResearchState( label="STUCK", trend_improving=False, experiment_diversity=0.2, crash_rate=0.0, best_bpb=1.0, last_improvement_at=1, total_experiments=20, ) directive = generate_directive(state) assert directive is not None assert "BOLD" in directive or "bold" in directive.lower(), ( f"Expected 'BOLD' in directive, got: {directive}" ) def test_broken_returns_alert_directive(self): """BROKEN state returns a directive containing 'ALERT' and crash rate.""" from harness.meta_agent import generate_directive from harness.search_strategy import ResearchState state = ResearchState( label="BROKEN", trend_improving=False, experiment_diversity=0.0, crash_rate=0.75, best_bpb=float("inf"), last_improvement_at=0, total_experiments=8, ) directive = generate_directive(state) assert directive is not None assert "ALERT" in directive def test_exploiting_returns_diversity_directive(self): """EXPLOITING state returns a directive mentioning diversity.""" from harness.meta_agent import generate_directive from harness.search_strategy import ResearchState state = ResearchState( label="EXPLOITING", trend_improving=False, experiment_diversity=0.1, crash_rate=0.0, best_bpb=0.9, last_improvement_at=8, total_experiments=10, ) directive = generate_directive(state) assert directive is not None assert "divers" in directive.lower() or "Search" in directive class TestStripPreviousDirective: def test_strips_marker_block(self): """_strip_previous_directive removes the auto-generated section.""" from harness.meta_agent import _strip_previous_directive, _DIRECTIVE_MARKER content = f"Some content\n\n{_DIRECTIVE_MARKER}\nOld directive text.\n" result = _strip_previous_directive(content) assert _DIRECTIVE_MARKER not in result assert "Some content" in result def test_no_marker_unchanged(self): """Content without a marker is returned unchanged (modulo trailing space).""" from harness.meta_agent import _strip_previous_directive content = "Normal program.md content\nNo directive here.\n" result = _strip_previous_directive(content) assert "Normal program.md content" in result assert "No directive here" in result class TestRunMetaIteration: def test_run_on_empty_results(self, tmp_path): """run_meta_iteration with no results returns state=EXPLORING, changed=False.""" from harness.meta_agent import run_meta_iteration results = str(tmp_path / "results.tsv") program = str(tmp_path / "program.md") summary = run_meta_iteration(program_path=program, results_path=results) assert summary["state"] == "EXPLORING" assert summary["changed"] is False def test_run_writes_directive_when_stuck(self, tmp_path): """run_meta_iteration writes a directive to program.md when STUCK.""" from harness.meta_agent import run_meta_iteration results = tmp_path / "results.tsv" results.write_text( "commit\tval_bpb\tmemory_gb\tstatus\tdescription\n" + "best0001\t0.800000\t2.0\tkeep\texperiment 0\n" + "".join( f"abc{i:04d}\t1.000000\t2.0\tkeep\texperiment {i}\n" for i in range(1, 16) ) ) program = tmp_path / "program.md" program.write_text("# Program\n") summary = run_meta_iteration( program_path=str(program), results_path=str(results) ) assert summary["changed"] is True assert "directive" in summary written = program.read_text() assert "Meta-Agent Directive" in written