icarus112's picture
Update Feather a10g-large training runtime image
c475135 verified
"""Tests for HYDRA harness components.
Covers:
- eval_agent: parse_run_log, check_secondary_alarms, should_keep
- search_strategy: diagnose, should_explore
- meta_agent: generate_directive, _strip_previous_directive
All tests are CPU-only and create/destroy temp files as needed.
Run:
uv run pytest tests/test_harness.py -v
"""
import os
import tempfile
import pytest
import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# ---------------------------------------------------------------------------
# eval_agent tests
# ---------------------------------------------------------------------------
class TestParseRunLog:
def _write_log(self, content: str) -> str:
"""Write content to a temp log file and return its path."""
fh = tempfile.NamedTemporaryFile(
mode="w", suffix=".log", delete=False
)
fh.write(content)
fh.flush()
fh.close()
return fh.name
def test_parse_valid_summary_block(self):
"""All fields are extracted correctly from a well-formed log."""
from harness.eval_agent import parse_run_log
log = (
"step 00100 (50.0%) | loss: 3.123456\n"
"---\n"
"val_bpb: 1.234567\n"
"training_seconds: 300.100\n"
"total_seconds: 325.000\n"
"peak_vram_mb: 2048.000\n"
"mfu_percent: 12.500\n"
"total_tokens_M: 100.000\n"
"num_steps: 200\n"
"num_params_M: 7.900\n"
"n_layer: 4\n"
"d_model: 256\n"
"mhc_spectral_norm: 1.2300\n"
"engram_hit_rate: 0.4500\n"
"sr_bypass_rate: 1.0000\n"
)
path = self._write_log(log)
try:
result = parse_run_log(path)
assert result.val_bpb == pytest.approx(1.234567)
assert result.training_seconds == pytest.approx(300.1)
assert result.total_seconds == pytest.approx(325.0)
assert result.peak_vram_mb == pytest.approx(2048.0)
assert result.mfu_percent == pytest.approx(12.5)
assert result.total_tokens_m == pytest.approx(100.0)
assert result.num_steps == 200
assert result.num_params_m == pytest.approx(7.9)
assert result.n_layer == 4
assert result.d_model == 256
assert result.mhc_spectral_norm == pytest.approx(1.23)
assert result.engram_hit_rate == pytest.approx(0.45)
assert result.sr_bypass_rate == pytest.approx(1.0)
assert not result.crashed
assert result.error_message == ""
finally:
os.unlink(path)
def test_parse_crash_traceback(self):
"""Crashed run sets crashed=True and captures error_message."""
from harness.eval_agent import parse_run_log
log = (
"Traceback (most recent call last):\n"
" File 'train.py', line 100, in <module>\n"
"RuntimeError: CUDA out of memory\n"
)
path = self._write_log(log)
try:
result = parse_run_log(path)
assert result.crashed
assert "CUDA out of memory" in result.error_message
finally:
os.unlink(path)
def test_parse_missing_file(self):
"""Non-existent log file sets crashed=True."""
from harness.eval_agent import parse_run_log
result = parse_run_log("/nonexistent/path/run.log")
assert result.crashed
assert result.error_message != ""
def test_parse_empty_file(self):
"""Empty log file returns crashed=False with all defaults."""
from harness.eval_agent import parse_run_log
path = self._write_log("")
try:
result = parse_run_log(path)
assert result.val_bpb == 0.0
assert result.num_steps == 0
finally:
os.unlink(path)
def test_parse_partial_log(self):
"""Partial log (only some fields) populates only those fields."""
from harness.eval_agent import parse_run_log
log = "val_bpb: 0.987654\npeak_vram_mb: 1500.0\n"
path = self._write_log(log)
try:
result = parse_run_log(path)
assert result.val_bpb == pytest.approx(0.987654)
assert result.peak_vram_mb == pytest.approx(1500.0)
assert result.num_steps == 0 # not present, stays default
finally:
os.unlink(path)
def test_int_fields_parsed_as_int(self):
"""num_steps, n_layer, d_model are ints, not floats."""
from harness.eval_agent import parse_run_log
log = "num_steps: 500\nn_layer: 4\nd_model: 256\n"
path = self._write_log(log)
try:
result = parse_run_log(path)
assert isinstance(result.num_steps, int)
assert isinstance(result.n_layer, int)
assert isinstance(result.d_model, int)
finally:
os.unlink(path)
class TestCheckSecondaryAlarms:
def test_all_clear_no_alarms(self):
"""No alarms when all metrics are within thresholds."""
from harness.eval_agent import ExperimentResult, check_secondary_alarms
result = ExperimentResult(mhc_spectral_norm=1.5, engram_hit_rate=0.5, mfu_percent=25.0)
alarms = check_secondary_alarms(result)
assert alarms == []
def test_mhc_spectral_norm_alarm(self):
"""Alarm fires when mhc_spectral_norm > 2.0."""
from harness.eval_agent import ExperimentResult, check_secondary_alarms
result = ExperimentResult(mhc_spectral_norm=2.5)
alarms = check_secondary_alarms(result)
assert any("mhc_spectral_norm" in a for a in alarms)
def test_engram_hit_rate_alarm(self):
"""Alarm fires when engram_hit_rate is in (0, 0.1)."""
from harness.eval_agent import ExperimentResult, check_secondary_alarms
result = ExperimentResult(engram_hit_rate=0.05)
alarms = check_secondary_alarms(result)
assert any("engram_hit_rate" in a for a in alarms)
def test_engram_hit_rate_zero_no_alarm(self):
"""Zero engram_hit_rate does NOT fire alarm (gated off)."""
from harness.eval_agent import ExperimentResult, check_secondary_alarms
result = ExperimentResult(engram_hit_rate=0.0)
alarms = check_secondary_alarms(result)
assert not any("engram_hit_rate" in a for a in alarms)
def test_mfu_alarm(self):
"""Alarm fires when mfu_percent is in (0, 10)."""
from harness.eval_agent import ExperimentResult, check_secondary_alarms
result = ExperimentResult(mfu_percent=5.0)
alarms = check_secondary_alarms(result)
assert any("mfu_percent" in a for a in alarms)
def test_three_alarms_simultaneously(self):
"""All three alarms fire when all thresholds are exceeded."""
from harness.eval_agent import ExperimentResult, check_secondary_alarms
result = ExperimentResult(mhc_spectral_norm=2.5, engram_hit_rate=0.05, mfu_percent=5.0)
alarms = check_secondary_alarms(result)
assert len(alarms) == 3
class TestShouldKeep:
def test_improved_bpb_keeps(self):
"""val_bpb strictly lower than best_bpb -> keep."""
from harness.eval_agent import ExperimentResult, should_keep
result = ExperimentResult(val_bpb=0.95)
keep, reason = should_keep(result, best_bpb=1.0)
assert keep is True
assert reason == "keep"
def test_worse_bpb_discards(self):
"""val_bpb >= best_bpb -> discard."""
from harness.eval_agent import ExperimentResult, should_keep
result = ExperimentResult(val_bpb=1.05)
keep, reason = should_keep(result, best_bpb=1.0)
assert keep is False
assert reason == "discard"
def test_equal_bpb_discards(self):
"""val_bpb == best_bpb -> discard (strict improvement required)."""
from harness.eval_agent import ExperimentResult, should_keep
result = ExperimentResult(val_bpb=1.0)
keep, reason = should_keep(result, best_bpb=1.0)
assert keep is False
def test_crashed_discards(self):
"""Crashed result is always discarded regardless of bpb."""
from harness.eval_agent import ExperimentResult, should_keep
result = ExperimentResult(val_bpb=0.5, crashed=True)
keep, reason = should_keep(result, best_bpb=1.0)
assert keep is False
assert reason == "crash"
def test_zero_bpb_discards(self):
"""val_bpb <= 0 is treated as invalid and discarded."""
from harness.eval_agent import ExperimentResult, should_keep
result = ExperimentResult(val_bpb=0.0)
keep, reason = should_keep(result, best_bpb=1.0)
assert keep is False
def test_secondary_gate_mhc_rejects(self):
"""mhc_spectral_norm gate rejects even an improving result."""
from harness.eval_agent import ExperimentResult, should_keep
result = ExperimentResult(val_bpb=0.9, mhc_spectral_norm=3.0)
gates = {"mhc_spectral_norm": {"max": 2.0}}
keep, reason = should_keep(result, best_bpb=1.0, gates=gates)
assert keep is False
assert "mhc_spectral_norm" in reason
def test_secondary_gate_engram_rejects(self):
"""engram_hit_rate gate rejects even an improving result."""
from harness.eval_agent import ExperimentResult, should_keep
result = ExperimentResult(val_bpb=0.9, engram_hit_rate=0.01)
gates = {"engram_hit_rate": {"min": 0.05}}
keep, reason = should_keep(result, best_bpb=1.0, gates=gates)
assert keep is False
assert "engram_hit_rate" in reason
def test_no_gates_passed(self):
"""No gates argument keeps an improving result."""
from harness.eval_agent import ExperimentResult, should_keep
result = ExperimentResult(val_bpb=0.8, mhc_spectral_norm=5.0)
keep, reason = should_keep(result, best_bpb=1.0, gates=None)
assert keep is True
# ---------------------------------------------------------------------------
# search_strategy tests
# ---------------------------------------------------------------------------
class TestDiagnose:
def test_missing_file_returns_exploring(self):
"""Non-existent results.tsv returns EXPLORING state."""
from harness.search_strategy import diagnose
state = diagnose("/nonexistent/results.tsv")
assert state.label == "EXPLORING"
assert state.total_experiments == 0
assert state.best_bpb == float("inf")
def test_empty_file_returns_exploring(self):
"""results.tsv with only a header returns EXPLORING."""
from harness.search_strategy import diagnose
with tempfile.NamedTemporaryFile(mode="w", suffix=".tsv", delete=False) as fh:
fh.write("commit\tval_bpb\tmemory_gb\tstatus\tdescription\n")
path = fh.name
try:
state = diagnose(path)
assert state.label == "EXPLORING"
assert state.total_experiments == 0
finally:
os.unlink(path)
def test_improving_trend_is_exploring(self):
"""Steadily decreasing val_bpb trend -> EXPLORING."""
from harness.search_strategy import diagnose
with tempfile.NamedTemporaryFile(mode="w", suffix=".tsv", delete=False) as fh:
fh.write("commit\tval_bpb\tmemory_gb\tstatus\tdescription\n")
# 12 rows with improving BPB (each unique description for diversity)
for i in range(12):
bpb = 1.0 - i * 0.01
fh.write(f"abc{i:04d}\t{bpb:.6f}\t2.0\tkeep\texperiment_{i:02d}_arch\n")
path = fh.name
try:
state = diagnose(path, stuck_threshold=20)
assert state.total_experiments == 12
assert state.best_bpb == pytest.approx(1.0 - 11 * 0.01)
assert state.label in ("EXPLORING", "EXPLOITING")
finally:
os.unlink(path)
def test_stuck_state_after_no_improvement(self):
"""10+ experiments without improvement -> STUCK."""
from harness.search_strategy import diagnose
with tempfile.NamedTemporaryFile(mode="w", suffix=".tsv", delete=False) as fh:
fh.write("commit\tval_bpb\tmemory_gb\tstatus\tdescription\n")
# First row is the best, then 15 rows that are worse
fh.write("best0001\t0.800000\t2.0\tkeep\texperiment 0\n")
for i in range(1, 16):
fh.write(f"abc{i:04d}\t1.000000\t2.0\tkeep\texperiment {i}\n")
path = fh.name
try:
state = diagnose(path, stuck_threshold=10)
assert state.label == "STUCK"
assert state.best_bpb == pytest.approx(0.8)
finally:
os.unlink(path)
def test_broken_state_high_crash_rate(self):
"""Crash rate > 0.5 -> BROKEN."""
from harness.search_strategy import diagnose
with tempfile.NamedTemporaryFile(mode="w", suffix=".tsv", delete=False) as fh:
fh.write("commit\tval_bpb\tmemory_gb\tstatus\tdescription\n")
for i in range(10):
status = "crash" if i < 7 else "keep"
bpb = "0.0" if i < 7 else "1.0"
fh.write(f"abc{i:04d}\t{bpb}\t2.0\t{status}\texperiment {i}\n")
path = fh.name
try:
state = diagnose(path)
assert state.label == "BROKEN"
assert state.crash_rate > 0.5
finally:
os.unlink(path)
def test_best_bpb_tracked_correctly(self):
"""best_bpb is the global minimum across all experiments."""
from harness.search_strategy import diagnose
with tempfile.NamedTemporaryFile(mode="w", suffix=".tsv", delete=False) as fh:
fh.write("commit\tval_bpb\tmemory_gb\tstatus\tdescription\n")
bpbs = [1.0, 0.9, 0.85, 0.95, 1.1, 0.87]
for i, bpb in enumerate(bpbs):
fh.write(f"abc{i:04d}\t{bpb:.6f}\t2.0\tkeep\texperiment {i}\n")
path = fh.name
try:
state = diagnose(path)
assert state.best_bpb == pytest.approx(0.85)
finally:
os.unlink(path)
class TestShouldExplore:
def test_no_improvement_returns_true(self):
"""should_explore returns True when stuck for N experiments."""
from harness.search_strategy import should_explore
with tempfile.NamedTemporaryFile(mode="w", suffix=".tsv", delete=False) as fh:
fh.write("commit\tval_bpb\tmemory_gb\tstatus\tdescription\n")
# Best is first row, then 12 rows with no improvement
fh.write("best0001\t0.800000\t2.0\tkeep\texperiment 0\n")
for i in range(1, 13):
fh.write(f"abc{i:04d}\t1.000000\t2.0\tkeep\texperiment {i}\n")
path = fh.name
try:
assert should_explore(path, n=10) is True
finally:
os.unlink(path)
def test_active_improvement_returns_false(self):
"""should_explore returns False when improvement is happening."""
from harness.search_strategy import should_explore
with tempfile.NamedTemporaryFile(mode="w", suffix=".tsv", delete=False) as fh:
fh.write("commit\tval_bpb\tmemory_gb\tstatus\tdescription\n")
# Steady improvement
for i in range(5):
bpb = 1.0 - i * 0.05
fh.write(f"abc{i:04d}\t{bpb:.6f}\t2.0\tkeep\texperiment {i}\n")
path = fh.name
try:
assert should_explore(path, n=10) is False
finally:
os.unlink(path)
# ---------------------------------------------------------------------------
# meta_agent tests
# ---------------------------------------------------------------------------
class TestGenerateDirective:
def test_exploring_returns_none(self):
"""EXPLORING state produces no directive."""
from harness.meta_agent import generate_directive
from harness.search_strategy import ResearchState
state = ResearchState(
label="EXPLORING",
trend_improving=True,
experiment_diversity=0.8,
crash_rate=0.0,
best_bpb=0.9,
last_improvement_at=10,
total_experiments=10,
)
assert generate_directive(state) is None
def test_stuck_returns_bold_directive(self):
"""STUCK state returns a directive containing 'BOLD' or 'bold'."""
from harness.meta_agent import generate_directive
from harness.search_strategy import ResearchState
state = ResearchState(
label="STUCK",
trend_improving=False,
experiment_diversity=0.2,
crash_rate=0.0,
best_bpb=1.0,
last_improvement_at=1,
total_experiments=20,
)
directive = generate_directive(state)
assert directive is not None
assert "BOLD" in directive or "bold" in directive.lower(), (
f"Expected 'BOLD' in directive, got: {directive}"
)
def test_broken_returns_alert_directive(self):
"""BROKEN state returns a directive containing 'ALERT' and crash rate."""
from harness.meta_agent import generate_directive
from harness.search_strategy import ResearchState
state = ResearchState(
label="BROKEN",
trend_improving=False,
experiment_diversity=0.0,
crash_rate=0.75,
best_bpb=float("inf"),
last_improvement_at=0,
total_experiments=8,
)
directive = generate_directive(state)
assert directive is not None
assert "ALERT" in directive
def test_exploiting_returns_diversity_directive(self):
"""EXPLOITING state returns a directive mentioning diversity."""
from harness.meta_agent import generate_directive
from harness.search_strategy import ResearchState
state = ResearchState(
label="EXPLOITING",
trend_improving=False,
experiment_diversity=0.1,
crash_rate=0.0,
best_bpb=0.9,
last_improvement_at=8,
total_experiments=10,
)
directive = generate_directive(state)
assert directive is not None
assert "divers" in directive.lower() or "Search" in directive
class TestStripPreviousDirective:
def test_strips_marker_block(self):
"""_strip_previous_directive removes the auto-generated section."""
from harness.meta_agent import _strip_previous_directive, _DIRECTIVE_MARKER
content = f"Some content\n\n{_DIRECTIVE_MARKER}\nOld directive text.\n"
result = _strip_previous_directive(content)
assert _DIRECTIVE_MARKER not in result
assert "Some content" in result
def test_no_marker_unchanged(self):
"""Content without a marker is returned unchanged (modulo trailing space)."""
from harness.meta_agent import _strip_previous_directive
content = "Normal program.md content\nNo directive here.\n"
result = _strip_previous_directive(content)
assert "Normal program.md content" in result
assert "No directive here" in result
class TestRunMetaIteration:
def test_run_on_empty_results(self, tmp_path):
"""run_meta_iteration with no results returns state=EXPLORING, changed=False."""
from harness.meta_agent import run_meta_iteration
results = str(tmp_path / "results.tsv")
program = str(tmp_path / "program.md")
summary = run_meta_iteration(program_path=program, results_path=results)
assert summary["state"] == "EXPLORING"
assert summary["changed"] is False
def test_run_writes_directive_when_stuck(self, tmp_path):
"""run_meta_iteration writes a directive to program.md when STUCK."""
from harness.meta_agent import run_meta_iteration
results = tmp_path / "results.tsv"
results.write_text(
"commit\tval_bpb\tmemory_gb\tstatus\tdescription\n"
+ "best0001\t0.800000\t2.0\tkeep\texperiment 0\n"
+ "".join(
f"abc{i:04d}\t1.000000\t2.0\tkeep\texperiment {i}\n"
for i in range(1, 16)
)
)
program = tmp_path / "program.md"
program.write_text("# Program\n")
summary = run_meta_iteration(
program_path=str(program), results_path=str(results)
)
assert summary["changed"] is True
assert "directive" in summary
written = program.read_text()
assert "Meta-Agent Directive" in written