icarus112 commited on
Commit
2c30a29
·
verified ·
1 Parent(s): 2d94172

Upload folder using huggingface_hub

Browse files
overlay/harness/__init__.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """HYDRA harness package: orchestration infrastructure for autoresearch."""
2
+ from harness.eval_agent import ExperimentResult, parse_run_log, should_keep
3
+ from harness.git_utils import current_branch, current_commit_short
4
+ from harness.health_monitor import check_health, get_gpu_stats
5
+ from harness.meta_agent import run_meta_iteration
6
+ from harness.orchestrator import run_loop
7
+ from harness.search_strategy import ResearchState, diagnose
8
+
9
+ __all__ = [
10
+ "run_loop",
11
+ "parse_run_log",
12
+ "ExperimentResult",
13
+ "should_keep",
14
+ "run_meta_iteration",
15
+ "diagnose",
16
+ "ResearchState",
17
+ "check_health",
18
+ "get_gpu_stats",
19
+ "current_branch",
20
+ "current_commit_short",
21
+ ]
overlay/harness/eval_agent.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Eval agent: parse run.log and extract metrics from training runs."""
2
+ import re
3
+ from dataclasses import dataclass, field
4
+
5
+
6
+ @dataclass
7
+ class ExperimentResult:
8
+ """Parsed result from a single experiment run.
9
+
10
+ All float fields default to 0.0; integer fields default to 0.
11
+ The ``crashed`` flag is set when the log indicates a failure or the
12
+ log file is missing entirely.
13
+ """
14
+
15
+ # Primary metric
16
+ val_bpb: float = 0.0
17
+
18
+ # Timing
19
+ training_seconds: float = 0.0
20
+ total_seconds: float = 0.0
21
+
22
+ # Hardware
23
+ peak_vram_mb: float = 0.0
24
+ mfu_percent: float = 0.0
25
+
26
+ # Throughput
27
+ total_tokens_m: float = 0.0
28
+ num_steps: int = 0
29
+
30
+ # Model shape (echoed by train.py summary block)
31
+ num_params_m: float = 0.0
32
+ n_layer: int = 0
33
+ d_model: int = 0
34
+
35
+ # Secondary health metrics
36
+ mhc_spectral_norm: float = 0.0
37
+ engram_hit_rate: float = 0.0
38
+ sr_bypass_rate: float = 0.0
39
+
40
+ # Status
41
+ crashed: bool = False
42
+ error_message: str = ""
43
+
44
+
45
+ # Regex patterns keyed by ExperimentResult attribute name.
46
+ # Format must match the ``--- Summary ---`` block printed by train.py.
47
+ _PATTERNS: dict[str, str] = {
48
+ "val_bpb": r"^val_bpb:\s+([\d.]+)",
49
+ "training_seconds": r"^training_seconds:\s+([\d.]+)",
50
+ "total_seconds": r"^total_seconds:\s+([\d.]+)",
51
+ "peak_vram_mb": r"^peak_vram_mb:\s+([\d.]+)",
52
+ "mfu_percent": r"^mfu_percent:\s+([\d.]+)",
53
+ "total_tokens_m": r"^total_tokens_M:\s+([\d.]+)",
54
+ "num_steps": r"^num_steps:\s+(\d+)",
55
+ "num_params_m": r"^num_params_M:\s+([\d.]+)",
56
+ "n_layer": r"^n_layer:\s+(\d+)",
57
+ "d_model": r"^d_model:\s+(\d+)",
58
+ "mhc_spectral_norm": r"^mhc_spectral_norm:\s+([\d.]+)",
59
+ "engram_hit_rate": r"^engram_hit_rate:\s+([\d.]+)",
60
+ "sr_bypass_rate": r"^sr_bypass_rate:\s+([\d.]+)",
61
+ }
62
+
63
+ # Attributes that should be parsed as int rather than float.
64
+ _INT_ATTRS: frozenset[str] = frozenset({"num_steps", "n_layer", "d_model"})
65
+
66
+
67
+ def parse_run_log(log_path: str) -> ExperimentResult:
68
+ """Parse a run.log file and extract all training metrics.
69
+
70
+ Args:
71
+ log_path: Absolute path to the run.log file.
72
+
73
+ Returns:
74
+ Populated ExperimentResult; sets ``crashed=True`` when the log
75
+ contains a traceback or the file is missing.
76
+ """
77
+ result = ExperimentResult()
78
+
79
+ try:
80
+ with open(log_path) as fh:
81
+ content = fh.read()
82
+ except FileNotFoundError:
83
+ result.crashed = True
84
+ result.error_message = f"Log file not found: {log_path}"
85
+ return result
86
+
87
+ # Detect crash signals in output.
88
+ if "Traceback" in content or "FAIL" in content or "Error" in content:
89
+ result.crashed = True
90
+ lines = content.strip().splitlines()
91
+ result.error_message = "\n".join(lines[-20:])
92
+
93
+ for attr, pattern in _PATTERNS.items():
94
+ match = re.search(pattern, content, re.MULTILINE)
95
+ if match:
96
+ raw = match.group(1)
97
+ setattr(result, attr, int(raw) if attr in _INT_ATTRS else float(raw))
98
+
99
+ return result
100
+
101
+
102
+ def check_secondary_alarms(result: ExperimentResult) -> list[str]:
103
+ """Check secondary metrics against fixed alarm thresholds.
104
+
105
+ Args:
106
+ result: Parsed experiment result.
107
+
108
+ Returns:
109
+ List of human-readable alarm strings (empty if all clear).
110
+ """
111
+ alarms: list[str] = []
112
+
113
+ if result.mhc_spectral_norm > 2.0:
114
+ alarms.append(
115
+ f"mhc_spectral_norm={result.mhc_spectral_norm:.4f} > 2.0 (ALARM)"
116
+ )
117
+ if 0 < result.engram_hit_rate < 0.1:
118
+ alarms.append(
119
+ f"engram_hit_rate={result.engram_hit_rate:.4f} < 0.1 (memory underused)"
120
+ )
121
+ if 0 < result.mfu_percent < 10:
122
+ alarms.append(
123
+ f"mfu_percent={result.mfu_percent:.2f}% < 10% (GPU underutilized)"
124
+ )
125
+
126
+ return alarms
127
+
128
+
129
+ def should_keep(
130
+ result: ExperimentResult,
131
+ best_bpb: float,
132
+ gates: dict | None = None,
133
+ ) -> tuple[bool, str]:
134
+ """Decide whether to keep or discard an experiment.
135
+
136
+ The primary criterion is strictly lower val_bpb than the current best.
137
+ Optional secondary gates (passed from HarnessConfig.secondary_metrics)
138
+ can reject an otherwise-improving result.
139
+
140
+ Args:
141
+ result: Parsed experiment result.
142
+ best_bpb: Current best val_bpb across all experiments.
143
+ gates: Optional dict mapping metric name to threshold dict with
144
+ ``"max"`` or ``"min"`` keys, e.g.
145
+ ``{"mhc_spectral_norm": {"max": 2.0}}``.
146
+
147
+ Returns:
148
+ Tuple of (keep: bool, reason: str).
149
+ """
150
+ if result.crashed:
151
+ return False, "crash"
152
+ if result.val_bpb <= 0:
153
+ return False, "invalid val_bpb"
154
+ if result.val_bpb >= best_bpb:
155
+ return False, "discard"
156
+
157
+ # Secondary gate checks.
158
+ if gates:
159
+ gate_mhc = gates.get("mhc_spectral_norm", {}).get("max")
160
+ if gate_mhc is not None and result.mhc_spectral_norm > gate_mhc:
161
+ return (
162
+ False,
163
+ f"mhc_spectral_norm {result.mhc_spectral_norm:.4f} > gate {gate_mhc}",
164
+ )
165
+ gate_engram = gates.get("engram_hit_rate", {}).get("min")
166
+ if gate_engram is not None and result.engram_hit_rate < gate_engram:
167
+ return (
168
+ False,
169
+ f"engram_hit_rate {result.engram_hit_rate:.4f} < gate {gate_engram}",
170
+ )
171
+
172
+ return True, "keep"
overlay/harness/git_utils.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Git utilities for HYDRA autoresearch branch management."""
2
+ import os
3
+ import subprocess
4
+
5
+ REPO_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
6
+
7
+
8
+ def run_git(*args: str, check: bool = True) -> subprocess.CompletedProcess:
9
+ """Run a git command in the repo directory.
10
+
11
+ Args:
12
+ *args: Git command arguments.
13
+ check: Whether to raise on non-zero exit code.
14
+
15
+ Returns:
16
+ Completed process with stdout/stderr captured.
17
+ """
18
+ return subprocess.run(
19
+ ["git"] + list(args),
20
+ cwd=REPO_DIR,
21
+ capture_output=True,
22
+ text=True,
23
+ check=check,
24
+ )
25
+
26
+
27
+ def current_branch() -> str:
28
+ """Return the current git branch name.
29
+
30
+ Returns:
31
+ Branch name string.
32
+ """
33
+ result = run_git("rev-parse", "--abbrev-ref", "HEAD")
34
+ return result.stdout.strip()
35
+
36
+
37
+ def current_commit_short() -> str:
38
+ """Return the current HEAD commit short hash (7 chars).
39
+
40
+ Returns:
41
+ 7-character commit hash.
42
+ """
43
+ result = run_git("rev-parse", "--short=7", "HEAD")
44
+ return result.stdout.strip()
45
+
46
+
47
+ def create_branch(name: str) -> None:
48
+ """Create and switch to a new branch.
49
+
50
+ Args:
51
+ name: Branch name to create.
52
+ """
53
+ run_git("checkout", "-b", name)
54
+
55
+
56
+ def commit_all(message: str) -> str:
57
+ """Stage all changes, commit, and return short hash.
58
+
59
+ Args:
60
+ message: Commit message.
61
+
62
+ Returns:
63
+ Short commit hash after committing.
64
+ """
65
+ run_git("add", "-A")
66
+ run_git("commit", "-m", message, check=False)
67
+ return current_commit_short()
68
+
69
+
70
+ def reset_to(commit: str) -> None:
71
+ """Hard reset to a specific commit, discarding all changes.
72
+
73
+ Args:
74
+ commit: Commit hash (short or full) to reset to.
75
+ """
76
+ run_git("reset", "--hard", commit)
77
+
78
+
79
+ def get_last_n_diffs(n: int = 3) -> list[str]:
80
+ """Get the last N commit diffs (--stat format) for meta-agent context.
81
+
82
+ Args:
83
+ n: Number of recent commits to retrieve.
84
+
85
+ Returns:
86
+ List of diff stat strings, one per commit (truncated to 500 chars).
87
+ """
88
+ result = run_git("log", f"-{n}", "--format=%H", check=False)
89
+ hashes = [h for h in result.stdout.strip().split("\n") if h]
90
+ diffs: list[str] = []
91
+ for h in hashes:
92
+ diff_result = run_git("show", "--stat", h, check=False)
93
+ diffs.append(diff_result.stdout[:500])
94
+ return diffs
overlay/harness/health_monitor.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Hardware health monitoring for HYDRA experiments.
2
+
3
+ Provides lightweight checks that the orchestrator runs before each
4
+ experiment to avoid launching training into a degraded GPU state.
5
+ """
6
+ import os
7
+
8
+ import torch
9
+
10
+
11
+ def get_gpu_stats() -> dict:
12
+ """Return current GPU memory statistics.
13
+
14
+ Returns:
15
+ Dict with keys: available (bool), and when available:
16
+ name, memory_allocated_mb, memory_reserved_mb,
17
+ max_memory_allocated_mb, memory_total_mb.
18
+ """
19
+ if not torch.cuda.is_available():
20
+ return {"available": False}
21
+
22
+ props = torch.cuda.get_device_properties(0)
23
+ return {
24
+ "available": True,
25
+ "name": torch.cuda.get_device_name(0),
26
+ "memory_allocated_mb": torch.cuda.memory_allocated(0) / (1024 * 1024),
27
+ "memory_reserved_mb": torch.cuda.memory_reserved(0) / (1024 * 1024),
28
+ "max_memory_allocated_mb": torch.cuda.max_memory_allocated(0) / (1024 * 1024),
29
+ "memory_total_mb": props.total_mem / (1024 * 1024),
30
+ }
31
+
32
+
33
+ def check_health(
34
+ vram_pressure_pct: float = 90.0,
35
+ min_free_disk_gb: float = 1.0,
36
+ ) -> tuple[bool, list[str]]:
37
+ """Check GPU and disk health before launching an experiment.
38
+
39
+ Args:
40
+ vram_pressure_pct: Warn when GPU memory allocation exceeds this
41
+ percentage of total VRAM.
42
+ min_free_disk_gb: Warn when free disk space falls below this.
43
+
44
+ Returns:
45
+ Tuple of (healthy: bool, warnings: list[str]).
46
+ ``healthy`` is True when there are no warnings.
47
+ """
48
+ warnings: list[str] = []
49
+ stats = get_gpu_stats()
50
+
51
+ if not stats["available"]:
52
+ return False, ["No CUDA GPU available"]
53
+
54
+ # Memory pressure check.
55
+ used_pct = (
56
+ stats["memory_allocated_mb"] / stats["memory_total_mb"] * 100
57
+ if stats["memory_total_mb"] > 0
58
+ else 0.0
59
+ )
60
+ if used_pct > vram_pressure_pct:
61
+ warnings.append(
62
+ f"GPU memory pressure: {used_pct:.1f}% allocated "
63
+ f"({stats['memory_allocated_mb']:.0f} / {stats['memory_total_mb']:.0f} MB)"
64
+ )
65
+
66
+ # Disk space check.
67
+ try:
68
+ statvfs = os.statvfs(os.path.dirname(os.path.abspath(__file__)))
69
+ free_gb = (statvfs.f_bavail * statvfs.f_frsize) / (1024**3)
70
+ if free_gb < min_free_disk_gb:
71
+ warnings.append(f"Low disk space: {free_gb:.2f} GB free")
72
+ except (AttributeError, OSError):
73
+ # os.statvfs not available on all platforms (e.g. Windows).
74
+ pass
75
+
76
+ return len(warnings) == 0, warnings
77
+
78
+
79
+ def reset_peak_stats() -> None:
80
+ """Reset GPU peak memory tracking for the next experiment.
81
+
82
+ Should be called immediately before launching each training run so
83
+ that peak_vram_mb reported in run.log reflects only that experiment.
84
+ """
85
+ if torch.cuda.is_available():
86
+ torch.cuda.reset_peak_memory_stats()
overlay/harness/meta_agent.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Meta-agent: evolves program.md based on experiment history.
2
+
3
+ Runs every ``meta_interval`` inner-loop experiments (configured in
4
+ HarnessConfig). Reads the current research state from results.tsv,
5
+ decides whether guidance is needed, and appends a directive to
6
+ program.md. Any previous auto-generated directive is replaced so
7
+ the file stays clean.
8
+ """
9
+ import os
10
+
11
+ from harness.git_utils import REPO_DIR
12
+ from harness.search_strategy import ResearchState, diagnose
13
+
14
+ PROGRAM_PATH = os.path.join(REPO_DIR, "program.md")
15
+ RESULTS_PATH = os.path.join(REPO_DIR, "results.tsv")
16
+
17
+ # Sentinel that marks auto-generated content so it can be cleanly replaced.
18
+ _DIRECTIVE_MARKER = "## Meta-Agent Directive (auto-generated)"
19
+
20
+
21
+ def generate_directive(state: ResearchState) -> str | None:
22
+ """Generate a directive string to append to program.md, or None.
23
+
24
+ A directive is only produced when the research state is not EXPLORING
25
+ (i.e., something needs to change).
26
+
27
+ Args:
28
+ state: Current ResearchState diagnosis.
29
+
30
+ Returns:
31
+ Formatted directive string, or None when no change is needed.
32
+ """
33
+ if state.label == "EXPLORING":
34
+ return None
35
+
36
+ if state.label == "BROKEN":
37
+ return (
38
+ f"\n{_DIRECTIVE_MARKER}\n"
39
+ f"ALERT: Crash rate is {state.crash_rate:.0%} in the recent window. "
40
+ "Revert to the last stable commit. Reduce model complexity before "
41
+ "proposing further changes. Suggested actions:\n"
42
+ "- Reduce d_model or n_layer\n"
43
+ "- Reduce batch_size\n"
44
+ "- Disable experimental modules (Engram, mHC, Hestia) one at a time\n"
45
+ )
46
+
47
+ if state.label == "STUCK":
48
+ stale = state.total_experiments - state.last_improvement_at
49
+ return (
50
+ f"\n{_DIRECTIVE_MARKER}\n"
51
+ f"ALERT: No improvement for {stale} experiments "
52
+ f"(best_bpb={state.best_bpb:.6f}). "
53
+ "Apply BOLD changes for the next 5 experiments:\n"
54
+ "- Dramatically change d_model or n_layer (2× or ½)\n"
55
+ "- Toggle Engram or mHC on/off entirely\n"
56
+ "- Change optimizer hyperparameters by 3–5×\n"
57
+ "- Temporarily accept results within 0.5% of baseline\n"
58
+ )
59
+
60
+ if state.label == "EXPLOITING":
61
+ return (
62
+ f"\n{_DIRECTIVE_MARKER}\n"
63
+ "Search is converging too early. Inject diversity:\n"
64
+ "- If recent experiments tune LR, try architecture changes instead\n"
65
+ "- If tuning architecture, try optimizer or regularisation changes\n"
66
+ "- Try removing complexity (simplification wins are valuable)\n"
67
+ "- Explore a subsystem not touched in the last 10 experiments\n"
68
+ )
69
+
70
+ return None
71
+
72
+
73
+ def _strip_previous_directive(content: str) -> str:
74
+ """Remove any prior auto-generated directive block from content.
75
+
76
+ Args:
77
+ content: Full text of program.md.
78
+
79
+ Returns:
80
+ Content with any previous directive stripped and trailing
81
+ whitespace normalised.
82
+ """
83
+ if _DIRECTIVE_MARKER in content:
84
+ content = content[: content.index(_DIRECTIVE_MARKER)].rstrip() + "\n"
85
+ return content
86
+
87
+
88
+ def run_meta_iteration(
89
+ program_path: str = PROGRAM_PATH,
90
+ results_path: str = RESULTS_PATH,
91
+ ) -> dict:
92
+ """Run one meta-agent iteration.
93
+
94
+ Diagnoses the current research state and optionally rewrites
95
+ program.md with a new directive.
96
+
97
+ Args:
98
+ program_path: Path to program.md.
99
+ results_path: Path to results.tsv.
100
+
101
+ Returns:
102
+ Summary dict with keys: state, total_experiments, best_bpb,
103
+ crash_rate, changed, and optionally directive.
104
+ """
105
+ state = diagnose(results_path)
106
+
107
+ summary: dict = {
108
+ "state": state.label,
109
+ "total_experiments": state.total_experiments,
110
+ "best_bpb": state.best_bpb,
111
+ "crash_rate": state.crash_rate,
112
+ "changed": False,
113
+ }
114
+
115
+ directive = generate_directive(state)
116
+ if directive is None:
117
+ return summary
118
+
119
+ try:
120
+ with open(program_path) as fh:
121
+ content = fh.read()
122
+ except FileNotFoundError:
123
+ content = ""
124
+
125
+ content = _strip_previous_directive(content)
126
+ content = content + "\n" + directive
127
+
128
+ tmp_path = program_path + ".tmp"
129
+ try:
130
+ with open(tmp_path, "w") as fh:
131
+ fh.write(content)
132
+ os.replace(tmp_path, program_path) # atomic on POSIX
133
+ finally:
134
+ if os.path.exists(tmp_path):
135
+ os.unlink(tmp_path)
136
+
137
+ summary["changed"] = True
138
+ summary["directive"] = directive.strip()
139
+ return summary
overlay/harness/orchestrator.py ADDED
@@ -0,0 +1,293 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """HYDRA Orchestrator: main loop for autonomous research.
2
+
3
+ Usage::
4
+
5
+ python -m harness.orchestrator [--meta-interval N] [--max-experiments N]
6
+
7
+ Loop:
8
+ 1. Read current state (branch, results.tsv, program.md)
9
+ 2. [Architect Agent] proposes and applies changes to train.py (external)
10
+ 3. Git commit the changes
11
+ 4. Run training: ``uv run train.py`` captured to run.log
12
+ 5. [Eval Agent] extract metrics from run.log
13
+ 6. Keep or discard based on val_bpb + secondary metric gates
14
+ 7. Log to results.tsv
15
+ 8. Every ``meta_interval`` experiments: [Meta Agent] evolves program.md
16
+ 9. Repeat
17
+
18
+ The orchestrator intentionally does NOT modify train.py itself -- it
19
+ provides the infrastructure ("rails") that the autoresearch loop runs on.
20
+ """
21
+ import argparse
22
+ import csv
23
+ import os
24
+ import subprocess
25
+ import time
26
+
27
+ from harness.eval_agent import ExperimentResult, check_secondary_alarms, parse_run_log, should_keep
28
+ from harness.git_utils import REPO_DIR, commit_all, current_commit_short, reset_to
29
+ from harness.health_monitor import check_health, reset_peak_stats
30
+ from harness.meta_agent import run_meta_iteration
31
+ from harness.search_strategy import diagnose
32
+
33
+ # ---------------------------------------------------------------------------
34
+ # Paths
35
+ # ---------------------------------------------------------------------------
36
+
37
+ RESULTS_FILE = os.path.join(REPO_DIR, "results.tsv")
38
+ RUN_LOG = os.path.join(REPO_DIR, "run.log")
39
+
40
+ _TSV_HEADER = "commit\tval_bpb\tmemory_gb\tstatus\tdescription\n"
41
+
42
+
43
+ # ---------------------------------------------------------------------------
44
+ # TSV helpers
45
+ # ---------------------------------------------------------------------------
46
+
47
+
48
+ def init_results_tsv() -> None:
49
+ """Create results.tsv with header row if it does not yet exist."""
50
+ if not os.path.exists(RESULTS_FILE):
51
+ with open(RESULTS_FILE, "w") as fh:
52
+ fh.write(_TSV_HEADER)
53
+
54
+
55
+ def log_result(
56
+ commit: str,
57
+ val_bpb: float,
58
+ memory_gb: float,
59
+ status: str,
60
+ description: str,
61
+ ) -> None:
62
+ """Append one row to results.tsv.
63
+
64
+ Args:
65
+ commit: Short git hash for this experiment.
66
+ val_bpb: Validation bits-per-byte (0.0 for crashes).
67
+ memory_gb: Peak VRAM usage in gigabytes.
68
+ status: One of keep / discard / crash / timeout.
69
+ description: Short human-readable description.
70
+ """
71
+ with open(RESULTS_FILE, "a") as fh:
72
+ fh.write(
73
+ f"{commit}\t{val_bpb:.6f}\t{memory_gb:.2f}\t{status}\t{description}\n"
74
+ )
75
+
76
+
77
+ def count_experiments() -> int:
78
+ """Count the number of experiment rows in results.tsv.
79
+
80
+ Returns:
81
+ Row count excluding the header line (0 when file does not exist).
82
+ """
83
+ if not os.path.exists(RESULTS_FILE):
84
+ return 0
85
+ with open(RESULTS_FILE) as fh:
86
+ return max(0, sum(1 for _ in fh) - 1)
87
+
88
+
89
+ def _load_best_bpb() -> float:
90
+ """Scan results.tsv for the best (lowest positive) val_bpb seen so far.
91
+
92
+ Returns:
93
+ Best val_bpb, or ``float("inf")`` when no valid result exists.
94
+ """
95
+ if not os.path.exists(RESULTS_FILE):
96
+ return float("inf")
97
+ best = float("inf")
98
+ with open(RESULTS_FILE) as fh:
99
+ reader = csv.DictReader(fh, delimiter="\t")
100
+ for row in reader:
101
+ try:
102
+ bpb = float(row.get("val_bpb", "0") or "0")
103
+ except ValueError:
104
+ continue
105
+ if 0 < bpb < best:
106
+ best = bpb
107
+ return best
108
+
109
+
110
+ # ---------------------------------------------------------------------------
111
+ # Experiment execution
112
+ # ---------------------------------------------------------------------------
113
+
114
+
115
+ def run_experiment(timeout: int = 600) -> str:
116
+ """Launch ``uv run train.py`` and capture all output to run.log.
117
+
118
+ Args:
119
+ timeout: Kill the process after this many seconds.
120
+
121
+ Returns:
122
+ One of ``"ok"``, ``"timeout"``, or ``"error"``.
123
+ """
124
+ try:
125
+ with open(RUN_LOG, "w") as log_file:
126
+ proc = subprocess.run(
127
+ ["uv", "run", "train.py"],
128
+ cwd=REPO_DIR,
129
+ stdout=log_file,
130
+ stderr=subprocess.STDOUT,
131
+ timeout=timeout,
132
+ )
133
+ return "ok" if proc.returncode == 0 else "error"
134
+ except subprocess.TimeoutExpired:
135
+ return "timeout"
136
+ except Exception as exc: # noqa: BLE001
137
+ with open(RUN_LOG, "a") as log_file:
138
+ log_file.write(f"\nOrchestrator error: {exc}\n")
139
+ return "error"
140
+
141
+
142
+ # ---------------------------------------------------------------------------
143
+ # Main loop
144
+ # ---------------------------------------------------------------------------
145
+
146
+
147
+ def run_loop(
148
+ meta_interval: int = 20,
149
+ max_experiments: int | None = None,
150
+ experiment_timeout: int = 600,
151
+ secondary_gates: dict | None = None,
152
+ ) -> None:
153
+ """Run the HYDRA autoresearch loop.
154
+
155
+ This function runs indefinitely (or until ``max_experiments`` is reached
156
+ or the user interrupts with Ctrl-C).
157
+
158
+ Args:
159
+ meta_interval: Run the meta-agent every N experiments.
160
+ max_experiments: Hard stop after this many experiments (None = infinite).
161
+ experiment_timeout: Seconds before a training run is killed.
162
+ secondary_gates: Optional gate thresholds forwarded to
163
+ :func:`~harness.eval_agent.should_keep`.
164
+ """
165
+ init_results_tsv()
166
+ best_bpb = _load_best_bpb()
167
+ experiment_num = count_experiments()
168
+
169
+ print(
170
+ f"HYDRA Orchestrator starting. "
171
+ f"Experiments so far: {experiment_num}, Best BPB: {best_bpb:.6f}"
172
+ )
173
+
174
+ while max_experiments is None or experiment_num < max_experiments:
175
+ experiment_num += 1
176
+
177
+ # ------------------------------------------------------------------
178
+ # Pre-flight health check
179
+ # ------------------------------------------------------------------
180
+ healthy, hw_warnings = check_health()
181
+ if hw_warnings:
182
+ print(f" [health] {hw_warnings}")
183
+
184
+ # ------------------------------------------------------------------
185
+ # Periodic meta-agent update
186
+ # ------------------------------------------------------------------
187
+ if experiment_num > 1 and experiment_num % meta_interval == 0:
188
+ print(f"\n=== Meta-agent iteration at experiment {experiment_num} ===")
189
+ meta_result = run_meta_iteration()
190
+ print(
191
+ f" state={meta_result['state']} "
192
+ f"best_bpb={meta_result['best_bpb']:.6f} "
193
+ f"changed={meta_result['changed']}"
194
+ )
195
+ if meta_result.get("directive"):
196
+ print(f" directive: {meta_result['directive'][:120]}")
197
+
198
+ # ------------------------------------------------------------------
199
+ # Record baseline commit so we can reset on failure / discard
200
+ # ------------------------------------------------------------------
201
+ pre_commit = current_commit_short()
202
+
203
+ # ------------------------------------------------------------------
204
+ # Run experiment
205
+ # ------------------------------------------------------------------
206
+ print(f"\n--- Experiment {experiment_num} ---")
207
+ reset_peak_stats()
208
+ t0 = time.time()
209
+ run_status = run_experiment(timeout=experiment_timeout)
210
+ elapsed = time.time() - t0
211
+ print(f" run_status={run_status} elapsed={elapsed:.1f}s")
212
+
213
+ # ------------------------------------------------------------------
214
+ # Parse results
215
+ # ------------------------------------------------------------------
216
+ result: ExperimentResult = parse_run_log(RUN_LOG)
217
+
218
+ if result.crashed or run_status != "ok":
219
+ commit = current_commit_short()
220
+ err_short = (
221
+ "timeout"
222
+ if run_status == "timeout"
223
+ else result.error_message[:80].replace("\n", " ")
224
+ )
225
+ log_result(commit, 0.0, 0.0, "crash", err_short)
226
+ print(f" CRASH: {err_short}")
227
+ reset_to(pre_commit)
228
+ continue
229
+
230
+ # ------------------------------------------------------------------
231
+ # Secondary alarms (non-blocking -- logged but do not abort)
232
+ # ------------------------------------------------------------------
233
+ alarms = check_secondary_alarms(result)
234
+ if alarms:
235
+ for alarm in alarms:
236
+ print(f" [alarm] {alarm}")
237
+
238
+ # ------------------------------------------------------------------
239
+ # Keep / discard
240
+ # ------------------------------------------------------------------
241
+ keep, reason = should_keep(result, best_bpb, gates=secondary_gates)
242
+ commit = current_commit_short()
243
+ memory_gb = result.peak_vram_mb / 1024.0
244
+
245
+ if keep:
246
+ best_bpb = result.val_bpb
247
+ description = f"val_bpb improved to {result.val_bpb:.6f}"
248
+ log_result(commit, result.val_bpb, memory_gb, "keep", description)
249
+ print(f" KEEP: val_bpb={result.val_bpb:.6f} (new best)")
250
+ else:
251
+ description = f"{reason} val_bpb={result.val_bpb:.6f}"
252
+ log_result(commit, result.val_bpb, memory_gb, "discard", description)
253
+ print(f" DISCARD: val_bpb={result.val_bpb:.6f} ({reason})")
254
+ reset_to(pre_commit)
255
+
256
+ print(f"\nHYDRA finished after {experiment_num} experiments. Best BPB: {best_bpb:.6f}")
257
+
258
+
259
+ # ---------------------------------------------------------------------------
260
+ # CLI entry point
261
+ # ---------------------------------------------------------------------------
262
+
263
+
264
+ if __name__ == "__main__":
265
+ parser = argparse.ArgumentParser(description="HYDRA Autoresearch Orchestrator")
266
+ parser.add_argument(
267
+ "--meta-interval",
268
+ type=int,
269
+ default=20,
270
+ help="Run meta-agent every N experiments (default: 20)",
271
+ )
272
+ parser.add_argument(
273
+ "--max-experiments",
274
+ type=int,
275
+ default=None,
276
+ help="Stop after N experiments; omit for infinite (default: infinite)",
277
+ )
278
+ parser.add_argument(
279
+ "--experiment-timeout",
280
+ type=int,
281
+ default=600,
282
+ help="Kill training run after N seconds (default: 600)",
283
+ )
284
+ args = parser.parse_args()
285
+
286
+ try:
287
+ run_loop(
288
+ meta_interval=args.meta_interval,
289
+ max_experiments=args.max_experiments,
290
+ experiment_timeout=args.experiment_timeout,
291
+ )
292
+ except KeyboardInterrupt:
293
+ print("\nOrchestrator stopped by user.")
overlay/harness/search_strategy.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Search strategy for HYDRA's meta-evolution loop.
2
+
3
+ Reads results.tsv and diagnoses the current research state as one of:
4
+ EXPLORING -- active improvement trend with diverse experiments
5
+ EXPLOITING -- narrowing in on a local optimum (low diversity)
6
+ STUCK -- no improvement for >= stuck_threshold experiments
7
+ BROKEN -- crash rate exceeds crash_threshold
8
+ """
9
+ import csv
10
+ import os
11
+ from dataclasses import dataclass
12
+
13
+
14
+ @dataclass
15
+ class ResearchState:
16
+ """Diagnosis of the current research trajectory.
17
+
18
+ Attributes:
19
+ label: One of EXPLORING, EXPLOITING, STUCK, BROKEN.
20
+ trend_improving: True when the second half of the recent window is
21
+ better (lower BPB) than the first half.
22
+ experiment_diversity: Rough 0–1 score based on unique description
23
+ prefixes in the recent window.
24
+ crash_rate: Fraction of recent experiments that crashed.
25
+ best_bpb: Lowest val_bpb seen across all experiments.
26
+ last_improvement_at: Ordinal of the experiment that set best_bpb.
27
+ total_experiments: Total rows in results.tsv (excluding header).
28
+ """
29
+
30
+ label: str
31
+ trend_improving: bool
32
+ experiment_diversity: float
33
+ crash_rate: float
34
+ best_bpb: float
35
+ last_improvement_at: int
36
+ total_experiments: int
37
+
38
+
39
+ def diagnose(
40
+ results_path: str,
41
+ window: int = 20,
42
+ stuck_threshold: int = 10,
43
+ crash_threshold: float = 0.5,
44
+ ) -> ResearchState:
45
+ """Diagnose current research state from results.tsv.
46
+
47
+ Args:
48
+ results_path: Path to the tab-separated results file.
49
+ window: Number of recent experiments to consider for trend/diversity.
50
+ stuck_threshold: Experiments without improvement before labelling STUCK.
51
+ crash_threshold: Crash fraction above which state becomes BROKEN.
52
+
53
+ Returns:
54
+ ResearchState with diagnosis label and supporting statistics.
55
+ """
56
+ if not os.path.exists(results_path):
57
+ return ResearchState(
58
+ label="EXPLORING",
59
+ trend_improving=False,
60
+ experiment_diversity=0.0,
61
+ crash_rate=0.0,
62
+ best_bpb=float("inf"),
63
+ last_improvement_at=0,
64
+ total_experiments=0,
65
+ )
66
+
67
+ rows: list[dict] = []
68
+ with open(results_path) as fh:
69
+ reader = csv.DictReader(fh, delimiter="\t")
70
+ for row in reader:
71
+ rows.append(row)
72
+
73
+ if not rows:
74
+ return ResearchState(
75
+ label="EXPLORING",
76
+ trend_improving=False,
77
+ experiment_diversity=0.0,
78
+ crash_rate=0.0,
79
+ best_bpb=float("inf"),
80
+ last_improvement_at=0,
81
+ total_experiments=0,
82
+ )
83
+
84
+ total = len(rows)
85
+ recent = rows[-window:]
86
+
87
+ # Crash rate in the recent window.
88
+ crashes = sum(1 for r in recent if r.get("status") == "crash")
89
+ crash_rate = crashes / len(recent) if recent else 0.0
90
+
91
+ # Best BPB overall and which experiment achieved it.
92
+ best_bpb = float("inf")
93
+ last_improvement_at = 0
94
+ for i, row in enumerate(rows):
95
+ try:
96
+ bpb = float(row.get("val_bpb", "0") or "0")
97
+ except ValueError:
98
+ continue
99
+ if bpb > 0 and bpb < best_bpb:
100
+ best_bpb = bpb
101
+ last_improvement_at = i + 1
102
+
103
+ # Trend: is the second half of the recent window better than the first?
104
+ valid_bpbs = [
105
+ float(r.get("val_bpb", "0") or "0")
106
+ for r in recent
107
+ if float(r.get("val_bpb", "0") or "0") > 0
108
+ ]
109
+ trend_improving = False
110
+ if len(valid_bpbs) >= 4:
111
+ mid = len(valid_bpbs) // 2
112
+ first_half_mean = sum(valid_bpbs[:mid]) / mid
113
+ second_half_mean = sum(valid_bpbs[mid:]) / (len(valid_bpbs) - mid)
114
+ trend_improving = second_half_mean < first_half_mean
115
+
116
+ # Diversity: fraction of unique description prefixes (first 20 chars).
117
+ descriptions = {r.get("description", "")[:20] for r in recent}
118
+ diversity = min(1.0, len(descriptions) / max(1, len(recent)))
119
+
120
+ # Classify state.
121
+ stale = total - last_improvement_at
122
+ if crash_rate > crash_threshold:
123
+ label = "BROKEN"
124
+ elif stale >= stuck_threshold:
125
+ label = "STUCK"
126
+ elif trend_improving and diversity > 0.3:
127
+ label = "EXPLORING"
128
+ else:
129
+ label = "EXPLOITING"
130
+
131
+ return ResearchState(
132
+ label=label,
133
+ trend_improving=trend_improving,
134
+ experiment_diversity=diversity,
135
+ crash_rate=crash_rate,
136
+ best_bpb=best_bpb,
137
+ last_improvement_at=last_improvement_at,
138
+ total_experiments=total,
139
+ )
140
+
141
+
142
+ def should_explore(results_path: str, n: int = 10) -> bool:
143
+ """Return True when no improvement has been seen in the last N experiments.
144
+
145
+ Args:
146
+ results_path: Path to results.tsv.
147
+ n: Look-back window for improvement check.
148
+
149
+ Returns:
150
+ True if the research loop should try bolder mutations.
151
+ """
152
+ state = diagnose(results_path, window=n, stuck_threshold=n)
153
+ return state.label in ("STUCK", "BROKEN")