Spaces:

GAInTech
/

feather-a10g-large-runtime

Paused

App Files Files Community

icarus112 commited on 20 days ago

Commit

2c30a29

verified ·

1 Parent(s): 2d94172

Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

overlay/harness/__init__.py +21 -0
overlay/harness/eval_agent.py +172 -0
overlay/harness/git_utils.py +94 -0
overlay/harness/health_monitor.py +86 -0
overlay/harness/meta_agent.py +139 -0
overlay/harness/orchestrator.py +293 -0
overlay/harness/search_strategy.py +153 -0

overlay/harness/__init__.py ADDED Viewed

	@@ -0,0 +1,21 @@

+"""HYDRA harness package: orchestration infrastructure for autoresearch."""
+from harness.eval_agent import ExperimentResult, parse_run_log, should_keep
+from harness.git_utils import current_branch, current_commit_short
+from harness.health_monitor import check_health, get_gpu_stats
+from harness.meta_agent import run_meta_iteration
+from harness.orchestrator import run_loop
+from harness.search_strategy import ResearchState, diagnose
+__all__ = [
+    "run_loop",
+    "parse_run_log",
+    "ExperimentResult",
+    "should_keep",
+    "run_meta_iteration",
+    "diagnose",
+    "ResearchState",
+    "check_health",
+    "get_gpu_stats",
+    "current_branch",
+    "current_commit_short",
+]

overlay/harness/eval_agent.py ADDED Viewed

	@@ -0,0 +1,172 @@

+"""Eval agent: parse run.log and extract metrics from training runs."""
+import re
+from dataclasses import dataclass, field
+@dataclass
+class ExperimentResult:
+    """Parsed result from a single experiment run.
+    All float fields default to 0.0; integer fields default to 0.
+    The ``crashed`` flag is set when the log indicates a failure or the
+    log file is missing entirely.
+    """
+    # Primary metric
+    val_bpb: float = 0.0
+    # Timing
+    training_seconds: float = 0.0
+    total_seconds: float = 0.0
+    # Hardware
+    peak_vram_mb: float = 0.0
+    mfu_percent: float = 0.0
+    # Throughput
+    total_tokens_m: float = 0.0
+    num_steps: int = 0
+    # Model shape (echoed by train.py summary block)
+    num_params_m: float = 0.0
+    n_layer: int = 0
+    d_model: int = 0
+    # Secondary health metrics
+    mhc_spectral_norm: float = 0.0
+    engram_hit_rate: float = 0.0
+    sr_bypass_rate: float = 0.0
+    # Status
+    crashed: bool = False
+    error_message: str = ""
+# Regex patterns keyed by ExperimentResult attribute name.
+# Format must match the ``--- Summary ---`` block printed by train.py.
+_PATTERNS: dict[str, str] = {
+    "val_bpb": r"^val_bpb:\s+([\d.]+)",
+    "training_seconds": r"^training_seconds:\s+([\d.]+)",
+    "total_seconds": r"^total_seconds:\s+([\d.]+)",
+    "peak_vram_mb": r"^peak_vram_mb:\s+([\d.]+)",
+    "mfu_percent": r"^mfu_percent:\s+([\d.]+)",
+    "total_tokens_m": r"^total_tokens_M:\s+([\d.]+)",
+    "num_steps": r"^num_steps:\s+(\d+)",
+    "num_params_m": r"^num_params_M:\s+([\d.]+)",
+    "n_layer": r"^n_layer:\s+(\d+)",
+    "d_model": r"^d_model:\s+(\d+)",
+    "mhc_spectral_norm": r"^mhc_spectral_norm:\s+([\d.]+)",
+    "engram_hit_rate": r"^engram_hit_rate:\s+([\d.]+)",
+    "sr_bypass_rate": r"^sr_bypass_rate:\s+([\d.]+)",
+}
+# Attributes that should be parsed as int rather than float.
+_INT_ATTRS: frozenset[str] = frozenset({"num_steps", "n_layer", "d_model"})
+def parse_run_log(log_path: str) -> ExperimentResult:
+    """Parse a run.log file and extract all training metrics.
+    Args:
+        log_path: Absolute path to the run.log file.
+    Returns:
+        Populated ExperimentResult; sets ``crashed=True`` when the log
+        contains a traceback or the file is missing.
+    """
+    result = ExperimentResult()
+    try:
+        with open(log_path) as fh:
+            content = fh.read()
+    except FileNotFoundError:
+        result.crashed = True
+        result.error_message = f"Log file not found: {log_path}"
+        return result
+    # Detect crash signals in output.
+    if "Traceback" in content or "FAIL" in content or "Error" in content:
+        result.crashed = True
+        lines = content.strip().splitlines()
+        result.error_message = "\n".join(lines[-20:])
+    for attr, pattern in _PATTERNS.items():
+        match = re.search(pattern, content, re.MULTILINE)
+        if match:
+            raw = match.group(1)
+            setattr(result, attr, int(raw) if attr in _INT_ATTRS else float(raw))
+    return result
+def check_secondary_alarms(result: ExperimentResult) -> list[str]:
+    """Check secondary metrics against fixed alarm thresholds.
+    Args:
+        result: Parsed experiment result.
+    Returns:
+        List of human-readable alarm strings (empty if all clear).
+    """
+    alarms: list[str] = []
+    if result.mhc_spectral_norm > 2.0:
+        alarms.append(
+            f"mhc_spectral_norm={result.mhc_spectral_norm:.4f} > 2.0 (ALARM)"
+        )
+    if 0 < result.engram_hit_rate < 0.1:
+        alarms.append(
+            f"engram_hit_rate={result.engram_hit_rate:.4f} < 0.1 (memory underused)"
+        )
+    if 0 < result.mfu_percent < 10:
+        alarms.append(
+            f"mfu_percent={result.mfu_percent:.2f}% < 10% (GPU underutilized)"
+        )
+    return alarms
+def should_keep(
+    result: ExperimentResult,
+    best_bpb: float,
+    gates: dict | None = None,
+) -> tuple[bool, str]:
+    """Decide whether to keep or discard an experiment.
+    The primary criterion is strictly lower val_bpb than the current best.
+    Optional secondary gates (passed from HarnessConfig.secondary_metrics)
+    can reject an otherwise-improving result.
+    Args:
+        result: Parsed experiment result.
+        best_bpb: Current best val_bpb across all experiments.
+        gates: Optional dict mapping metric name to threshold dict with
+               ``"max"`` or ``"min"`` keys, e.g.
+               ``{"mhc_spectral_norm": {"max": 2.0}}``.
+    Returns:
+        Tuple of (keep: bool, reason: str).
+    """
+    if result.crashed:
+        return False, "crash"
+    if result.val_bpb <= 0:
+        return False, "invalid val_bpb"
+    if result.val_bpb >= best_bpb:
+        return False, "discard"
+    # Secondary gate checks.
+    if gates:
+        gate_mhc = gates.get("mhc_spectral_norm", {}).get("max")
+        if gate_mhc is not None and result.mhc_spectral_norm > gate_mhc:
+            return (
+                False,
+                f"mhc_spectral_norm {result.mhc_spectral_norm:.4f} > gate {gate_mhc}",
+            )
+        gate_engram = gates.get("engram_hit_rate", {}).get("min")
+        if gate_engram is not None and result.engram_hit_rate < gate_engram:
+            return (
+                False,
+                f"engram_hit_rate {result.engram_hit_rate:.4f} < gate {gate_engram}",
+            )
+    return True, "keep"

overlay/harness/git_utils.py ADDED Viewed

	@@ -0,0 +1,94 @@

+"""Git utilities for HYDRA autoresearch branch management."""
+import os
+import subprocess
+REPO_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+def run_git(*args: str, check: bool = True) -> subprocess.CompletedProcess:
+    """Run a git command in the repo directory.
+    Args:
+        *args: Git command arguments.
+        check: Whether to raise on non-zero exit code.
+    Returns:
+        Completed process with stdout/stderr captured.
+    """
+    return subprocess.run(
+        ["git"] + list(args),
+        cwd=REPO_DIR,
+        capture_output=True,
+        text=True,
+        check=check,
+    )
+def current_branch() -> str:
+    """Return the current git branch name.
+    Returns:
+        Branch name string.
+    """
+    result = run_git("rev-parse", "--abbrev-ref", "HEAD")
+    return result.stdout.strip()
+def current_commit_short() -> str:
+    """Return the current HEAD commit short hash (7 chars).
+    Returns:
+        7-character commit hash.
+    """
+    result = run_git("rev-parse", "--short=7", "HEAD")
+    return result.stdout.strip()
+def create_branch(name: str) -> None:
+    """Create and switch to a new branch.
+    Args:
+        name: Branch name to create.
+    """
+    run_git("checkout", "-b", name)
+def commit_all(message: str) -> str:
+    """Stage all changes, commit, and return short hash.
+    Args:
+        message: Commit message.
+    Returns:
+        Short commit hash after committing.
+    """
+    run_git("add", "-A")
+    run_git("commit", "-m", message, check=False)
+    return current_commit_short()
+def reset_to(commit: str) -> None:
+    """Hard reset to a specific commit, discarding all changes.
+    Args:
+        commit: Commit hash (short or full) to reset to.
+    """
+    run_git("reset", "--hard", commit)
+def get_last_n_diffs(n: int = 3) -> list[str]:
+    """Get the last N commit diffs (--stat format) for meta-agent context.
+    Args:
+        n: Number of recent commits to retrieve.
+    Returns:
+        List of diff stat strings, one per commit (truncated to 500 chars).
+    """
+    result = run_git("log", f"-{n}", "--format=%H", check=False)
+    hashes = [h for h in result.stdout.strip().split("\n") if h]
+    diffs: list[str] = []
+    for h in hashes:
+        diff_result = run_git("show", "--stat", h, check=False)
+        diffs.append(diff_result.stdout[:500])
+    return diffs

overlay/harness/health_monitor.py ADDED Viewed

	@@ -0,0 +1,86 @@

+"""Hardware health monitoring for HYDRA experiments.
+Provides lightweight checks that the orchestrator runs before each
+experiment to avoid launching training into a degraded GPU state.
+"""
+import os
+import torch
+def get_gpu_stats() -> dict:
+    """Return current GPU memory statistics.
+    Returns:
+        Dict with keys: available (bool), and when available:
+        name, memory_allocated_mb, memory_reserved_mb,
+        max_memory_allocated_mb, memory_total_mb.
+    """
+    if not torch.cuda.is_available():
+        return {"available": False}
+    props = torch.cuda.get_device_properties(0)
+    return {
+        "available": True,
+        "name": torch.cuda.get_device_name(0),
+        "memory_allocated_mb": torch.cuda.memory_allocated(0) / (1024 * 1024),
+        "memory_reserved_mb": torch.cuda.memory_reserved(0) / (1024 * 1024),
+        "max_memory_allocated_mb": torch.cuda.max_memory_allocated(0) / (1024 * 1024),
+        "memory_total_mb": props.total_mem / (1024 * 1024),
+    }
+def check_health(
+    vram_pressure_pct: float = 90.0,
+    min_free_disk_gb: float = 1.0,
+) -> tuple[bool, list[str]]:
+    """Check GPU and disk health before launching an experiment.
+    Args:
+        vram_pressure_pct: Warn when GPU memory allocation exceeds this
+            percentage of total VRAM.
+        min_free_disk_gb: Warn when free disk space falls below this.
+    Returns:
+        Tuple of (healthy: bool, warnings: list[str]).
+        ``healthy`` is True when there are no warnings.
+    """
+    warnings: list[str] = []
+    stats = get_gpu_stats()
+    if not stats["available"]:
+        return False, ["No CUDA GPU available"]
+    # Memory pressure check.
+    used_pct = (
+        stats["memory_allocated_mb"] / stats["memory_total_mb"] * 100
+        if stats["memory_total_mb"] > 0
+        else 0.0
+    )
+    if used_pct > vram_pressure_pct:
+        warnings.append(
+            f"GPU memory pressure: {used_pct:.1f}% allocated "
+            f"({stats['memory_allocated_mb']:.0f} / {stats['memory_total_mb']:.0f} MB)"
+        )
+    # Disk space check.
+    try:
+        statvfs = os.statvfs(os.path.dirname(os.path.abspath(__file__)))
+        free_gb = (statvfs.f_bavail * statvfs.f_frsize) / (1024**3)
+        if free_gb < min_free_disk_gb:
+            warnings.append(f"Low disk space: {free_gb:.2f} GB free")
+    except (AttributeError, OSError):
+        # os.statvfs not available on all platforms (e.g. Windows).
+        pass
+    return len(warnings) == 0, warnings
+def reset_peak_stats() -> None:
+    """Reset GPU peak memory tracking for the next experiment.
+    Should be called immediately before launching each training run so
+    that peak_vram_mb reported in run.log reflects only that experiment.
+    """
+    if torch.cuda.is_available():
+        torch.cuda.reset_peak_memory_stats()

overlay/harness/meta_agent.py ADDED Viewed

	@@ -0,0 +1,139 @@

+"""Meta-agent: evolves program.md based on experiment history.
+Runs every ``meta_interval`` inner-loop experiments (configured in
+HarnessConfig).  Reads the current research state from results.tsv,
+decides whether guidance is needed, and appends a directive to
+program.md.  Any previous auto-generated directive is replaced so
+the file stays clean.
+"""
+import os
+from harness.git_utils import REPO_DIR
+from harness.search_strategy import ResearchState, diagnose
+PROGRAM_PATH = os.path.join(REPO_DIR, "program.md")
+RESULTS_PATH = os.path.join(REPO_DIR, "results.tsv")
+# Sentinel that marks auto-generated content so it can be cleanly replaced.
+_DIRECTIVE_MARKER = "## Meta-Agent Directive (auto-generated)"
+def generate_directive(state: ResearchState) -> str | None:
+    """Generate a directive string to append to program.md, or None.
+    A directive is only produced when the research state is not EXPLORING
+    (i.e., something needs to change).
+    Args:
+        state: Current ResearchState diagnosis.
+    Returns:
+        Formatted directive string, or None when no change is needed.
+    """
+    if state.label == "EXPLORING":
+        return None
+    if state.label == "BROKEN":
+        return (
+            f"\n{_DIRECTIVE_MARKER}\n"
+            f"ALERT: Crash rate is {state.crash_rate:.0%} in the recent window. "
+            "Revert to the last stable commit. Reduce model complexity before "
+            "proposing further changes. Suggested actions:\n"
+            "- Reduce d_model or n_layer\n"
+            "- Reduce batch_size\n"
+            "- Disable experimental modules (Engram, mHC, Hestia) one at a time\n"
+        )
+    if state.label == "STUCK":
+        stale = state.total_experiments - state.last_improvement_at
+        return (
+            f"\n{_DIRECTIVE_MARKER}\n"
+            f"ALERT: No improvement for {stale} experiments "
+            f"(best_bpb={state.best_bpb:.6f}). "
+            "Apply BOLD changes for the next 5 experiments:\n"
+            "- Dramatically change d_model or n_layer (2× or ½)\n"
+            "- Toggle Engram or mHC on/off entirely\n"
+            "- Change optimizer hyperparameters by 3–5×\n"
+            "- Temporarily accept results within 0.5% of baseline\n"
+        )
+    if state.label == "EXPLOITING":
+        return (
+            f"\n{_DIRECTIVE_MARKER}\n"
+            "Search is converging too early. Inject diversity:\n"
+            "- If recent experiments tune LR, try architecture changes instead\n"
+            "- If tuning architecture, try optimizer or regularisation changes\n"
+            "- Try removing complexity (simplification wins are valuable)\n"
+            "- Explore a subsystem not touched in the last 10 experiments\n"
+        )
+    return None
+def _strip_previous_directive(content: str) -> str:
+    """Remove any prior auto-generated directive block from content.
+    Args:
+        content: Full text of program.md.
+    Returns:
+        Content with any previous directive stripped and trailing
+        whitespace normalised.
+    """
+    if _DIRECTIVE_MARKER in content:
+        content = content[: content.index(_DIRECTIVE_MARKER)].rstrip() + "\n"
+    return content
+def run_meta_iteration(
+    program_path: str = PROGRAM_PATH,
+    results_path: str = RESULTS_PATH,
+) -> dict:
+    """Run one meta-agent iteration.
+    Diagnoses the current research state and optionally rewrites
+    program.md with a new directive.
+    Args:
+        program_path: Path to program.md.
+        results_path: Path to results.tsv.
+    Returns:
+        Summary dict with keys: state, total_experiments, best_bpb,
+        crash_rate, changed, and optionally directive.
+    """
+    state = diagnose(results_path)
+    summary: dict = {
+        "state": state.label,
+        "total_experiments": state.total_experiments,
+        "best_bpb": state.best_bpb,
+        "crash_rate": state.crash_rate,
+        "changed": False,
+    }
+    directive = generate_directive(state)
+    if directive is None:
+        return summary
+    try:
+        with open(program_path) as fh:
+            content = fh.read()
+    except FileNotFoundError:
+        content = ""
+    content = _strip_previous_directive(content)
+    content = content + "\n" + directive
+    tmp_path = program_path + ".tmp"
+    try:
+        with open(tmp_path, "w") as fh:
+            fh.write(content)
+        os.replace(tmp_path, program_path)  # atomic on POSIX
+    finally:
+        if os.path.exists(tmp_path):
+            os.unlink(tmp_path)
+    summary["changed"] = True
+    summary["directive"] = directive.strip()
+    return summary

overlay/harness/orchestrator.py ADDED Viewed

	@@ -0,0 +1,293 @@

+"""HYDRA Orchestrator: main loop for autonomous research.
+Usage::
+    python -m harness.orchestrator [--meta-interval N] [--max-experiments N]
+Loop:
+    1. Read current state (branch, results.tsv, program.md)
+    2. [Architect Agent] proposes and applies changes to train.py (external)
+    3. Git commit the changes
+    4. Run training: ``uv run train.py`` captured to run.log
+    5. [Eval Agent] extract metrics from run.log
+    6. Keep or discard based on val_bpb + secondary metric gates
+    7. Log to results.tsv
+    8. Every ``meta_interval`` experiments: [Meta Agent] evolves program.md
+    9. Repeat
+The orchestrator intentionally does NOT modify train.py itself -- it
+provides the infrastructure ("rails") that the autoresearch loop runs on.
+"""
+import argparse
+import csv
+import os
+import subprocess
+import time
+from harness.eval_agent import ExperimentResult, check_secondary_alarms, parse_run_log, should_keep
+from harness.git_utils import REPO_DIR, commit_all, current_commit_short, reset_to
+from harness.health_monitor import check_health, reset_peak_stats
+from harness.meta_agent import run_meta_iteration
+from harness.search_strategy import diagnose
+# ---------------------------------------------------------------------------
+# Paths
+# ---------------------------------------------------------------------------
+RESULTS_FILE = os.path.join(REPO_DIR, "results.tsv")
+RUN_LOG = os.path.join(REPO_DIR, "run.log")
+_TSV_HEADER = "commit\tval_bpb\tmemory_gb\tstatus\tdescription\n"
+# ---------------------------------------------------------------------------
+# TSV helpers
+# ---------------------------------------------------------------------------
+def init_results_tsv() -> None:
+    """Create results.tsv with header row if it does not yet exist."""
+    if not os.path.exists(RESULTS_FILE):
+        with open(RESULTS_FILE, "w") as fh:
+            fh.write(_TSV_HEADER)
+def log_result(
+    commit: str,
+    val_bpb: float,
+    memory_gb: float,
+    status: str,
+    description: str,
+) -> None:
+    """Append one row to results.tsv.
+    Args:
+        commit: Short git hash for this experiment.
+        val_bpb: Validation bits-per-byte (0.0 for crashes).
+        memory_gb: Peak VRAM usage in gigabytes.
+        status: One of keep / discard / crash / timeout.
+        description: Short human-readable description.
+    """
+    with open(RESULTS_FILE, "a") as fh:
+        fh.write(
+            f"{commit}\t{val_bpb:.6f}\t{memory_gb:.2f}\t{status}\t{description}\n"
+        )
+def count_experiments() -> int:
+    """Count the number of experiment rows in results.tsv.
+    Returns:
+        Row count excluding the header line (0 when file does not exist).
+    """
+    if not os.path.exists(RESULTS_FILE):
+        return 0
+    with open(RESULTS_FILE) as fh:
+        return max(0, sum(1 for _ in fh) - 1)
+def _load_best_bpb() -> float:
+    """Scan results.tsv for the best (lowest positive) val_bpb seen so far.
+    Returns:
+        Best val_bpb, or ``float("inf")`` when no valid result exists.
+    """
+    if not os.path.exists(RESULTS_FILE):
+        return float("inf")
+    best = float("inf")
+    with open(RESULTS_FILE) as fh:
+        reader = csv.DictReader(fh, delimiter="\t")
+        for row in reader:
+            try:
+                bpb = float(row.get("val_bpb", "0") or "0")
+            except ValueError:
+                continue
+            if 0 < bpb < best:
+                best = bpb
+    return best
+# ---------------------------------------------------------------------------
+# Experiment execution
+# ---------------------------------------------------------------------------
+def run_experiment(timeout: int = 600) -> str:
+    """Launch ``uv run train.py`` and capture all output to run.log.
+    Args:
+        timeout: Kill the process after this many seconds.
+    Returns:
+        One of ``"ok"``, ``"timeout"``, or ``"error"``.
+    """
+    try:
+        with open(RUN_LOG, "w") as log_file:
+            proc = subprocess.run(
+                ["uv", "run", "train.py"],
+                cwd=REPO_DIR,
+                stdout=log_file,
+                stderr=subprocess.STDOUT,
+                timeout=timeout,
+            )
+        return "ok" if proc.returncode == 0 else "error"
+    except subprocess.TimeoutExpired:
+        return "timeout"
+    except Exception as exc:  # noqa: BLE001
+        with open(RUN_LOG, "a") as log_file:
+            log_file.write(f"\nOrchestrator error: {exc}\n")
+        return "error"
+# ---------------------------------------------------------------------------
+# Main loop
+# ---------------------------------------------------------------------------
+def run_loop(
+    meta_interval: int = 20,
+    max_experiments: int | None = None,
+    experiment_timeout: int = 600,
+    secondary_gates: dict | None = None,
+) -> None:
+    """Run the HYDRA autoresearch loop.
+    This function runs indefinitely (or until ``max_experiments`` is reached
+    or the user interrupts with Ctrl-C).
+    Args:
+        meta_interval: Run the meta-agent every N experiments.
+        max_experiments: Hard stop after this many experiments (None = infinite).
+        experiment_timeout: Seconds before a training run is killed.
+        secondary_gates: Optional gate thresholds forwarded to
+            :func:`~harness.eval_agent.should_keep`.
+    """
+    init_results_tsv()
+    best_bpb = _load_best_bpb()
+    experiment_num = count_experiments()
+    print(
+        f"HYDRA Orchestrator starting. "
+        f"Experiments so far: {experiment_num}, Best BPB: {best_bpb:.6f}"
+    )
+    while max_experiments is None or experiment_num < max_experiments:
+        experiment_num += 1
+        # ------------------------------------------------------------------
+        # Pre-flight health check
+        # ------------------------------------------------------------------
+        healthy, hw_warnings = check_health()
+        if hw_warnings:
+            print(f"  [health] {hw_warnings}")
+        # ------------------------------------------------------------------
+        # Periodic meta-agent update
+        # ------------------------------------------------------------------
+        if experiment_num > 1 and experiment_num % meta_interval == 0:
+            print(f"\n=== Meta-agent iteration at experiment {experiment_num} ===")
+            meta_result = run_meta_iteration()
+            print(
+                f"  state={meta_result['state']}  "
+                f"best_bpb={meta_result['best_bpb']:.6f}  "
+                f"changed={meta_result['changed']}"
+            )
+            if meta_result.get("directive"):
+                print(f"  directive: {meta_result['directive'][:120]}")
+        # ------------------------------------------------------------------
+        # Record baseline commit so we can reset on failure / discard
+        # ------------------------------------------------------------------
+        pre_commit = current_commit_short()
+        # ------------------------------------------------------------------
+        # Run experiment
+        # ------------------------------------------------------------------
+        print(f"\n--- Experiment {experiment_num} ---")
+        reset_peak_stats()
+        t0 = time.time()
+        run_status = run_experiment(timeout=experiment_timeout)
+        elapsed = time.time() - t0
+        print(f"  run_status={run_status}  elapsed={elapsed:.1f}s")
+        # ------------------------------------------------------------------
+        # Parse results
+        # ------------------------------------------------------------------
+        result: ExperimentResult = parse_run_log(RUN_LOG)
+        if result.crashed or run_status != "ok":
+            commit = current_commit_short()
+            err_short = (
+                "timeout"
+                if run_status == "timeout"
+                else result.error_message[:80].replace("\n", " ")
+            )
+            log_result(commit, 0.0, 0.0, "crash", err_short)
+            print(f"  CRASH: {err_short}")
+            reset_to(pre_commit)
+            continue
+        # ------------------------------------------------------------------
+        # Secondary alarms (non-blocking -- logged but do not abort)
+        # ------------------------------------------------------------------
+        alarms = check_secondary_alarms(result)
+        if alarms:
+            for alarm in alarms:
+                print(f"  [alarm] {alarm}")
+        # ------------------------------------------------------------------
+        # Keep / discard
+        # ------------------------------------------------------------------
+        keep, reason = should_keep(result, best_bpb, gates=secondary_gates)
+        commit = current_commit_short()
+        memory_gb = result.peak_vram_mb / 1024.0
+        if keep:
+            best_bpb = result.val_bpb
+            description = f"val_bpb improved to {result.val_bpb:.6f}"
+            log_result(commit, result.val_bpb, memory_gb, "keep", description)
+            print(f"  KEEP: val_bpb={result.val_bpb:.6f}  (new best)")
+        else:
+            description = f"{reason} val_bpb={result.val_bpb:.6f}"
+            log_result(commit, result.val_bpb, memory_gb, "discard", description)
+            print(f"  DISCARD: val_bpb={result.val_bpb:.6f}  ({reason})")
+            reset_to(pre_commit)
+    print(f"\nHYDRA finished after {experiment_num} experiments. Best BPB: {best_bpb:.6f}")
+# ---------------------------------------------------------------------------
+# CLI entry point
+# ---------------------------------------------------------------------------
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="HYDRA Autoresearch Orchestrator")
+    parser.add_argument(
+        "--meta-interval",
+        type=int,
+        default=20,
+        help="Run meta-agent every N experiments (default: 20)",
+    )
+    parser.add_argument(
+        "--max-experiments",
+        type=int,
+        default=None,
+        help="Stop after N experiments; omit for infinite (default: infinite)",
+    )
+    parser.add_argument(
+        "--experiment-timeout",
+        type=int,
+        default=600,
+        help="Kill training run after N seconds (default: 600)",
+    )
+    args = parser.parse_args()
+    try:
+        run_loop(
+            meta_interval=args.meta_interval,
+            max_experiments=args.max_experiments,
+            experiment_timeout=args.experiment_timeout,
+        )
+    except KeyboardInterrupt:
+        print("\nOrchestrator stopped by user.")

overlay/harness/search_strategy.py ADDED Viewed

	@@ -0,0 +1,153 @@

+"""Search strategy for HYDRA's meta-evolution loop.
+Reads results.tsv and diagnoses the current research state as one of:
+  EXPLORING  -- active improvement trend with diverse experiments
+  EXPLOITING -- narrowing in on a local optimum (low diversity)
+  STUCK      -- no improvement for >= stuck_threshold experiments
+  BROKEN     -- crash rate exceeds crash_threshold
+"""
+import csv
+import os
+from dataclasses import dataclass
+@dataclass
+class ResearchState:
+    """Diagnosis of the current research trajectory.
+    Attributes:
+        label: One of EXPLORING, EXPLOITING, STUCK, BROKEN.
+        trend_improving: True when the second half of the recent window is
+            better (lower BPB) than the first half.
+        experiment_diversity: Rough 0–1 score based on unique description
+            prefixes in the recent window.
+        crash_rate: Fraction of recent experiments that crashed.
+        best_bpb: Lowest val_bpb seen across all experiments.
+        last_improvement_at: Ordinal of the experiment that set best_bpb.
+        total_experiments: Total rows in results.tsv (excluding header).
+    """
+    label: str
+    trend_improving: bool
+    experiment_diversity: float
+    crash_rate: float
+    best_bpb: float
+    last_improvement_at: int
+    total_experiments: int
+def diagnose(
+    results_path: str,
+    window: int = 20,
+    stuck_threshold: int = 10,
+    crash_threshold: float = 0.5,
+) -> ResearchState:
+    """Diagnose current research state from results.tsv.
+    Args:
+        results_path: Path to the tab-separated results file.
+        window: Number of recent experiments to consider for trend/diversity.
+        stuck_threshold: Experiments without improvement before labelling STUCK.
+        crash_threshold: Crash fraction above which state becomes BROKEN.
+    Returns:
+        ResearchState with diagnosis label and supporting statistics.
+    """
+    if not os.path.exists(results_path):
+        return ResearchState(
+            label="EXPLORING",
+            trend_improving=False,
+            experiment_diversity=0.0,
+            crash_rate=0.0,
+            best_bpb=float("inf"),
+            last_improvement_at=0,
+            total_experiments=0,
+        )
+    rows: list[dict] = []
+    with open(results_path) as fh:
+        reader = csv.DictReader(fh, delimiter="\t")
+        for row in reader:
+            rows.append(row)
+    if not rows:
+        return ResearchState(
+            label="EXPLORING",
+            trend_improving=False,
+            experiment_diversity=0.0,
+            crash_rate=0.0,
+            best_bpb=float("inf"),
+            last_improvement_at=0,
+            total_experiments=0,
+        )
+    total = len(rows)
+    recent = rows[-window:]
+    # Crash rate in the recent window.
+    crashes = sum(1 for r in recent if r.get("status") == "crash")
+    crash_rate = crashes / len(recent) if recent else 0.0
+    # Best BPB overall and which experiment achieved it.
+    best_bpb = float("inf")
+    last_improvement_at = 0
+    for i, row in enumerate(rows):
+        try:
+            bpb = float(row.get("val_bpb", "0") or "0")
+        except ValueError:
+            continue
+        if bpb > 0 and bpb < best_bpb:
+            best_bpb = bpb
+            last_improvement_at = i + 1
+    # Trend: is the second half of the recent window better than the first?
+    valid_bpbs = [
+        float(r.get("val_bpb", "0") or "0")
+        for r in recent
+        if float(r.get("val_bpb", "0") or "0") > 0
+    ]
+    trend_improving = False
+    if len(valid_bpbs) >= 4:
+        mid = len(valid_bpbs) // 2
+        first_half_mean = sum(valid_bpbs[:mid]) / mid
+        second_half_mean = sum(valid_bpbs[mid:]) / (len(valid_bpbs) - mid)
+        trend_improving = second_half_mean < first_half_mean
+    # Diversity: fraction of unique description prefixes (first 20 chars).
+    descriptions = {r.get("description", "")[:20] for r in recent}
+    diversity = min(1.0, len(descriptions) / max(1, len(recent)))
+    # Classify state.
+    stale = total - last_improvement_at
+    if crash_rate > crash_threshold:
+        label = "BROKEN"
+    elif stale >= stuck_threshold:
+        label = "STUCK"
+    elif trend_improving and diversity > 0.3:
+        label = "EXPLORING"
+    else:
+        label = "EXPLOITING"
+    return ResearchState(
+        label=label,
+        trend_improving=trend_improving,
+        experiment_diversity=diversity,
+        crash_rate=crash_rate,
+        best_bpb=best_bpb,
+        last_improvement_at=last_improvement_at,
+        total_experiments=total,
+    )
+def should_explore(results_path: str, n: int = 10) -> bool:
+    """Return True when no improvement has been seen in the last N experiments.
+    Args:
+        results_path: Path to results.tsv.
+        n: Look-back window for improvement check.
+    Returns:
+        True if the research loop should try bolder mutations.
+    """
+    state = diagnose(results_path, window=n, stuck_threshold=n)
+    return state.label in ("STUCK", "BROKEN")