icarus112's picture
Upload folder using huggingface_hub
22741d9 verified
#!/usr/bin/env python3
"""HYDRA Autoresearch Mutation Loop.
Runs baseline training -> evaluates -> picks ONE mutation at a time ->
trains -> evaluates -> keeps if quality improves AND tps >= floor.
Repeats until all mutations exhausted or Ctrl+C.
State persisted in .omc/autoresearch_config.json for resume support.
Usage:
python scripts/autoresearch.py # run full loop
python scripts/autoresearch.py --dry-run # show plan, don't train
python scripts/autoresearch.py --baseline # only run baseline eval
"""
from __future__ import annotations
import argparse
import json
import math
import os
import re
import signal
import subprocess
import sys
import time
from pathlib import Path
_PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if _PROJECT_ROOT not in sys.path:
sys.path.insert(0, _PROJECT_ROOT)
# ---------------------------------------------------------------------------
# Mutation catalog (ordered by expected impact)
# ---------------------------------------------------------------------------
MUTATIONS = [
# Learning dynamics — env vars verified in hydra/config.py
{"name": "lr_matrix_0.012", "env": "HYDRA_MATRIX_LR=0.012"}, # default 0.12
{"name": "lr_matrix_0.06", "env": "HYDRA_MATRIX_LR=0.06"}, # half default
{"name": "lr_matrix_0.24", "env": "HYDRA_MATRIX_LR=0.24"}, # double default
{"name": "lr_floor_50pct", "env": "HYDRA_LR_MIN_MULT=0.5"}, # default 0.0
{"name": "lr_floor_20pct", "env": "HYDRA_LR_MIN_MULT=0.2"}, # default 0.0
{"name": "embed_lr_0.5", "env": "HYDRA_EMBED_LR=0.5"}, # default 1.0
{"name": "embed_lr_2.0", "env": "HYDRA_EMBED_LR=2.0"}, # default 1.0
{"name": "unembed_lr_0.01", "env": "HYDRA_UNEMBED_LR=0.01"}, # default 0.005
# Architecture — env vars verified in hydra/config.py
{"name": "d_model_384", "env": "HYDRA_D_MODEL=384"}, # default 256
{"name": "d_model_192", "env": "HYDRA_D_MODEL=192"}, # smaller
{"name": "d_state_128", "env": "HYDRA_D_STATE=128"}, # default 64
{"name": "d_state_32", "env": "HYDRA_D_STATE=32"}, # smaller
{"name": "n_layer_6", "env": "HYDRA_N_LAYER=6"}, # default 4
{"name": "n_layer_3", "env": "HYDRA_N_LAYER=3"}, # fewer
{"name": "headdim_16", "env": "HYDRA_HEADDIM=16"}, # default 32 -> more heads
{"name": "headdim_64", "env": "HYDRA_HEADDIM=64"}, # default 32 -> fewer heads
{"name": "expand_3", "env": "HYDRA_EXPAND=3"}, # default 2
{"name": "engram_2048", "env": "HYDRA_ENGRAM_N_COLUMNS=2048"}, # default 1024
{"name": "engram_4096", "env": "HYDRA_ENGRAM_N_COLUMNS=4096"}, # default 1024
{"name": "engram_512", "env": "HYDRA_ENGRAM_N_COLUMNS=512"}, # smaller
# Batch size
{"name": "batch_32k", "env": "HYDRA_TOTAL_BATCH=32768"}, # default 32768 (verify)
{"name": "batch_16k", "env": "HYDRA_TOTAL_BATCH=16384"}, # smaller batch
{"name": "batch_65k", "env": "HYDRA_TOTAL_BATCH=65536"}, # larger batch
# Regularization — env vars verified in hydra/model.py + hydra/config.py
{"name": "dropout_0.05", "env": "HYDRA_DROPOUT=0.05"}, # default 0.2
{"name": "dropout_0.1", "env": "HYDRA_DROPOUT=0.1"}, # default 0.2
{"name": "dropout_0.3", "env": "HYDRA_DROPOUT=0.3"}, # higher
]
# ---------------------------------------------------------------------------
# State management
# ---------------------------------------------------------------------------
STATE_DIR = os.path.join(_PROJECT_ROOT, ".omc")
STATE_FILE = os.path.join(STATE_DIR, "autoresearch_config.json")
DEFAULT_STATE = {
"baseline_quality": None,
"baseline_tps": None,
"current_gen": 0,
"mutations_tested": [],
"mutations_kept": [],
"tps_floor": 62000,
"time_budget": 600,
"history": [],
}
def load_state() -> dict:
"""Load state from disk or return default."""
if os.path.exists(STATE_FILE):
with open(STATE_FILE, "r") as f:
state = json.load(f)
# Backfill missing keys from defaults
for k, v in DEFAULT_STATE.items():
if k not in state:
state[k] = v
return state
return dict(DEFAULT_STATE)
def save_state(state: dict) -> None:
"""Persist state to disk."""
os.makedirs(STATE_DIR, exist_ok=True)
with open(STATE_FILE, "w") as f:
json.dump(state, f, indent=2)
# ---------------------------------------------------------------------------
# Training subprocess
# ---------------------------------------------------------------------------
def build_env(extra_env: str | None = None) -> dict[str, str]:
"""Build environment for training subprocess."""
env = os.environ.copy()
# Ensure CUDA paths
ld_paths = ["/usr/lib/wsl/lib", "/usr/local/cuda/lib64"]
existing = env.get("LD_LIBRARY_PATH", "")
for p in ld_paths:
if p not in existing:
existing = p + ":" + existing
env["LD_LIBRARY_PATH"] = existing
# Apply mutation env var
if extra_env:
key, val = extra_env.split("=", 1)
env[key] = val
return env
def run_training(time_budget: int, extra_env: str | None = None) -> dict | None:
"""Run train.py with given time budget and optional env override.
Returns dict with parsed metrics, or None on failure.
"""
env = build_env(extra_env)
env["HYDRA_TIME_BUDGET"] = str(time_budget)
cmd = [os.path.join(_PROJECT_ROOT, ".venv", "bin", "python"), "-u", "train.py"]
try:
proc = subprocess.Popen(
cmd,
cwd=_PROJECT_ROOT,
env=env,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
bufsize=1,
)
except Exception as e:
print(f" [ERROR] Failed to start training: {e}")
return None
output_lines: list[str] = []
last_step_line = ""
try:
for line in proc.stdout:
line = line.rstrip()
output_lines.append(line)
if line.startswith("step="):
last_step_line = line
# Print progress every 50 steps
m = re.search(r"step=(\d+)", line)
if m and int(m.group(1)) % 50 == 0:
tps_m = re.search(r"tps=(\d+)", line)
bpb_m = re.search(r"bpb=([\d.]+)", line)
tps = tps_m.group(1) if tps_m else "?"
bpb = bpb_m.group(1) if bpb_m else "?"
print(f" step={m.group(1)} tps={tps} bpb={bpb}", flush=True)
elif "val_bpb" in line or "factual_english_score" in line:
print(f" {line}", flush=True)
except KeyboardInterrupt:
proc.terminate()
proc.wait()
raise
proc.wait()
if proc.returncode != 0:
print(f" [ERROR] Training exited with code {proc.returncode}")
# Print last 10 lines for debugging
for line in output_lines[-10:]:
print(f" {line}")
return None
return _parse_training_output(output_lines)
def _parse_training_output(lines: list[str]) -> dict:
"""Extract metrics from training output lines."""
metrics: dict[str, float] = {}
for line in lines:
# Key=value pairs from summary block
for key in ["val_bpb", "training_seconds", "peak_vram_mb", "mfu_percent",
"total_tokens_M", "num_steps", "factual_english_score",
"factual_english_hits"]:
m = re.match(rf"^{key}:\s+([\d.]+)", line.strip())
if m:
metrics[key] = float(m.group(1))
# TPS from last step line
if line.startswith("step="):
tps_m = re.search(r"tps=(\d+)", line)
if tps_m:
metrics["tps"] = float(tps_m.group(1))
return metrics
# ---------------------------------------------------------------------------
# Eval integration
# ---------------------------------------------------------------------------
def run_eval_after_training(extra_env: str | None = None) -> dict | None:
"""Run eval_quality.py after training. Returns metrics dict or None."""
env = build_env(extra_env)
cmd = [
os.path.join(_PROJECT_ROOT, ".venv", "bin", "python"),
os.path.join(_PROJECT_ROOT, "scripts", "eval_quality.py"),
]
try:
result = subprocess.run(
cmd,
cwd=_PROJECT_ROOT,
env=env,
capture_output=True,
text=True,
timeout=120, # 2 min max for eval
)
except subprocess.TimeoutExpired:
print(" [ERROR] Eval timed out (120s)")
return None
except Exception as e:
print(f" [ERROR] Eval failed: {e}")
return None
if result.returncode != 0:
print(f" [ERROR] Eval exited with code {result.returncode}")
for line in result.stdout.split("\n")[-10:]:
print(f" {line}")
for line in result.stderr.split("\n")[-5:]:
print(f" {line}")
return None
# Parse key=value output
metrics = {}
for line in result.stdout.split("\n"):
line = line.strip()
m = re.match(r"^([\w]+)=([\d.eE+-]+)$", line)
if m:
try:
metrics[m.group(1)] = float(m.group(2))
except ValueError:
pass
return metrics if metrics else None
# ---------------------------------------------------------------------------
# Git operations
# ---------------------------------------------------------------------------
def git_commit(message: str) -> bool:
"""Stage all changes and commit."""
try:
subprocess.run(["git", "add", "-A"], cwd=_PROJECT_ROOT, check=True,
capture_output=True, timeout=30)
subprocess.run(
["git", "commit", "-m", message],
cwd=_PROJECT_ROOT, check=True, capture_output=True, timeout=30,
)
return True
except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e:
print(f" [WARN] Git commit failed: {e}")
return False
# ---------------------------------------------------------------------------
# Main loop
# ---------------------------------------------------------------------------
_SHUTDOWN = False
def _handle_sigint(signum, frame):
global _SHUTDOWN
if _SHUTDOWN:
print("\n[AUTORESEARCH] Double Ctrl+C — force exit")
sys.exit(1)
_SHUTDOWN = True
print("\n[AUTORESEARCH] Ctrl+C received — finishing current gen then saving state...")
def main():
global _SHUTDOWN
signal.signal(signal.SIGINT, _handle_sigint)
parser = argparse.ArgumentParser(description="HYDRA autoresearch mutation loop")
parser.add_argument("--dry-run", action="store_true", help="Show plan, don't train")
parser.add_argument("--baseline", action="store_true", help="Only run baseline")
parser.add_argument("--time-budget", type=int, default=600, help="Time budget per run (s)")
parser.add_argument("--tps-floor", type=int, default=62000, help="Minimum acceptable TPS")
args = parser.parse_args()
state = load_state()
state["time_budget"] = args.time_budget
state["tps_floor"] = args.tps_floor
tested = set(state["mutations_tested"])
remaining = [m for m in MUTATIONS if m["name"] not in tested]
print("=" * 70)
print("HYDRA AUTORESEARCH MUTATION LOOP")
print("=" * 70)
print(f"Time budget per run: {state['time_budget']}s")
print(f"TPS floor: {state['tps_floor']}")
print(f"Current gen: {state['current_gen']}")
print(f"Mutations tested: {len(tested)}/{len(MUTATIONS)}")
print(f"Mutations kept: {state['mutations_kept']}")
print(f"Remaining: {[m['name'] for m in remaining]}")
print()
if args.dry_run:
print("[DRY RUN] Would test these mutations in order:")
for i, m in enumerate(remaining):
print(f" {i + 1}. {m['name']} ({m['env']})")
return
# -----------------------------------------------------------------------
# Baseline (Gen 0)
# -----------------------------------------------------------------------
if state["baseline_quality"] is None:
print("[GEN 0] Running baseline training + evaluation...")
train_metrics = run_training(state["time_budget"])
if train_metrics is None:
print("[FAIL] Baseline training failed")
save_state(state)
return
print("[GEN 0] Running quality evaluation...")
eval_metrics = run_eval_after_training()
if eval_metrics is None:
print("[FAIL] Baseline eval failed")
save_state(state)
return
baseline_tps = train_metrics.get("tps", 0)
baseline_quality = eval_metrics.get("quality_score", 0)
state["baseline_quality"] = baseline_quality
state["baseline_tps"] = baseline_tps
state["current_gen"] = 0
state["history"].append({
"gen": 0,
"mutation": "baseline",
"quality_score": baseline_quality,
"baseline_score": baseline_quality,
"delta": "0.0%",
"tps": baseline_tps,
"ppl": eval_metrics.get("ppl", 0),
"bleu4": eval_metrics.get("bleu4", 0),
"rouge_l": eval_metrics.get("rouge_l", 0),
"factual": eval_metrics.get("factual", 0),
"bpb": eval_metrics.get("bpb", 0),
"repetition_rate": eval_metrics.get("repetition_rate", 0),
"kept": True,
})
save_state(state)
print(f"[GEN 0] BASELINE: quality={baseline_quality:.4f} tps={baseline_tps:.0f}")
if args.baseline:
return
else:
print(f"[RESUME] Baseline quality={state['baseline_quality']:.4f} tps={state['baseline_tps']:.0f}")
if args.baseline:
return
# -----------------------------------------------------------------------
# Mutation loop
# -----------------------------------------------------------------------
current_quality = state["baseline_quality"]
# Track best quality so far (from last kept mutation, not just baseline)
if state["history"]:
kept_entries = [h for h in state["history"] if h.get("kept")]
if kept_entries:
current_quality = kept_entries[-1]["quality_score"]
for mutation in remaining:
if _SHUTDOWN:
print("[AUTORESEARCH] Shutdown requested — saving state")
save_state(state)
return
gen = state["current_gen"] + 1
name = mutation["name"]
env_str = mutation["env"]
print(f"\n[GEN {gen}] Testing {name} ({env_str})...")
print(f" Current best quality: {current_quality:.4f}")
# Train with mutation
print(f" Training ({state['time_budget']}s)...", flush=True)
train_metrics = run_training(state["time_budget"], extra_env=env_str)
if train_metrics is None:
print(f" [SKIP] Training failed for {name}")
state["mutations_tested"].append(name)
state["current_gen"] = gen
state["history"].append({
"gen": gen, "mutation": name,
"quality_score": 0, "baseline_score": current_quality,
"delta": "FAIL", "tps": 0, "ppl": 0, "bleu4": 0,
"rouge_l": 0, "factual": 0, "bpb": 0, "repetition_rate": 0,
"kept": False,
})
save_state(state)
continue
tps = train_metrics.get("tps", 0)
# TPS floor check
if tps < state["tps_floor"]:
print(f" [REJECT] TPS={tps:.0f} < floor={state['tps_floor']} — skipping eval")
state["mutations_tested"].append(name)
state["current_gen"] = gen
state["history"].append({
"gen": gen, "mutation": name,
"quality_score": 0, "baseline_score": current_quality,
"delta": f"TPS_FAIL({tps:.0f})", "tps": tps,
"ppl": 0, "bleu4": 0, "rouge_l": 0, "factual": 0,
"bpb": train_metrics.get("val_bpb", 0), "repetition_rate": 0,
"kept": False,
})
save_state(state)
continue
# Evaluate
print(f" Evaluating...", flush=True)
eval_metrics = run_eval_after_training(extra_env=env_str)
if eval_metrics is None:
print(f" [SKIP] Eval failed for {name}")
state["mutations_tested"].append(name)
state["current_gen"] = gen
state["history"].append({
"gen": gen, "mutation": name,
"quality_score": 0, "baseline_score": current_quality,
"delta": "EVAL_FAIL", "tps": tps, "ppl": 0, "bleu4": 0,
"rouge_l": 0, "factual": 0, "bpb": 0, "repetition_rate": 0,
"kept": False,
})
save_state(state)
continue
quality = eval_metrics.get("quality_score", 0)
delta_pct = ((quality - current_quality) / max(abs(current_quality), 1e-6)) * 100
delta_str = f"{delta_pct:+.1f}%"
kept = quality > current_quality and tps >= state["tps_floor"]
status = "KEEP" if kept else "DISCARD"
entry = {
"gen": gen,
"mutation": name,
"quality_score": quality,
"baseline_score": current_quality,
"delta": delta_str,
"tps": tps,
"ppl": eval_metrics.get("ppl", 0),
"bleu4": eval_metrics.get("bleu4", 0),
"rouge_l": eval_metrics.get("rouge_l", 0),
"factual": eval_metrics.get("factual", 0),
"bpb": eval_metrics.get("bpb", 0),
"repetition_rate": eval_metrics.get("repetition_rate", 0),
"kept": kept,
}
print(f"\n[GEN {gen}] {name}: quality={quality:.4f} ({delta_str}) tps={tps:.0f} -> {status}")
if kept:
current_quality = quality
state["mutations_kept"].append(name)
git_commit(f"autoresearch: gen {gen}{name} quality {delta_str}")
state["mutations_tested"].append(name)
state["current_gen"] = gen
state["history"].append(entry)
save_state(state)
# -----------------------------------------------------------------------
# Summary
# -----------------------------------------------------------------------
print("\n" + "=" * 70)
print("AUTORESEARCH COMPLETE")
print("=" * 70)
print(f"Total generations: {state['current_gen']}")
print(f"Mutations kept: {state['mutations_kept']}")
print(f"Final quality: {current_quality:.4f}")
if state["baseline_quality"]:
total_delta = ((current_quality - state["baseline_quality"]) /
max(abs(state["baseline_quality"]), 1e-6)) * 100
print(f"Total improvement: {total_delta:+.1f}%")
print()
# Print history table
print(f"{'Gen':>4} {'Mutation':>20} {'Quality':>8} {'Delta':>8} {'TPS':>7} {'PPL':>8} {'BPB':>7} {'Kept':>5}")
print("-" * 75)
for h in state["history"]:
print(f"{h['gen']:4d} {h['mutation']:>20s} {h['quality_score']:8.4f} "
f"{h['delta']:>8s} {h['tps']:7.0f} {h['ppl']:8.2f} "
f"{h.get('bpb', 0):7.4f} {' YES' if h['kept'] else ' NO'}")
if __name__ == "__main__":
main()