File size: 7,609 Bytes
7e9d95d cd009b8 7e9d95d cd009b8 7e9d95d de93199 7e9d95d e9d19ff 6458fca 7e9d95d eac8329 7e9d95d eac8329 7e9d95d eac8329 7e9d95d f9606c7 eac8329 f9606c7 12d0fcb f9606c7 de93199 eac8329 7e9d95d e9d19ff 7e9d95d eac8329 7e9d95d de93199 eac8329 de93199 eac8329 de93199 7e9d95d de93199 7e9d95d de93199 12d0fcb eac8329 de93199 6458fca de93199 6458fca de93199 7e9d95d de93199 7e9d95d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 | """Tier-1 evaluation driver: baseline vs looped Laguna on knowledge-MC tasks.
Runs a matrix of loop configs (baseline / layer / block / naive) over one or more
multiple-choice tasks and reports acc, acc_norm, and the delta vs baseline (pp).
Real run (GPU):
uv run python scripts/tf_run_eval.py --model poolside/Laguna-XS.2 \
--tasks arc_challenge,openbookqa --dtype bfloat16 --device cuda
Local plumbing check (CPU, tiny random model + REAL tokenizer + REAL dataset):
uv run python scripts/tf_run_eval.py --tiny --tasks arc_challenge --limit 20
The --tiny path exercises the entire real code path (tokenizer, dataset loading,
scoring, config matrix) against a tiny random-weight model; metrics are meaningless
but it proves the GPU run is turn-key.
"""
from __future__ import annotations
import argparse
import json
import sys
import time
from datetime import datetime
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
import torch
from looped_laguna import LoopConfig, load_model_and_tokenizer
from looped_laguna.eval import DATASET_LOADERS, format_results, run_matrix
DEFAULT_TOKENIZER = str(Path(__file__).resolve().parent.parent / "laguna_src")
def build_configs(num_layers: int, ks: list[int], width: int, center: float, controls: bool = True) -> dict:
"""Config matrix, named `{mode}-{strategy}-K{k}` (mode x strategy is a clean 2x2).
strategy: rk = damped K-stage Runge-Kutta (the proposed method)
naive = undamped x<-g(x) repeated K times (control, expected to collapse)
mode: layer = iterate each window layer in place (required for MoE)
block = iterate the whole window as a unit (control, routing thrash)
We sweep K for the proposed config (layer-rk). With `controls=True` we also add the
other three corners of the 2x2 at K_max (block-rk isolates mode; layer-naive isolates
strategy; block-naive is the canonical "naive looped transformer"). Pass controls=False
for a clean K-sweep (baseline + layer-rk only) that doesn't duplicate the controls a
breadth run already covers.
"""
window = LoopConfig.from_depth_fraction(num_layers, center_frac=center, width=width).window
configs: dict[str, LoopConfig | None] = {"baseline": None}
for k in ks:
configs[f"layer-rk-K{k}"] = LoopConfig(window=window, K=k, mode="layer")
if controls:
k_max = max(ks)
configs[f"block-rk-K{k_max}"] = LoopConfig(window=window, K=k_max, mode="block")
configs[f"layer-naive-K{k_max}"] = LoopConfig(window=window, K=k_max, mode="layer", naive=True)
configs[f"block-naive-K{k_max}"] = LoopConfig(window=window, K=k_max, mode="block", naive=True)
return configs
def main() -> None:
p = argparse.ArgumentParser(description="Tier-1 eval: baseline vs looped Laguna on knowledge-MC tasks.")
p.add_argument("--model", default="poolside/Laguna-XS.2",
help="HF model id or local path of the real model (ignored with --tiny)")
p.add_argument("--tokenizer", default=DEFAULT_TOKENIZER,
help="tokenizer path/id (default: the vendored laguna_src)")
p.add_argument("--tasks", default="arc_challenge",
help="comma-separated task names (see DATASET_LOADERS in eval.py)")
p.add_argument("--limit", type=int, default=None, help="max examples per task (default: all)")
p.add_argument("--ks", default="2,3", help="comma-separated loop counts K to sweep for layer-rk")
p.add_argument("--no-controls", dest="controls", action="store_false",
help="only run baseline + layer-rk per K (skip block/naive controls), e.g. for a K-sweep")
p.add_argument("--width", type=int, default=4, help="loop window width in layers")
p.add_argument("--center", type=float, default=0.5, help="loop window center as a depth fraction (0-1)")
p.add_argument("--dtype", default="bfloat16", help="model dtype: bfloat16/float16/float32")
p.add_argument("--device", default="cuda", help="device or device_map: cuda/cpu/auto")
p.add_argument("--batch-size", type=int, default=16,
help="padded-batch size for scoring (bump to 32-64 on big GPUs; lower for long tasks)")
p.add_argument("--tiny", action="store_true",
help="use a tiny random-weight model instead of real weights (plumbing check)")
p.add_argument("--tiny-layers", type=int, default=8, help="number of layers for the --tiny model")
p.add_argument("--output", default=None,
help="results JSON path (default: auto-named results_eval_<timestamp>.json)")
p.add_argument("--no-save", action="store_true", help="do not write results to disk")
p.add_argument("--no-peritem", dest="peritem", action="store_false",
help="don't save per-item raw data (per-choice LLs/gold/subject) for re-aggregation")
args = p.parse_args()
ks = [int(x) for x in args.ks.split(",")]
tasks = [t.strip() for t in args.tasks.split(",")]
for t in tasks:
if t not in DATASET_LOADERS:
raise SystemExit(f"unknown task {t!r}; known: {sorted(DATASET_LOADERS)}")
model, tokenizer = load_model_and_tokenizer(
args.model, args.tokenizer, dtype=args.dtype, device=args.device,
tiny=args.tiny, tiny_layers=args.tiny_layers,
)
num_layers = model.config.num_hidden_layers
configs = build_configs(num_layers, ks, args.width, args.center, controls=args.controls)
window = next(c.window for c in configs.values() if c is not None)
print(f"model: {'tiny' if args.tiny else args.model} | layers={num_layers} | loop window={window}")
print(f"configs: {list(configs)}")
# Always persist (incrementally) unless --no-save, so a crash never loses finished work.
save_path = None if args.no_save else (
args.output or f"results_eval_{datetime.now().strftime('%Y%m%d-%H%M%S')}.json"
)
meta = {"model": "tiny" if args.tiny else args.model, "num_layers": num_layers,
"window": list(window), "ks": ks, "tasks": tasks, "limit": args.limit}
all_results: dict = {}
# Per-item raw data (per-choice LLs + gold/subject/lengths) for later re-aggregation.
peritem_path = str(Path(save_path).with_suffix("")) + "_peritem.jsonl" if (save_path and args.peritem) else None
def save() -> None:
if save_path:
Path(save_path).write_text(json.dumps({"meta": meta, "results": all_results}, indent=2))
if save_path:
print(f"saving results to {save_path} (updated after every config)")
if peritem_path:
Path(peritem_path).write_text("") # truncate; run_matrix appends one line per task
print(f"saving per-item data to {peritem_path}")
for task in tasks:
examples = DATASET_LOADERS[task](limit=args.limit)
print(f"\n=== {task} ({len(examples)} items) ===")
all_results[task] = {} # partial configs land here as they finish
t0 = time.time()
results = run_matrix(
model, tokenizer, examples, configs, batch_size=args.batch_size,
peritem_path=peritem_path, task_name=task,
on_result=lambda name, r, t=task: (all_results[t].__setitem__(name, r), save()),
)
all_results[task] = results # full results, now with deltas + significance
save()
for line in format_results(results):
print(line)
print(f" ({time.time() - t0:.1f}s)")
if save_path:
print(f"\nresults saved to {save_path}")
if __name__ == "__main__":
main()
|