File size: 7,609 Bytes
7e9d95d
 
 
 
 
 
cd009b8
7e9d95d
 
 
cd009b8
7e9d95d
 
 
 
 
 
 
 
 
 
 
 
de93199
7e9d95d
 
 
 
 
 
e9d19ff
6458fca
7e9d95d
 
 
 
eac8329
7e9d95d
 
 
 
 
 
 
eac8329
 
 
 
 
7e9d95d
 
 
 
 
eac8329
 
 
 
 
7e9d95d
 
 
 
 
 
f9606c7
 
 
 
 
 
 
 
 
eac8329
 
f9606c7
 
 
 
12d0fcb
 
f9606c7
 
 
de93199
 
 
eac8329
 
7e9d95d
 
 
 
 
 
 
 
e9d19ff
 
 
 
7e9d95d
eac8329
7e9d95d
 
 
 
de93199
 
 
 
 
 
 
 
eac8329
 
 
de93199
 
 
 
 
 
eac8329
 
 
de93199
7e9d95d
 
 
de93199
7e9d95d
de93199
12d0fcb
eac8329
de93199
 
6458fca
de93199
6458fca
 
de93199
7e9d95d
de93199
 
7e9d95d
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
"""Tier-1 evaluation driver: baseline vs looped Laguna on knowledge-MC tasks.

Runs a matrix of loop configs (baseline / layer / block / naive) over one or more
multiple-choice tasks and reports acc, acc_norm, and the delta vs baseline (pp).

Real run (GPU):
    uv run python scripts/tf_run_eval.py --model poolside/Laguna-XS.2 \
        --tasks arc_challenge,openbookqa --dtype bfloat16 --device cuda

Local plumbing check (CPU, tiny random model + REAL tokenizer + REAL dataset):
    uv run python scripts/tf_run_eval.py --tiny --tasks arc_challenge --limit 20

The --tiny path exercises the entire real code path (tokenizer, dataset loading,
scoring, config matrix) against a tiny random-weight model; metrics are meaningless
but it proves the GPU run is turn-key.
"""

from __future__ import annotations

import argparse
import json
import sys
import time
from datetime import datetime
from pathlib import Path

sys.path.insert(0, str(Path(__file__).resolve().parent.parent))

import torch

from looped_laguna import LoopConfig, load_model_and_tokenizer
from looped_laguna.eval import DATASET_LOADERS, format_results, run_matrix

DEFAULT_TOKENIZER = str(Path(__file__).resolve().parent.parent / "laguna_src")


def build_configs(num_layers: int, ks: list[int], width: int, center: float, controls: bool = True) -> dict:
    """Config matrix, named `{mode}-{strategy}-K{k}` (mode x strategy is a clean 2x2).

      strategy:  rk    = damped K-stage Runge-Kutta (the proposed method)
                 naive = undamped x<-g(x) repeated K times (control, expected to collapse)
      mode:      layer = iterate each window layer in place (required for MoE)
                 block = iterate the whole window as a unit (control, routing thrash)

    We sweep K for the proposed config (layer-rk). With `controls=True` we also add the
    other three corners of the 2x2 at K_max (block-rk isolates mode; layer-naive isolates
    strategy; block-naive is the canonical "naive looped transformer"). Pass controls=False
    for a clean K-sweep (baseline + layer-rk only) that doesn't duplicate the controls a
    breadth run already covers.
    """
    window = LoopConfig.from_depth_fraction(num_layers, center_frac=center, width=width).window
    configs: dict[str, LoopConfig | None] = {"baseline": None}
    for k in ks:
        configs[f"layer-rk-K{k}"] = LoopConfig(window=window, K=k, mode="layer")
    if controls:
        k_max = max(ks)
        configs[f"block-rk-K{k_max}"] = LoopConfig(window=window, K=k_max, mode="block")
        configs[f"layer-naive-K{k_max}"] = LoopConfig(window=window, K=k_max, mode="layer", naive=True)
        configs[f"block-naive-K{k_max}"] = LoopConfig(window=window, K=k_max, mode="block", naive=True)
    return configs




def main() -> None:
    p = argparse.ArgumentParser(description="Tier-1 eval: baseline vs looped Laguna on knowledge-MC tasks.")
    p.add_argument("--model", default="poolside/Laguna-XS.2",
                   help="HF model id or local path of the real model (ignored with --tiny)")
    p.add_argument("--tokenizer", default=DEFAULT_TOKENIZER,
                   help="tokenizer path/id (default: the vendored laguna_src)")
    p.add_argument("--tasks", default="arc_challenge",
                   help="comma-separated task names (see DATASET_LOADERS in eval.py)")
    p.add_argument("--limit", type=int, default=None, help="max examples per task (default: all)")
    p.add_argument("--ks", default="2,3", help="comma-separated loop counts K to sweep for layer-rk")
    p.add_argument("--no-controls", dest="controls", action="store_false",
                   help="only run baseline + layer-rk per K (skip block/naive controls), e.g. for a K-sweep")
    p.add_argument("--width", type=int, default=4, help="loop window width in layers")
    p.add_argument("--center", type=float, default=0.5, help="loop window center as a depth fraction (0-1)")
    p.add_argument("--dtype", default="bfloat16", help="model dtype: bfloat16/float16/float32")
    p.add_argument("--device", default="cuda", help="device or device_map: cuda/cpu/auto")
    p.add_argument("--batch-size", type=int, default=16,
                   help="padded-batch size for scoring (bump to 32-64 on big GPUs; lower for long tasks)")
    p.add_argument("--tiny", action="store_true",
                   help="use a tiny random-weight model instead of real weights (plumbing check)")
    p.add_argument("--tiny-layers", type=int, default=8, help="number of layers for the --tiny model")
    p.add_argument("--output", default=None,
                   help="results JSON path (default: auto-named results_eval_<timestamp>.json)")
    p.add_argument("--no-save", action="store_true", help="do not write results to disk")
    p.add_argument("--no-peritem", dest="peritem", action="store_false",
                   help="don't save per-item raw data (per-choice LLs/gold/subject) for re-aggregation")
    args = p.parse_args()

    ks = [int(x) for x in args.ks.split(",")]
    tasks = [t.strip() for t in args.tasks.split(",")]
    for t in tasks:
        if t not in DATASET_LOADERS:
            raise SystemExit(f"unknown task {t!r}; known: {sorted(DATASET_LOADERS)}")

    model, tokenizer = load_model_and_tokenizer(
        args.model, args.tokenizer, dtype=args.dtype, device=args.device,
        tiny=args.tiny, tiny_layers=args.tiny_layers,
    )
    num_layers = model.config.num_hidden_layers
    configs = build_configs(num_layers, ks, args.width, args.center, controls=args.controls)
    window = next(c.window for c in configs.values() if c is not None)
    print(f"model: {'tiny' if args.tiny else args.model} | layers={num_layers} | loop window={window}")
    print(f"configs: {list(configs)}")

    # Always persist (incrementally) unless --no-save, so a crash never loses finished work.
    save_path = None if args.no_save else (
        args.output or f"results_eval_{datetime.now().strftime('%Y%m%d-%H%M%S')}.json"
    )
    meta = {"model": "tiny" if args.tiny else args.model, "num_layers": num_layers,
            "window": list(window), "ks": ks, "tasks": tasks, "limit": args.limit}
    all_results: dict = {}

    # Per-item raw data (per-choice LLs + gold/subject/lengths) for later re-aggregation.
    peritem_path = str(Path(save_path).with_suffix("")) + "_peritem.jsonl" if (save_path and args.peritem) else None

    def save() -> None:
        if save_path:
            Path(save_path).write_text(json.dumps({"meta": meta, "results": all_results}, indent=2))

    if save_path:
        print(f"saving results to {save_path} (updated after every config)")
    if peritem_path:
        Path(peritem_path).write_text("")  # truncate; run_matrix appends one line per task
        print(f"saving per-item data to {peritem_path}")

    for task in tasks:
        examples = DATASET_LOADERS[task](limit=args.limit)
        print(f"\n=== {task} ({len(examples)} items) ===")
        all_results[task] = {}  # partial configs land here as they finish
        t0 = time.time()
        results = run_matrix(
            model, tokenizer, examples, configs, batch_size=args.batch_size,
            peritem_path=peritem_path, task_name=task,
            on_result=lambda name, r, t=task: (all_results[t].__setitem__(name, r), save()),
        )
        all_results[task] = results  # full results, now with deltas + significance
        save()
        for line in format_results(results):
            print(line)
        print(f"  ({time.time() - t0:.1f}s)")

    if save_path:
        print(f"\nresults saved to {save_path}")


if __name__ == "__main__":
    main()