add code/ loader snapshot

Browse files

Files changed (12) hide show

code/data/__init__.py +0 -0
code/data/dataset.py +171 -0
code/data/ground_truth.py +106 -0
code/data/sources.py +38 -0
code/data/trace_format.py +229 -0
code/eval/__init__.py +0 -0
code/eval/eval_cruxeval_codi.py +119 -0
code/eval/eval_cruxeval_sft.py +129 -0
code/tokens.py +39 -0
code/train/__init__.py +0 -0
code/train/train_codi.py +280 -0
code/wb.py +18 -0

code/data/__init__.py ADDED Viewed

File without changes

code/data/dataset.py ADDED Viewed

	@@ -0,0 +1,171 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+"""
+CRUXEval-O dataset: deterministic train/val split + ground-truth execution
+traces -> (input_ids, labels) for teacher-forcing CODI.
+Neutral data layer shared by training (``cwm.training.data``) and eval
+(``evals.cruxeval.run_eval_codi``); depends on nothing in either, so the
+split and trace format never drift. Thin HuggingFace-tokenizer wrapper over
+the verbatim Table 9 trace generator (``.ground_truth`` / ``.trace_format``):
+build the seeded prompt, tokenize ``prompt + render_frames_to_generation(frames)``,
+and mask the prompt out of the labels (teacher-forced, so labels == input_ids
+with the prompt prefix set to ``-100``).
+"""
+from __future__ import annotations
+from .ground_truth import ground_truth_trace, make_trace_context
+from .trace_format import (
+    ACTION_SEP,
+    LINE_SEP,
+    TraceEvent,
+    render_frames_to_generation,
+)
+IGNORE_INDEX = -100
+def _prompt_str(code: str, input_str: str) -> str:
+    ctx = make_trace_context(code, input_str)
+    return f"<|trace_context_start|>{ctx}<|frame_sep|><|call_sep|>{{}}<|action_sep|>def main():\n<|frame_sep|>"
+def _tokenize_trace(code, input_str, tokenizer, *, max_seq_len, max_frames):
+    """``(prompt_ids, trace_ids, spans)``; None to skip. Trace must terminate in
+    RETURN/EXCEPTION and have >=1 LINE span. Span ``(i, j)``: ``trace_ids[i]`` is
+    ``<|line_sep|>``, ``j`` its ``<|action_sep|>``, ``trace_ids[i+1:j]`` the locals
+    a CODI student swaps for a latent block. Single source of membership so the SFT
+    baseline and CODI train on identical data."""
+    frames, error = ground_truth_trace(code, input_str, align_to_prompt=True, max_frames=max_frames)
+    if not frames or error == "frames_exceeded":
+        return None
+    if frames[-1].event not in (TraceEvent.RETURN, TraceEvent.EXCEPTION):
+        return None
+    # Qwen has no BOS (bos_token_id is None); CWM did. Prepend only if present.
+    bos = [tokenizer.bos_token_id] if tokenizer.bos_token_id is not None else []
+    prompt_ids = bos + tokenizer.encode(_prompt_str(code, input_str), add_special_tokens=False)
+    trace_ids = tokenizer.encode(render_frames_to_generation(frames), add_special_tokens=False)
+    if len(prompt_ids) + len(trace_ids) > max_seq_len:
+        return None
+    ls = tokenizer.convert_tokens_to_ids(LINE_SEP)
+    asep = tokenizer.convert_tokens_to_ids(ACTION_SEP)
+    spans, i, n = [], 0, len(trace_ids)
+    while i < n:
+        if trace_ids[i] == ls:
+            j = i + 1
+            while j < n and trace_ids[j] != asep:
+                j += 1
+            if j == n:
+                break
+            spans.append((i, j))
+            i = j + 1
+        else:
+            i += 1
+    if not spans:
+        return None
+    return prompt_ids, trace_ids, spans
+def build_example(code, input_str, tokenizer, *, max_seq_len, max_frames=-1):
+    """SFT ``(input_ids, labels)`` with the prompt masked; None to skip."""
+    r = _tokenize_trace(code, input_str, tokenizer, max_seq_len=max_seq_len, max_frames=max_frames)
+    if r is None:
+        return None
+    prompt_ids, trace_ids, _ = r
+    return prompt_ids + trace_ids, [IGNORE_INDEX] * len(prompt_ids) + trace_ids
+def build_codi_example(code, input_str, tokenizer, *, max_seq_len, max_frames=-1):
+    """Multi-span CODI example ``{prompt_ids, trace_ids, spans}``; None to skip."""
+    r = _tokenize_trace(code, input_str, tokenizer, max_seq_len=max_seq_len, max_frames=max_frames)
+    if r is None:
+        return None
+    prompt_ids, trace_ids, spans = r
+    return {"prompt_ids": prompt_ids, "trace_ids": trace_ids, "spans": spans}
+def _load_cache(cache_dir, n_samples):
+    """Load precomputed tokenized examples (precompute.py); slice to n_samples."""
+    from datasets import load_from_disk
+    ex = list(load_from_disk(cache_dir))
+    return ex[:n_samples] if n_samples > 0 else ex
+def build_codi_dataset(
+    tokenizer, *, sources=("mbpp", "humaneval", "pyx"), n_samples: int = -1,
+    max_seq_len: int = 4096, max_frames: int = -1, cache_dir: str | None = None
+) -> list[dict]:
+    """CODI examples (prompt/reasoning/answer) over ``sources``, or a precomputed cache."""
+    if cache_dir:
+        ex = _load_cache(cache_dir, n_samples)
+        return [e for e in ex if len(e["prompt_ids"]) + len(e["trace_ids"]) <= max_seq_len]
+    rows = rows_for_sources(sources)
+    if n_samples > 0:
+        rows = rows[:n_samples]
+    out = []
+    for r in rows:
+        try:
+            out.append(build_codi_example(r["code"], r["input"], tokenizer,
+                                          max_seq_len=max_seq_len, max_frames=max_frames))
+        except Exception:
+            pass
+    return [ex for ex in out if ex is not None]
+def build_codi_single_dataset(
+    tokenizer, *, sources=("mbpp", "humaneval", "pyx"), n_samples: int = -1,
+    max_seq_len: int = 4096, max_frames: int = -1, cache_dir: str | None = None
+) -> list[dict]:
+    """Faithful single-block CODI: split each trace at its last ``<|return_sep|>`` into
+    ``{prompt_ids, reasoning_ids, answer_ids}`` (reasoning = whole trace, answer = final
+    RETURN frame). Derived from the multi-span examples; no separate cache needed."""
+    rsep = tokenizer.convert_tokens_to_ids("<|return_sep|>")
+    out = []
+    for e in build_codi_dataset(tokenizer, sources=sources, n_samples=n_samples,
+                                max_seq_len=max_seq_len, max_frames=max_frames, cache_dir=cache_dir):
+        t = e["trace_ids"]
+        idx = [i for i, x in enumerate(t) if x == rsep]
+        if not idx or idx[-1] == 0:
+            continue
+        out.append({"prompt_ids": e["prompt_ids"], "reasoning_ids": t[:idx[-1]], "answer_ids": t[idx[-1]:]})
+    return out
+def rows_for_sources(sources):
+    """Merge {id,code,input,output} rows across sources (all rows; train vs test
+    is split by dataset, e.g. cruxeval is held out for eval)."""
+    from . import sources as _src
+    rows = []
+    for name in sources:
+        for i, row in enumerate(_src.load_one(name)):
+            missing = [k for k in ("id", "code", "input", "output") if k not in row]
+            if missing:
+                raise ValueError(f"{name} row {i} missing keys: {missing}")
+            if not all(isinstance(row[k], str) for k in ("code", "input", "output")):
+                raise TypeError(f"{name} row {i} must use string code/input/output")
+            row = dict(row)
+            row["id"] = str(row["id"])
+            rows.append(row)
+    return rows
+def build_dataset(
+    tokenizer, *, sources=("mbpp", "humaneval", "pyx"), n_samples: int = -1,
+    max_seq_len: int = 8192, max_frames: int = -1, cache_dir: str | None = None
+) -> list[tuple[list[int], list[int]]]:
+    """Tokenized trace examples over ``sources``, or a precomputed cache."""
+    if cache_dir:
+        ex = _load_cache(cache_dir, n_samples)
+        return [(e["input_ids"], e["labels"]) for e in ex if len(e["input_ids"]) <= max_seq_len]
+    rows = rows_for_sources(sources)
+    if n_samples > 0:
+        rows = rows[:n_samples]
+    examples = (
+        build_example(
+            r["code"], r["input"], tokenizer,
+            max_seq_len=max_seq_len, max_frames=max_frames,
+        )
+        for r in rows
+    )
+    return [ex for ex in examples if ex is not None]

code/data/ground_truth.py ADDED Viewed

	@@ -0,0 +1,106 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+"""Ground-truth execution traces in CWM's frame format.
+Runs ``f(input)`` under ``sys.settrace`` and records CALL/LINE/RETURN/EXCEPTION
+frames with diff-based locals (unchanged vars render as ``".."``), values via
+``repr``. A synthetic ``def main(): return f(<input>)`` wraps the function; the
+seeded ``call main()`` frame is dropped by default to align with the trace
+prompt. Not a bit-exact replica of Meta's internal tracer (see README.md).
+"""
+from __future__ import annotations
+import linecache
+import sys
+from typing import Any
+from .trace_format import DIFF_PLACEHOLDER, TraceEvent, TraceFrame, normalize_source
+_FILENAME = "<cwm_trace>"
+_ENTRY = "main"
+class _FramesExceeded(Exception):
+    pass
+def make_trace_context(code: str, input_str: str) -> str:
+    return f"\n{code}\ndef main():  # << START_OF_TRACE\n    return f({input_str})\n"
+def render_value(value: Any) -> str:
+    try:
+        return repr(value)
+    except Exception:
+        return "<unrepr>"
+def ground_truth_trace(
+    code: str, input_str: str, align_to_prompt: bool = True, max_frames: int = -1
+) -> tuple[list[TraceFrame], str | None]:
+    """Return (frames, error) for executing ``f(input_str)``. ``error`` is
+    non-None if the program raised; frames up to that point are still returned.
+    """
+    context = make_trace_context(code, input_str)
+    linecache.cache[_FILENAME] = (len(context), None, context.splitlines(keepends=True), _FILENAME)
+    frames: list[TraceFrame] = []
+    scope_prev: dict[int, dict[str, str]] = {}  # id(frame) -> last rendered locals
+    entry = None
+    def source(frame):
+        return normalize_source(linecache.getline(_FILENAME, frame.f_lineno))
+    def diff_locals(frame):
+        prev = scope_prev.get(id(frame), {})
+        out, rendered = {}, {}
+        for name, val in frame.f_locals.items():
+            r = render_value(val)
+            rendered[name] = r
+            out[name] = DIFF_PLACEHOLDER if prev.get(name) == r else r
+        scope_prev[id(frame)] = rendered
+        return out
+    def trace(frame, event, arg):
+        nonlocal entry
+        # Abort loop-heavy programs, but only from our file (not GC/__del__ frames).
+        if max_frames > 0 and len(frames) >= max_frames and frame.f_code.co_filename == _FILENAME:
+            raise _FramesExceeded
+        if entry is None:
+            if event == "call" and frame.f_code.co_name == _ENTRY:
+                entry = id(frame)
+            else:
+                return None
+        # Only trace user code from our context, not library frames.
+        if frame.f_code.co_filename != _FILENAME:
+            return None
+        if event == "call":
+            frames.append(TraceFrame(event=TraceEvent.CALL, source=source(frame), locals=diff_locals(frame)))
+        elif event == "line":
+            frames.append(TraceFrame(event=TraceEvent.LINE, source=source(frame), locals=diff_locals(frame)))
+        elif event == "return":
+            frames.append(TraceFrame(event=TraceEvent.RETURN, source=source(frame), arg=render_value(arg)))
+        elif event == "exception":
+            name = getattr(arg[0], "__name__", str(arg[0]))
+            frames.append(TraceFrame(event=TraceEvent.EXCEPTION, source=source(frame), arg=render_value(name)))
+        return trace
+    ns: dict[str, Any] = {}
+    exec(compile(context, _FILENAME, "exec"), ns)  # define f, main untraced
+    error = None
+    old = sys.gettrace()
+    sys.settrace(trace)
+    try:
+        ns[_ENTRY]()
+    except _FramesExceeded:
+        error = "frames_exceeded"
+    except Exception as e:
+        error = f"{type(e).__name__}: {e}"
+    finally:
+        sys.settrace(old)
+    # Drop the seeded ``call main()`` frame so frames align with the prompt.
+    if align_to_prompt and frames and frames[0].event == TraceEvent.CALL and frames[0].source.startswith("def main()"):
+        frames = frames[1:]
+    return frames, error

code/data/sources.py ADDED Viewed

	@@ -0,0 +1,38 @@

+"""Dataset name(s) -> merged {id, code, input, output} rows.
+Add a converted dataset by running its folder's convert.py (saves ./data via
+save_to_disk) and listing it in _LOCAL. cruxeval keeps its own Hub-fallback
+loader and is held out entirely for eval (eval_cruxeval_*.py), never trained on.
+"""
+from __future__ import annotations
+import os
+from pathlib import Path
+_LOCAL = {"mbpp": "MBPP", "humaneval": "HumanEval", "pyx": "PyX"}  # name -> folder, data in ./data
+def load_cruxeval():
+    """Full CRUXEval-O (the held-out test set). Prefer a local save_to_disk copy;
+    the HF builder FileLock dies on NFS caches."""
+    local_dir = os.environ.get("CRUXEVAL_DIR")
+    if local_dir and os.path.isdir(local_dir):
+        from datasets import load_from_disk
+        return list(load_from_disk(local_dir))
+    from datasets import load_dataset
+    return list(load_dataset("cruxeval-org/cruxeval", split="test"))
+def load_one(name: str) -> list[dict]:
+    key = name.strip().lower()
+    if key == "cruxeval":
+        return load_cruxeval()
+    if key in _LOCAL:
+        from datasets import load_from_disk
+        d = os.environ.get(key.upper() + "_DIR") or str(Path(__file__).parent / _LOCAL[key] / "data")
+        return list(load_from_disk(d))
+    raise ValueError(f"unknown data source {name!r}; pick from {['cruxeval', *_LOCAL]}")

code/data/trace_format.py ADDED Viewed

	@@ -0,0 +1,229 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+"""
+Shared CWM execution-trace representation and parsing.
+CWM predicts an execution trace as a sequence of *frames*, each consisting of an
+*observation* (the local-variable state) and an *action* (the executed source
+line). The on-the-wire format (see PROMPTING_GUIDE.md and demos/cwmdbg.py) is:
+    <|call_sep|>$LOCALS<|action_sep|>$SOURCE<|frame_sep|>
+    <|line_sep|>$LOCALS<|action_sep|>$SOURCE<|frame_sep|>
+    <|return_sep|><|action_sep|>$SOURCE<|arg_sep|>$VALUE<|frame_sep|>
+    <|exception_sep|><|action_sep|>$SOURCE<|arg_sep|>$VALUE<|frame_sep|>
+`$LOCALS` is a JSON object mapping variable names to *string* values; each value
+is the JSON encoding of the underlying Python value (e.g. `"5"`, `"\"abc\""`,
+`"[1, 2]"`). Locals use a diff-based representation: a variable whose value is
+unchanged since the previous frame in the same scope is rendered as the
+placeholder string `".."`. `$VALUE` (return/exception frames) is the JSON
+encoding of the returned/raised value, stored as a JSON string.
+This module is GPU-free and import-light so it can be unit-tested directly.
+"""
+from __future__ import annotations
+import json
+from dataclasses import dataclass, field
+from enum import Enum
+# Literal piece strings as they appear when a generation is decoded with
+# cut_at_stop_tokens=False (matches CWMInstructTokenizer.*_ID constants).
+CALL_SEP = "<|call_sep|>"
+LINE_SEP = "<|line_sep|>"
+RETURN_SEP = "<|return_sep|>"
+EXCEPTION_SEP = "<|exception_sep|>"
+ACTION_SEP = "<|action_sep|>"
+ARG_SEP = "<|arg_sep|>"
+FRAME_SEP = "<|frame_sep|>"
+END_OF_TEXT = "<|end_of_text|>"
+DIFF_PLACEHOLDER = ".."
+_START_MARKER = "  # << START_OF_TRACE"
+class TraceEvent(Enum):
+    CALL = "call"
+    LINE = "line"
+    RETURN = "return"
+    EXCEPTION = "exception"
+_EVENT_TOKENS: dict[str, TraceEvent] = {
+    CALL_SEP: TraceEvent.CALL,
+    LINE_SEP: TraceEvent.LINE,
+    RETURN_SEP: TraceEvent.RETURN,
+    EXCEPTION_SEP: TraceEvent.EXCEPTION,
+}
+_EVENT_TO_TOKEN: dict[TraceEvent, str] = {v: k for k, v in _EVENT_TOKENS.items()}
+@dataclass
+class TraceFrame:
+    """A single execution-trace frame.
+    `locals_str` is the raw `$LOCALS` text exactly as it appears between the
+    event token and `<|action_sep|>` (empty string for return/exception
+    frames). `locals` is its parsed form (a dict of name -> JSON-string-value),
+    or None if it failed to parse as a JSON object. `source` is the action line
+    with the START_OF_TRACE marker and trailing newline stripped.
+    """
+    event: TraceEvent
+    source: str
+    locals_str: str = ""
+    locals: dict[str, str] | None = None
+    arg: str | None = None
+    malformed: bool = False
+    # Token counts (filled when a tokenizer is available); used for the
+    # "Avg State/Action Length (Token)" statistics rows of Table 9.
+    state_tokens: int = 0
+    action_tokens: int = 0
+    @property
+    def has_locals(self) -> bool:
+        return self.event in (TraceEvent.CALL, TraceEvent.LINE)
+def normalize_source(source: str) -> str:
+    """Strip the trace start marker and trailing newline from a source line."""
+    return source.rstrip("\n").rstrip(_START_MARKER).rstrip()
+def parse_locals(locals_str: str) -> dict[str, str] | None:
+    """Parse a `$LOCALS` payload into a dict, or None if it is not a JSON object."""
+    locals_str = locals_str.strip()
+    if locals_str == "":
+        return {}
+    try:
+        obj = json.loads(locals_str)
+    except json.JSONDecodeError:
+        return None
+    if not isinstance(obj, dict):
+        return None
+    # Values are always JSON strings; coerce defensively.
+    return {str(k): v if isinstance(v, str) else json.dumps(v) for k, v in obj.items()}
+def parse_generated_trace(generation: str) -> tuple[list[TraceFrame], bool]:
+    """Parse a full-trace generation string into frames.
+    Returns (frames, well_formed). `well_formed` is True when every frame had a
+    leading event token and an `<|action_sep|>` (and an `<|arg_sep|>` for
+    return/exception frames) and the generation contained no leftover garbage
+    between the last frame and end-of-text. This drives the "Valid Trace Format"
+    metric. Individual frames are still returned even when malformed so that the
+    other metrics can be computed over whatever parsed cleanly.
+    """
+    # Everything after end-of-text is irrelevant.
+    if END_OF_TEXT in generation:
+        generation = generation.split(END_OF_TEXT, 1)[0]
+    frames: list[TraceFrame] = []
+    well_formed = True
+    segments = generation.split(FRAME_SEP)
+    # The final segment is the text after the last frame_sep; for a clean trace
+    # it should be empty (the model emitted frame_sep then end_of_text).
+    trailing = segments.pop() if segments else ""
+    if trailing.strip() not in ("",):
+        well_formed = False
+    for seg in segments:
+        if seg.strip() == "":
+            # Stray empty segment (e.g. leading text before first token).
+            continue
+        frame, ok = _parse_segment(seg)
+        if frame is None:
+            well_formed = False
+            continue
+        well_formed = well_formed and ok
+        frames.append(frame)
+    if not frames:
+        well_formed = False
+    return frames, well_formed
+def _parse_segment(seg: str) -> tuple[TraceFrame | None, bool]:
+    # Identify the (first) event token.
+    event: TraceEvent | None = None
+    for tok, evt in _EVENT_TOKENS.items():
+        idx = seg.find(tok)
+        if idx != -1:
+            event = evt
+            seg = seg[idx + len(tok):]
+            break
+    if event is None:
+        return None, False
+    ok = True
+    if event in (TraceEvent.CALL, TraceEvent.LINE):
+        if ACTION_SEP not in seg:
+            return (
+                TraceFrame(event=event, source="", malformed=True),
+                False,
+            )
+        locals_str, source = seg.split(ACTION_SEP, 1)
+        parsed = parse_locals(locals_str)
+        return (
+            TraceFrame(
+                event=event,
+                source=normalize_source(source),
+                locals_str=locals_str.strip(),
+                locals=parsed,
+                malformed=parsed is None,
+            ),
+            ok,
+        )
+    # RETURN / EXCEPTION
+    if ACTION_SEP not in seg:
+        return TraceFrame(event=event, source="", malformed=True), False
+    seg = seg.split(ACTION_SEP, 1)[1]
+    if ARG_SEP in seg:
+        source, arg = seg.split(ARG_SEP, 1)
+        arg = _parse_arg(arg)
+    else:
+        source, arg = seg, None
+        ok = False
+    return (
+        TraceFrame(event=event, source=normalize_source(source), arg=arg),
+        ok,
+    )
+def render_frames_to_generation(frames: list[TraceFrame]) -> str:
+    """Render frames back to the on-the-wire generation string.
+    Inverse of ``parse_generated_trace`` for well-formed frames. Used by tests
+    (a ground-truth trace rendered this way must round-trip to a perfect score)
+    and to materialize a reference trace string for inspection.
+    """
+    out: list[str] = []
+    for f in frames:
+        out.append(_EVENT_TO_TOKEN[f.event])
+        if f.has_locals:
+            out.append(json.dumps(f.locals if f.locals is not None else {}))
+        out.append(ACTION_SEP)
+        out.append(f.source)
+        if f.event in (TraceEvent.RETURN, TraceEvent.EXCEPTION):
+            out.append(ARG_SEP)
+            out.append(json.dumps(f.arg))
+        out.append(FRAME_SEP)
+    out.append(END_OF_TEXT)
+    return "".join(out)
+def _parse_arg(arg_str: str) -> str | None:
+    arg_str = arg_str.strip()
+    if arg_str == "":
+        return None
+    try:
+        # The frame stores json.dumps(value_string); unwrap one level so `arg`
+        # is the source-literal value string (e.g. '"x9ja"' or '17').
+        loaded = json.loads(arg_str)
+        return loaded if isinstance(loaded, str) else arg_str
+    except json.JSONDecodeError:
+        return arg_str

code/eval/__init__.py ADDED Viewed

File without changes

code/eval/eval_cruxeval_codi.py ADDED Viewed

	@@ -0,0 +1,119 @@

+"""CRUXEval-O latent eval: the CODI student generates the trace, but at every
+<|line_sep|> the frame's $LOCALS is replaced by a latent block (latent_start +
+latent_steps recurrent latents + latent_end), mirroring training _student.
+"""
+import argparse
+import json
+import os
+from datetime import timedelta
+import torch
+import torch.distributed as dist
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+from data.dataset import _prompt_str
+from data.sources import load_cruxeval
+from eval.eval_cruxeval_sft import check_correct, extract_answer_trace_full
+from tokens import add_trace_tokens, token_ids
+from train.train_codi import CodiModel
+def load_codi(m, latent_steps, dev):
+    tok = AutoTokenizer.from_pretrained(m, use_fast=True)
+    add_trace_tokens(tok)
+    ids = token_ids(tok)
+    base = AutoModelForCausalLM.from_config(AutoConfig.from_pretrained(m), torch_dtype=torch.bfloat16)
+    model = CodiModel(base, latent_start_id=ids["<|latent_start|>"],
+                      latent_end_id=ids["<|latent_end|>"], latent_steps=latent_steps)
+    if os.path.exists(f"{m}/pytorch_model.bin"):  # epoch checkpoint: full CodiModel
+        model.load_state_dict(torch.load(f"{m}/pytorch_model.bin", map_location="cpu"))
+    else:  # final export: backbone safetensors + separate projector
+        model.model = AutoModelForCausalLM.from_pretrained(m, torch_dtype=torch.bfloat16)
+        model.prj.load_state_dict(torch.load(f"{m}/thought_projector.pt", map_location="cpu"))
+    return tok, ids, model.to(dev).eval()
+@torch.no_grad()
+def gen_latent(model, prompt_ids, ls_id, eot, max_new):
+    dev = prompt_ids.device
+    o = model.model(input_ids=prompt_ids[None], use_cache=True)
+    cache, logits = o.past_key_values, o.logits[:, -1]
+    out = []
+    for _ in range(max_new):
+        t = int(logits.argmax(-1))
+        if t == eot:
+            break
+        out.append(t)
+        o = model.model(input_ids=torch.tensor([[t]], device=dev), past_key_values=cache, use_cache=True)
+        cache = o.past_key_values
+        if t == ls_id:  # drop $LOCALS, insert latent block; its logits predict <|action_sep|>
+            cache, logits = model._latent_block(cache)
+        else:
+            logits = o.logits[:, -1]
+    return out
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--model", required=True)
+    ap.add_argument("--n_samples", type=int, default=-1)
+    ap.add_argument("--max_new_tokens", type=int, default=8192)
+    ap.add_argument("--latent_steps", type=int, default=1)
+    ap.add_argument("--out", default="")
+    args = ap.parse_args()
+    ddp = "RANK" in os.environ
+    rank = int(os.environ.get("RANK", 0))
+    world = int(os.environ.get("WORLD_SIZE", 1))
+    local_rank = int(os.environ.get("LOCAL_RANK", 0))
+    if ddp:
+        dist.init_process_group("nccl", timeout=timedelta(hours=1))  # ranks finish at different times under long gens
+    torch.cuda.set_device(local_rank)
+    tok, ids, model = load_codi(args.model, args.latent_steps, local_rank)
+    ls_id, eot = ids["<|line_sep|>"], ids["<|end_of_text|>"]
+    rows = load_cruxeval()
+    if args.n_samples > 0:
+        rows = rows[: args.n_samples]
+    n = len(rows)
+    shard = rows[rank::world]
+    n_correct = n_fmt = 0
+    results = []
+    for i, r in enumerate(shard):
+        enc = tok(_prompt_str(r["code"], r["input"]), return_tensors="pt",
+                  add_special_tokens=False).to(local_rank)
+        gen = tok.decode(gen_latent(model, enc["input_ids"][0], ls_id, eot, args.max_new_tokens),
+                         skip_special_tokens=False)
+        pred = extract_answer_trace_full(gen)
+        ok = pred is not None and check_correct(r["code"], r["output"], pred)
+        n_fmt += pred is not None
+        n_correct += ok
+        results.append({"id": r["id"], "expected": r["output"], "predicted": pred, "correct": ok, "generation": gen})
+        if rank == 0 and (i + 1) % 20 == 0:
+            print(f"  rank0 {i+1}/{len(shard)}  pass@1={n_correct/(i+1):.4f}", flush=True)
+    if ddp:
+        t = torch.tensor([n_correct, n_fmt], device=local_rank)
+        dist.all_reduce(t)
+        n_correct, n_fmt = int(t[0]), int(t[1])
+        gathered = [None] * world
+        dist.gather_object(results, gathered if rank == 0 else None, dst=0)
+        if rank == 0:
+            results = [x for part in gathered for x in part]
+    if rank == 0:
+        print(f"\nCRUXEval-O latent pass@1={n_correct / n:.4f}  "
+              f"valid_format={n_fmt / n:.4f}  (n={n}, greedy)")
+        if args.out:
+            with open(args.out, "w") as f:
+                json.dump({"pass_at_1": n_correct / n, "valid_format": n_fmt / n,
+                           "n": n, "results": results}, f, indent=2)
+    if ddp:
+        dist.destroy_process_group()
+if __name__ == "__main__":
+    main()

code/eval/eval_cruxeval_sft.py ADDED Viewed

	@@ -0,0 +1,129 @@

+"""Stage 1 baseline eval: CRUXEval-O output prediction via full-trace generation.
+Feed the training prompt (seeds frame 0), let the SFT model generate the trace,
+take main()'s last return value as the predicted output, score by execution.
+Greedy => pass@1 is the exact-match fraction. Reuses cwm_andre eval logic.
+"""
+import argparse
+import json
+import os
+import subprocess
+import sys
+from datetime import timedelta
+import torch
+import torch.distributed as dist
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from data.dataset import _prompt_str
+from data.sources import load_cruxeval
+from tokens import add_trace_tokens, token_ids
+ARG_SEP, FRAME_SEP, RETURN_SEP = "<|arg_sep|>", "<|frame_sep|>", "<|return_sep|>"
+def extract_answer_trace_full(gen: str) -> str | None:
+    """Value of main()'s last RETURN frame: ...<|arg_sep|>"value"<|frame_sep|>."""
+    r = gen.rfind(RETURN_SEP)
+    if r == -1:
+        return None
+    a = gen.find(ARG_SEP, r)
+    if a == -1:
+        return None
+    rest = gen[a + len(ARG_SEP):]
+    end = rest.find(FRAME_SEP)
+    val = (rest[:end] if end != -1 else rest).strip()
+    if not val:
+        return None
+    try:
+        return json.loads(val)
+    except json.JSONDecodeError:
+        return val
+def check_correct(code: str, expected: str, predicted: str, timeout: float = 3.0) -> bool:
+    """Execute `code; assert expected == predicted` (CRUXEval semantics)."""
+    test = f"{code}\nassert {expected} == {predicted}"
+    try:
+        return subprocess.run(
+            [sys.executable, "-c", test], timeout=timeout, capture_output=True
+        ).returncode == 0
+    except Exception:
+        return False
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--model", required=True)
+    ap.add_argument("--n_samples", type=int, default=-1)
+    ap.add_argument("--max_new_tokens", type=int, default=8192)
+    ap.add_argument("--batch_size", type=int, default=8)
+    ap.add_argument("--out", default="")
+    args = ap.parse_args()
+    # DDP-style data parallelism for inference: torchrun sets RANK/WORLD_SIZE/LOCAL_RANK.
+    ddp = "RANK" in os.environ
+    rank = int(os.environ.get("RANK", 0))
+    world = int(os.environ.get("WORLD_SIZE", 1))
+    local_rank = int(os.environ.get("LOCAL_RANK", 0))
+    if ddp:
+        dist.init_process_group("nccl", timeout=timedelta(hours=1))  # ranks finish at different times under long gens
+    torch.cuda.set_device(local_rank)
+    tok = AutoTokenizer.from_pretrained(args.model, use_fast=True)
+    add_trace_tokens(tok)  # idempotent; ensures trace tokens present
+    tok.padding_side = "left"  # left-pad so all generated tokens start at the same offset
+    eot_id = token_ids(tok)["<|end_of_text|>"]
+    model = AutoModelForCausalLM.from_pretrained(
+        args.model, torch_dtype=torch.bfloat16).to(local_rank).eval()
+    rows = load_cruxeval()
+    if args.n_samples > 0:
+        rows = rows[: args.n_samples]
+    n = len(rows)
+    shard = rows[rank::world]  # disjoint round-robin split across ranks
+    n_correct = n_fmt = 0
+    results = []
+    for bi, batch_start in enumerate(range(0, len(shard), args.batch_size)):
+        batch = shard[batch_start: batch_start + args.batch_size]
+        enc = tok([_prompt_str(r["code"], r["input"]) for r in batch],
+                  return_tensors="pt", padding=True, add_special_tokens=False).to(local_rank)
+        with torch.no_grad():
+            out = model.generate(**enc, max_new_tokens=args.max_new_tokens, do_sample=False,
+                                 eos_token_id=eot_id, pad_token_id=eot_id)
+        for j, r in enumerate(batch):
+            gen = tok.decode(out[j, enc["input_ids"].shape[1]:], skip_special_tokens=False)
+            pred = extract_answer_trace_full(gen)
+            ok = pred is not None and check_correct(r["code"], r["output"], pred)
+            n_fmt += pred is not None
+            n_correct += ok
+            results.append({"id": r["id"], "expected": r["output"], "predicted": pred, "correct": ok, "generation": gen})
+        if rank == 0 and (bi + 1) % 5 == 0:
+            done = batch_start + len(batch)
+            print(f"  rank0 {done}/{len(shard)}  pass@1={n_correct/done:.4f}", flush=True)
+    # Reduce metrics and gather per-row results across ranks.
+    if ddp:
+        t = torch.tensor([n_correct, n_fmt], device=local_rank)
+        dist.all_reduce(t)
+        n_correct, n_fmt = int(t[0]), int(t[1])
+        gathered = [None] * world
+        dist.gather_object(results, gathered if rank == 0 else None, dst=0)
+        if rank == 0:
+            results = [x for part in gathered for x in part]
+    if rank == 0:
+        print(f"\nCRUXEval-O pass@1={n_correct / n:.4f}  "
+              f"valid_format={n_fmt / n:.4f}  (n={n}, greedy)")
+        if args.out:
+            with open(args.out, "w") as f:
+                json.dump({"pass_at_1": n_correct / n, "valid_format": n_fmt / n,
+                           "n": n, "results": results}, f, indent=2)
+    if ddp:
+        dist.destroy_process_group()
+if __name__ == "__main__":
+    main()

code/tokens.py ADDED Viewed

	@@ -0,0 +1,39 @@

+"""CWM trace special tokens + tokenizer/embedding setup for a non-CWM base."""
+# Trace-format tokens (mirrors data/trace_format.py) + latent delimiters.
+TRACE_TOKENS = [
+    "<|trace_context_start|>",
+    "<|call_sep|>", "<|line_sep|>", "<|return_sep|>", "<|exception_sep|>",
+    "<|action_sep|>", "<|arg_sep|>", "<|frame_sep|>", "<|end_of_text|>",
+    "<|latent_start|>", "<|latent_end|>",
+]
+def add_trace_tokens(tokenizer) -> int:
+    """Add the trace tokens as special tokens. Returns the count newly added."""
+    return tokenizer.add_tokens(TRACE_TOKENS, special_tokens=True)
+def resize_and_init(model, tokenizer, n_added: int) -> None:
+    """Resize embeddings to the tokenizer; init new rows to the existing mean."""
+    old = model.get_input_embeddings().weight.shape[0]
+    model.resize_token_embeddings(len(tokenizer))
+    if n_added <= 0:
+        return
+    seen = set()
+    for emb in (model.get_input_embeddings(), model.get_output_embeddings()):
+        if emb is None or id(emb) in seen:  # tied embeddings: resize once
+            continue
+        seen.add(id(emb))
+        w = emb.weight.data
+        w[old:] = w[:old].mean(dim=0, keepdim=True)
+def token_ids(tokenizer) -> dict[str, int]:
+    """Map each trace token to its single id (asserts single-token encoding)."""
+    ids = {}
+    for t in TRACE_TOKENS:
+        enc = tokenizer.encode(t, add_special_tokens=False)
+        assert len(enc) == 1, f"{t!r} did not encode to a single id: {enc}"
+        ids[t] = enc[0]
+    return ids

code/train/__init__.py ADDED Viewed

File without changes

code/train/train_codi.py ADDED Viewed

	@@ -0,0 +1,280 @@

+"""Stage 2b: per-frame CODI self-distillation (multi-span).
+Shared-weight teacher+student initialized from the Stage-1 SFT model.
+- Teacher reads the full explicit trace (prompt+trace), CE = L_teacher.
+- Student replaces each LINE frame's $LOCALS with a latent block (latent_start +
+  `latent_steps` recurrent latents + latent_end; last hidden -> prj -> next embed)
+  and teacher-forces the rest, CE = L_student over the emitted (non-locals) text.
+- KD aligns the hidden at each frame's `<|action_sep|>` (student after latents vs
+  teacher after locals), teacher detached. L = a*Lt + b*Ls + g*Lkd.
+"""
+import argparse
+import os
+import random
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import AutoModelForCausalLM, AutoTokenizer, DynamicCache, Trainer, TrainingArguments
+from transformers.trainer_utils import get_last_checkpoint
+from transformers.utils import WEIGHTS_NAME
+from data.dataset import IGNORE_INDEX, build_codi_dataset
+from tokens import add_trace_tokens, token_ids
+from wb import wandb_init
+class CodiModel(nn.Module):
+    def __init__(self, base, *, latent_start_id, latent_end_id, latent_steps,
+                 a=1.0, b=1.0, g=1.0, kd_layers=None, single_anchor=False,
+                 ss_prob=0.0, ss_ramp_frac=0.5, teacher=None, kd_target="hidden", kd_temp=2.0,
+                 line_sep_id=None, recon_w=0.0):
+        super().__init__()
+        self.model = base
+        h = base.config.hidden_size
+        # CODI thought projector (last hidden -> next latent input).
+        self.prj = nn.Sequential(
+            nn.Linear(h, h, bias=False), nn.GELU(),
+            nn.Linear(h, h, bias=False), nn.LayerNorm(h),
+        )
+        ref = base.get_input_embeddings().weight
+        self.prj.to(device=ref.device, dtype=ref.dtype)
+        self.latent_steps, self.a, self.b, self.g = latent_steps, a, b, g
+        self.teacher = [teacher] if teacher is not None else None  # list -> hidden from state_dict/DDP/optim
+        self.kd_target, self.kd_temp = kd_target, kd_temp  # hidden: smooth_l1 on kd_layers; logit: KL on lm_head
+        if kd_target == "logit" or (teacher is not None and kd_layers is None):
+            kd_layers = [-1]  # logit KD is defined on the last layer only; frozen default = key (last) hidden
+        self.kd_layers = kd_layers  # None -> all layers
+        self.single_anchor = single_anchor  # KD at last span only (vanilla-CODI ablation)
+        # scheduled sampling: ss_p (ramped per step) of post-latent lines feed the student's own argmax
+        self.ss_prob, self.ss_ramp_frac, self.ss_p = ss_prob, ss_ramp_frac, 0.0
+        self.register_buffer("_ls_tok", torch.tensor([[latent_start_id]], dtype=torch.long), persistent=False)
+        self.register_buffer("_le_tok", torch.tensor([[latent_end_id]], dtype=torch.long), persistent=False)
+        self.body = base.model
+        self.head = base.lm_head
+    def _kd(self, hs):
+        return hs[1:] if self.kd_layers is None else tuple(hs[l] for l in self.kd_layers)
+    def _emb(self, ids):
+        return self.model.get_input_embeddings()(ids)
+    def _teacher(self, full_ids, labels, kd_pos):
+        pos = torch.tensor(kd_pos, device=full_ids.device)
+        if self.teacher is not None:  # frozen teacher: KD targets only, no teacher CE
+            tch, dev = self.teacher[0], full_ids.device
+            if next(tch.parameters()).device != dev:
+                tch.to(dev)
+            with torch.no_grad():
+                if self.kd_target == "logit":  # target = teacher's own next-token logits
+                    return None, [tch(input_ids=full_ids[None], use_cache=False).logits[0, pos]]
+                hs = tch(input_ids=full_ids[None], use_cache=False, output_hidden_states=True).hidden_states
+                return None, [l[0, pos] for l in self._kd(hs)]
+        with torch.no_grad():  # KD targets are detached; take hiddens without a backward graph
+            hs = self.model(input_ids=full_ids[None], use_cache=False, output_hidden_states=True).hidden_states
+            kd = [l[0, pos] for l in self._kd(hs)]
+        # CE forward without output_hidden_states so grad-checkpointing actually frees layer acts.
+        self.model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})
+        logits = self.model(input_ids=full_ids[None], use_cache=False).logits
+        self.model.gradient_checkpointing_disable()  # teacher-only; student keeps KV cache
+        ce = F.cross_entropy(logits[0, :-1], labels[1:], ignore_index=IGNORE_INDEX)
+        return ce, kd
+    def _latent_block(self, cache):
+        """latent_start + `latent_steps` recurrent latents + latent_end on top of
+        `cache`. Returns (new cache, logits predicting the next real token)."""
+        o = self.body(inputs_embeds=self._emb(self._ls_tok), past_key_values=cache, use_cache=True)
+        cache, h = o.past_key_values, o.last_hidden_state[:, -1:]
+        for _ in range(self.latent_steps):
+            o = self.body(inputs_embeds=self.prj(h), past_key_values=cache, use_cache=True)
+            cache, h = o.past_key_values, o.last_hidden_state[:, -1:]
+        o = self.body(inputs_embeds=self._emb(self._le_tok), past_key_values=cache, use_cache=True)
+        return o.past_key_values, self.head(o.last_hidden_state[:, -1])
+    def _student(self, prompt_ids, trace_ids, spans):
+        # Segments cover trace_ids in order; locals (trace_ids[i+1:j]) are dropped
+        # and replaced by a latent block. kd=True marks a frame's <|action_sep|>.
+        segs, prev, kd = [], 0, False
+        for i, j in spans:
+            segs.append(("text", trace_ids[prev:i + 1], kd))
+            segs.append(("latent", None, False))
+            prev, kd = j, True
+        segs.append(("text", trace_ids[prev:], kd))
+        last = len(segs) - 1
+        out = self.model(inputs_embeds=self._emb(prompt_ids[None]), use_cache=True)
+        cache, prev_logits = out.past_key_values, out.logits[:, -1]  # predicts trace_ids[0]
+        ce_logits, ce_targets, kd_vecs = [], [], []
+        for s, (kind, ids, kd) in enumerate(segs):
+            if kind == "latent":  # prev_logits predicted dropped locals; overwrite, no CE
+                cache, prev_logits = self._latent_block(cache)
+                continue
+            inp = ids
+            if kd and 0 < self.ss_p and random.random() < self.ss_p:
+                # scheduled sampling: replace the code (not action_sep / line_sep) with the student's own
+                # argmax via a no-grad pass on a detached cache clone; CE targets below stay GT.
+                end = ids.numel() if s == last else ids.numel() - 1
+                c = DynamicCache()
+                for i, ly in enumerate(cache.layers):
+                    c.update(ly.keys.detach(), ly.values.detach(), i)
+                with torch.no_grad():
+                    pred = self.model(inputs_embeds=self._emb(ids[None]), past_key_values=c, use_cache=True).logits[0].argmax(-1)
+                inp = ids.clone(); inp[1:end] = pred[:end - 1]
+            ce_logits.append(prev_logits); ce_targets.append(ids[:1])
+            out = self.model(inputs_embeds=self._emb(inp[None]), past_key_values=cache,
+                             use_cache=True, output_hidden_states=kd)  # hiddens only for KD anchors
+            cache, logits = out.past_key_values, out.logits[0]
+            if ids.numel() > 1:
+                ce_logits.append(logits[:-1]); ce_targets.append(ids[1:])
+            prev_logits = logits[-1:]
+            if kd:  # action_sep is this segment's first token
+                kd_vecs.append([hs[0, 0] for hs in self._kd(out.hidden_states)])
+        ce = F.cross_entropy(torch.cat(ce_logits), torch.cat(ce_targets))
+        s_kd = [torch.stack([v[l] for v in kd_vecs]) for l in range(len(kd_vecs[0]))]
+        return ce, s_kd
+    def _kd_loss(self, s_kd, t_kd):
+        s, t = torch.stack(s_kd), torch.stack(t_kd).detach()
+        if self.kd_target == "logit":  # s=student hidden, t=frozen-teacher logits; KL on distributions
+            T = self.kd_temp
+            sl, tl = self.head(s).flatten(0, -2) / T, t.flatten(0, -2) / T
+            return F.kl_div(F.log_softmax(sl, -1), F.softmax(tl, -1), reduction="batchmean") * T * T
+        return F.smooth_l1_loss(s, t)
+    def forward(self, examples):
+        dev = self.model.get_input_embeddings().weight.device
+        tl = sl = kl = 0.0
+        for ex in examples:
+            prompt = torch.tensor(ex["prompt_ids"], device=dev)
+            trace = torch.tensor(ex["trace_ids"], device=dev)
+            spans = ex["spans"]
+            full = torch.cat([prompt, trace])
+            labels = None if self.teacher else torch.cat([full.new_full((len(prompt),), IGNORE_INDEX), trace])
+            kd_pos = [len(prompt) + j for _, j in spans]
+            t_ce, t_kd = self._teacher(full, labels, kd_pos)
+            s_ce, s_kd = self._student(prompt, trace, spans)
+            if self.single_anchor:  # keep only the last frame's anchor (per layer)
+                t_kd, s_kd = [t[-1:] for t in t_kd], [s[-1:] for s in s_kd]
+            tl = tl + (t_ce if t_ce is not None else 0.0)  # frozen teacher -> no teacher CE
+            sl, kl = sl + s_ce, kl + self._kd_loss(s_kd, t_kd)
+        n = len(examples)
+        loss = self.a * tl / n + self.b * sl / n + self.g * kl / n
+        t_log = (tl / n).detach() if torch.is_tensor(tl) else torch.tensor(0.0)  # 0 under frozen teacher
+        return {"loss": loss, "teacher_loss": t_log,
+                "student_loss": (sl / n).detach(), "kd_loss": (kl / n).detach()}
+class CodiTrainer(Trainer):
+    def compute_loss(self, model, inputs, return_outputs=False, **kw):
+        core = model.module if hasattr(model, "module") else model
+        if core.ss_prob:  # linear ramp 0 -> ss_prob over the first ss_ramp_frac of training
+            core.ss_p = self._ss = core.ss_prob * min(1.0, self.state.global_step / max(1.0, core.ss_ramp_frac * self.state.max_steps))
+        out = model(inputs["examples"])
+        self._sub = {k: out[k].detach() for k in ("teacher_loss", "student_loss", "kd_loss")}
+        return (out["loss"], out) if return_outputs else out["loss"]
+    def log(self, logs, *a, **k):  # surface sub-losses to console + wandb
+        if hasattr(self, "_sub"):
+            logs.update({k: v.item() for k, v in self._sub.items()})
+        if hasattr(self, "_ss"):
+            logs["ss_p"] = self._ss
+        super().log(logs, *a, **k)
+    def _save(self, output_dir=None, state_dict=None):
+        # tied backbone weights -> safetensors (5.x default) rejects shared tensors; torch.save instead.
+        output_dir = output_dir or self.args.output_dir
+        os.makedirs(output_dir, exist_ok=True)
+        torch.save(state_dict or self.model.state_dict(), os.path.join(output_dir, WEIGHTS_NAME))
+        torch.save(self.args, os.path.join(output_dir, "training_args.bin"))
+        # also write config/tokenizer/projector so each ckpt is eval-loadable (small, no weight dup).
+        self.model.model.config.save_pretrained(output_dir)
+        self.tok.save_pretrained(output_dir)
+        torch.save(self.model.prj.state_dict(), os.path.join(output_dir, "thought_projector.pt"))
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--model", required=True)  # Stage-1 SFT dir
+    ap.add_argument("--output_dir", required=True)
+    ap.add_argument("--sources", nargs="+", default=["mbpp", "humaneval", "pyx"])
+    ap.add_argument("--cache_dir", default="data/cache/codi_train")  # offline tokenized examples from precompute.py
+    ap.add_argument("--n_samples", type=int, default=-1)
+    ap.add_argument("--max_seq_len", type=int, default=4096)
+    ap.add_argument("--max_frames", type=int, default=-1)
+    ap.add_argument("--latent_steps", type=int, default=1)
+    ap.add_argument("--epochs", type=float, default=10.0)
+    ap.add_argument("--lr", type=float, default=1e-5)
+    ap.add_argument("--batch_size", type=int, default=1)
+    ap.add_argument("--grad_accum", type=int, default=4)
+    ap.add_argument("--max_steps", type=int, default=-1)
+    ap.add_argument("--save_steps", type=int, default=500)
+    ap.add_argument("--alpha", type=float, default=1.0)
+    ap.add_argument("--beta", type=float, default=1.0)
+    ap.add_argument("--gamma", type=float, default=1.0)
+    ap.add_argument("--kd_layers", nargs="+", type=int, default=None)  # default: all layers (frozen -> last)
+    ap.add_argument("--frozen_teacher", default="")  # path to frozen SFT teacher; "" -> shared-weight (legacy)
+    ap.add_argument("--kd_target", default="hidden", choices=["hidden", "logit"])  # key-hidden align: smooth_l1 vs KL
+    ap.add_argument("--kd_temp", type=float, default=2.0)  # logit-KD temperature
+    ap.add_argument("--single_anchor", action="store_true")  # KD at last frame only (vanilla CODI)
+    ap.add_argument("--ss_prob", type=float, default=0.0)  # scheduled-sampling max prob (0 = off)
+    ap.add_argument("--ss_ramp_frac", type=float, default=0.5)  # ramp ss_prob over this frac of steps
+    args = ap.parse_args()
+    tok = AutoTokenizer.from_pretrained(args.model, use_fast=True)
+    add_trace_tokens(tok)  # idempotent
+    ids = token_ids(tok)
+    base = AutoModelForCausalLM.from_pretrained(args.model, torch_dtype=torch.bfloat16)
+    base.config.use_cache = True
+    teacher = None
+    if args.frozen_teacher:
+        teacher = AutoModelForCausalLM.from_pretrained(args.frozen_teacher, torch_dtype=torch.bfloat16)
+        teacher.config.use_cache = False
+        teacher.eval().requires_grad_(False)
+    model = CodiModel(base, latent_start_id=ids["<|latent_start|>"], latent_end_id=ids["<|latent_end|>"],
+                      latent_steps=args.latent_steps, a=args.alpha, b=args.beta, g=args.gamma,
+                      kd_layers=args.kd_layers, single_anchor=args.single_anchor,
+                      ss_prob=args.ss_prob, ss_ramp_frac=args.ss_ramp_frac,
+                      teacher=teacher, kd_target=args.kd_target, kd_temp=args.kd_temp)
+    ds = build_codi_dataset(tok, sources=args.sources, cache_dir=args.cache_dir,
+                            n_samples=args.n_samples, max_seq_len=args.max_seq_len, max_frames=args.max_frames)
+    print(f"{len(ds)} codi examples, latent_steps={args.latent_steps}")
+    report_to = wandb_init(args, "codi")
+    targs = TrainingArguments(
+        output_dir=args.output_dir,
+        per_device_train_batch_size=args.batch_size,
+        gradient_accumulation_steps=args.grad_accum,
+        num_train_epochs=args.epochs,
+        max_steps=args.max_steps,
+        learning_rate=args.lr,
+        lr_scheduler_type="cosine",
+        warmup_ratio=0.03,
+        weight_decay=0.1,
+        max_grad_norm=1.0,
+        bf16=True,
+        optim="paged_adamw_8bit",
+        ddp_find_unused_parameters=False,
+        logging_steps=5,
+        save_strategy="steps",
+        save_steps=args.save_steps,
+        save_total_limit=None,
+        report_to=report_to,
+        remove_unused_columns=False,
+        label_names=[],
+    )
+    trainer = CodiTrainer(
+        model=model, args=targs, train_dataset=ds,
+        data_collator=lambda b: {"examples": b},
+    )
+    trainer.tok = tok
+    # Native checkpoints (CodiModel wrapper + optimizer) auto-resume if interrupted.
+    ckpt = get_last_checkpoint(args.output_dir) if os.path.isdir(args.output_dir) else None
+    trainer.train(resume_from_checkpoint=ckpt)
+    trainer._save_checkpoint(trainer.model, trial=None)  # final step as a resumable, eval-loadable checkpoint-<step>
+if __name__ == "__main__":
+    main()

code/wb.py ADDED Viewed

	@@ -0,0 +1,18 @@

+"""wandb: default-on, offline (compute nodes have no internet -> `wandb sync` later),
+never blocks training. Returns report_to for TrainingArguments."""
+import os
+def wandb_init(args, stage):
+    if int(os.environ.get("RANK", "0")) != 0:  # rank0 only under DDP
+        return []
+    try:
+        import wandb
+        os.environ.setdefault("WANDB_MODE", "offline")
+        wandb.init(project="codi_trace", name=f"{stage}-{os.path.basename(args.output_dir)}",
+                   dir=args.output_dir, config=vars(args))
+        return ["wandb"]
+    except Exception as e:
+        print(f"wandb disabled: {e}")
+        return []