| { |
| "title": "Weight-Space Geometry of Offline Reasoning Training", |
| "subtitle": "Same data, different losses, same circuits? Six offline reasoning losses, one base model, identical math rollouts — do they converge in weight space?", |
| "venue": "ICML 2026 · Mechanistic Interpretability Workshop", |
| "paper": "https://arxiv.org/abs/2606.23740", |
| "repo": "https://github.com/AlexWortega/capabilityvectors", |
| "methodColors": { |
| "sft": "#2f5d7c", |
| "rft": "#3f7a6d", |
| "dft": "#c08a1e", |
| "rift": "#7c5a86", |
| "grpo": "#bf4d2e", |
| "dpo": "#2b2420", |
| "online_grpo": "#46785f", |
| "online_dapo": "#9c3526", |
| "base": "#9b948a" |
| }, |
| "methodLabels": { |
| "sft": "SFT", "rft": "RFT", "dft": "DFT", "rift": "RIFT", |
| "grpo": "Offline GRPO", "dpo": "DPO", |
| "online_grpo": "Online GRPO", "online_dapo": "Online DAPO", "base": "Base" |
| }, |
| "methods": [ |
| {"id": "sft", "name": "SFT", "loss": "cross-entropy on all rollouts", "neg": "—", "reward": "—", "ref": "—", "formula": "\\mathcal{L}_{\\text{SFT}} = -\\,\\mathbb{E}_{(x,y)\\sim\\mathcal{D}}\\;\\log \\pi_\\theta(y\\mid x)"}, |
| {"id": "rft", "name": "RFT", "loss": "cross-entropy on positives only", "neg": "filtered out", "reward": "implicit", "ref": "—", "formula": "\\mathcal{L}_{\\text{RFT}} = -\\,\\mathbb{E}_{(x,y)\\sim\\mathcal{D}}\\;\\mathbb{1}\\!\\left[r(x,y){=}1\\right]\\,\\log \\pi_\\theta(y\\mid x)"}, |
| {"id": "dft", "name": "DFT", "loss": "CE × sg(π_θ)", "neg": "down-weighted","reward": "—", "ref": "—", "formula": "\\mathcal{L}_{\\text{DFT}} = -\\,\\mathbb{E}_{(x,y)\\sim\\mathcal{D}}\\;\\operatorname{sg}\\!\\big(\\pi_\\theta(y\\mid x)\\big)\\,\\log \\pi_\\theta(y\\mid x)"}, |
| {"id": "rift", "name": "RIFT", "loss": "reward-weighted cross-entropy", "neg": "weighted", "reward": "yes", "ref": "—", "formula": "\\mathcal{L}_{\\text{RIFT}} = -\\,\\mathbb{E}_{(x,y)\\sim\\mathcal{D}}\\;r(x,y)\\,\\log \\pi_\\theta(y\\mid x)"}, |
| {"id": "grpo", "name": "Offline GRPO", "loss": "group-relative advantage", "neg": "yes", "reward": "yes", "ref": "yes", "formula": "\\mathcal{L}_{\\text{GRPO}} = -\\,\\mathbb{E}_{x}\\;\\tfrac{1}{G}\\!\\sum_{i=1}^{G} \\hat{A}_i\\,\\log \\pi_\\theta(y_i\\mid x),\\quad \\hat{A}_i=\\dfrac{r_i-\\operatorname{mean}(\\mathbf{r})}{\\operatorname{std}(\\mathbf{r})}"}, |
| {"id": "dpo", "name": "DPO", "loss": "contrastive log-ratio", "neg": "paired", "reward": "implicit", "ref": "yes", "formula": "\\mathcal{L}_{\\text{DPO}} = -\\,\\mathbb{E}\\,\\log\\sigma\\!\\Big(\\beta\\big[\\log\\tfrac{\\pi_\\theta(y_w\\mid x)}{\\pi_{\\text{ref}}(y_w\\mid x)}-\\log\\tfrac{\\pi_\\theta(y_l\\mid x)}{\\pi_{\\text{ref}}(y_l\\mid x)}\\big]\\Big)"} |
| ], |
| "abstract": "Offline reinforcement-learning losses (RFT, RIFT, DFT, Offline GRPO, DPO) are widely used to distill reasoning from large teachers into smaller students, and are usually compared on downstream accuracy alone. We ask a different question: are they mechanistically distinct, or do they converge to a similar weight update? We train six methods on identical math rollouts from a single base model (Qwen3-4B) with attention-only LoRA, then dissect the resulting weight deltas with cosine similarity, principal-angle subspace analysis, linear mode connectivity, and CKA. The picture that emerges is sharp: the reward-weighted MLE family collapses onto one direction, DPO lives in a near-orthogonal subspace, and on-policy RL is geometrically unlike everything offline — even though every method saw exactly the same data.", |
| "captions": { |
| "cosine": "Cosine similarity between the stacked LoRA weight deltas ΔW (gauge-invariant, over 144 attention modules). Toggle to add the two on-policy methods. Click any cell to inspect that pair layer-by-layer below.", |
| "perlayer": "Per-layer cosine of ΔW across all 36 transformer blocks. Pick pairs in the legend; drag the slider to read off any layer. The SFT/RFT/RIFT family stays colinear everywhere; DPO and online RL hug zero.", |
| "cka": "Linear CKA of hidden representations, layer by layer. Most pairs stay near 1.0 — but DPO's representation similarity collapses in the late layers, the only method that rewires what the network computes rather than just how it writes it.", |
| "svd": "Top-1 singular-direction agreement of ΔW per layer (output subspace u). Even when raw cosine is low, the dominant output direction can stay aligned — the signature of a shared solution reached through different input-side bases.", |
| "angles": "Median principal angle between ΔW subspaces (SFT vs each method, over 144 modules): the dominant top-1 angle and the worst of the top-10 directions. Small angle = shared subspace. SFT–RFT sit ~7° apart; SFT–DPO opens to ~55°.", |
| "geometry": "Update size and rank. Frobenius norm ‖ΔW‖ and average effective rank per method. DPO moves a tiny distance at high rank (a small, broad nudge); the SFT family moves far along a low-rank direction.", |
| "lmc": "Linear mode connectivity: masked-answer loss interpolated between two adapters (α: 0→1). A flat or monotone path means a shared loss basin; a bump in the middle is a barrier. DPO shows a barrier; SFT↔Offline-GRPO does not.", |
| "accuracy": "Greedy pass@1. The reward-orthogonal methods (DPO, online GRPO/DAPO) keep the base model's accuracy, while the SFT direction drops GSM8K below base. Note: DPO uses a 10× smaller learning rate (the standard convention).", |
| "seedlr": "Robustness of the geometry to random seed and learning rate. Raw weight-cosine between two seeds is low, but the top-1 output direction is seed-stable — low cosine is an input-subspace init artifact, not a different solution. A 10× learning-rate change rotates ΔW, it does not merely rescale it." |
| } |
| } |
|
|