AlexWortega's picture
Upload data/meta.json with huggingface_hub
0734eb6 verified
Raw
History Blame Contribute Delete
5.87 kB
{
"title": "Weight-Space Geometry of Offline Reasoning Training",
"subtitle": "Same data, different losses, same circuits? Six offline reasoning losses, one base model, identical math rollouts — do they converge in weight space?",
"venue": "ICML 2026 · Mechanistic Interpretability Workshop",
"paper": "https://arxiv.org/abs/2606.23740",
"repo": "https://github.com/AlexWortega/capabilityvectors",
"methodColors": {
"sft": "#2f5d7c",
"rft": "#3f7a6d",
"dft": "#c08a1e",
"rift": "#7c5a86",
"grpo": "#bf4d2e",
"dpo": "#2b2420",
"online_grpo": "#46785f",
"online_dapo": "#9c3526",
"base": "#9b948a"
},
"methodLabels": {
"sft": "SFT", "rft": "RFT", "dft": "DFT", "rift": "RIFT",
"grpo": "Offline GRPO", "dpo": "DPO",
"online_grpo": "Online GRPO", "online_dapo": "Online DAPO", "base": "Base"
},
"methods": [
{"id": "sft", "name": "SFT", "loss": "cross-entropy on all rollouts", "neg": "—", "reward": "—", "ref": "—", "formula": "\\mathcal{L}_{\\text{SFT}} = -\\,\\mathbb{E}_{(x,y)\\sim\\mathcal{D}}\\;\\log \\pi_\\theta(y\\mid x)"},
{"id": "rft", "name": "RFT", "loss": "cross-entropy on positives only", "neg": "filtered out", "reward": "implicit", "ref": "—", "formula": "\\mathcal{L}_{\\text{RFT}} = -\\,\\mathbb{E}_{(x,y)\\sim\\mathcal{D}}\\;\\mathbb{1}\\!\\left[r(x,y){=}1\\right]\\,\\log \\pi_\\theta(y\\mid x)"},
{"id": "dft", "name": "DFT", "loss": "CE × sg(π_θ)", "neg": "down-weighted","reward": "—", "ref": "—", "formula": "\\mathcal{L}_{\\text{DFT}} = -\\,\\mathbb{E}_{(x,y)\\sim\\mathcal{D}}\\;\\operatorname{sg}\\!\\big(\\pi_\\theta(y\\mid x)\\big)\\,\\log \\pi_\\theta(y\\mid x)"},
{"id": "rift", "name": "RIFT", "loss": "reward-weighted cross-entropy", "neg": "weighted", "reward": "yes", "ref": "—", "formula": "\\mathcal{L}_{\\text{RIFT}} = -\\,\\mathbb{E}_{(x,y)\\sim\\mathcal{D}}\\;r(x,y)\\,\\log \\pi_\\theta(y\\mid x)"},
{"id": "grpo", "name": "Offline GRPO", "loss": "group-relative advantage", "neg": "yes", "reward": "yes", "ref": "yes", "formula": "\\mathcal{L}_{\\text{GRPO}} = -\\,\\mathbb{E}_{x}\\;\\tfrac{1}{G}\\!\\sum_{i=1}^{G} \\hat{A}_i\\,\\log \\pi_\\theta(y_i\\mid x),\\quad \\hat{A}_i=\\dfrac{r_i-\\operatorname{mean}(\\mathbf{r})}{\\operatorname{std}(\\mathbf{r})}"},
{"id": "dpo", "name": "DPO", "loss": "contrastive log-ratio", "neg": "paired", "reward": "implicit", "ref": "yes", "formula": "\\mathcal{L}_{\\text{DPO}} = -\\,\\mathbb{E}\\,\\log\\sigma\\!\\Big(\\beta\\big[\\log\\tfrac{\\pi_\\theta(y_w\\mid x)}{\\pi_{\\text{ref}}(y_w\\mid x)}-\\log\\tfrac{\\pi_\\theta(y_l\\mid x)}{\\pi_{\\text{ref}}(y_l\\mid x)}\\big]\\Big)"}
],
"abstract": "Offline reinforcement-learning losses (RFT, RIFT, DFT, Offline GRPO, DPO) are widely used to distill reasoning from large teachers into smaller students, and are usually compared on downstream accuracy alone. We ask a different question: are they mechanistically distinct, or do they converge to a similar weight update? We train six methods on identical math rollouts from a single base model (Qwen3-4B) with attention-only LoRA, then dissect the resulting weight deltas with cosine similarity, principal-angle subspace analysis, linear mode connectivity, and CKA. The picture that emerges is sharp: the reward-weighted MLE family collapses onto one direction, DPO lives in a near-orthogonal subspace, and on-policy RL is geometrically unlike everything offline — even though every method saw exactly the same data.",
"captions": {
"cosine": "Cosine similarity between the stacked LoRA weight deltas ΔW (gauge-invariant, over 144 attention modules). Toggle to add the two on-policy methods. Click any cell to inspect that pair layer-by-layer below.",
"perlayer": "Per-layer cosine of ΔW across all 36 transformer blocks. Pick pairs in the legend; drag the slider to read off any layer. The SFT/RFT/RIFT family stays colinear everywhere; DPO and online RL hug zero.",
"cka": "Linear CKA of hidden representations, layer by layer. Most pairs stay near 1.0 — but DPO's representation similarity collapses in the late layers, the only method that rewires what the network computes rather than just how it writes it.",
"svd": "Top-1 singular-direction agreement of ΔW per layer (output subspace u). Even when raw cosine is low, the dominant output direction can stay aligned — the signature of a shared solution reached through different input-side bases.",
"angles": "Median principal angle between ΔW subspaces (SFT vs each method, over 144 modules): the dominant top-1 angle and the worst of the top-10 directions. Small angle = shared subspace. SFT–RFT sit ~7° apart; SFT–DPO opens to ~55°.",
"geometry": "Update size and rank. Frobenius norm ‖ΔW‖ and average effective rank per method. DPO moves a tiny distance at high rank (a small, broad nudge); the SFT family moves far along a low-rank direction.",
"lmc": "Linear mode connectivity: masked-answer loss interpolated between two adapters (α: 0→1). A flat or monotone path means a shared loss basin; a bump in the middle is a barrier. DPO shows a barrier; SFT↔Offline-GRPO does not.",
"accuracy": "Greedy pass@1. The reward-orthogonal methods (DPO, online GRPO/DAPO) keep the base model's accuracy, while the SFT direction drops GSM8K below base. Note: DPO uses a 10× smaller learning rate (the standard convention).",
"seedlr": "Robustness of the geometry to random seed and learning rate. Raw weight-cosine between two seeds is low, but the top-1 output direction is seed-stable — low cosine is an input-subspace init artifact, not a different solution. A 10× learning-rate change rotates ΔW, it does not merely rescale it."
}
}