File size: 3,775 Bytes
21647a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
"""ladder.py — channel_ladder_configs (ADR-013, the experiment design).

The isolated-channel ladder REPLACES the old combined alpha=0.2/beta=0.4 recipe
(superseded; see docs/ALTERED_MINDS_TIE_IN.md Phase 3). Per ADR-013, a combined
run confounds four effects (task RL, self-distillation of altered reasoning,
frontier-teacher imitation, KL anchoring) and is scientifically uninterpretable.
Worse, SDPO against the altered model's OWN hint-conditioned forward pass can
AMPLIFY the distortion, so it is an experimental intervention, not a stabilizer.

The ladder isolates channels so each effect is attributable:

  | Arm | alpha_sdpo | beta_replay | Purpose                          |
  |-----|------------|-------------|----------------------------------|
  | A0  | —          | —           | altered SFT, no RL (control)     |
  | A1  | 0.0        | 0.0         | GRPO-only baseline               |
  | A2  | 0.02       | 0.0         | +SDPO small (amplification probe)|
  | A3  | 0.0        | 0.05        | +replay-DPO small (washout probe)|
  | A4  | 0.02       | 0.05        | combined — ONLY after A1-A3      |

``kl_beta`` (KL-to-altered-init coef) = 0.02 for all RL arms. A0 is a sentinel
(no RL) so its alpha/beta/kl_beta are None.
"""
from __future__ import annotations

from typing import Any

__all__ = ["channel_ladder_configs", "LADDER_KL_BETA"]

#: KL-to-altered-init coefficient applied to every RL arm (A1-A4).
LADDER_KL_BETA = 0.02


def channel_ladder_configs() -> list[dict[str, Any]]:
    """Return the ordered A0-A4 arm configs.

    Each arm is a dict with keys: ``arm``, ``alpha_sdpo``, ``beta_replay``,
    ``kl_beta``, ``note``. A0 is the no-RL sentinel (alpha/beta/kl_beta = None).

    A runner sweeps these with IDENTICAL seeds/prompts so any observed change in
    the alteration signature is attributable to the single channel that arm
    turns on relative to A1.
    """
    return [
        {
            "arm": "A0",
            "alpha_sdpo": None,
            "beta_replay": None,
            "kl_beta": None,
            "note": (
                "Control: altered SFT checkpoint, NO RL. Sentinel arm used to "
                "anchor the pre-RL alteration signature."
            ),
        },
        {
            "arm": "A1",
            "alpha_sdpo": 0.0,
            "beta_replay": 0.0,
            "kl_beta": LADDER_KL_BETA,
            "note": (
                "GRPO-only baseline (both extra channels OFF). Isolates the "
                "effect of task-driven RL alone on the alteration."
            ),
        },
        {
            "arm": "A2",
            "alpha_sdpo": 0.02,
            "beta_replay": 0.0,
            "kl_beta": LADDER_KL_BETA,
            "note": (
                "+SDPO small (amplification probe). SDPO ONLY vs A1: tests "
                "whether self-distillation against the altered model's own "
                "hint-conditioned forward pass AMPLIFIES the distortion."
            ),
        },
        {
            "arm": "A3",
            "alpha_sdpo": 0.0,
            "beta_replay": 0.05,
            "kl_beta": LADDER_KL_BETA,
            "note": (
                "+replay-DPO small (washout probe). Trace-replay-DPO ONLY vs "
                "A1: tests whether frontier-teacher disagreement WASHES OUT the "
                "alteration toward base."
            ),
        },
        {
            "arm": "A4",
            "alpha_sdpo": 0.02,
            "beta_replay": 0.05,
            "kl_beta": LADDER_KL_BETA,
            "note": (
                "Combined — run ONLY after A1-A3 are interpretable. Confounds "
                "channels by design; meaningful only as a capstone once the "
                "isolated arms are understood."
            ),
        },
    ]