feat(wave-b): ADR-013 LMA integration + B4 end-to-end SDPO-fires proof + doc refresh

21647a4 about 1 month ago

3.78 kB

	"""ladder.py — channel_ladder_configs (ADR-013, the experiment design).

	The isolated-channel ladder REPLACES the old combined alpha=0.2/beta=0.4 recipe
	(superseded; see docs/ALTERED_MINDS_TIE_IN.md Phase 3). Per ADR-013, a combined
	run confounds four effects (task RL, self-distillation of altered reasoning,
	frontier-teacher imitation, KL anchoring) and is scientifically uninterpretable.
	Worse, SDPO against the altered model's OWN hint-conditioned forward pass can
	AMPLIFY the distortion, so it is an experimental intervention, not a stabilizer.

	The ladder isolates channels so each effect is attributable:

	\| Arm \| alpha_sdpo \| beta_replay \| Purpose \|
	\|-----\|------------\|-------------\|----------------------------------\|
	\| A0 \| — \| — \| altered SFT, no RL (control) \|
	\| A1 \| 0.0 \| 0.0 \| GRPO-only baseline \|
	\| A2 \| 0.02 \| 0.0 \| +SDPO small (amplification probe)\|
	\| A3 \| 0.0 \| 0.05 \| +replay-DPO small (washout probe)\|
	\| A4 \| 0.02 \| 0.05 \| combined — ONLY after A1-A3 \|

	``kl_beta`` (KL-to-altered-init coef) = 0.02 for all RL arms. A0 is a sentinel
	(no RL) so its alpha/beta/kl_beta are None.
	"""
	from __future__ import annotations

	from typing import Any

	__all__ = ["channel_ladder_configs", "LADDER_KL_BETA"]

	#: KL-to-altered-init coefficient applied to every RL arm (A1-A4).
	LADDER_KL_BETA = 0.02


	def channel_ladder_configs() -> list[dict[str, Any]]:
	"""Return the ordered A0-A4 arm configs.

	Each arm is a dict with keys: ``arm``, ``alpha_sdpo``, ``beta_replay``,
	``kl_beta``, ``note``. A0 is the no-RL sentinel (alpha/beta/kl_beta = None).

	A runner sweeps these with IDENTICAL seeds/prompts so any observed change in
	the alteration signature is attributable to the single channel that arm
	turns on relative to A1.
	"""
	return [
	{
	"arm": "A0",
	"alpha_sdpo": None,
	"beta_replay": None,
	"kl_beta": None,
	"note": (
	"Control: altered SFT checkpoint, NO RL. Sentinel arm used to "
	"anchor the pre-RL alteration signature."
	),
	},
	{
	"arm": "A1",
	"alpha_sdpo": 0.0,
	"beta_replay": 0.0,
	"kl_beta": LADDER_KL_BETA,
	"note": (
	"GRPO-only baseline (both extra channels OFF). Isolates the "
	"effect of task-driven RL alone on the alteration."
	),
	},
	{
	"arm": "A2",
	"alpha_sdpo": 0.02,
	"beta_replay": 0.0,
	"kl_beta": LADDER_KL_BETA,
	"note": (
	"+SDPO small (amplification probe). SDPO ONLY vs A1: tests "
	"whether self-distillation against the altered model's own "
	"hint-conditioned forward pass AMPLIFIES the distortion."
	),
	},
	{
	"arm": "A3",
	"alpha_sdpo": 0.0,
	"beta_replay": 0.05,
	"kl_beta": LADDER_KL_BETA,
	"note": (
	"+replay-DPO small (washout probe). Trace-replay-DPO ONLY vs "
	"A1: tests whether frontier-teacher disagreement WASHES OUT the "
	"alteration toward base."
	),
	},
	{
	"arm": "A4",
	"alpha_sdpo": 0.02,
	"beta_replay": 0.05,
	"kl_beta": LADDER_KL_BETA,
	"note": (
	"Combined — run ONLY after A1-A3 are interpretable. Confounds "
	"channels by design; meaningful only as a capstone once the "
	"isolated arms are understood."
	),
	},
	]