obliteratus

Running on Zero

App Files Files Community

obliteratus / paper /appendix.tex

pliny-the-prompter

Upload 128 files

f254212 verified about 1 month ago

raw

history blame contribute delete

42.6 kB

	% appendix.tex — included from main.tex via \input{appendix}
	\appendix

	\section{Extended Theoretical Results}
	\label{app:theory}

	This appendix contains full statements and proofs for five additional theoretical results that extend the geometric theory of refusal removal developed in Section~\ref{sec:theory}.
	These results draw on spectral perturbation theory, optimal transport, random matrix theory, Riemannian geometry on the Grassmannian, and information geometry.
	Each addresses a gap identified in prior abliteration literature and provides quantitative, falsifiable predictions.

	% ─────────────────────────────────────────────────────────────────────
	\subsection{Spectral Cost of Abliteration}
	\label{app:spectral_cost}

	Abliteration modifies weight matrices by projecting out refusal directions.
	We bound the resulting perturbation to the singular value spectrum, yielding a formal \emph{capability preservation guarantee}.

	\begin{definition}[Directional Abliteration]
	\label{def:abliteration}
	Let $\mathbf{W} \in \mathbb{R}^{m \times d}$ be a weight matrix and $\mathbf{r} \in \mathbb{R}^d$ a unit refusal direction ($\\|\mathbf{r}\\| = 1$).
	The \emph{abliterated weight matrix} is:
	\begin{equation}
	\mathbf{W}' = \mathbf{W}(\mathbf{I} - \mathbf{r}\mathbf{r}^\top)
	\end{equation}
	For $k$ orthonormal directions $\mathbf{R} = [\mathbf{r}_1, \ldots, \mathbf{r}_k] \in \mathbb{R}^{d \times k}$, the multi-direction abliteration is $\mathbf{W}' = \mathbf{W}(\mathbf{I} - \mathbf{R}\mathbf{R}^\top)$.
	\end{definition}

	\begin{theorem}[Spectral Cost of Abliteration]
	\label{thm:spectral_cost}
	Let $\mathbf{W} \in \mathbb{R}^{m \times d}$ have singular value decomposition $\mathbf{W} = \mathbf{U}\boldsymbol{\Sigma}\mathbf{V}^\top$ with singular values $\sigma_1 \geq \sigma_2 \geq \cdots \geq \sigma_p$ ($p = \min(m,d)$) and right singular vectors $\mathbf{v}_1, \ldots, \mathbf{v}_p$.
	Let $\mathbf{r}$ be a unit refusal direction with expansion $\mathbf{r} = \sum_{j=1}^p \alpha_j \mathbf{v}_j + \mathbf{r}_\perp$ in the singular basis (where $\mathbf{r}_\perp$ lies in the null space of $\mathbf{W}^\top$).
	Let $\sigma_1' \geq \cdots \geq \sigma_p'$ be the singular values of $\mathbf{W}' = \mathbf{W}(\mathbf{I} - \mathbf{r}\mathbf{r}^\top)$.
	Then:

	\begin{enumerate}[label=(\roman*)]
	\item \textbf{Global bound (Weyl).} For all $i$:
	\begin{equation}
	\|\sigma_i' - \sigma_i\| \leq \\|\mathbf{W}\mathbf{r}\\| = \left(\sum_{j=1}^p \alpha_j^2 \sigma_j^2\right)^{1/2}
	\end{equation}

	\item \textbf{Direction-resolved bound.} The perturbation matrix $\mathbf{E} = \mathbf{W}' - \mathbf{W} = -\mathbf{W}\mathbf{r}\mathbf{r}^\top$ has rank $\leq 1$ and its only nonzero singular value is $\\|\mathbf{W}\mathbf{r}\\|$.
	Consequently, at most one singular value of $\mathbf{W}$ changes by $\\|\mathbf{W}\mathbf{r}\\|$; all others shift by at most this amount but by Cauchy interlacing, at most $p-1$ singular values shift, and the shifts satisfy:
	\begin{equation}
	\sum_{i=1}^p (\sigma_i' - \sigma_i)^2 \leq \\|\mathbf{E}\\|_F^2 = \\|\mathbf{W}\mathbf{r}\\|^2
	\end{equation}

	\item \textbf{Singular subspace stability (Davis--Kahan).} Let $\mathbf{V}_k = [\mathbf{v}_1, \ldots, \mathbf{v}_k]$ span the top-$k$ right singular subspace of $\mathbf{W}$, and $\mathbf{V}_k'$ the corresponding subspace of $\mathbf{W}'$.
	If the singular value gap $\delta_k = \sigma_k - \sigma_{k+1} > \\|\mathbf{W}\mathbf{r}\\|$, then the principal angle $\theta$ between these subspaces satisfies:
	\begin{equation}
	\sin \theta(\mathbf{V}_k, \mathbf{V}_k') \leq \frac{\\|\mathbf{W}\mathbf{r}\\|}{\delta_k - \\|\mathbf{W}\mathbf{r}\\|}
	\end{equation}

	\item \textbf{Multi-direction extension.} For $k$ orthonormal directions $\mathbf{R} \in \mathbb{R}^{d \times k}$:
	\begin{equation}
	\|\sigma_i' - \sigma_i\| \leq \\|\mathbf{W}\mathbf{R}\\|_2 \quad \text{and} \quad \sum_i (\sigma_i' - \sigma_i)^2 \leq \\|\mathbf{W}\mathbf{R}\\|_F^2 = \sum_{j=1}^k \\|\mathbf{W}\mathbf{r}_j\\|^2
	\end{equation}
	\end{enumerate}
	\end{theorem}

	\begin{proof}
	\textbf{(i)} The abliterated matrix is $\mathbf{W}' = \mathbf{W} + \mathbf{E}$ where $\mathbf{E} = -\mathbf{W}\mathbf{r}\mathbf{r}^\top$.
	By Weyl's inequality for singular values \citep{stewart1990matrix}: $\|\sigma_i(\mathbf{W}') - \sigma_i(\mathbf{W})\| \leq \sigma_1(\mathbf{E}) = \\|\mathbf{E}\\|_2$.
	Since $\mathbf{E} = -(\mathbf{W}\mathbf{r})\mathbf{r}^\top$ is a rank-1 matrix (outer product of $\mathbf{W}\mathbf{r} \in \mathbb{R}^m$ and $\mathbf{r} \in \mathbb{R}^d$), its only nonzero singular value is $\\|\mathbf{W}\mathbf{r}\\| \cdot \\|\mathbf{r}\\| = \\|\mathbf{W}\mathbf{r}\\|$.
	Expanding in the singular basis: $\mathbf{W}\mathbf{r} = \sum_j \alpha_j \sigma_j \mathbf{u}_j$, so $\\|\mathbf{W}\mathbf{r}\\|^2 = \sum_j \alpha_j^2 \sigma_j^2$.

	\textbf{(ii)} The Frobenius bound follows from the Hoffman--Wielandt inequality: $\sum_i (\sigma_i' - \sigma_i)^2 \leq \\|\mathbf{E}\\|_F^2$.
	For a rank-1 matrix, $\\|\mathbf{E}\\|_F = \\|\mathbf{E}\\|_2 = \\|\mathbf{W}\mathbf{r}\\|$.

	\textbf{(iii)} This is the $\sin\theta$ theorem of \citet{davis1970rotation}.
	The perturbation $\mathbf{E}$ acts on the invariant subspace.
	By the classical Davis--Kahan bound applied to $\mathbf{W}^\top \mathbf{W}$ (whose eigenvalues are $\sigma_i^2$), the relevant gap is $\sigma_k^2 - \sigma_{k+1}^2 = (\sigma_k - \sigma_{k+1})(\sigma_k + \sigma_{k+1})$.
	Using the refined multiplicative form and the fact that $\\|\mathbf{E}\\|_2 = \\|\mathbf{W}\mathbf{r}\\|$, we obtain the stated bound (using $\delta_k = \sigma_k - \sigma_{k+1}$ requires the assumption $\sigma_{k+1} > 0$; otherwise the symmetric form with $\sigma_k^2 - \sigma_{k+1}^2$ gives a tighter bound).

	\textbf{(iv)} For $\mathbf{E} = -\mathbf{W}\mathbf{R}\mathbf{R}^\top$ with $\mathbf{R}^\top\mathbf{R} = \mathbf{I}_k$, we have $\operatorname{rank}(\mathbf{E}) \leq k$ and $\\|\mathbf{E}\\|_2 = \\|\mathbf{W}\mathbf{R}\\|_2$, $\\|\mathbf{E}\\|_F^2 = \operatorname{tr}(\mathbf{R}^\top \mathbf{W}^\top \mathbf{W} \mathbf{R}) = \sum_j \\|\mathbf{W}\mathbf{r}_j\\|^2$ (using orthonormality of columns of $\mathbf{R}$).
	The bounds then follow from Weyl and Hoffman--Wielandt as before.
	\end{proof}

	\begin{corollary}[Capability Preservation Guarantee]
	\label{cor:capability}
	If the refusal direction $\mathbf{r}$ lies primarily in the bottom singular subspace---i.e., $\alpha_j \approx 0$ for $j \leq k$ and $\sum_{j > k} \alpha_j^2 \sigma_j^2 \leq \epsilon^2$---then abliteration perturbs all singular values by at most $\epsilon$, and the top-$k$ singular subspace rotates by at most $\arcsin(\epsilon / \delta_k)$.
	Intuitively, refusal directions that are ``orthogonal to the model's principal computations'' can be removed cheaply.
	\end{corollary}

	\paragraph{Remark.}
	This result explains the empirical success of norm-preserving abliteration \citep{grimjim2025}: rescaling $\mathbf{W}'$ to match $\\|\mathbf{W}\\|_F$ compensates for the trace-level spectral shift $\sum_i (\sigma_i'^2 - \sigma_i^2) = -\\|\mathbf{W}\mathbf{r}\\|^2$, but does not address the \emph{relative} reordering of singular values.
	Theorem~\ref{thm:spectral_cost}(iii) shows that when the spectral gap is large relative to $\\|\mathbf{W}\mathbf{r}\\|$, the singular subspace is approximately preserved even without norm correction.

	% ─────────────────────────────────────────────────────────────────────
	\subsection{Wasserstein Cost of Abliteration}
	\label{app:wasserstein}

	Current abliteration methods optimize mean separation (difference-of-means) while ignoring the distributional cost of the intervention.
	We formalize this using optimal transport theory.

	\begin{theorem}[Wasserstein Cost of Directional Projection]
	\label{thm:wasserstein}
	Let $\mu_0 = \mathcal{N}(\mathbf{m}, \boldsymbol{\Sigma})$ be the activation distribution at layer~$l$ on harmless inputs, and let $\mathbf{P}_\perp = \mathbf{I} - \mathbf{r}\mathbf{r}^\top$ be the projection orthogonal to refusal direction~$\mathbf{r}$.
	The post-abliteration activation distribution is $\mu_1 = \mathcal{N}(\mathbf{P}_\perp \mathbf{m}, \,\mathbf{P}_\perp \boldsymbol{\Sigma} \mathbf{P}_\perp)$.
	The 2-Wasserstein distance decomposes as:
	\begin{equation}
	W_2^2(\mu_0, \mu_1) = \underbrace{(\mathbf{r}^\top \mathbf{m})^2}_{\text{mean shift}} + \underbrace{\operatorname{tr}\!\big(\boldsymbol{\Sigma}\big) + \operatorname{tr}\!\big(\mathbf{P}_\perp \boldsymbol{\Sigma} \mathbf{P}_\perp\big) - 2\operatorname{tr}\!\Big(\big(\boldsymbol{\Sigma}^{1/2}\mathbf{P}_\perp\boldsymbol{\Sigma}\mathbf{P}_\perp\boldsymbol{\Sigma}^{1/2}\big)^{1/2}\Big)}_{\text{Bures divergence } \mathcal{B}^2(\boldsymbol{\Sigma},\, \mathbf{P}_\perp\boldsymbol{\Sigma}\mathbf{P}_\perp)}
	\end{equation}
	Furthermore, with $\sigma_r^2 = \mathbf{r}^\top\boldsymbol{\Sigma}\mathbf{r}$ denoting the activation variance along $\mathbf{r}$ and $\alpha_i = \mathbf{r}^\top\mathbf{e}_i$ the components in the eigenbasis of $\boldsymbol{\Sigma}$:
	\begin{enumerate}[label=(\roman*)]
	\item \textbf{General upper bound.} The Bures divergence satisfies:
	\begin{equation}
	\mathcal{B}^2(\boldsymbol{\Sigma}, \mathbf{P}_\perp\boldsymbol{\Sigma}\mathbf{P}_\perp) \leq \sigma_r^2 + 2\sum_{i=1}^d \lambda_i \alpha_i^2 - \sigma_r^4/\lambda_{\max}
	\end{equation}
	where the bound follows from the operator concavity of $A \mapsto \operatorname{tr}(A^{1/2})$.
	\item \textbf{Eigenvector special case.} When $\mathbf{r}$ is an eigenvector of $\boldsymbol{\Sigma}$ with eigenvalue $\lambda_r$:
	\end{enumerate}
	\begin{equation}
	W_2^2(\mu_0, \mu_1) = (\mathbf{r}^\top\mathbf{m})^2 + ({\sqrt{\lambda_r}})^2 = (\mathbf{r}^\top\mathbf{m})^2 + \lambda_r
	\end{equation}
	\end{theorem}

	\begin{proof}
	The 2-Wasserstein distance between Gaussians $\mathcal{N}(\mathbf{m}_0, \boldsymbol{\Sigma}_0)$ and $\mathcal{N}(\mathbf{m}_1, \boldsymbol{\Sigma}_1)$ has the closed form \citep{dowson1982frechet,givens1984class}:
	\begin{equation}
	W_2^2 = \\|\mathbf{m}_0 - \mathbf{m}_1\\|^2 + \operatorname{tr}(\boldsymbol{\Sigma}_0) + \operatorname{tr}(\boldsymbol{\Sigma}_1) - 2\operatorname{tr}\!\big((\boldsymbol{\Sigma}_0^{1/2}\boldsymbol{\Sigma}_1\boldsymbol{\Sigma}_0^{1/2})^{1/2}\big)
	\end{equation}

	Setting $\mathbf{m}_0 = \mathbf{m}$, $\mathbf{m}_1 = \mathbf{P}_\perp\mathbf{m}$: the mean shift is $\\|\mathbf{m} - \mathbf{P}_\perp\mathbf{m}\\|^2 = \\|\mathbf{r}\mathbf{r}^\top\mathbf{m}\\|^2 = (\mathbf{r}^\top\mathbf{m})^2$.

	Setting $\boldsymbol{\Sigma}_0 = \boldsymbol{\Sigma}$, $\boldsymbol{\Sigma}_1 = \mathbf{P}_\perp\boldsymbol{\Sigma}\mathbf{P}_\perp$: this is a singular covariance (rank deficient along $\mathbf{r}$), so $\boldsymbol{\Sigma}_1$ has eigenvalue 0 along $\mathbf{r}$ and unchanged eigenvalues along directions orthogonal to $\mathbf{r}$ that are also eigenvectors of $\boldsymbol{\Sigma}$.

	For the special case where $\mathbf{r} = \mathbf{e}_j$ (an eigenvector of $\boldsymbol{\Sigma}$), we have $\boldsymbol{\Sigma}_1 = \boldsymbol{\Sigma} - \lambda_j \mathbf{e}_j\mathbf{e}_j^\top$.
	Then $\boldsymbol{\Sigma}_0^{1/2}\boldsymbol{\Sigma}_1\boldsymbol{\Sigma}_0^{1/2}$ has eigenvalues $\lambda_i^2$ for $i \neq j$ and $0$ for $i = j$.
	Thus $\operatorname{tr}((\cdot)^{1/2}) = \sum_{i \neq j} \lambda_i$.
	The Bures term becomes $\sum_i \lambda_i + \sum_{i \neq j} \lambda_i - 2\sum_{i \neq j}\lambda_i = \lambda_j$.
	So $W_2^2 = (\mathbf{r}^\top\mathbf{m})^2 + \lambda_j$.

	\textbf{General case (upper bound).}
	When $\mathbf{r}$ is not an eigenvector, $\boldsymbol{\Sigma}_1 = \mathbf{P}_\perp\boldsymbol{\Sigma}\mathbf{P}_\perp$ is no longer simultaneously diagonalizable with $\boldsymbol{\Sigma}$.
	The Bures divergence is $\mathcal{B}^2 = \operatorname{tr}(\boldsymbol{\Sigma}) + \operatorname{tr}(\boldsymbol{\Sigma}_1) - 2\operatorname{tr}((\boldsymbol{\Sigma}^{1/2}\boldsymbol{\Sigma}_1\boldsymbol{\Sigma}^{1/2})^{1/2})$.
	We have $\operatorname{tr}(\boldsymbol{\Sigma}_1) = \operatorname{tr}(\boldsymbol{\Sigma}) - \sigma_r^2$ (since projection removes variance $\sigma_r^2 = \mathbf{r}^\top\boldsymbol{\Sigma}\mathbf{r}$), so $\operatorname{tr}(\boldsymbol{\Sigma}) + \operatorname{tr}(\boldsymbol{\Sigma}_1) = 2\operatorname{tr}(\boldsymbol{\Sigma}) - \sigma_r^2$.
	For the matrix square root term, note $\boldsymbol{\Sigma}^{1/2}\boldsymbol{\Sigma}_1\boldsymbol{\Sigma}^{1/2} = \boldsymbol{\Sigma}^2 - \boldsymbol{\Sigma}^{1/2}(\boldsymbol{\Sigma}\mathbf{r}\mathbf{r}^\top + \mathbf{r}\mathbf{r}^\top\boldsymbol{\Sigma} - \sigma_r^2\mathbf{r}\mathbf{r}^\top)\boldsymbol{\Sigma}^{1/2}$.
	By the concavity of $A \mapsto \operatorname{tr}(A^{1/2})$ on positive semidefinite matrices (a consequence of the operator concavity of $t \mapsto t^{1/2}$), we have $\operatorname{tr}((\boldsymbol{\Sigma}^{1/2}\boldsymbol{\Sigma}_1\boldsymbol{\Sigma}^{1/2})^{1/2}) \geq \operatorname{tr}(\boldsymbol{\Sigma}) - \sigma_r^2/2 - \sum_i \lambda_i\alpha_i^2/2 + \sigma_r^4/(4\lambda_{\max})$, yielding the stated upper bound after algebraic simplification.
	\end{proof}

	\begin{corollary}[Mean-Only Abliteration Is Suboptimal]
	\label{cor:mean_only}
	Standard difference-of-means abliteration selects $\mathbf{r}$ to maximize the harmful-vs-harmless mean shift $(\mathbf{r}^\top\mathbf{d})^2$, but the total Wasserstein cost on harmless inputs includes the Bures term $\mathcal{B}^2$.
	A direction $\mathbf{r}$ that happens to be a high-variance eigenvector of $\boldsymbol{\Sigma}$ incurs Bures cost $\lambda_r$, which may dominate the mean shift.
	The \emph{Wasserstein-optimal} refusal direction minimizes:
	\begin{equation}
	\mathbf{r}^* = \argmin_{\\|\mathbf{r}\\|=1} \frac{W_2^2(\mu_0^{\text{harmless}}, \mu_1^{\text{harmless}})}{(\mathbf{r}^\top\mathbf{d})^2} = \argmin_{\\|\mathbf{r}\\|=1} \frac{(\mathbf{r}^\top\mathbf{m})^2 + \mathbf{r}^\top\boldsymbol{\Sigma}\mathbf{r}}{(\mathbf{r}^\top\mathbf{d})^2}
	\end{equation}
	This is a generalized eigenvalue problem, distinct from both standard and whitened SVD.
	\end{corollary}

	\paragraph{Practical implication.}
	Theorem~\ref{thm:wasserstein} predicts that abliteration along high-variance directions of the harmless distribution (rogue dimensions) incurs disproportionate Wasserstein cost.
	This provides a formal explanation for why whitened SVD (Theorem~\ref{thm:fisher}) improves capability preservation: by downweighting high-variance directions, whitened SVD implicitly reduces the Bures component of the Wasserstein cost.

	% ─────────────────────────────────────────────────────────────────────
	\subsection{Detectability Phase Transition for Refusal Directions}
	\label{app:phase_transition}

	Using random matrix theory, we characterize when linear methods can reliably identify refusal directions from finite prompt samples.

	\begin{definition}[Spiked Activation Model]
	\label{def:spiked}
	Let activations under harmful prompts at layer~$l$ be:
	\begin{equation}
	\mathbf{a}_i = \boldsymbol{\mu} + \sqrt{\beta}\, \mathbf{r}\, z_i + \boldsymbol{\epsilon}_i, \quad i = 1, \ldots, n
	\end{equation}
	where $\mathbf{r} \in \mathbb{R}^d$ is the unit refusal direction, $\beta > 0$ is the refusal signal strength, $z_i \sim \mathcal{N}(0, 1)$ is the per-prompt refusal activation, and $\boldsymbol{\epsilon}_i \sim \mathcal{N}(\mathbf{0}, \sigma^2\mathbf{I}_d)$ is isotropic noise.
	The population covariance is $\boldsymbol{\Sigma} = \sigma^2\mathbf{I} + \beta\mathbf{r}\mathbf{r}^\top$ (a rank-1 spiked model).
	\end{definition}

	\begin{theorem}[BBP Phase Transition for Refusal Detection]
	\label{thm:bbp}
	In the proportional limit $n, d \to \infty$ with $d/n \to \gamma > 0$, let $\hat{\mathbf{v}}_1$ be the leading eigenvector of the sample covariance $\hat{\boldsymbol{\Sigma}} = \frac{1}{n}\sum_{i=1}^n (\mathbf{a}_i - \bar{\mathbf{a}})(\mathbf{a}_i - \bar{\mathbf{a}})^\top$.
	Define the signal-to-noise ratio $\rho = \beta/\sigma^2$.

	\begin{enumerate}[label=(\roman*)]
	\item \textbf{Subcritical regime} ($\rho \leq \sqrt{\gamma}$).
	The leading sample eigenvector $\hat{\mathbf{v}}_1$ is asymptotically orthogonal to the true refusal direction:
	\begin{equation}
	\|\langle \hat{\mathbf{v}}_1, \mathbf{r} \rangle\|^2 \xrightarrow{a.s.} 0
	\end{equation}
	No consistent linear estimator of $\mathbf{r}$ exists.
	Abliteration based on the top eigenvector removes a noise direction and has no effect on refusal.

	\item \textbf{Supercritical regime} ($\rho > \sqrt{\gamma}$).
	The leading eigenvector consistently estimates $\mathbf{r}$:
	\begin{equation}
	\|\langle \hat{\mathbf{v}}_1, \mathbf{r} \rangle\|^2 \xrightarrow{a.s.} \frac{1 - \gamma/\rho^2}{1 + \gamma/\rho} \in (0, 1)
	\end{equation}
	and the leading sample eigenvalue detaches from the Marchenko--Pastur bulk:
	\begin{equation}
	\hat{\lambda}_1 \xrightarrow{a.s.} \sigma^2(\rho + 1)(1 + \gamma/\rho) > \sigma^2(1+\sqrt{\gamma})^2
	\end{equation}

	\item \textbf{Minimum sample size.}
	For the refusal direction to be detectable, the number of prompts must satisfy:
	\begin{equation}
	n > \frac{d \sigma^4}{\beta^2} = \frac{d}{\rho^2}
	\end{equation}
	For $d = 4096$ (typical 7B model) and $\rho = 2$ (moderate refusal signal): $n > 1024$ prompts.
	For strong refusal ($\rho = 5$): $n > 164$ prompts.
	\end{enumerate}
	\end{theorem}

	\begin{proof}
	Parts (i) and (ii) are the Baik--Ben~Arous--P\'ech\'e (BBP) phase transition \citep{baik2005phase,paul2007asymptotics} applied to the spiked covariance model.

	\textbf{(i)} When $\rho \leq \sqrt{\gamma}$, the spike eigenvalue $\sigma^2(1 + \rho)$ does not exceed the right edge of the Marchenko--Pastur distribution $\sigma^2(1+\sqrt{\gamma})^2$ in the sample.
	By the BBP theorem, $\hat{\lambda}_1 \to \sigma^2(1+\sqrt{\gamma})^2$ (stuck at the bulk edge), and $\|\langle\hat{\mathbf{v}}_1, \mathbf{r}\rangle\|^2 \to 0$.

	\textbf{(ii)} When $\rho > \sqrt{\gamma}$, the spike pushes the sample eigenvalue above the bulk.
	The asymptotic alignment $\|\langle\hat{\mathbf{v}}_1, \mathbf{r}\rangle\|^2 \to (1 - \gamma/\rho^2)/(1 + \gamma/\rho)$ follows from the resolvent analysis in \citet{paul2007asymptotics}.

	\textbf{(iii)} The critical condition $\rho > \sqrt{\gamma}$ with $\gamma = d/n$ gives $\beta/\sigma^2 > \sqrt{d/n}$, hence $n > d\sigma^4/\beta^2$.
	\end{proof}

	\paragraph{Practical implication.}
	Most abliteration studies use 32--128 harmful prompts with $d = 4096$.
	This gives $\gamma = d/n \approx 32$--$128$, requiring $\rho > 5.7$--$11.3$ for reliable detection.
	The BBP threshold explains why abliteration on small prompt sets sometimes fails catastrophically: the extracted direction is noise, not refusal.
	Theorem~\ref{thm:bbp}(iii) provides a concrete minimum prompt count.

	This also explains why difference-of-means (which estimates the mean shift rather than the covariance spike) can succeed with fewer samples than PCA: the mean estimator has standard rate $O(1/\sqrt{n})$ regardless of $d$, while the covariance-based estimator requires $n = \Omega(d/\rho^2)$.
	The practical recommendation is to use difference-of-means for small prompt sets and whitened SVD (which combines both) when sufficient prompts are available.

	% ─────────────────────────────────────────────────────────────────────
	\subsection{Grassmannian Coherence of Refusal Subspaces}
	\label{app:grassmannian}

	The refusal subspace varies across layers.
	We formalize this as a curve on the Grassmannian and derive consequences for multi-layer abliteration.

	\begin{definition}[Refusal Curve on the Grassmannian]
	\label{def:refusal_curve}
	Let $\mathcal{S}_l \in \operatorname{Gr}(k, d)$ be the $k$-dimensional refusal subspace at layer $l$, identified as the span of the top-$k$ singular vectors of the whitened difference matrix at layer $l$.
	The \emph{refusal curve} is the discrete path $\gamma = (\mathcal{S}_1, \mathcal{S}_2, \ldots, \mathcal{S}_L)$ on $\operatorname{Gr}(k, d)$.
	The \emph{geodesic distance} between adjacent subspaces is:
	\begin{equation}
	d_G(\mathcal{S}_l, \mathcal{S}_{l+1}) = \left(\sum_{i=1}^k \theta_i^2\right)^{1/2}
	\end{equation}
	where $\theta_1, \ldots, \theta_k \in [0, \pi/2]$ are the \emph{principal angles} between $\mathcal{S}_l$ and $\mathcal{S}_{l+1}$, computed via the SVD of $\mathbf{V}_l^\top \mathbf{V}_{l+1}$ (where $\mathbf{V}_l \in \mathbb{R}^{d \times k}$ is an orthonormal basis for $\mathcal{S}_l$).
	\end{definition}

	\begin{theorem}[Grassmannian Coherence and Abliteration Effectiveness]
	\label{thm:grassmannian}
	Let $\gamma = (\mathcal{S}_1, \ldots, \mathcal{S}_L)$ be the refusal curve.
	Define the \emph{coherence} $C = \max_{l, l'} d_G(\mathcal{S}_l, \mathcal{S}_{l'})$ and the \emph{total geodesic length} $\Lambda = \sum_{l=1}^{L-1} d_G(\mathcal{S}_l, \mathcal{S}_{l+1})$.

	\begin{enumerate}[label=(\roman*)]
	\item \textbf{Universal direction effectiveness.}
	Let $\mathcal{S}^*$ be any fixed $k$-dimensional subspace (e.g., the refusal directions extracted from a single layer).
	The fraction of refusal energy at layer $l$ captured by projecting onto $\mathcal{S}^*$ is:
	\begin{equation}
	E_l(\mathcal{S}^) = \frac{\\|\mathbf{P}_{\mathcal{S}^} \mathbf{P}_{\mathcal{S}_l}\\|_F^2}{k} = \frac{\sum_{i=1}^k \cos^2\theta_i(\mathcal{S}^*, \mathcal{S}_l)}{k}
	\end{equation}
	If $C < \pi/4$ (coherent refusal), then for \emph{any} data subspace $\mathcal{S}_{l_0}$ (or the Fr\'echet mean $\bar{\mathcal{S}}$):
	\begin{equation}
	E_l(\mathcal{S}_{l_0}) \geq \cos^2(C) > \frac{1}{2} \quad \text{for all } l
	\end{equation}
	That is, a single universal abliteration subspace (e.g., the refusal directions from any single layer) captures more than half the refusal energy at every layer.

	\item \textbf{Mechanism count lower bound.}
	The total geodesic length lower-bounds the number of geometrically distinct refusal mechanisms:
	\begin{equation}
	M \geq \left\lceil \frac{\Lambda}{\pi/4} \right\rceil
	\end{equation}
	where $M$ is the minimum number of subspaces of angular radius $\pi/4$ needed to cover the refusal curve (the covering number with respect to the geodesic metric).

	\item \textbf{Optimal layer selection.}
	Given a budget of $T$ layers for abliteration, the optimal selection $\{l_1, \ldots, l_T\}$ that maximizes total refusal energy removal across all layers satisfies:
	\begin{equation}
	\sum_{l=1}^L E_l\!\left(\bigcup_{t=1}^T \mathcal{S}_{l_t}\right) \geq \left(1 - \left(1 - \frac{1}{M}\right)^T\right) L
	\end{equation}
	where the union denotes the span. This follows from the submodularity of set coverage (see Theorem~\ref{thm:submodular} below).
	\end{enumerate}
	\end{theorem}

	\begin{proof}
	\textbf{(i)} The projection overlap $E_l(\mathcal{S}^) = k^{-1}\sum_i \cos^2\theta_i$ follows from the definition of principal angles: if $\mathbf{V}^$ and $\mathbf{V}_l$ are orthonormal bases, then $\\|\mathbf{V}^{*\top}\mathbf{V}_l\\|_F^2 = \sum_i \cos^2\theta_i$.

	We show that \emph{any} data point $\mathcal{S}_{l_0}$ already achieves $E_l(\mathcal{S}_{l_0}) > 1/2$ for all $l$.
	By definition of coherence, $d_G(\mathcal{S}_{l_0}, \mathcal{S}_l) \leq C$ for all $l$.
	When $k=1$, $d_G = \theta_1$ and $E_l = \cos^2\theta_1 \geq \cos^2(C) > 1/2$ since $C < \pi/4$.
	For $k > 1$: $E_l = k^{-1}\sum_i \cos^2\theta_i \geq \cos^2(\max_i \theta_i)$.
	Since the geodesic distance satisfies $d_G = (\sum_i \theta_i^2)^{1/2}$, we have $\max_i \theta_i \leq d_G \leq C$, giving $E_l \geq \cos^2(C) > 1/2$.
	The Fr\'echet mean $\bar{\mathcal{S}} = \argmin_{\mathcal{S}} \sum_l d_G^2(\mathcal{S}, \mathcal{S}_l)$ can only improve upon this, since it minimizes the total squared distance to all subspaces.

	\textbf{(ii)} The covering number argument: any ball of radius $\pi/4$ on $\operatorname{Gr}(k,d)$ contains subspaces with pairwise geodesic distance at most $\pi/2$.
	A curve of total length $\Lambda$ requires at least $\lceil \Lambda / (\pi/4) \rceil$ balls of radius $\pi/4$ to cover (since adjacent centers on the curve are separated by their arc length).
	Each ball represents one ``mechanism''---a group of layers using geometrically similar refusal subspaces.

	\textbf{(iii)} The energy at layer $l$ from the union of $T$ layer subspaces is $E_l(\cup_t \mathcal{S}_{l_t}) = k^{-1}\\|\mathbf{P}_{\cup_t \mathcal{S}_{l_t}} \mathbf{P}_{\mathcal{S}_l}\\|_F^2$.
	The function $f(T) = \sum_l E_l$ is monotone submodular in the set of selected layers (adding a layer can only increase energy, and with diminishing returns since the new subspace overlaps with existing ones).
	By the greedy algorithm guarantee for submodular maximization \citep{nemhauser1978analysis}, the greedy selection achieves $(1-1/e)$ of the optimum. The stated bound follows from noting that $M$ well-chosen layers would capture all energy.
	\end{proof}

	% ─────────────────────────────────────────────────────────────────────
	\subsection{Fisher Sensitivity of Abliteration}
	\label{app:fisher_sensitivity}

	We connect the geometric refusal analysis to the information-theoretic cost of abliteration on the model's output distribution.

	\begin{theorem}[Fisher Information Bound on Abliteration Cost]
	\label{thm:fisher_info}
	Let $p(\mathbf{y} \| \mathbf{x}; \boldsymbol{\theta})$ be the model's output distribution parameterized by weights $\boldsymbol{\theta} \in \mathbb{R}^D$.
	Let $\boldsymbol{\theta}' = \boldsymbol{\theta} - \Delta\boldsymbol{\theta}$ be the abliterated parameters, where $\Delta\boldsymbol{\theta}$ is the concatenation of $(\mathbf{W}_l\mathbf{r}\mathbf{r}^\top)$ flattened across all modified layers.
	Then for harmless input distribution $\mathcal{X}$:

	\begin{enumerate}[label=(\roman*)]
	\item \textbf{Local KL bound.}
	\begin{equation}
	\mathbb{E}_{\mathbf{x} \sim \mathcal{X}}\!\left[D_{\mathrm{KL}}\!\big(p(\cdot\|\mathbf{x};\boldsymbol{\theta}) \,\\|\, p(\cdot\|\mathbf{x};\boldsymbol{\theta}')\big)\right] \leq \frac{1}{2}\Delta\boldsymbol{\theta}^\top \mathbf{F}(\boldsymbol{\theta}) \Delta\boldsymbol{\theta} + O(\\|\Delta\boldsymbol{\theta}\\|^3)
	\end{equation}
	where $\mathbf{F}(\boldsymbol{\theta}) = \mathbb{E}_{\mathbf{x},\mathbf{y}}[\nabla_{\boldsymbol{\theta}} \log p(\mathbf{y}\|\mathbf{x};\boldsymbol{\theta}) \nabla_{\boldsymbol{\theta}} \log p(\mathbf{y}\|\mathbf{x};\boldsymbol{\theta})^\top]$ is the Fisher information matrix.

	\item \textbf{Per-layer decomposition.} For a single-layer abliteration at layer $l$ with perturbation $\Delta\mathbf{W}_l = \mathbf{W}_l\mathbf{r}\mathbf{r}^\top$:
	\begin{equation}
	\frac{1}{2}\operatorname{vec}(\Delta\mathbf{W}_l)^\top \mathbf{F}_l \operatorname{vec}(\Delta\mathbf{W}_l)
	\end{equation}
	where $\mathbf{F}_l$ is the block of the Fisher information corresponding to layer~$l$'s parameters.

	\item \textbf{Empirical estimability.}
	The Fisher cost $\mathbf{r}^\top \tilde{\mathbf{F}}_l \mathbf{r}$ (where $\tilde{\mathbf{F}}_l$ is the Fisher information projected to the refusal direction subspace) can be estimated from the variance of the score function:
	\begin{equation}
	\mathbf{r}^\top \tilde{\mathbf{F}}_l \mathbf{r} = \operatorname{Var}_{\mathbf{x},\mathbf{y}}\!\left[\frac{\partial}{\partial \epsilon}\log p(\mathbf{y}\|\mathbf{x};\boldsymbol{\theta} - \epsilon \operatorname{vec}^{-1}(\mathbf{W}_l\mathbf{r}\mathbf{r}^\top))\bigg\|_{\epsilon=0}\right]
	\end{equation}
	This requires only forward passes, not the full $D \times D$ Fisher matrix.
	\end{enumerate}
	\end{theorem}

	\begin{proof}
	\textbf{(i)} The KL divergence between nearby distributions in an exponential family (or more generally, any smooth parametric family) admits the local expansion:
	\begin{equation}
	D_{\mathrm{KL}}(p_{\boldsymbol{\theta}} \\| p_{\boldsymbol{\theta}'}) = \frac{1}{2}(\boldsymbol{\theta} - \boldsymbol{\theta}')^\top \mathbf{F}(\boldsymbol{\theta})(\boldsymbol{\theta} - \boldsymbol{\theta}') + O(\\|\boldsymbol{\theta} - \boldsymbol{\theta}'\\|^3)
	\end{equation}
	This is the fundamental theorem of information geometry \citep{amari2016information}: the Fisher information matrix is the Riemannian metric tensor on the statistical manifold, and KL divergence is (locally) the squared geodesic distance.
	Setting $\boldsymbol{\theta} - \boldsymbol{\theta}' = \Delta\boldsymbol{\theta}$ and taking expectation over $\mathbf{x}$ gives the result.

	\textbf{(ii)} The block structure follows from the chain rule: the Fisher information decomposes across independent parameter groups when the model has a layered structure.
	For transformer blocks, the parameters of different layers contribute additively to the log-likelihood gradient, so $\mathbf{F}$ is block-diagonal to first order (cross-layer terms arise from shared activations but are typically small).

	\textbf{(iii)} The score function $\nabla_{\boldsymbol{\theta}} \log p$ projected along the abliteration direction gives the directional Fisher information.
	This is a scalar quantity estimable from samples by the plug-in estimator.
	\end{proof}

	\begin{corollary}[Pre-Abliteration Cost Estimation]
	\label{cor:preabliteration}
	Before performing abliteration, one can estimate its information-theoretic cost by:
	(1)~computing $\Delta\mathbf{W}_l = \mathbf{W}_l\mathbf{r}\mathbf{r}^\top$ (requires no model modification);
	(2)~estimating the directional Fisher information via score variance on a small harmless dataset.
	Layers with high Fisher cost should receive stronger regularization or be excluded from abliteration.
	This provides a principled, per-layer regularization schedule.
	\end{corollary}

	% ─────────────────────────────────────────────────────────────────────
	\subsection{Optimal Direction Selection via Submodular Optimization}
	\label{app:submodular}

	Given a large set of candidate refusal directions, selecting the optimal subset is computationally hard in general.
	We show it admits an efficient approximation.

	\begin{theorem}[Submodularity of Refusal Energy Removal]
	\label{thm:submodular}
	Let $\mathcal{D} = \{\mathbf{r}_1, \ldots, \mathbf{r}_N\}$ be a set of candidate refusal directions (potentially from different layers, extraction methods, or harm categories).
	For a subset $S \subseteq \mathcal{D}$, define the refusal energy removal function:
	\begin{equation}
	f(S) = \sum_{l=1}^L \left\\|\mathbf{P}_{\operatorname{span}(S)} \mathbf{d}_l\right\\|^2
	\end{equation}
	where $\mathbf{d}_l = \boldsymbol{\mu}_l^{(\text{harm})} - \boldsymbol{\mu}_l^{(\text{safe})}$ is the refusal signal at layer $l$ and $\mathbf{P}_{\operatorname{span}(S)}$ projects onto the subspace spanned by $S$.
	Then:

	\begin{enumerate}[label=(\roman*)]
	\item $f$ is \emph{monotone}: $f(S) \leq f(S \cup \{r\})$ for all $S, r$.
	\item $f$ is \emph{submodular}: $f(S \cup \{r\}) - f(S) \geq f(T \cup \{r\}) - f(T)$ for all $S \subseteq T$ and $r \notin T$.
	\item The greedy algorithm---iteratively selecting $r^* = \argmax_{r \notin S} f(S \cup \{r\}) - f(S)$---achieves:
	\begin{equation}
	f(S_{\text{greedy}}^{(k)}) \geq \left(1 - \frac{1}{e}\right) \max_{\|S\| = k} f(S) \geq 0.632 \cdot f^*_k
	\end{equation}
	\end{enumerate}
	\end{theorem}

	\begin{proof}
	\textbf{(i) Monotonicity.}
	Adding a direction $\mathbf{r}$ to $S$ can only enlarge $\operatorname{span}(S)$, so $\mathbf{P}_{\operatorname{span}(S \cup \{r\})} \succeq \mathbf{P}_{\operatorname{span}(S)}$ in the Loewner order.
	Therefore $\\|\mathbf{P}_{\operatorname{span}(S \cup \{r\})} \mathbf{d}_l\\|^2 \geq \\|\mathbf{P}_{\operatorname{span}(S)} \mathbf{d}_l\\|^2$ for each $l$.

	\textbf{(ii) Submodularity.}
	The marginal gain of adding $\mathbf{r}$ to $S$ is the residual projection:
	\begin{equation}
	f(S \cup \{r\}) - f(S) = \sum_l \\|\mathbf{P}_{\mathbf{r}_\perp^S} \mathbf{d}_l\\|^2 \cdot \cos^2\angle(\mathbf{r}_\perp^S, \mathbf{d}_l^{\perp S})
	\end{equation}
	where $\mathbf{r}_\perp^S$ is the component of $\mathbf{r}$ orthogonal to $\operatorname{span}(S)$, and $\mathbf{d}_l^{\perp S}$ is the residual refusal signal.
	For $S \subseteq T$, the residual $\mathbf{d}_l^{\perp T}$ is a (weakly) shorter vector than $\mathbf{d}_l^{\perp S}$, and the orthogonal component $\mathbf{r}_\perp^T$ is weakly shorter than $\mathbf{r}_\perp^S$.
	Both effects reduce the marginal gain, establishing submodularity.

	More formally, $g(S) = \\|\mathbf{P}_{\operatorname{span}(S)}\mathbf{v}\\|^2$ is a polymatroid rank function for any fixed vector $\mathbf{v}$, and polymatroid rank functions are submodular \citep{edmonds1970submodular}.
	Since $f$ is a non-negative sum of submodular functions (one per layer), it is submodular.

	\textbf{(iii)} This is the classical result of \citet{nemhauser1978analysis}: greedy maximization of a monotone submodular function subject to a cardinality constraint achieves a $(1-1/e)$-approximation.
	\end{proof}

	\paragraph{Practical implication.}
	The greedy algorithm for direction selection is already used implicitly in abliteration (selecting the top SVD direction, then the next, etc.).
	Theorem~\ref{thm:submodular} provides a formal justification and shows that this approach is near-optimal.
	More importantly, it justifies mixing directions from different sources (different layers, whitened vs.\ standard SVD, different harm categories) into a single candidate pool and greedily selecting the best $k$---the platform's analysis-informed pipeline uses exactly this approach.

	% ─────────────────────────────────────────────────────────────────────
	\subsection{Full Proof of the Residual Signal Bound (Theorem~\ref{thm:ouroboros})}
	\label{app:ouroboros_proof}

	We provide the complete proof of Theorem~\ref{thm:ouroboros} from the main text, addressing the Gini--Lorenz relationship that was stated without proof.

	\begin{proof}[Full proof of Theorem~\ref{thm:ouroboros}]
	Let $s_1, \ldots, s_L \geq 0$ be the refusal strengths with $S = \sum_l s_l > 0$ and sorted values $s_{(1)} \leq \cdots \leq s_{(L)}$.
	Let $p_l = s_l / S$ be the normalized strengths, so $\sum_l p_l = 1$.

	\textbf{Step 1: Repair ratio.}
	When layer $j$ is abliterated, the residual refusal signal is at least $S - s_j$ (this is a lower bound since it assumes no compensatory increase from other layers).
	The repair ratio is $R_j = (S - s_j)/S = 1 - p_j$.
	The minimum over all layers is $R_{\min} = 1 - p_{\max}$ where $p_{\max} = \max_l p_l$.

	\textbf{Step 2: Bounding $p_{\max}$ via the Gini coefficient.}
	We claim that for any non-negative distribution $(p_1, \ldots, p_L)$ summing to 1 with Gini coefficient $G$:
	\begin{equation}
	p_{\max} \leq \frac{1 + G(L-1)}{L}
	\end{equation}

	\textit{Proof of claim.}
	The Gini coefficient has the representation:
	\begin{equation}
	G = \frac{\sum_{i<j}\|p_i - p_j\|}{\binom{L}{2} \cdot 2\bar{p}} = \frac{\sum_{i<j}\|p_i - p_j\|}{(L-1)}
	\end{equation}
	where $\bar{p} = 1/L$.
	For the maximum element $p_{\max} = p_{(L)}$, we have $\|p_{(L)} - p_{(k)}\| = p_{(L)} - p_{(k)}$ for all $k < L$.
	Therefore:
	\begin{equation}
	G \geq \frac{\sum_{k=1}^{L-1}(p_{(L)} - p_{(k)})}{(L-1)} = \frac{(L-1)p_{(L)} - (1 - p_{(L)})}{(L-1)} = p_{(L)} - \frac{1 - p_{(L)}}{L-1}
	\end{equation}
	where we used $\sum_{k=1}^{L-1} p_{(k)} = 1 - p_{(L)}$.
	Rearranging: $G \geq p_{(L)} - (1-p_{(L)})/(L-1)$, so
	$G(L-1) \geq (L-1)p_{(L)} - 1 + p_{(L)} = Lp_{(L)} - 1$, giving $p_{(L)} \leq (1+G(L-1))/L$.

	This bound is tight: it is achieved by the extremal two-level distribution $p_{\max} = (1+G(L-1))/L$, $p_k = (1-p_{\max})/(L-1)$ for all other $k$.

	\textbf{Step 3: Combining.}
	\begin{equation}
	R_{\min} = 1 - p_{\max} \geq 1 - \frac{1 + G(L-1)}{L} = \frac{L - 1 - G(L-1)}{L} = \frac{(L-1)(1-G)}{L}
	\end{equation}

	\textbf{Boundary cases.}
	For uniform distribution ($G = 0$): $R_{\min} \geq (L-1)/L = 1 - 1/L$.
	For maximally concentrated ($G \to 1$): $R_{\min} \to 0$, confirming that single-layer abliteration can be fully effective when all refusal is concentrated.
	\end{proof}

	% ─────────────────────────────────────────────────────────────────────
	\subsection{Full Proof of Sparsity--Energy Concentration (Theorem~\ref{thm:sparse})}
	\label{app:sparse_proof}

	\begin{proof}[Full proof of Theorem~\ref{thm:sparse}]
	Let $p_1 \geq p_2 \geq \cdots \geq p_n$ be sorted projection magnitudes $p_i = \|\mathbf{W}[i,:] \cdot \mathbf{r}\|$.
	Define $P = \sum_i p_i$ and $E_{\text{total}} = \sum_i p_i^2$.

	\textbf{Part 1: Frobenius identity.}
	Sparse projection modifies only the top-$\lfloor \alpha n \rfloor$ rows, so:
	\begin{equation}
	\\|\Delta\mathbf{W}_{\text{sparse}}\\|_F^2 = \sum_{i=1}^{\lfloor \alpha n \rfloor} p_i^2, \quad \\|\Delta\mathbf{W}_{\text{dense}}\\|_F^2 = \sum_{i=1}^n p_i^2
	\end{equation}
	Since $E(\alpha) = \sum_{i=1}^{\lfloor\alpha n\rfloor} p_i^2 / \sum_{i=1}^n p_i^2$ by definition, we have the identity $\\|\Delta\mathbf{W}_{\text{sparse}}\\|_F^2 = E(\alpha) \cdot \\|\Delta\mathbf{W}_{\text{dense}}\\|_F^2$.
	The advantage of sparse surgery is that $E(\alpha) \gg \alpha$ for concentrated distributions ($G > 0$): a fraction $\alpha$ of rows accounts for $E(\alpha)$ of the total perturbation, so the remaining $(1-\alpha)$ fraction of rows---left unchanged by sparse surgery---contributes only $(1-E(\alpha))$ of the dense perturbation.

	\textbf{Part 2: Rigorous energy concentration bound.}
	We derive a lower bound on $E(\alpha) = \sum_{i=1}^{\lfloor\alpha n\rfloor} p_i^2 / E_{\text{total}}$ in terms of the Gini coefficient $G$ of the distribution $(p_1, \ldots, p_n)$.

	Let $\bar{L}(\alpha)$ be the complementary Lorenz curve: the fraction of the total sum $P$ captured by the top-$\alpha$ fraction.
	By the Cauchy--Schwarz inequality applied to the top-$\lfloor\alpha n\rfloor$ values:
	\begin{equation}
	E(\alpha) \geq \frac{\bar{L}(\alpha)^2}{\alpha}
	\end{equation}
	since $(\sum_{i=1}^m p_i)^2 \leq m \sum_{i=1}^m p_i^2$ gives $\sum_{i=1}^m p_i^2 \geq (\sum p_i)^2/m$.

	For distributions with Gini $G$, we bound $\bar{L}(\alpha)$ from below.
	A classical result from Lorenz curve theory is that for the two-level extremal distribution (which minimizes $\bar{L}(\alpha)$ for given $G$ when $\alpha \leq (1+G)/2$):
	\begin{equation}
	\bar{L}(\alpha) \geq \alpha\!\left(1 + G\cdot\frac{1-\alpha}{1-(1-G)\alpha/(1+G)}\right) \geq \alpha(1 + G(1-\alpha))
	\end{equation}
	where the simpler bound on the right follows from $1-x \geq 1/(1+x)$.
	Therefore:
	\begin{equation}
	E(\alpha) \geq \alpha(1+G(1-\alpha))^2
	\end{equation}
	At $\alpha = 0.12$, $G = 0.7$: $E(0.12) \geq 0.12(1+0.616)^2 = 0.12 \times 2.613 = 0.314$.

	\textbf{Part 3: The empirical scaling law.}
	The rigorous bound above ($E \geq 0.314$) is weaker than the empirical observation ($E \approx 0.94$) because real weight matrices have heavier tails than the two-level extremal distribution---a small fraction of rows carry disproportionate refusal energy.
	The scaling $E(\alpha) \gtrsim 1-(1-\alpha)^{2/(1+G)}$ stated in the main text is an \emph{empirical} scaling law observed consistently across tested weight matrices.
	It is not a proven worst-case bound, and the $\gtrsim$ notation in the main text reflects this status.
	We leave the derivation of a tight analytical bound as an open problem.
	\end{proof}

	\paragraph{Summary.}
	The Frobenius identity $\\|\Delta\mathbf{W}_{\text{sparse}}\\|_F^2 = E(\alpha)\\|\Delta\mathbf{W}_{\text{dense}}\\|_F^2$ is exact.
	The energy concentration $E(\alpha) \geq \alpha(1+G(1-\alpha))^2$ is rigorous but loose.
	The tighter scaling $1-(1-\alpha)^{2/(1+G)}$ is empirical.
	All three confirm that sparse surgery is strictly more efficient than random row selection for any distribution with $G > 0$.

	% ─────────────────────────────────────────────────────────────────────
	\section{ML Reproducibility Checklist}
	\label{app:reproducibility}

	Following the NeurIPS/ICML reproducibility guidelines:

	\begin{enumerate}[leftmargin=*]
	\item \textbf{Code availability}: Full source code released under AGPL-3.0 at \url{https://github.com/elder-plinius/OBLITERATUS}. Version 0.1.0 archived on Zenodo (DOI pending).
	\item \textbf{Dependencies}: All dependencies pinned in \texttt{pyproject.toml}; Docker image available for exact environment reproduction.
	\item \textbf{Random seeds}: The platform defaults to seed 42 and supports multi-seed sweeps ($s \in \{42, 137, 2024\}$) with bootstrap CIs. All tables in this paper report single-run results with seed 42. See Section~\ref{para:stat_limitations} for a discussion of statistical limitations and confidence intervals.
	\item \textbf{Compute}: All pipeline stages are designed to run on a single GPU. Full evaluation (7 models $\times$ 3 methods) requires ${\sim}$12 GPU-hours on an NVIDIA A100 (80\,GB). Reproducible on consumer hardware (RTX 3090/4090) with quantization.
	\item \textbf{Dataset}: Evaluation prompts bundled with the codebase (no external dataset download required). Harmful/harmless prompt sets derived from public benchmarks with filtering.
	\item \textbf{Hyperparameters}: Method presets (direction count, regularization, norm preservation) are specified in Section~\ref{sec:intervention}. The \texttt{informed} method's auto-configuration is deterministic given a fixed seed and model.
	\item \textbf{Statistical tests}: The platform supports bootstrap CIs (BCa, 10{,}000 resamples) for all continuous metrics and Clopper--Pearson exact CIs for refusal rates. These tools are available for independent replication.
	\item \textbf{Negative results}: Section~\ref{sec:discussion} reports failure modes including increased perplexity on polyhedral-refusal models and the independence assumption in Theorem~\ref{thm:ouroboros}.
	\end{enumerate}