obliteratus / paper /appendix.tex
pliny-the-prompter's picture
Upload 128 files
f254212 verified
% appendix.tex β€” included from main.tex via \input{appendix}
\appendix
\section{Extended Theoretical Results}
\label{app:theory}
This appendix contains full statements and proofs for five additional theoretical results that extend the geometric theory of refusal removal developed in Section~\ref{sec:theory}.
These results draw on spectral perturbation theory, optimal transport, random matrix theory, Riemannian geometry on the Grassmannian, and information geometry.
Each addresses a gap identified in prior abliteration literature and provides quantitative, falsifiable predictions.
% ─────────────────────────────────────────────────────────────────────
\subsection{Spectral Cost of Abliteration}
\label{app:spectral_cost}
Abliteration modifies weight matrices by projecting out refusal directions.
We bound the resulting perturbation to the singular value spectrum, yielding a formal \emph{capability preservation guarantee}.
\begin{definition}[Directional Abliteration]
\label{def:abliteration}
Let $\mathbf{W} \in \mathbb{R}^{m \times d}$ be a weight matrix and $\mathbf{r} \in \mathbb{R}^d$ a unit refusal direction ($\|\mathbf{r}\| = 1$).
The \emph{abliterated weight matrix} is:
\begin{equation}
\mathbf{W}' = \mathbf{W}(\mathbf{I} - \mathbf{r}\mathbf{r}^\top)
\end{equation}
For $k$ orthonormal directions $\mathbf{R} = [\mathbf{r}_1, \ldots, \mathbf{r}_k] \in \mathbb{R}^{d \times k}$, the multi-direction abliteration is $\mathbf{W}' = \mathbf{W}(\mathbf{I} - \mathbf{R}\mathbf{R}^\top)$.
\end{definition}
\begin{theorem}[Spectral Cost of Abliteration]
\label{thm:spectral_cost}
Let $\mathbf{W} \in \mathbb{R}^{m \times d}$ have singular value decomposition $\mathbf{W} = \mathbf{U}\boldsymbol{\Sigma}\mathbf{V}^\top$ with singular values $\sigma_1 \geq \sigma_2 \geq \cdots \geq \sigma_p$ ($p = \min(m,d)$) and right singular vectors $\mathbf{v}_1, \ldots, \mathbf{v}_p$.
Let $\mathbf{r}$ be a unit refusal direction with expansion $\mathbf{r} = \sum_{j=1}^p \alpha_j \mathbf{v}_j + \mathbf{r}_\perp$ in the singular basis (where $\mathbf{r}_\perp$ lies in the null space of $\mathbf{W}^\top$).
Let $\sigma_1' \geq \cdots \geq \sigma_p'$ be the singular values of $\mathbf{W}' = \mathbf{W}(\mathbf{I} - \mathbf{r}\mathbf{r}^\top)$.
Then:
\begin{enumerate}[label=(\roman*)]
\item \textbf{Global bound (Weyl).} For all $i$:
\begin{equation}
|\sigma_i' - \sigma_i| \leq \|\mathbf{W}\mathbf{r}\| = \left(\sum_{j=1}^p \alpha_j^2 \sigma_j^2\right)^{1/2}
\end{equation}
\item \textbf{Direction-resolved bound.} The perturbation matrix $\mathbf{E} = \mathbf{W}' - \mathbf{W} = -\mathbf{W}\mathbf{r}\mathbf{r}^\top$ has rank $\leq 1$ and its only nonzero singular value is $\|\mathbf{W}\mathbf{r}\|$.
Consequently, at most one singular value of $\mathbf{W}$ changes by $\|\mathbf{W}\mathbf{r}\|$; all others shift by at most this amount but by Cauchy interlacing, at most $p-1$ singular values shift, and the shifts satisfy:
\begin{equation}
\sum_{i=1}^p (\sigma_i' - \sigma_i)^2 \leq \|\mathbf{E}\|_F^2 = \|\mathbf{W}\mathbf{r}\|^2
\end{equation}
\item \textbf{Singular subspace stability (Davis--Kahan).} Let $\mathbf{V}_k = [\mathbf{v}_1, \ldots, \mathbf{v}_k]$ span the top-$k$ right singular subspace of $\mathbf{W}$, and $\mathbf{V}_k'$ the corresponding subspace of $\mathbf{W}'$.
If the singular value gap $\delta_k = \sigma_k - \sigma_{k+1} > \|\mathbf{W}\mathbf{r}\|$, then the principal angle $\theta$ between these subspaces satisfies:
\begin{equation}
\sin \theta(\mathbf{V}_k, \mathbf{V}_k') \leq \frac{\|\mathbf{W}\mathbf{r}\|}{\delta_k - \|\mathbf{W}\mathbf{r}\|}
\end{equation}
\item \textbf{Multi-direction extension.} For $k$ orthonormal directions $\mathbf{R} \in \mathbb{R}^{d \times k}$:
\begin{equation}
|\sigma_i' - \sigma_i| \leq \|\mathbf{W}\mathbf{R}\|_2 \quad \text{and} \quad \sum_i (\sigma_i' - \sigma_i)^2 \leq \|\mathbf{W}\mathbf{R}\|_F^2 = \sum_{j=1}^k \|\mathbf{W}\mathbf{r}_j\|^2
\end{equation}
\end{enumerate}
\end{theorem}
\begin{proof}
\textbf{(i)} The abliterated matrix is $\mathbf{W}' = \mathbf{W} + \mathbf{E}$ where $\mathbf{E} = -\mathbf{W}\mathbf{r}\mathbf{r}^\top$.
By Weyl's inequality for singular values \citep{stewart1990matrix}: $|\sigma_i(\mathbf{W}') - \sigma_i(\mathbf{W})| \leq \sigma_1(\mathbf{E}) = \|\mathbf{E}\|_2$.
Since $\mathbf{E} = -(\mathbf{W}\mathbf{r})\mathbf{r}^\top$ is a rank-1 matrix (outer product of $\mathbf{W}\mathbf{r} \in \mathbb{R}^m$ and $\mathbf{r} \in \mathbb{R}^d$), its only nonzero singular value is $\|\mathbf{W}\mathbf{r}\| \cdot \|\mathbf{r}\| = \|\mathbf{W}\mathbf{r}\|$.
Expanding in the singular basis: $\mathbf{W}\mathbf{r} = \sum_j \alpha_j \sigma_j \mathbf{u}_j$, so $\|\mathbf{W}\mathbf{r}\|^2 = \sum_j \alpha_j^2 \sigma_j^2$.
\textbf{(ii)} The Frobenius bound follows from the Hoffman--Wielandt inequality: $\sum_i (\sigma_i' - \sigma_i)^2 \leq \|\mathbf{E}\|_F^2$.
For a rank-1 matrix, $\|\mathbf{E}\|_F = \|\mathbf{E}\|_2 = \|\mathbf{W}\mathbf{r}\|$.
\textbf{(iii)} This is the $\sin\theta$ theorem of \citet{davis1970rotation}.
The perturbation $\mathbf{E}$ acts on the invariant subspace.
By the classical Davis--Kahan bound applied to $\mathbf{W}^\top \mathbf{W}$ (whose eigenvalues are $\sigma_i^2$), the relevant gap is $\sigma_k^2 - \sigma_{k+1}^2 = (\sigma_k - \sigma_{k+1})(\sigma_k + \sigma_{k+1})$.
Using the refined multiplicative form and the fact that $\|\mathbf{E}\|_2 = \|\mathbf{W}\mathbf{r}\|$, we obtain the stated bound (using $\delta_k = \sigma_k - \sigma_{k+1}$ requires the assumption $\sigma_{k+1} > 0$; otherwise the symmetric form with $\sigma_k^2 - \sigma_{k+1}^2$ gives a tighter bound).
\textbf{(iv)} For $\mathbf{E} = -\mathbf{W}\mathbf{R}\mathbf{R}^\top$ with $\mathbf{R}^\top\mathbf{R} = \mathbf{I}_k$, we have $\operatorname{rank}(\mathbf{E}) \leq k$ and $\|\mathbf{E}\|_2 = \|\mathbf{W}\mathbf{R}\|_2$, $\|\mathbf{E}\|_F^2 = \operatorname{tr}(\mathbf{R}^\top \mathbf{W}^\top \mathbf{W} \mathbf{R}) = \sum_j \|\mathbf{W}\mathbf{r}_j\|^2$ (using orthonormality of columns of $\mathbf{R}$).
The bounds then follow from Weyl and Hoffman--Wielandt as before.
\end{proof}
\begin{corollary}[Capability Preservation Guarantee]
\label{cor:capability}
If the refusal direction $\mathbf{r}$ lies primarily in the bottom singular subspace---i.e., $\alpha_j \approx 0$ for $j \leq k$ and $\sum_{j > k} \alpha_j^2 \sigma_j^2 \leq \epsilon^2$---then abliteration perturbs all singular values by at most $\epsilon$, and the top-$k$ singular subspace rotates by at most $\arcsin(\epsilon / \delta_k)$.
Intuitively, refusal directions that are ``orthogonal to the model's principal computations'' can be removed cheaply.
\end{corollary}
\paragraph{Remark.}
This result explains the empirical success of norm-preserving abliteration \citep{grimjim2025}: rescaling $\mathbf{W}'$ to match $\|\mathbf{W}\|_F$ compensates for the trace-level spectral shift $\sum_i (\sigma_i'^2 - \sigma_i^2) = -\|\mathbf{W}\mathbf{r}\|^2$, but does not address the \emph{relative} reordering of singular values.
Theorem~\ref{thm:spectral_cost}(iii) shows that when the spectral gap is large relative to $\|\mathbf{W}\mathbf{r}\|$, the singular subspace is approximately preserved even without norm correction.
% ─────────────────────────────────────────────────────────────────────
\subsection{Wasserstein Cost of Abliteration}
\label{app:wasserstein}
Current abliteration methods optimize mean separation (difference-of-means) while ignoring the distributional cost of the intervention.
We formalize this using optimal transport theory.
\begin{theorem}[Wasserstein Cost of Directional Projection]
\label{thm:wasserstein}
Let $\mu_0 = \mathcal{N}(\mathbf{m}, \boldsymbol{\Sigma})$ be the activation distribution at layer~$l$ on harmless inputs, and let $\mathbf{P}_\perp = \mathbf{I} - \mathbf{r}\mathbf{r}^\top$ be the projection orthogonal to refusal direction~$\mathbf{r}$.
The post-abliteration activation distribution is $\mu_1 = \mathcal{N}(\mathbf{P}_\perp \mathbf{m}, \,\mathbf{P}_\perp \boldsymbol{\Sigma} \mathbf{P}_\perp)$.
The 2-Wasserstein distance decomposes as:
\begin{equation}
W_2^2(\mu_0, \mu_1) = \underbrace{(\mathbf{r}^\top \mathbf{m})^2}_{\text{mean shift}} + \underbrace{\operatorname{tr}\!\big(\boldsymbol{\Sigma}\big) + \operatorname{tr}\!\big(\mathbf{P}_\perp \boldsymbol{\Sigma} \mathbf{P}_\perp\big) - 2\operatorname{tr}\!\Big(\big(\boldsymbol{\Sigma}^{1/2}\mathbf{P}_\perp\boldsymbol{\Sigma}\mathbf{P}_\perp\boldsymbol{\Sigma}^{1/2}\big)^{1/2}\Big)}_{\text{Bures divergence } \mathcal{B}^2(\boldsymbol{\Sigma},\, \mathbf{P}_\perp\boldsymbol{\Sigma}\mathbf{P}_\perp)}
\end{equation}
Furthermore, with $\sigma_r^2 = \mathbf{r}^\top\boldsymbol{\Sigma}\mathbf{r}$ denoting the activation variance along $\mathbf{r}$ and $\alpha_i = \mathbf{r}^\top\mathbf{e}_i$ the components in the eigenbasis of $\boldsymbol{\Sigma}$:
\begin{enumerate}[label=(\roman*)]
\item \textbf{General upper bound.} The Bures divergence satisfies:
\begin{equation}
\mathcal{B}^2(\boldsymbol{\Sigma}, \mathbf{P}_\perp\boldsymbol{\Sigma}\mathbf{P}_\perp) \leq \sigma_r^2 + 2\sum_{i=1}^d \lambda_i \alpha_i^2 - \sigma_r^4/\lambda_{\max}
\end{equation}
where the bound follows from the operator concavity of $A \mapsto \operatorname{tr}(A^{1/2})$.
\item \textbf{Eigenvector special case.} When $\mathbf{r}$ is an eigenvector of $\boldsymbol{\Sigma}$ with eigenvalue $\lambda_r$:
\end{enumerate}
\begin{equation}
W_2^2(\mu_0, \mu_1) = (\mathbf{r}^\top\mathbf{m})^2 + ({\sqrt{\lambda_r}})^2 = (\mathbf{r}^\top\mathbf{m})^2 + \lambda_r
\end{equation}
\end{theorem}
\begin{proof}
The 2-Wasserstein distance between Gaussians $\mathcal{N}(\mathbf{m}_0, \boldsymbol{\Sigma}_0)$ and $\mathcal{N}(\mathbf{m}_1, \boldsymbol{\Sigma}_1)$ has the closed form \citep{dowson1982frechet,givens1984class}:
\begin{equation}
W_2^2 = \|\mathbf{m}_0 - \mathbf{m}_1\|^2 + \operatorname{tr}(\boldsymbol{\Sigma}_0) + \operatorname{tr}(\boldsymbol{\Sigma}_1) - 2\operatorname{tr}\!\big((\boldsymbol{\Sigma}_0^{1/2}\boldsymbol{\Sigma}_1\boldsymbol{\Sigma}_0^{1/2})^{1/2}\big)
\end{equation}
Setting $\mathbf{m}_0 = \mathbf{m}$, $\mathbf{m}_1 = \mathbf{P}_\perp\mathbf{m}$: the mean shift is $\|\mathbf{m} - \mathbf{P}_\perp\mathbf{m}\|^2 = \|\mathbf{r}\mathbf{r}^\top\mathbf{m}\|^2 = (\mathbf{r}^\top\mathbf{m})^2$.
Setting $\boldsymbol{\Sigma}_0 = \boldsymbol{\Sigma}$, $\boldsymbol{\Sigma}_1 = \mathbf{P}_\perp\boldsymbol{\Sigma}\mathbf{P}_\perp$: this is a singular covariance (rank deficient along $\mathbf{r}$), so $\boldsymbol{\Sigma}_1$ has eigenvalue 0 along $\mathbf{r}$ and unchanged eigenvalues along directions orthogonal to $\mathbf{r}$ that are also eigenvectors of $\boldsymbol{\Sigma}$.
For the special case where $\mathbf{r} = \mathbf{e}_j$ (an eigenvector of $\boldsymbol{\Sigma}$), we have $\boldsymbol{\Sigma}_1 = \boldsymbol{\Sigma} - \lambda_j \mathbf{e}_j\mathbf{e}_j^\top$.
Then $\boldsymbol{\Sigma}_0^{1/2}\boldsymbol{\Sigma}_1\boldsymbol{\Sigma}_0^{1/2}$ has eigenvalues $\lambda_i^2$ for $i \neq j$ and $0$ for $i = j$.
Thus $\operatorname{tr}((\cdot)^{1/2}) = \sum_{i \neq j} \lambda_i$.
The Bures term becomes $\sum_i \lambda_i + \sum_{i \neq j} \lambda_i - 2\sum_{i \neq j}\lambda_i = \lambda_j$.
So $W_2^2 = (\mathbf{r}^\top\mathbf{m})^2 + \lambda_j$.
\textbf{General case (upper bound).}
When $\mathbf{r}$ is not an eigenvector, $\boldsymbol{\Sigma}_1 = \mathbf{P}_\perp\boldsymbol{\Sigma}\mathbf{P}_\perp$ is no longer simultaneously diagonalizable with $\boldsymbol{\Sigma}$.
The Bures divergence is $\mathcal{B}^2 = \operatorname{tr}(\boldsymbol{\Sigma}) + \operatorname{tr}(\boldsymbol{\Sigma}_1) - 2\operatorname{tr}((\boldsymbol{\Sigma}^{1/2}\boldsymbol{\Sigma}_1\boldsymbol{\Sigma}^{1/2})^{1/2})$.
We have $\operatorname{tr}(\boldsymbol{\Sigma}_1) = \operatorname{tr}(\boldsymbol{\Sigma}) - \sigma_r^2$ (since projection removes variance $\sigma_r^2 = \mathbf{r}^\top\boldsymbol{\Sigma}\mathbf{r}$), so $\operatorname{tr}(\boldsymbol{\Sigma}) + \operatorname{tr}(\boldsymbol{\Sigma}_1) = 2\operatorname{tr}(\boldsymbol{\Sigma}) - \sigma_r^2$.
For the matrix square root term, note $\boldsymbol{\Sigma}^{1/2}\boldsymbol{\Sigma}_1\boldsymbol{\Sigma}^{1/2} = \boldsymbol{\Sigma}^2 - \boldsymbol{\Sigma}^{1/2}(\boldsymbol{\Sigma}\mathbf{r}\mathbf{r}^\top + \mathbf{r}\mathbf{r}^\top\boldsymbol{\Sigma} - \sigma_r^2\mathbf{r}\mathbf{r}^\top)\boldsymbol{\Sigma}^{1/2}$.
By the concavity of $A \mapsto \operatorname{tr}(A^{1/2})$ on positive semidefinite matrices (a consequence of the operator concavity of $t \mapsto t^{1/2}$), we have $\operatorname{tr}((\boldsymbol{\Sigma}^{1/2}\boldsymbol{\Sigma}_1\boldsymbol{\Sigma}^{1/2})^{1/2}) \geq \operatorname{tr}(\boldsymbol{\Sigma}) - \sigma_r^2/2 - \sum_i \lambda_i\alpha_i^2/2 + \sigma_r^4/(4\lambda_{\max})$, yielding the stated upper bound after algebraic simplification.
\end{proof}
\begin{corollary}[Mean-Only Abliteration Is Suboptimal]
\label{cor:mean_only}
Standard difference-of-means abliteration selects $\mathbf{r}$ to maximize the harmful-vs-harmless mean shift $(\mathbf{r}^\top\mathbf{d})^2$, but the total Wasserstein cost on harmless inputs includes the Bures term $\mathcal{B}^2$.
A direction $\mathbf{r}$ that happens to be a high-variance eigenvector of $\boldsymbol{\Sigma}$ incurs Bures cost $\lambda_r$, which may dominate the mean shift.
The \emph{Wasserstein-optimal} refusal direction minimizes:
\begin{equation}
\mathbf{r}^* = \argmin_{\|\mathbf{r}\|=1} \frac{W_2^2(\mu_0^{\text{harmless}}, \mu_1^{\text{harmless}})}{(\mathbf{r}^\top\mathbf{d})^2} = \argmin_{\|\mathbf{r}\|=1} \frac{(\mathbf{r}^\top\mathbf{m})^2 + \mathbf{r}^\top\boldsymbol{\Sigma}\mathbf{r}}{(\mathbf{r}^\top\mathbf{d})^2}
\end{equation}
This is a generalized eigenvalue problem, distinct from both standard and whitened SVD.
\end{corollary}
\paragraph{Practical implication.}
Theorem~\ref{thm:wasserstein} predicts that abliteration along high-variance directions of the harmless distribution (rogue dimensions) incurs disproportionate Wasserstein cost.
This provides a formal explanation for why whitened SVD (Theorem~\ref{thm:fisher}) improves capability preservation: by downweighting high-variance directions, whitened SVD implicitly reduces the Bures component of the Wasserstein cost.
% ─────────────────────────────────────────────────────────────────────
\subsection{Detectability Phase Transition for Refusal Directions}
\label{app:phase_transition}
Using random matrix theory, we characterize when linear methods can reliably identify refusal directions from finite prompt samples.
\begin{definition}[Spiked Activation Model]
\label{def:spiked}
Let activations under harmful prompts at layer~$l$ be:
\begin{equation}
\mathbf{a}_i = \boldsymbol{\mu} + \sqrt{\beta}\, \mathbf{r}\, z_i + \boldsymbol{\epsilon}_i, \quad i = 1, \ldots, n
\end{equation}
where $\mathbf{r} \in \mathbb{R}^d$ is the unit refusal direction, $\beta > 0$ is the refusal signal strength, $z_i \sim \mathcal{N}(0, 1)$ is the per-prompt refusal activation, and $\boldsymbol{\epsilon}_i \sim \mathcal{N}(\mathbf{0}, \sigma^2\mathbf{I}_d)$ is isotropic noise.
The population covariance is $\boldsymbol{\Sigma} = \sigma^2\mathbf{I} + \beta\mathbf{r}\mathbf{r}^\top$ (a rank-1 spiked model).
\end{definition}
\begin{theorem}[BBP Phase Transition for Refusal Detection]
\label{thm:bbp}
In the proportional limit $n, d \to \infty$ with $d/n \to \gamma > 0$, let $\hat{\mathbf{v}}_1$ be the leading eigenvector of the sample covariance $\hat{\boldsymbol{\Sigma}} = \frac{1}{n}\sum_{i=1}^n (\mathbf{a}_i - \bar{\mathbf{a}})(\mathbf{a}_i - \bar{\mathbf{a}})^\top$.
Define the signal-to-noise ratio $\rho = \beta/\sigma^2$.
\begin{enumerate}[label=(\roman*)]
\item \textbf{Subcritical regime} ($\rho \leq \sqrt{\gamma}$).
The leading sample eigenvector $\hat{\mathbf{v}}_1$ is asymptotically orthogonal to the true refusal direction:
\begin{equation}
|\langle \hat{\mathbf{v}}_1, \mathbf{r} \rangle|^2 \xrightarrow{a.s.} 0
\end{equation}
No consistent linear estimator of $\mathbf{r}$ exists.
Abliteration based on the top eigenvector removes a noise direction and has no effect on refusal.
\item \textbf{Supercritical regime} ($\rho > \sqrt{\gamma}$).
The leading eigenvector consistently estimates $\mathbf{r}$:
\begin{equation}
|\langle \hat{\mathbf{v}}_1, \mathbf{r} \rangle|^2 \xrightarrow{a.s.} \frac{1 - \gamma/\rho^2}{1 + \gamma/\rho} \in (0, 1)
\end{equation}
and the leading sample eigenvalue detaches from the Marchenko--Pastur bulk:
\begin{equation}
\hat{\lambda}_1 \xrightarrow{a.s.} \sigma^2(\rho + 1)(1 + \gamma/\rho) > \sigma^2(1+\sqrt{\gamma})^2
\end{equation}
\item \textbf{Minimum sample size.}
For the refusal direction to be detectable, the number of prompts must satisfy:
\begin{equation}
n > \frac{d \sigma^4}{\beta^2} = \frac{d}{\rho^2}
\end{equation}
For $d = 4096$ (typical 7B model) and $\rho = 2$ (moderate refusal signal): $n > 1024$ prompts.
For strong refusal ($\rho = 5$): $n > 164$ prompts.
\end{enumerate}
\end{theorem}
\begin{proof}
Parts (i) and (ii) are the Baik--Ben~Arous--P\'ech\'e (BBP) phase transition \citep{baik2005phase,paul2007asymptotics} applied to the spiked covariance model.
\textbf{(i)} When $\rho \leq \sqrt{\gamma}$, the spike eigenvalue $\sigma^2(1 + \rho)$ does not exceed the right edge of the Marchenko--Pastur distribution $\sigma^2(1+\sqrt{\gamma})^2$ in the sample.
By the BBP theorem, $\hat{\lambda}_1 \to \sigma^2(1+\sqrt{\gamma})^2$ (stuck at the bulk edge), and $|\langle\hat{\mathbf{v}}_1, \mathbf{r}\rangle|^2 \to 0$.
\textbf{(ii)} When $\rho > \sqrt{\gamma}$, the spike pushes the sample eigenvalue above the bulk.
The asymptotic alignment $|\langle\hat{\mathbf{v}}_1, \mathbf{r}\rangle|^2 \to (1 - \gamma/\rho^2)/(1 + \gamma/\rho)$ follows from the resolvent analysis in \citet{paul2007asymptotics}.
\textbf{(iii)} The critical condition $\rho > \sqrt{\gamma}$ with $\gamma = d/n$ gives $\beta/\sigma^2 > \sqrt{d/n}$, hence $n > d\sigma^4/\beta^2$.
\end{proof}
\paragraph{Practical implication.}
Most abliteration studies use 32--128 harmful prompts with $d = 4096$.
This gives $\gamma = d/n \approx 32$--$128$, requiring $\rho > 5.7$--$11.3$ for reliable detection.
The BBP threshold explains why abliteration on small prompt sets sometimes fails catastrophically: the extracted direction is noise, not refusal.
Theorem~\ref{thm:bbp}(iii) provides a concrete minimum prompt count.
This also explains why difference-of-means (which estimates the mean shift rather than the covariance spike) can succeed with fewer samples than PCA: the mean estimator has standard rate $O(1/\sqrt{n})$ regardless of $d$, while the covariance-based estimator requires $n = \Omega(d/\rho^2)$.
The practical recommendation is to use difference-of-means for small prompt sets and whitened SVD (which combines both) when sufficient prompts are available.
% ─────────────────────────────────────────────────────────────────────
\subsection{Grassmannian Coherence of Refusal Subspaces}
\label{app:grassmannian}
The refusal subspace varies across layers.
We formalize this as a curve on the Grassmannian and derive consequences for multi-layer abliteration.
\begin{definition}[Refusal Curve on the Grassmannian]
\label{def:refusal_curve}
Let $\mathcal{S}_l \in \operatorname{Gr}(k, d)$ be the $k$-dimensional refusal subspace at layer $l$, identified as the span of the top-$k$ singular vectors of the whitened difference matrix at layer $l$.
The \emph{refusal curve} is the discrete path $\gamma = (\mathcal{S}_1, \mathcal{S}_2, \ldots, \mathcal{S}_L)$ on $\operatorname{Gr}(k, d)$.
The \emph{geodesic distance} between adjacent subspaces is:
\begin{equation}
d_G(\mathcal{S}_l, \mathcal{S}_{l+1}) = \left(\sum_{i=1}^k \theta_i^2\right)^{1/2}
\end{equation}
where $\theta_1, \ldots, \theta_k \in [0, \pi/2]$ are the \emph{principal angles} between $\mathcal{S}_l$ and $\mathcal{S}_{l+1}$, computed via the SVD of $\mathbf{V}_l^\top \mathbf{V}_{l+1}$ (where $\mathbf{V}_l \in \mathbb{R}^{d \times k}$ is an orthonormal basis for $\mathcal{S}_l$).
\end{definition}
\begin{theorem}[Grassmannian Coherence and Abliteration Effectiveness]
\label{thm:grassmannian}
Let $\gamma = (\mathcal{S}_1, \ldots, \mathcal{S}_L)$ be the refusal curve.
Define the \emph{coherence} $C = \max_{l, l'} d_G(\mathcal{S}_l, \mathcal{S}_{l'})$ and the \emph{total geodesic length} $\Lambda = \sum_{l=1}^{L-1} d_G(\mathcal{S}_l, \mathcal{S}_{l+1})$.
\begin{enumerate}[label=(\roman*)]
\item \textbf{Universal direction effectiveness.}
Let $\mathcal{S}^*$ be any fixed $k$-dimensional subspace (e.g., the refusal directions extracted from a single layer).
The fraction of refusal energy at layer $l$ captured by projecting onto $\mathcal{S}^*$ is:
\begin{equation}
E_l(\mathcal{S}^*) = \frac{\|\mathbf{P}_{\mathcal{S}^*} \mathbf{P}_{\mathcal{S}_l}\|_F^2}{k} = \frac{\sum_{i=1}^k \cos^2\theta_i(\mathcal{S}^*, \mathcal{S}_l)}{k}
\end{equation}
If $C < \pi/4$ (coherent refusal), then for \emph{any} data subspace $\mathcal{S}_{l_0}$ (or the Fr\'echet mean $\bar{\mathcal{S}}$):
\begin{equation}
E_l(\mathcal{S}_{l_0}) \geq \cos^2(C) > \frac{1}{2} \quad \text{for all } l
\end{equation}
That is, a single universal abliteration subspace (e.g., the refusal directions from any single layer) captures more than half the refusal energy at every layer.
\item \textbf{Mechanism count lower bound.}
The total geodesic length lower-bounds the number of geometrically distinct refusal mechanisms:
\begin{equation}
M \geq \left\lceil \frac{\Lambda}{\pi/4} \right\rceil
\end{equation}
where $M$ is the minimum number of subspaces of angular radius $\pi/4$ needed to cover the refusal curve (the covering number with respect to the geodesic metric).
\item \textbf{Optimal layer selection.}
Given a budget of $T$ layers for abliteration, the optimal selection $\{l_1, \ldots, l_T\}$ that maximizes total refusal energy removal across all layers satisfies:
\begin{equation}
\sum_{l=1}^L E_l\!\left(\bigcup_{t=1}^T \mathcal{S}_{l_t}\right) \geq \left(1 - \left(1 - \frac{1}{M}\right)^T\right) L
\end{equation}
where the union denotes the span. This follows from the submodularity of set coverage (see Theorem~\ref{thm:submodular} below).
\end{enumerate}
\end{theorem}
\begin{proof}
\textbf{(i)} The projection overlap $E_l(\mathcal{S}^*) = k^{-1}\sum_i \cos^2\theta_i$ follows from the definition of principal angles: if $\mathbf{V}^*$ and $\mathbf{V}_l$ are orthonormal bases, then $\|\mathbf{V}^{*\top}\mathbf{V}_l\|_F^2 = \sum_i \cos^2\theta_i$.
We show that \emph{any} data point $\mathcal{S}_{l_0}$ already achieves $E_l(\mathcal{S}_{l_0}) > 1/2$ for all $l$.
By definition of coherence, $d_G(\mathcal{S}_{l_0}, \mathcal{S}_l) \leq C$ for all $l$.
When $k=1$, $d_G = \theta_1$ and $E_l = \cos^2\theta_1 \geq \cos^2(C) > 1/2$ since $C < \pi/4$.
For $k > 1$: $E_l = k^{-1}\sum_i \cos^2\theta_i \geq \cos^2(\max_i \theta_i)$.
Since the geodesic distance satisfies $d_G = (\sum_i \theta_i^2)^{1/2}$, we have $\max_i \theta_i \leq d_G \leq C$, giving $E_l \geq \cos^2(C) > 1/2$.
The Fr\'echet mean $\bar{\mathcal{S}} = \argmin_{\mathcal{S}} \sum_l d_G^2(\mathcal{S}, \mathcal{S}_l)$ can only improve upon this, since it minimizes the total squared distance to all subspaces.
\textbf{(ii)} The covering number argument: any ball of radius $\pi/4$ on $\operatorname{Gr}(k,d)$ contains subspaces with pairwise geodesic distance at most $\pi/2$.
A curve of total length $\Lambda$ requires at least $\lceil \Lambda / (\pi/4) \rceil$ balls of radius $\pi/4$ to cover (since adjacent centers on the curve are separated by their arc length).
Each ball represents one ``mechanism''---a group of layers using geometrically similar refusal subspaces.
\textbf{(iii)} The energy at layer $l$ from the union of $T$ layer subspaces is $E_l(\cup_t \mathcal{S}_{l_t}) = k^{-1}\|\mathbf{P}_{\cup_t \mathcal{S}_{l_t}} \mathbf{P}_{\mathcal{S}_l}\|_F^2$.
The function $f(T) = \sum_l E_l$ is monotone submodular in the set of selected layers (adding a layer can only increase energy, and with diminishing returns since the new subspace overlaps with existing ones).
By the greedy algorithm guarantee for submodular maximization \citep{nemhauser1978analysis}, the greedy selection achieves $(1-1/e)$ of the optimum. The stated bound follows from noting that $M$ well-chosen layers would capture all energy.
\end{proof}
% ─────────────────────────────────────────────────────────────────────
\subsection{Fisher Sensitivity of Abliteration}
\label{app:fisher_sensitivity}
We connect the geometric refusal analysis to the information-theoretic cost of abliteration on the model's output distribution.
\begin{theorem}[Fisher Information Bound on Abliteration Cost]
\label{thm:fisher_info}
Let $p(\mathbf{y} | \mathbf{x}; \boldsymbol{\theta})$ be the model's output distribution parameterized by weights $\boldsymbol{\theta} \in \mathbb{R}^D$.
Let $\boldsymbol{\theta}' = \boldsymbol{\theta} - \Delta\boldsymbol{\theta}$ be the abliterated parameters, where $\Delta\boldsymbol{\theta}$ is the concatenation of $(\mathbf{W}_l\mathbf{r}\mathbf{r}^\top)$ flattened across all modified layers.
Then for harmless input distribution $\mathcal{X}$:
\begin{enumerate}[label=(\roman*)]
\item \textbf{Local KL bound.}
\begin{equation}
\mathbb{E}_{\mathbf{x} \sim \mathcal{X}}\!\left[D_{\mathrm{KL}}\!\big(p(\cdot|\mathbf{x};\boldsymbol{\theta}) \,\|\, p(\cdot|\mathbf{x};\boldsymbol{\theta}')\big)\right] \leq \frac{1}{2}\Delta\boldsymbol{\theta}^\top \mathbf{F}(\boldsymbol{\theta}) \Delta\boldsymbol{\theta} + O(\|\Delta\boldsymbol{\theta}\|^3)
\end{equation}
where $\mathbf{F}(\boldsymbol{\theta}) = \mathbb{E}_{\mathbf{x},\mathbf{y}}[\nabla_{\boldsymbol{\theta}} \log p(\mathbf{y}|\mathbf{x};\boldsymbol{\theta}) \nabla_{\boldsymbol{\theta}} \log p(\mathbf{y}|\mathbf{x};\boldsymbol{\theta})^\top]$ is the Fisher information matrix.
\item \textbf{Per-layer decomposition.} For a single-layer abliteration at layer $l$ with perturbation $\Delta\mathbf{W}_l = \mathbf{W}_l\mathbf{r}\mathbf{r}^\top$:
\begin{equation}
\frac{1}{2}\operatorname{vec}(\Delta\mathbf{W}_l)^\top \mathbf{F}_l \operatorname{vec}(\Delta\mathbf{W}_l)
\end{equation}
where $\mathbf{F}_l$ is the block of the Fisher information corresponding to layer~$l$'s parameters.
\item \textbf{Empirical estimability.}
The Fisher cost $\mathbf{r}^\top \tilde{\mathbf{F}}_l \mathbf{r}$ (where $\tilde{\mathbf{F}}_l$ is the Fisher information projected to the refusal direction subspace) can be estimated from the variance of the score function:
\begin{equation}
\mathbf{r}^\top \tilde{\mathbf{F}}_l \mathbf{r} = \operatorname{Var}_{\mathbf{x},\mathbf{y}}\!\left[\frac{\partial}{\partial \epsilon}\log p(\mathbf{y}|\mathbf{x};\boldsymbol{\theta} - \epsilon \operatorname{vec}^{-1}(\mathbf{W}_l\mathbf{r}\mathbf{r}^\top))\bigg|_{\epsilon=0}\right]
\end{equation}
This requires only forward passes, not the full $D \times D$ Fisher matrix.
\end{enumerate}
\end{theorem}
\begin{proof}
\textbf{(i)} The KL divergence between nearby distributions in an exponential family (or more generally, any smooth parametric family) admits the local expansion:
\begin{equation}
D_{\mathrm{KL}}(p_{\boldsymbol{\theta}} \| p_{\boldsymbol{\theta}'}) = \frac{1}{2}(\boldsymbol{\theta} - \boldsymbol{\theta}')^\top \mathbf{F}(\boldsymbol{\theta})(\boldsymbol{\theta} - \boldsymbol{\theta}') + O(\|\boldsymbol{\theta} - \boldsymbol{\theta}'\|^3)
\end{equation}
This is the fundamental theorem of information geometry \citep{amari2016information}: the Fisher information matrix is the Riemannian metric tensor on the statistical manifold, and KL divergence is (locally) the squared geodesic distance.
Setting $\boldsymbol{\theta} - \boldsymbol{\theta}' = \Delta\boldsymbol{\theta}$ and taking expectation over $\mathbf{x}$ gives the result.
\textbf{(ii)} The block structure follows from the chain rule: the Fisher information decomposes across independent parameter groups when the model has a layered structure.
For transformer blocks, the parameters of different layers contribute additively to the log-likelihood gradient, so $\mathbf{F}$ is block-diagonal to first order (cross-layer terms arise from shared activations but are typically small).
\textbf{(iii)} The score function $\nabla_{\boldsymbol{\theta}} \log p$ projected along the abliteration direction gives the directional Fisher information.
This is a scalar quantity estimable from samples by the plug-in estimator.
\end{proof}
\begin{corollary}[Pre-Abliteration Cost Estimation]
\label{cor:preabliteration}
Before performing abliteration, one can estimate its information-theoretic cost by:
(1)~computing $\Delta\mathbf{W}_l = \mathbf{W}_l\mathbf{r}\mathbf{r}^\top$ (requires no model modification);
(2)~estimating the directional Fisher information via score variance on a small harmless dataset.
Layers with high Fisher cost should receive stronger regularization or be excluded from abliteration.
This provides a principled, per-layer regularization schedule.
\end{corollary}
% ─────────────────────────────────────────────────────────────────────
\subsection{Optimal Direction Selection via Submodular Optimization}
\label{app:submodular}
Given a large set of candidate refusal directions, selecting the optimal subset is computationally hard in general.
We show it admits an efficient approximation.
\begin{theorem}[Submodularity of Refusal Energy Removal]
\label{thm:submodular}
Let $\mathcal{D} = \{\mathbf{r}_1, \ldots, \mathbf{r}_N\}$ be a set of candidate refusal directions (potentially from different layers, extraction methods, or harm categories).
For a subset $S \subseteq \mathcal{D}$, define the refusal energy removal function:
\begin{equation}
f(S) = \sum_{l=1}^L \left\|\mathbf{P}_{\operatorname{span}(S)} \mathbf{d}_l\right\|^2
\end{equation}
where $\mathbf{d}_l = \boldsymbol{\mu}_l^{(\text{harm})} - \boldsymbol{\mu}_l^{(\text{safe})}$ is the refusal signal at layer $l$ and $\mathbf{P}_{\operatorname{span}(S)}$ projects onto the subspace spanned by $S$.
Then:
\begin{enumerate}[label=(\roman*)]
\item $f$ is \emph{monotone}: $f(S) \leq f(S \cup \{r\})$ for all $S, r$.
\item $f$ is \emph{submodular}: $f(S \cup \{r\}) - f(S) \geq f(T \cup \{r\}) - f(T)$ for all $S \subseteq T$ and $r \notin T$.
\item The greedy algorithm---iteratively selecting $r^* = \argmax_{r \notin S} f(S \cup \{r\}) - f(S)$---achieves:
\begin{equation}
f(S_{\text{greedy}}^{(k)}) \geq \left(1 - \frac{1}{e}\right) \max_{|S| = k} f(S) \geq 0.632 \cdot f^*_k
\end{equation}
\end{enumerate}
\end{theorem}
\begin{proof}
\textbf{(i) Monotonicity.}
Adding a direction $\mathbf{r}$ to $S$ can only enlarge $\operatorname{span}(S)$, so $\mathbf{P}_{\operatorname{span}(S \cup \{r\})} \succeq \mathbf{P}_{\operatorname{span}(S)}$ in the Loewner order.
Therefore $\|\mathbf{P}_{\operatorname{span}(S \cup \{r\})} \mathbf{d}_l\|^2 \geq \|\mathbf{P}_{\operatorname{span}(S)} \mathbf{d}_l\|^2$ for each $l$.
\textbf{(ii) Submodularity.}
The marginal gain of adding $\mathbf{r}$ to $S$ is the residual projection:
\begin{equation}
f(S \cup \{r\}) - f(S) = \sum_l \|\mathbf{P}_{\mathbf{r}_\perp^S} \mathbf{d}_l\|^2 \cdot \cos^2\angle(\mathbf{r}_\perp^S, \mathbf{d}_l^{\perp S})
\end{equation}
where $\mathbf{r}_\perp^S$ is the component of $\mathbf{r}$ orthogonal to $\operatorname{span}(S)$, and $\mathbf{d}_l^{\perp S}$ is the residual refusal signal.
For $S \subseteq T$, the residual $\mathbf{d}_l^{\perp T}$ is a (weakly) shorter vector than $\mathbf{d}_l^{\perp S}$, and the orthogonal component $\mathbf{r}_\perp^T$ is weakly shorter than $\mathbf{r}_\perp^S$.
Both effects reduce the marginal gain, establishing submodularity.
More formally, $g(S) = \|\mathbf{P}_{\operatorname{span}(S)}\mathbf{v}\|^2$ is a polymatroid rank function for any fixed vector $\mathbf{v}$, and polymatroid rank functions are submodular \citep{edmonds1970submodular}.
Since $f$ is a non-negative sum of submodular functions (one per layer), it is submodular.
\textbf{(iii)} This is the classical result of \citet{nemhauser1978analysis}: greedy maximization of a monotone submodular function subject to a cardinality constraint achieves a $(1-1/e)$-approximation.
\end{proof}
\paragraph{Practical implication.}
The greedy algorithm for direction selection is already used implicitly in abliteration (selecting the top SVD direction, then the next, etc.).
Theorem~\ref{thm:submodular} provides a formal justification and shows that this approach is near-optimal.
More importantly, it justifies mixing directions from different sources (different layers, whitened vs.\ standard SVD, different harm categories) into a single candidate pool and greedily selecting the best $k$---the platform's analysis-informed pipeline uses exactly this approach.
% ─────────────────────────────────────────────────────────────────────
\subsection{Full Proof of the Residual Signal Bound (Theorem~\ref{thm:ouroboros})}
\label{app:ouroboros_proof}
We provide the complete proof of Theorem~\ref{thm:ouroboros} from the main text, addressing the Gini--Lorenz relationship that was stated without proof.
\begin{proof}[Full proof of Theorem~\ref{thm:ouroboros}]
Let $s_1, \ldots, s_L \geq 0$ be the refusal strengths with $S = \sum_l s_l > 0$ and sorted values $s_{(1)} \leq \cdots \leq s_{(L)}$.
Let $p_l = s_l / S$ be the normalized strengths, so $\sum_l p_l = 1$.
\textbf{Step 1: Repair ratio.}
When layer $j$ is abliterated, the residual refusal signal is at least $S - s_j$ (this is a lower bound since it assumes no compensatory increase from other layers).
The repair ratio is $R_j = (S - s_j)/S = 1 - p_j$.
The minimum over all layers is $R_{\min} = 1 - p_{\max}$ where $p_{\max} = \max_l p_l$.
\textbf{Step 2: Bounding $p_{\max}$ via the Gini coefficient.}
We claim that for any non-negative distribution $(p_1, \ldots, p_L)$ summing to 1 with Gini coefficient $G$:
\begin{equation}
p_{\max} \leq \frac{1 + G(L-1)}{L}
\end{equation}
\textit{Proof of claim.}
The Gini coefficient has the representation:
\begin{equation}
G = \frac{\sum_{i<j}|p_i - p_j|}{\binom{L}{2} \cdot 2\bar{p}} = \frac{\sum_{i<j}|p_i - p_j|}{(L-1)}
\end{equation}
where $\bar{p} = 1/L$.
For the maximum element $p_{\max} = p_{(L)}$, we have $|p_{(L)} - p_{(k)}| = p_{(L)} - p_{(k)}$ for all $k < L$.
Therefore:
\begin{equation}
G \geq \frac{\sum_{k=1}^{L-1}(p_{(L)} - p_{(k)})}{(L-1)} = \frac{(L-1)p_{(L)} - (1 - p_{(L)})}{(L-1)} = p_{(L)} - \frac{1 - p_{(L)}}{L-1}
\end{equation}
where we used $\sum_{k=1}^{L-1} p_{(k)} = 1 - p_{(L)}$.
Rearranging: $G \geq p_{(L)} - (1-p_{(L)})/(L-1)$, so
$G(L-1) \geq (L-1)p_{(L)} - 1 + p_{(L)} = Lp_{(L)} - 1$, giving $p_{(L)} \leq (1+G(L-1))/L$.
This bound is tight: it is achieved by the extremal two-level distribution $p_{\max} = (1+G(L-1))/L$, $p_k = (1-p_{\max})/(L-1)$ for all other $k$.
\textbf{Step 3: Combining.}
\begin{equation}
R_{\min} = 1 - p_{\max} \geq 1 - \frac{1 + G(L-1)}{L} = \frac{L - 1 - G(L-1)}{L} = \frac{(L-1)(1-G)}{L}
\end{equation}
\textbf{Boundary cases.}
For uniform distribution ($G = 0$): $R_{\min} \geq (L-1)/L = 1 - 1/L$.
For maximally concentrated ($G \to 1$): $R_{\min} \to 0$, confirming that single-layer abliteration can be fully effective when all refusal is concentrated.
\end{proof}
% ─────────────────────────────────────────────────────────────────────
\subsection{Full Proof of Sparsity--Energy Concentration (Theorem~\ref{thm:sparse})}
\label{app:sparse_proof}
\begin{proof}[Full proof of Theorem~\ref{thm:sparse}]
Let $p_1 \geq p_2 \geq \cdots \geq p_n$ be sorted projection magnitudes $p_i = |\mathbf{W}[i,:] \cdot \mathbf{r}|$.
Define $P = \sum_i p_i$ and $E_{\text{total}} = \sum_i p_i^2$.
\textbf{Part 1: Frobenius identity.}
Sparse projection modifies only the top-$\lfloor \alpha n \rfloor$ rows, so:
\begin{equation}
\|\Delta\mathbf{W}_{\text{sparse}}\|_F^2 = \sum_{i=1}^{\lfloor \alpha n \rfloor} p_i^2, \quad \|\Delta\mathbf{W}_{\text{dense}}\|_F^2 = \sum_{i=1}^n p_i^2
\end{equation}
Since $E(\alpha) = \sum_{i=1}^{\lfloor\alpha n\rfloor} p_i^2 / \sum_{i=1}^n p_i^2$ by definition, we have the identity $\|\Delta\mathbf{W}_{\text{sparse}}\|_F^2 = E(\alpha) \cdot \|\Delta\mathbf{W}_{\text{dense}}\|_F^2$.
The advantage of sparse surgery is that $E(\alpha) \gg \alpha$ for concentrated distributions ($G > 0$): a fraction $\alpha$ of rows accounts for $E(\alpha)$ of the total perturbation, so the remaining $(1-\alpha)$ fraction of rows---left unchanged by sparse surgery---contributes only $(1-E(\alpha))$ of the dense perturbation.
\textbf{Part 2: Rigorous energy concentration bound.}
We derive a lower bound on $E(\alpha) = \sum_{i=1}^{\lfloor\alpha n\rfloor} p_i^2 / E_{\text{total}}$ in terms of the Gini coefficient $G$ of the distribution $(p_1, \ldots, p_n)$.
Let $\bar{L}(\alpha)$ be the complementary Lorenz curve: the fraction of the total sum $P$ captured by the top-$\alpha$ fraction.
By the Cauchy--Schwarz inequality applied to the top-$\lfloor\alpha n\rfloor$ values:
\begin{equation}
E(\alpha) \geq \frac{\bar{L}(\alpha)^2}{\alpha}
\end{equation}
since $(\sum_{i=1}^m p_i)^2 \leq m \sum_{i=1}^m p_i^2$ gives $\sum_{i=1}^m p_i^2 \geq (\sum p_i)^2/m$.
For distributions with Gini $G$, we bound $\bar{L}(\alpha)$ from below.
A classical result from Lorenz curve theory is that for the two-level extremal distribution (which minimizes $\bar{L}(\alpha)$ for given $G$ when $\alpha \leq (1+G)/2$):
\begin{equation}
\bar{L}(\alpha) \geq \alpha\!\left(1 + G\cdot\frac{1-\alpha}{1-(1-G)\alpha/(1+G)}\right) \geq \alpha(1 + G(1-\alpha))
\end{equation}
where the simpler bound on the right follows from $1-x \geq 1/(1+x)$.
Therefore:
\begin{equation}
E(\alpha) \geq \alpha(1+G(1-\alpha))^2
\end{equation}
At $\alpha = 0.12$, $G = 0.7$: $E(0.12) \geq 0.12(1+0.616)^2 = 0.12 \times 2.613 = 0.314$.
\textbf{Part 3: The empirical scaling law.}
The rigorous bound above ($E \geq 0.314$) is weaker than the empirical observation ($E \approx 0.94$) because real weight matrices have heavier tails than the two-level extremal distribution---a small fraction of rows carry disproportionate refusal energy.
The scaling $E(\alpha) \gtrsim 1-(1-\alpha)^{2/(1+G)}$ stated in the main text is an \emph{empirical} scaling law observed consistently across tested weight matrices.
It is not a proven worst-case bound, and the $\gtrsim$ notation in the main text reflects this status.
We leave the derivation of a tight analytical bound as an open problem.
\end{proof}
\paragraph{Summary.}
The Frobenius identity $\|\Delta\mathbf{W}_{\text{sparse}}\|_F^2 = E(\alpha)\|\Delta\mathbf{W}_{\text{dense}}\|_F^2$ is exact.
The energy concentration $E(\alpha) \geq \alpha(1+G(1-\alpha))^2$ is rigorous but loose.
The tighter scaling $1-(1-\alpha)^{2/(1+G)}$ is empirical.
All three confirm that sparse surgery is strictly more efficient than random row selection for any distribution with $G > 0$.
% ─────────────────────────────────────────────────────────────────────
\section{ML Reproducibility Checklist}
\label{app:reproducibility}
Following the NeurIPS/ICML reproducibility guidelines:
\begin{enumerate}[leftmargin=*]
\item \textbf{Code availability}: Full source code released under AGPL-3.0 at \url{https://github.com/elder-plinius/OBLITERATUS}. Version 0.1.0 archived on Zenodo (DOI pending).
\item \textbf{Dependencies}: All dependencies pinned in \texttt{pyproject.toml}; Docker image available for exact environment reproduction.
\item \textbf{Random seeds}: The platform defaults to seed 42 and supports multi-seed sweeps ($s \in \{42, 137, 2024\}$) with bootstrap CIs. All tables in this paper report single-run results with seed 42. See Section~\ref{para:stat_limitations} for a discussion of statistical limitations and confidence intervals.
\item \textbf{Compute}: All pipeline stages are designed to run on a single GPU. Full evaluation (7 models $\times$ 3 methods) requires ${\sim}$12 GPU-hours on an NVIDIA A100 (80\,GB). Reproducible on consumer hardware (RTX 3090/4090) with quantization.
\item \textbf{Dataset}: Evaluation prompts bundled with the codebase (no external dataset download required). Harmful/harmless prompt sets derived from public benchmarks with filtering.
\item \textbf{Hyperparameters}: Method presets (direction count, regularization, norm preservation) are specified in Section~\ref{sec:intervention}. The \texttt{informed} method's auto-configuration is deterministic given a fixed seed and model.
\item \textbf{Statistical tests}: The platform supports bootstrap CIs (BCa, 10{,}000 resamples) for all continuous metrics and Clopper--Pearson exact CIs for refusal rates. These tools are available for independent replication.
\item \textbf{Negative results}: Section~\ref{sec:discussion} reports failure modes including increased perplexity on polyhedral-refusal models and the independence assumption in Theorem~\ref{thm:ouroboros}.
\end{enumerate}