Iconoclast / final_report_acl.tex
OpenAI Codex
Publish Iconoclast research release
3236af9
\begin{filecontents*}{references.bib}
@inproceedings{abadi2016dp,
title = {Deep Learning with Differential Privacy},
author = {Abadi, Martin and Chu, Andy and Goodfellow, Ian and McMahan, H. Brendan and Mironov, Ilya and Talwar, Kunal and Zhang, Li},
booktitle = {Proceedings of the 2016 ACM SIGSAC Conference on Computer and Communications Security},
pages = {308--318},
year = {2016}
}
@misc{agnihotri2025abliteration,
title = {A Granular Study of Safety Pretraining under Model Abliteration},
author = {Agnihotri, Shashank and Jakubassa, Jonas and Dey, Priyam and Goyal, Sachin and Schiele, Bernt and Radhakrishnan, Venkatesh Babu and Keuper, Margret},
year = {2025},
eprint = {2510.02768},
archivePrefix = {arXiv}
}
@inproceedings{akiba2019optuna,
title = {Optuna: A Next-generation Hyperparameter Optimization Framework},
author = {Akiba, Takuya and Sano, Shotaro and Yanase, Toshihiko and Ohta, Takeru and Koyama, Masanori},
booktitle = {Proceedings of the 25th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},
pages = {2623--2631},
year = {2019}
}
@inproceedings{arditi2024refusal,
title = {Refusal in Language Models Is Mediated by a Single Direction},
author = {Arditi, Andy and Obeso, Oscar and Syed, Aaquib and Paleka, Daniel and Panickssery, Nina and Gurnee, Wes and Nanda, Neel},
booktitle = {Advances in Neural Information Processing Systems},
volume = {37},
pages = {136037--136083},
year = {2024}
}
@inproceedings{bagdasaryan2019disparate,
title = {Differential Privacy Has Disparate Impact on Model Accuracy},
author = {Bagdasaryan, Eugene and Poursaeed, Omid and Shmatikov, Vitaly},
booktitle = {Advances in Neural Information Processing Systems},
volume = {32},
year = {2019}
}
@inproceedings{bourtoule2021machine,
title = {Machine Unlearning},
author = {Bourtoule, Lucas and Chandrasekaran, Varun and Choquette-Choo, Christopher A. and Jia, Hengrui and Travers, Adelin and Zhang, Baiwu and Lie, David and Papernot, Nicolas},
booktitle = {2021 IEEE Symposium on Security and Privacy},
pages = {141--159},
year = {2021}
}
@inproceedings{brown2022privacy,
title = {What Does It Mean for a Language Model to Preserve Privacy?},
author = {Brown, Hannah and Lee, Katherine and Mireshghallah, Fatemehsadat and Shokri, Reza and Tramer, Florian},
booktitle = {Proceedings of the 2022 ACM Conference on Fairness, Accountability, and Transparency},
pages = {2280--2292},
year = {2022}
}
@inproceedings{carlini2021extracting,
title = {Extracting Training Data from Large Language Models},
author = {Carlini, Nicholas and Tramer, Florian and Wallace, Eric and Jagielski, Matthew and Herbert-Voss, Ariel and Lee, Katherine and Roberts, Adam and Brown, Tom and Song, Dawn and Erlingsson, Ulfar and others},
booktitle = {30th USENIX Security Symposium},
pages = {2633--2650},
year = {2021}
}
@misc{chao2024jailbreakbench,
title = {JailbreakBench: An Open Robustness Benchmark for Jailbreaking Large Language Models},
author = {Chao, Patrick and Robey, Alexander and Dobriban, Edgar and Hassani, Hamed and Pappas, George J. and Wong, Eric},
year = {2024},
eprint = {2404.01318},
archivePrefix = {arXiv}
}
@inproceedings{dettmers2022bitsandbytes,
title = {LLM.int8(): 8-bit Matrix Multiplication for Transformers at Scale},
author = {Dettmers, Tim and Lewis, Mike and Belkada, Younes and Zettlemoyer, Luke},
booktitle = {Advances in Neural Information Processing Systems},
volume = {35},
pages = {30318--30332},
year = {2022}
}
@inproceedings{golatkar2020eternal,
title = {Eternal Sunshine of the Spotless Net: Selective Forgetting in Deep Networks},
author = {Golatkar, Aditya and Achille, Alessandro and Soatto, Stefano},
booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
pages = {9304--9312},
year = {2020}
}
@inproceedings{gupta2021adaptive,
title = {Adaptive Machine Unlearning},
author = {Gupta, Varun and Jung, Christopher and Neel, Seth and Roth, Aaron and Sharifi-Malvajerdi, Saeed and Waites, Chris},
booktitle = {Advances in Neural Information Processing Systems},
volume = {34},
pages = {16319--16330},
year = {2021}
}
@inproceedings{hu2022lora,
title = {LoRA: Low-Rank Adaptation of Large Language Models},
author = {Hu, Edward J. and Shen, Yelong and Wallis, Phillip and Allen-Zhu, Zeyuan and Li, Yuanzhi and Wang, Shean and Wang, Lu and Chen, Weizhu},
booktitle = {International Conference on Learning Representations},
year = {2022}
}
@misc{jain2023wildjailbreak,
title = {WildJailbreak},
author = {{Allen Institute for AI}},
year = {2024},
howpublished = {\url{https://huggingface.co/datasets/allenai/wildjailbreak}}
}
@inproceedings{koh2017influence,
title = {Understanding Black-box Predictions via Influence Functions},
author = {Koh, Pang Wei and Liang, Percy},
booktitle = {International Conference on Machine Learning},
pages = {1885--1894},
year = {2017}
}
@misc{labonne2024abliteration,
title = {Uncensor Any LLM with Abliteration},
author = {Labonne, Maxime},
year = {2024},
howpublished = {\url{https://huggingface.co/blog/mlabonne/abliteration}}
}
@misc{labonneHarmlessAlpaca,
title = {Harmless Alpaca Dataset},
author = {Labonne, Maxime},
year = {2024},
howpublished = {\url{https://huggingface.co/datasets/mlabonne/harmless_alpaca}}
}
@misc{labonneHarmfulBehaviors,
title = {Harmful Behaviors Dataset},
author = {Labonne, Maxime},
year = {2024},
howpublished = {\url{https://huggingface.co/datasets/mlabonne/harmful_behaviors}}
}
@misc{lai2025biprojected,
title = {Projected Abliteration and Norm-Preserving Biprojected Abliteration},
author = {Lai, Jim},
year = {2025},
howpublished = {Hugging Face Blog}
}
@inproceedings{lhoest2021datasets,
title = {Datasets: A Community Library for Natural Language Processing},
author = {Lhoest, Quentin and Villanova del Moral, Albert and Jernite, Yacine and Thakur, Abhishek and von Platen, Patrick and Patil, Suraj and Chaumond, Julien and Drame, Mariama and Plu, Julien and Tunstall, Lewis and Davison, Joe and Sasko, Mario and Chhablani, Gunjan and Malik, Bhavitvya and Brandeis, Simon and Le Scao, Teven and Sanh, Victor and Xu, Canwen and Patry, Nicolas and McMillan-Major, Angelina and Schmid, Philipp and Gugger, Sylvain and Delangue, Clement and Matussiere, Thibault and Debut, Lysandre and Bekman, Stas and Cistac, Pierric and Goehringer, Thibault and Mustar, Victor and Lagunas, Francois and Rush, Alexander M. and Wolf, Thomas},
booktitle = {Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing: System Demonstrations},
pages = {175--184},
year = {2021}
}
@inproceedings{lukas2023pii,
title = {Analyzing Leakage of Personally Identifiable Information in Language Models},
author = {Lukas, Nils and Salem, Ahmed and Sim, Robert and Tople, Shruti and Wutschitz, Lukas and Zanella-Beguelin, Santiago},
booktitle = {2023 IEEE Symposium on Security and Privacy},
pages = {346--363},
year = {2023}
}
@inproceedings{mireshghallah2021privacyreg,
title = {Privacy Regularization: Joint Privacy-Utility Optimization in Language Models},
author = {Mireshghallah, Fatemehsadat and Inan, Huseyin and Hasegawa, Marcello and Ruhle, Victor and Berg-Kirkpatrick, Taylor and Sim, Robert},
booktitle = {Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
pages = {3799--3807},
year = {2021}
}
@misc{mireshghallah2023secret,
title = {Can LLMs Keep a Secret? Testing Privacy Implications of Language Models via Contextual Integrity Theory},
author = {Mireshghallah, Niloofar and Kim, Hyunwoo and Zhou, Xuhui and Tsvetkov, Yulia and Sap, Maarten and Shokri, Reza and Choi, Yejin},
year = {2023},
eprint = {2310.17884},
archivePrefix = {arXiv}
}
@inproceedings{paszke2019pytorch,
title = {PyTorch: An Imperative Style, High-Performance Deep Learning Library},
author = {Paszke, Adam and Gross, Sam and Massa, Francisco and Lerer, Adam and Bradbury, James and Chanan, Gregory and Killeen, Trevor and Lin, Zeming and Gimelshein, Natalia and Antiga, Luca and Desmaison, Alban and Kopf, Andreas and Yang, Edward and DeVito, Zachary and Raison, Martin and Tejani, Alykhan and Chilamkurthy, Sasank and Steiner, Benoit and Fang, Lu and Bai, Junjie and Chintala, Soumith},
booktitle = {Advances in Neural Information Processing Systems},
volume = {32},
year = {2019}
}
@misc{peft2024,
title = {PEFT: State-of-the-art Parameter-Efficient Fine-Tuning},
author = {{Hugging Face}},
year = {2024},
howpublished = {\url{https://github.com/huggingface/peft}}
}
@inproceedings{rafailov2023dpo,
title = {Direct Preference Optimization: Your Language Model Is Secretly a Reward Model},
author = {Rafailov, Rafael and Sharma, Archit and Mitchell, Eric and Ermon, Stefano and Manning, Christopher D. and Finn, Chelsea},
booktitle = {Advances in Neural Information Processing Systems},
volume = {36},
year = {2023}
}
@inproceedings{shokri2017membership,
title = {Membership Inference Attacks Against Machine Learning Models},
author = {Shokri, Reza and Stronati, Marco and Song, Congzheng and Shmatikov, Vitaly},
booktitle = {2017 IEEE Symposium on Security and Privacy},
pages = {3--18},
year = {2017}
}
@article{tarun2023unsir,
title = {Fast Yet Effective Machine Unlearning},
author = {Tarun, Ayush K. and Chundawat, Vikram S. and Mandal, Murari and Kankanhalli, Mohan},
journal = {IEEE Transactions on Neural Networks and Learning Systems},
volume = {35},
number = {9},
pages = {13046--13055},
year = {2023}
}
@misc{weidmann2026heretic,
title = {Heretic: Directional Abliteration for Open-Weight Models},
author = {Weidmann, Philipp Emanuel and contributors},
year = {2026},
note = {Software project referenced by the ICONOCLAST NOTICE file}
}
@inproceedings{wolf2020transformers,
title = {Transformers: State-of-the-Art Natural Language Processing},
author = {Wolf, Thomas and Debut, Lysandre and Sanh, Victor and Chaumond, Julien and Delangue, Clement and Moi, Anthony and Cistac, Pierric and Rault, Tim and Louf, Remi and Funtowicz, Morgan and Davison, Joe and Shleifer, Sam and von Platen, Patrick and Ma, Clara and Jernite, Yacine and Plu, Julien and Xu, Canwen and Le Scao, Teven and Gugger, Sylvain and Drame, Mariama and Lhoest, Quentin and Rush, Alexander M.},
booktitle = {Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations},
pages = {38--45},
year = {2020}
}
@inproceedings{zhao2024uma,
title = {UMA: Facilitating Backdoor Scanning via Unlearning-Based Model Ablation},
author = {Zhao, Yue and Li, Congyi and Chen, Kai},
booktitle = {Proceedings of the AAAI Conference on Artificial Intelligence},
volume = {38},
pages = {21823--21831},
year = {2024}
}
\end{filecontents*}
\documentclass[11pt]{article}
\usepackage{acl}
\usepackage{times}
\usepackage{latexsym}
\usepackage[T1]{fontenc}
\usepackage[utf8]{inputenc}
\usepackage{microtype}
\usepackage{amsmath}
\usepackage{booktabs}
\usepackage{graphicx}
\usepackage{url}
\title{ICONOCLAST: Benign-Subspace-Preserved Abliteration for Efficient Representation Editing}
\author{
Aparajita Sarkar\textsuperscript{1*} \and
Urvi Desai\textsuperscript{1*} \and
Varesh Patel\textsuperscript{1*} \\
\textsuperscript{1}Rutgers, New Brunswick, USA \\
\texttt{\{as5760, ubd4, vp752\}@scarletmail.rutgers.edu} \\
\textsuperscript{*}Equal contribution.
}
\begin{document}
\maketitle
\begin{abstract}
The original project proposal framed privacy preservation as a zero-gradient concept erasure problem: identify directions responsible for personally identifiable information (PII) recall and remove them without retraining. The implemented codebase generalizes that idea into ICONOCLAST, a representation-editing framework for ablating refusal behavior while preserving benign model behavior. ICONOCLAST estimates candidate refusal directions from contrastive harmless and harmful prompts, projects those directions away from a low-rank benign residual subspace, and applies the resulting edits through low-rank LoRA adapters over attention and MLP output projections. A multi-objective Optuna search selects edits that reduce harmful-prompt refusals, avoid benign overrefusals, and minimize first-token KL divergence from the base model. Across ten matched open-weight model evaluations, ICONOCLAST improves the lexicographic refusal/overrefusal/KL criterion over a HERETIC-style baseline in every matched row and obtains lower KL divergence in eight of ten cases, though Falcon3 exhibits a high-KL outlier. These results support benign-subspace preservation as a practical mechanism for reducing the utility cost of inference-time representation editing.
\end{abstract}
\section{Introduction}
Large language models memorize and reproduce information from training data, including sensitive personal records and policy-sensitive behaviors \citep{carlini2021extracting,lukas2023pii,brown2022privacy}. The project proposal, \textit{Surgical Privacy via Norm-Preserving Abliteration for Machine Unlearning}, targeted this privacy problem directly: it proposed extracting a PII recognition direction and removing that direction through norm-preserving abliteration. The implementation analyzed in this report keeps the central geometric hypothesis but shifts the target from PII recall to refusal behavior. Instead of retraining a model, ICONOCLAST edits internal representation pathways at inference time.
This shift is technically coherent. Both PII recall and refusal behavior can be described as contrastive concepts: there are prompts that activate the target behavior and prompts that should preserve normal helpfulness. Prior work shows that refusal behavior can be localized to low-dimensional activation directions \citep{arditi2024refusal}, and abliteration-style methods exploit this observation by projecting these directions out of model weights \citep{labonne2024abliteration,lai2025biprojected}. The risk is that the target direction is not geometrically isolated. A naive projection may reduce refusals but also damage benign task behavior, producing an alignment or utility tax.
ICONOCLAST addresses this risk by adding benign-subspace preservation. It computes candidate refusal directions from harmful and harmless prompts, estimates a low-rank subspace from harmless residual activations, and removes from each candidate direction the component aligned with this benign subspace. The final edit is encoded as a LoRA update \citep{hu2022lora} so trials can be evaluated efficiently and later merged into model weights.
The contribution of this report is a code-grounded account of the implemented system. We describe the architecture, algorithms, cluster pipeline, and empirical results found in the repository. We also make explicit where the implementation diverges from the proposal: the final artifact is best understood as a general representation-editing benchmark for refusal removal rather than a completed PII-unlearning benchmark.
\section{Related Work}
\paragraph{Privacy, memorization, and unlearning.}
Training-data extraction and membership inference show that language models can leak memorized text and reveal whether examples were present in training data \citep{shokri2017membership,carlini2021extracting,lukas2023pii}. Contextual integrity further complicates privacy because sensitive disclosures are often defined by social context rather than simple surface forms \citep{brown2022privacy,mireshghallah2023secret}. Differential privacy gives formal training-time guarantees \citep{abadi2016dp}, but can reduce utility and disproportionately harm underrepresented groups \citep{bagdasaryan2019disparate}. Machine unlearning methods attempt post-training deletion through sharding, influence estimates, or selective impair-and-repair strategies \citep{bourtoule2021machine,gupta2021adaptive,koh2017influence,golatkar2020eternal,tarun2023unsir}. These methods often require retraining, gradients, or retained data access.
\paragraph{Representation editing and abliteration.}
Mechanistic representation editing provides a lighter-weight alternative. Refusal behavior has been shown to be mediated by a small number of residual-stream directions \citep{arditi2024refusal}. Public abliteration recipes then use these directions to suppress refusal behavior without ordinary fine-tuning \citep{labonne2024abliteration,lai2025biprojected}. Similar concept-removal ideas also appear in unlearning-based model ablation for backdoor analysis \citep{zhao2024uma}. Safety and utility tradeoffs under abliteration remain active concerns \citep{agnihotri2025abliteration}.
\paragraph{Optimization and infrastructure.}
The codebase builds on the Hugging Face ecosystem for model loading, datasets, and PEFT adapters \citep{wolf2020transformers,lhoest2021datasets,peft2024}. It uses PyTorch for tensor computation \citep{paszke2019pytorch}, optional bitsandbytes quantization for large models \citep{dettmers2022bitsandbytes}, and Optuna for multi-objective hyperparameter search \citep{akiba2019optuna}. The benchmark data are drawn primarily from harmless Alpaca prompts, harmful behavior prompts, and JailbreakBench \citep{labonneHarmlessAlpaca,labonneHarmfulBehaviors,chao2024jailbreakbench}; one stress-test configuration also references WildJailbreak \citep{jain2023wildjailbreak}.
\section{Methodology}
\subsection{System Architecture}
The repository implements a Python package, \texttt{iconoclast}, with a CLI entry point. Configuration is centralized in a Pydantic settings model, which accepts CLI arguments, environment variables, dotenv values, and TOML files. The model wrapper loads a Hugging Face causal or chat model, tries configured dtypes, optionally loads in 4-bit precision, installs PEFT LoRA adapters over target linear modules, and exposes generation, hidden-state extraction, and log-probability methods.
The main pipeline has four phases. First, it loads harmless and harmful prompts. Second, it obtains per-layer residual activations by generating one token and collecting hidden states at the final prompt position. Third, it builds and optionally filters candidate directions. Fourth, Optuna proposes edit parameters, the model is reset, LoRA abliteration is applied, and the edited model is evaluated.
\subsection{Candidate Direction Extraction}
Let $G_{\ell} \in \mathbb{R}^{n_g \times d}$ be harmless residuals and $B_{\ell} \in \mathbb{R}^{n_b \times d}$ be harmful residuals at layer $\ell$. The code constructs three normalized candidates:
\begin{align}
d^{\mathrm{mean}}_{\ell} &=
\mathrm{norm}\left(\mu(B_{\ell})-\mu(G_{\ell})\right), \\
d^{\mathrm{med}}_{\ell} &=
\mathrm{norm}\left(\mathrm{med}(B_{\ell})-\mathrm{med}(G_{\ell})\right), \\
d^{\mathrm{var}}_{\ell} &=
\mathrm{norm}\left(
\frac{\mu(B_{\ell})-\mu(G_{\ell})}
{\sqrt{\frac{1}{2}(\sigma^2(B_{\ell})+\sigma^2(G_{\ell}))+\epsilon}}
\right).
\end{align}
A fourth hybrid direction linearly interpolates between mean and variance candidates. Trials can use per-layer directions directly or a global direction interpolated between adjacent layer directions. They can also sample methods independently for attention and MLP components.
\subsection{Benign-Subspace Preservation}
The primary ICONOCLAST modification is to estimate a benign residual subspace and remove it from candidate directions. For each layer, harmless residuals are centered and passed through low-rank PCA. Given a rank-$k$ benign basis $U_{\ell} \in \mathbb{R}^{k \times d}$ and a candidate direction $d_{\ell}$, the projected direction is
\begin{equation}
\tilde{d}_{\ell} =
\mathrm{norm}\left(d_{\ell} - \alpha U_{\ell}^{\top}U_{\ell}d_{\ell}\right),
\end{equation}
where the implemented benchmark uses full dampening, $\alpha=1$, when the benign subspace is enabled. A separate orthogonalization option removes the component parallel to the harmless mean residual. In benchmark configs, ICONOCLAST generally enables orthogonalization, row-normalized edits, and a benign-subspace rank of 8. The generated HERETIC baseline disables the benign subspace and standard orthogonalization, creating a direct comparison to simpler directional editing \citep{weidmann2026heretic}.
\subsection{LoRA Abliteration}
For each selected component and layer, ICONOCLAST edits output projection matrices with a low-rank update. Let $v$ be the selected direction and $W$ the flattened output matrix. The basic abliteration update is
\begin{equation}
\Delta W = -\lambda v(v^\top W).
\end{equation}
The implementation stores this rank-one update as LoRA matrices: $A=v^\top W$ and $B=-\lambda v$. The scalar $\lambda$ varies by distance from a sampled maximum-weight layer. With \texttt{row\_normalization = pre}, the update is scaled by original row norms. With full row normalization, the code constructs the normalized edited matrix, restores original row magnitudes, subtracts the original matrix, and compresses the delta through low-rank SVD. This approximates norm-preserving biprojected abliteration while retaining adapter-based evaluation speed.
\subsection{Objective and Metrics}
The evaluator records four behavioral quantities. A refusal count is computed through configurable marker matching on generated harmful-prompt responses. A benign overrefusal count uses the same detector on harmless prompts. Disclaimer marker hits measure policy-heavy near misses. A heuristic compliance score combines prompt keyword coverage, actionability markers, response length, and specificity cues.
Utility is measured by first-token KL divergence between the edited and base models on harmless prompts:
\begin{equation}
D_{\mathrm{KL}}(p_{\mathrm{edit}} \parallel p_{\mathrm{base}}).
\end{equation}
The Optuna study minimizes a KL-derived score and a behavior score that combines harmful refusal, overrefusal, disclaimer, and compliance-gap terms. After optimization, the system selects a Pareto-style front sorted by harmful refusals, benign overrefusals, and KL divergence.
\section{Experiments and Results}
\subsection{Experimental Setup}
The matched benchmark runs use harmless training prompts from \texttt{mlabonne/harmless\_alpaca} and harmful prompts from \texttt{JailbreakBench/JBB-Behaviors}. The common full benchmark setting uses 240 harmless training prompts, 80 harmful training prompts, 64 harmless evaluation prompts, and the remaining 20 JBB harmful prompts. Most full benchmark configurations run 48 Optuna trials with four startup trials, not the 200 trials claimed in an earlier draft. Older exploratory configs use smaller budgets and \texttt{mlabonne/harmful\_behaviors}.
Evaluations were run on Rutgers iLabs Slurm infrastructure. The scripts stage source into per-job directories, isolate model and dataset caches, run batch optimization, write \texttt{batch\_summary.json}, and clean temporary caches. Sequential orchestration was added because parallel model downloads exceeded disk quotas.
\subsection{Matched Model Comparison}
Table~\ref{tab:results} reports the best matched rows found in local \texttt{batch\_summary.json} files. Harmful refusal counts are out of 20; benign overrefusals are out of 64. The comparison criterion is lexicographic: fewer harmful refusals, then fewer benign overrefusals, then lower KL.
\begin{table*}[t]
\centering
\small
\begin{tabular}{lrrrrrr}
\toprule
Model & \multicolumn{3}{c}{ICONOCLAST} & \multicolumn{3}{c}{HERETIC} \\
\cmidrule(lr){2-4}\cmidrule(lr){5-7}
& Ref. & Over. & KL & Ref. & Over. & KL \\
\midrule
Llama-3.1-8B-Instruct & 0 & 0 & 0.0447 & 1 & 0 & 0.1854 \\
Qwen3.5-9B base & 10 & 2 & 0.0055 & 10 & 3 & 0.0160 \\
Mistral-7B-Instruct-v0.3 & 1 & 0 & 0.0554 & 4 & 0 & 0.1317 \\
Falcon3-7B-Instruct & 0 & 0 & 6.1448 & 4 & 1 & 0.1648 \\
Gemma-2-2B-IT & 1 & 0 & 0.1849 & 1 & 2 & 0.6441 \\
Phi-4-mini-instruct & 2 & 1 & 0.0204 & 2 & 1 & 0.0978 \\
Yi-1.5-9B-Chat & 2 & 0 & 0.0511 & 3 & 0 & 0.0355 \\
StableLM2-1.6B & 2 & 0 & 0.0328 & 3 & 0 & 0.0670 \\
SmolLM2-1.7B-Instruct & 1 & 1 & 0.0087 & 2 & 2 & 0.2699 \\
OLMo-2-1B-Instruct & 2 & 0 & 0.0345 & 2 & 1 & 0.0944 \\
\bottomrule
\end{tabular}
\caption{Matched benchmark summaries. Ref. is harmful-prompt refusal count out of 20; Over. is harmless-prompt overrefusal count out of 64; KL is first-token divergence from the base model.}
\label{tab:results}
\end{table*}
ICONOCLAST wins all ten rows under the repository's selection rule. It obtains strictly fewer harmful refusals in six rows, equal harmful refusals with fewer overrefusals or lower KL in four rows, and lower KL divergence in eight rows. The strongest utility-preservation cases are SmolLM2, Gemma-2, and Llama-3.1: ICONOCLAST substantially reduces KL while matching or improving behavioral metrics. Qwen3.5-9B is difficult: both methods retain 10 refusals, but ICONOCLAST reduces benign overrefusals and KL. Yi-1.5 shows the main tradeoff case: ICONOCLAST has fewer refusals, but HERETIC has lower KL.
Falcon3 is an important failure mode. ICONOCLAST achieves zero refusals and zero overrefusals, but its KL divergence is 6.1448, far above the rest of the table. This suggests that the lexicographic selection rule can prefer behavioral gains even when semantic drift is severe. A stricter production system should impose a hard KL constraint or move high-KL candidates off the acceptable Pareto front.
\subsection{Additional Runs}
Several exploratory summaries support the same design trajectory but are not directly matched in Table~\ref{tab:results}. Qwen3-1.7B paper-directness reaches 0 harmful refusals, 0 overrefusals, and 0.0310 KL on its smaller setting. Qwen2.5-3B base reaches 1 refusal, 1 overrefusal, and 0.0263 KL. Qwen3-4B benchmark-v2 reaches 2 refusals and 0 overrefusals, but with 0.7976 KL. Phi-3.5 nullspace-v3 reaches 3 refusals, 2 overrefusals, and 0.0981 KL. No completed large-N evaluator JSON outputs were present in the analyzed local tree, so the present report should be read as an optimized holdout benchmark rather than a large-scale statistical confirmation.
\section{Conclusion}
ICONOCLAST implements the proposal's core insight that target behaviors can be edited geometrically without ordinary retraining, but it applies that insight to refusal-direction editing rather than completed PII unlearning. The codebase demonstrates a coherent architecture: contrastive residual collection, multiple direction estimators, benign-subspace projection, LoRA-encoded abliteration, and multi-objective search over behavioral and utility metrics.
The empirical evidence is promising but nuanced. Benign-subspace preservation improves the repository's matched refusal/overrefusal/KL criterion on all ten matched rows and reduces KL on eight of ten. However, the Falcon3 KL outlier and the absence of completed large-N outputs show that the method still needs stronger constraints and broader validation. Future work should return to the proposal's PII setting by constructing contrastive privacy datasets, replacing refusal markers with PII leakage metrics, and evaluating whether benign-subspace-preserved abliteration can remove privacy-relevant recall while preserving general task utility.
\section*{Limitations}
The report is based on local source, configuration, timestamp, and result files. Git history contains only one commit, and filesystem creation times may be distorted by copy or sync operations. The evaluation uses marker-based refusal detection and heuristic compliance scoring, which can misclassify responses. Finally, the ACL source assumes the standard ACL style files are available in the compilation environment.
\bibliographystyle{acl_natbib}
\bibliography{references}
\end{document}