Spaces:
Running on Zero
Running on Zero
| % ββ Refusal and Abliteration ββββββββββββββββββββββββββββββββββββββββββ | |
| @article{arditi2024refusal, | |
| title={Refusal in Language Models Is Mediated by a Single Direction}, | |
| author={Arditi, Andy and Ballard, Oscar and others}, | |
| journal={arXiv preprint arXiv:2406.11717}, | |
| year={2024} | |
| } | |
| @misc{gabliteration2024, | |
| title={{Gabliteration}: Adaptive Multi-Directional Neural Weight Modification for Selective Behavioral Alteration in Large Language Models}, | |
| author={G\"{o}kdeniz G\"{u}lmez}, | |
| year={2026}, | |
| eprint={2512.18901}, | |
| archivePrefix={arXiv}, | |
| primaryClass={cs.AI}, | |
| url={https://arxiv.org/abs/2512.18901} | |
| } | |
| @misc{grimjim2025, | |
| title={Norm-Preserving Biprojected Abliteration}, | |
| author={{grimjim}}, | |
| year={2025}, | |
| howpublished={\url{https://huggingface.co/grimjim}}, | |
| note={HuggingFace model cards} | |
| } | |
| @misc{failspy_abliterator, | |
| title={abliterator: Refusal direction removal tool}, | |
| author={{FailSpy}}, | |
| year={2024}, | |
| howpublished={\url{https://github.com/FailSpy/abliterator}} | |
| } | |
| % ββ Concept Cones and Geometry ββββββββββββββββββββββββββββββββββββββββ | |
| @inproceedings{wollschlager2025geometry, | |
| title={The Geometry of Refusal in Large Language Models: Concept Cones and Representational Independence}, | |
| author={Wollschlager, Tom and Elstner, Jannes and Geisler, Simon and Cohen-Addad, Vincent and Gunnemann, Stephan and Gasteiger, Johannes}, | |
| booktitle={International Conference on Machine Learning (ICML)}, | |
| year={2025}, | |
| note={arXiv:2502.17420} | |
| } | |
| % ββ Steering Vectors ββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| @article{turner2023activation, | |
| title={Activation Addition: Steering Language Models Without Optimization}, | |
| author={Turner, Alexander Matt and Thiergart, Lisa and Udell, David and Leech, Gavin and Mini, Ulisse and MacDiarmid, Monte}, | |
| journal={arXiv preprint arXiv:2308.10248}, | |
| year={2023} | |
| } | |
| @article{rimsky2024steering, | |
| title={Steering {Llama} 2 via Contrastive Activation Addition}, | |
| author={Rimsky, Nina and Gabrieli, Nick and Schulz, Julian and Turner, Alexander Matt and Tong, Evan and Hubinger, Evan}, | |
| journal={arXiv preprint arXiv:2312.06681}, | |
| year={2024} | |
| } | |
| @article{li2024inference, | |
| title={Inference-Time Intervention: Eliciting Truthful Answers from a Language Model}, | |
| author={Li, Kenneth and Patel, Oam and Vi{\'e}gas, Fernanda and Pfister, Hanspeter and Wattenberg, Martin}, | |
| journal={Advances in Neural Information Processing Systems}, | |
| volume={36}, | |
| year={2024} | |
| } | |
| % ββ Alignment Training Methods ββββββββββββββββββββββββββββββββββββββββ | |
| @article{ouyang2022training, | |
| title={Training Language Models to Follow Instructions with Human Feedback}, | |
| author={Ouyang, Long and Wu, Jeff and Jiang, Xu and Almeida, Diogo and Wainwright, Carroll L and Mishkin, Pamela and Zhang, Chong and Agarwal, Sandhini and Slama, Katarina and Ray, Alex and others}, | |
| journal={Advances in Neural Information Processing Systems}, | |
| volume={35}, | |
| year={2022} | |
| } | |
| @article{rafailov2023direct, | |
| title={Direct Preference Optimization: Your Language Model is Secretly a Reward Model}, | |
| author={Rafailov, Rafael and Sharma, Archit and Mitchell, Eric and Ermon, Stefano and Manning, Christopher D and Finn, Chelsea}, | |
| journal={Advances in Neural Information Processing Systems}, | |
| volume={36}, | |
| year={2023} | |
| } | |
| @article{bai2022constitutional, | |
| title={Constitutional {AI}: Harmlessness from {AI} Feedback}, | |
| author={Bai, Yuntao and Kadavath, Saurav and Kundu, Sandipan and Askell, Amanda and Kernion, Jackson and Jones, Andy and Chen, Anna and Goldie, Anna and Mirhoseini, Azalia and McKinnon, Cameron and others}, | |
| journal={arXiv preprint arXiv:2212.08073}, | |
| year={2022} | |
| } | |
| % ββ Mechanistic Interpretability ββββββββββββββββββββββββββββββββββββββ | |
| @article{meng2022locating, | |
| title={Locating and Editing Factual Associations in {GPT}}, | |
| author={Meng, Kevin and Bau, David and Andonian, Alex and Belinkov, Yonatan}, | |
| journal={Advances in Neural Information Processing Systems}, | |
| volume={35}, | |
| year={2022} | |
| } | |
| @article{elhage2021mathematical, | |
| title={A Mathematical Framework for Transformer Circuits}, | |
| author={Elhage, Nelson and Nanda, Neel and Olsson, Catherine and Henighan, Tom and Joseph, Nicholas and Mann, Ben and Askell, Amanda and Bai, Yuntao and Chen, Anna and Conerly, Tom and others}, | |
| journal={Transformer Circuits Thread}, | |
| year={2021}, | |
| howpublished={\url{https://transformer-circuits.pub/2021/framework/index.html}} | |
| } | |
| @misc{nanda2022transformerlens, | |
| title={{TransformerLens}}, | |
| author={Nanda, Neel and Bloom, Joseph}, | |
| year={2022}, | |
| howpublished={\url{https://github.com/TransformerLensOrg/TransformerLens}} | |
| } | |
| @misc{nostalgebraist2020logit, | |
| title={interpreting {GPT}: the logit lens}, | |
| author={{nostalgebraist}}, | |
| year={2020}, | |
| howpublished={\url{https://www.lesswrong.com/posts/AcKRB8wDpdaN6v6ru/}} | |
| } | |
| % ββ Probing and Representation Analysis βββββββββββββββββββββββββββββββ | |
| @article{alain2017understanding, | |
| title={Understanding Intermediate Layers Using Linear Classifier Probes}, | |
| author={Alain, Guillaume and Bengio, Yoshua}, | |
| journal={arXiv preprint arXiv:1610.01644}, | |
| year={2017} | |
| } | |
| @article{kornblith2019similarity, | |
| title={Similarity of Neural Network Representations Revisited}, | |
| author={Kornblith, Simon and Norouzi, Mohammad and Lee, Honglak and Hinton, Geoffrey}, | |
| journal={International Conference on Machine Learning}, | |
| year={2019} | |
| } | |
| @article{ethayarajh2019contextual, | |
| title={How Contextual are Contextualized Word Representations? {C}omparing the Geometry of {BERT}, {ELMo}, and {GPT-2} Embeddings}, | |
| author={Ethayarajh, Kawin}, | |
| journal={Proceedings of EMNLP-IJCNLP}, | |
| year={2019} | |
| } | |
| % ββ Defense and Safety ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| @article{qi2025safety, | |
| title={Safety Alignment Should Be Made More Than Just a Few Tokens Deep}, | |
| author={Qi, Xiangyu and Zeng, Yi and Xie, Tinghao and Chen, Pin-Yu and Jia, Ruoxi and Mittal, Prateek and Henderson, Peter}, | |
| journal={arXiv preprint arXiv:2406.05946}, | |
| year={2024} | |
| } | |
| @article{zou2024circuit, | |
| title={Improving Alignment and Robustness with Circuit Breakers}, | |
| author={Zou, Andy and Phan, Long and Chen, Justin and Campbell, James and Guo, Phillip and Ren, Richard and Pan, Alexander and Yin, Xuwang and Mazeika, Mantas and Dombrowski, Ann-Kathrin and others}, | |
| journal={arXiv preprint arXiv:2406.04313}, | |
| year={2024} | |
| } | |
| @article{zou2023universal, | |
| title={Universal and Transferable Adversarial Attacks on Aligned Language Models}, | |
| author={Zou, Andy and Wang, Zifan and Kolter, J Zico and Fredrikson, Matt}, | |
| journal={arXiv preprint arXiv:2307.15043}, | |
| year={2023} | |
| } | |
| @article{zou2023representation, | |
| title={Representation Engineering: A Top-Down Approach to {AI} Transparency}, | |
| author={Zou, Andy and Phan, Long and Chen, Sarah and Campbell, James and Guo, Phillip and Ren, Richard and Pan, Alexander and Yin, Xuwang and Mazeika, Mantas and Dombrowski, Ann-Kathrin and others}, | |
| journal={arXiv preprint arXiv:2310.01405}, | |
| year={2023} | |
| } | |
| @article{young2025comparative, | |
| title={Comparative Analysis of Abliteration Methods for Language Model Safety Removal}, | |
| author={Young, Alex}, | |
| journal={arXiv preprint arXiv:2502.05420}, | |
| year={2025} | |
| } | |
| % ββ Heretic and Bayesian Abliteration ββββββββββββββββββββββββββββββββ | |
| @misc{heretic2025, | |
| title={Heretic: Bayesian Optimization for {LLM} Abliteration}, | |
| author={{p-e-w}}, | |
| year={2025}, | |
| howpublished={\url{https://github.com/p-e-w/heretic}}, | |
| note={Pioneered Bayesian optimization and LoRA-mediated ablation for refusal removal} | |
| } | |
| @inproceedings{akiba2019optuna, | |
| title={Optuna: A Next-generation Hyperparameter Optimization Framework}, | |
| author={Akiba, Takuya and Sano, Shotaro and Yanase, Toshihiko and Ohta, Takeru and Koyama, Masanori}, | |
| booktitle={Proceedings of the 25th ACM SIGKDD International Conference on Knowledge Discovery \& Data Mining}, | |
| pages={2623--2631}, | |
| year={2019} | |
| } | |
| % ββ LoRA and Low-Rank Adaptation ββββββββββββββββββββββββββββββββββββ | |
| @article{hu2022lora, | |
| title={{LoRA}: Low-Rank Adaptation of Large Language Models}, | |
| author={Hu, Edward J and Shen, Yelong and Wallis, Phillip and Allen-Zhu, Zeyuan and Li, Yuanzhi and Wang, Shean and Wang, Lu and Chen, Weizhu}, | |
| journal={International Conference on Learning Representations}, | |
| year={2022} | |
| } | |
| % ββ Mixture-of-Experts ββββββββββββββββββββββββββββββββββββββββββββββ | |
| @article{shazeer2017outrageously, | |
| title={Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer}, | |
| author={Shazeer, Noam and Mirhoseini, Azalia and Maziarz, Krzysztof and Davis, Andy and Le, Quoc and Hinton, Geoffrey and Dean, Jeff}, | |
| journal={International Conference on Learning Representations}, | |
| year={2017} | |
| } | |
| @article{fedus2022switch, | |
| title={Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity}, | |
| author={Fedus, William and Zoph, Barret and Shazeer, Noam}, | |
| journal={Journal of Machine Learning Research}, | |
| volume={23}, | |
| number={120}, | |
| pages={1--39}, | |
| year={2022} | |
| } | |
| @article{jiang2024mixtral, | |
| title={Mixtral of Experts}, | |
| author={Jiang, Albert Q and Sablayrolles, Alexandre and Roux, Antoine and Mensch, Arthur and Savary, Blanche and Bamford, Chris and Chaplot, Devendra Singh and de las Casas, Diego and Hanna, Emma Bou and Bressand, Florian and others}, | |
| journal={arXiv preprint arXiv:2401.04088}, | |
| year={2024} | |
| } | |
| @article{dai2024deepseekmoe, | |
| title={{DeepSeekMoE}: Towards Ultimate Expert Specialization in Mixture-of-Experts Language Models}, | |
| author={Dai, Damai and Deng, Chengqi and Zhao, Chenggang and Xu, R X and Gao, Huazuo and Chen, Deli and Li, Jiashi and Zeng, Wangding and Yu, Xingkai and Wu, Y and others}, | |
| journal={arXiv preprint arXiv:2401.06066}, | |
| year={2024} | |
| } | |
| % ββ Evaluation ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| @article{gao2021framework, | |
| title={A Framework for Few-shot Language Model Evaluation}, | |
| author={Gao, Leo and Tow, Jonathan and Biderman, Stella and Black, Sid and DiPofi, Anthony and Foster, Charles and Golding, Laurence and Hsu, Jeffrey and McDonell, Kyle and Muennighoff, Niklas and others}, | |
| journal={Zenodo}, | |
| year={2021} | |
| } | |
| % ββ Datasets ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| @article{taori2023alpaca, | |
| title={Stanford Alpaca: An Instruction-following LLaMA Model}, | |
| author={Taori, Rohan and Gulrajani, Ishaan and Zhang, Tianyi and Dubois, Yann and Li, Xuechen and Guestrin, Carlos and Liang, Percy and Hashimoto, Tatsunori B}, | |
| year={2023}, | |
| url={https://github.com/tatsu-lab/stanford_alpaca} | |
| } | |