| { |
| "source_model": "Qwen/Qwen3-4B", |
| "technique": "refusal_direction_ablation", |
| "method": "advanced", |
| "method_config": { |
| "n_directions": 4, |
| "direction_method": "svd", |
| "norm_preserve": true, |
| "regularization": 0.3, |
| "refinement_passes": 2, |
| "project_biases": true, |
| "use_chat_template": true, |
| "use_whitened_svd": false, |
| "true_iterative_refinement": false, |
| "winsorize_activations": false, |
| "float_layer_interpolation": false, |
| "cot_aware": false, |
| "use_kl_optimization": false, |
| "use_lora_ablation": false, |
| "spectral_cascade": false, |
| "spectral_bands": 3, |
| "spectral_threshold": 0.05 |
| }, |
| "references": [ |
| "Arditi et al., Refusal in Language Models Is Mediated by a Single Direction (NeurIPS 2024)", |
| "Gabliteration: SVD-based multi-direction extraction (arXiv:2512.18901)", |
| "Norm-Preserving Biprojected Abliteration (grimjim, 2025)", |
| "Young, Comparative Analysis of LLM Abliteration Methods (arXiv:2512.13655)", |
| "Joad et al., More to Refusal than a Single Direction (2026)", |
| "Heretic (p-e-w, 2025): Bayesian optimization, LoRA-mediated ablation, winsorization", |
| "OBLITERATUS: Whitened SVD, EGA, CoT-aware, KL co-optimization, float interpolation (novel)" |
| ], |
| "strong_layers": [ |
| 2, |
| 3, |
| 4, |
| 5, |
| 6, |
| 7, |
| 8, |
| 9, |
| 10, |
| 11, |
| 12, |
| 13, |
| 14, |
| 15, |
| 16, |
| 17, |
| 18, |
| 19, |
| 20, |
| 21, |
| 22, |
| 23, |
| 24, |
| 25, |
| 26, |
| 27, |
| 28, |
| 29, |
| 30, |
| 31, |
| 32, |
| 33, |
| 34, |
| 35 |
| ], |
| "n_harmful_prompts": 33, |
| "n_harmless_prompts": 33, |
| "quality_metrics": { |
| "perplexity": 4.811749685438475, |
| "coherence": 1.0, |
| "refusal_rate": 0.03333333333333333, |
| "kl_divergence": 0.00011699854076141492, |
| "spectral_certification": "RED" |
| }, |
| "kl_contributions": {}, |
| "cot_preserved_layers": [], |
| "float_layer_weights": {}, |
| "lora_adapters_saved": false |
| } |