{ "source_model": "infly/OpenCoder-8B-Base", "technique": "refusal_direction_ablation", "method": "advanced", "method_config": { "n_directions": 1, "norm_preserve": false, "regularization": 0.98, "refinement_passes": 1, "project_biases": false, "use_chat_template": false, "use_whitened_svd": false, "true_iterative_refinement": false, "winsorize_activations": false, "float_layer_interpolation": false, "cot_aware": false, "use_kl_optimization": false, "use_lora_ablation": false, "spectral_cascade": true, "spectral_bands": 2, "spectral_threshold": 0.01 }, "references": [ "Arditi et al., Refusal in Language Models Is Mediated by a Single Direction (NeurIPS 2024)", "Gabliteration: SVD-based multi-direction extraction (arXiv:2512.18901)", "Norm-Preserving Biprojected Abliteration (grimjim, 2025)", "Young, Comparative Analysis of LLM Abliteration Methods (arXiv:2512.13655)", "Joad et al., More to Refusal than a Single Direction (2026)", "Heretic (p-e-w, 2025): Bayesian optimization, LoRA-mediated ablation, winsorization", "OBLITERATUS: Whitened SVD, EGA, CoT-aware, KL co-optimization, float interpolation (novel)" ], "strong_layers": [ 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17 ], "n_harmful_prompts": 512, "n_harmless_prompts": 512, "quality_metrics": { "perplexity": 4.38637179262426, "coherence": 0.8, "refusal_rate": 0.0, "kl_divergence": 0.10356692969799042, "spectral_certification": null }, "kl_contributions": {}, "cot_preserved_layers": [], "float_layer_weights": {}, "lora_adapters_saved": false }