File size: 1,740 Bytes
5975329 c31cb98 5975329 c31cb98 5975329 c31cb98 5975329 f3c4bae 5975329 c31cb98 f3c4bae 5975329 f3c4bae 5975329 f3c4bae 5975329 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 | {
"source_model": "infly/OpenCoder-8B-Base",
"technique": "refusal_direction_ablation",
"method": "advanced",
"method_config": {
"n_directions": 1,
"norm_preserve": false,
"regularization": 0.98,
"refinement_passes": 1,
"project_biases": false,
"use_chat_template": false,
"use_whitened_svd": false,
"true_iterative_refinement": false,
"winsorize_activations": false,
"float_layer_interpolation": false,
"cot_aware": false,
"use_kl_optimization": false,
"use_lora_ablation": false,
"spectral_cascade": true,
"spectral_bands": 2,
"spectral_threshold": 0.01
},
"references": [
"Arditi et al., Refusal in Language Models Is Mediated by a Single Direction (NeurIPS 2024)",
"Gabliteration: SVD-based multi-direction extraction (arXiv:2512.18901)",
"Norm-Preserving Biprojected Abliteration (grimjim, 2025)",
"Young, Comparative Analysis of LLM Abliteration Methods (arXiv:2512.13655)",
"Joad et al., More to Refusal than a Single Direction (2026)",
"Heretic (p-e-w, 2025): Bayesian optimization, LoRA-mediated ablation, winsorization",
"OBLITERATUS: Whitened SVD, EGA, CoT-aware, KL co-optimization, float interpolation (novel)"
],
"strong_layers": [
31,
30,
29,
28,
27,
26,
25,
24,
23,
22,
21,
20,
19,
18,
17
],
"n_harmful_prompts": 512,
"n_harmless_prompts": 512,
"quality_metrics": {
"perplexity": 4.38637179262426,
"coherence": 0.8,
"refusal_rate": 0.0,
"kl_divergence": 0.10356692969799042,
"spectral_certification": null
},
"kl_contributions": {},
"cot_preserved_layers": [],
"float_layer_weights": {},
"lora_adapters_saved": false
} |