eAI-hf
/

QwenPoly01

Model card Files Files and versions

QwenPoly01 / abliteration_metadata.json

numpy-libs's picture

OBLITERATUS: advanced on Qwen/Qwen3-4B

d626e1f verified about 1 month ago

history blame contribute delete

1.92 kB

	{
	"source_model": "Qwen/Qwen3-4B",
	"technique": "refusal_direction_ablation",
	"method": "advanced",
	"method_config": {
	"n_directions": 4,
	"direction_method": "svd",
	"norm_preserve": true,
	"regularization": 0.3,
	"refinement_passes": 2,
	"project_biases": true,
	"use_chat_template": true,
	"use_whitened_svd": false,
	"true_iterative_refinement": false,
	"winsorize_activations": false,
	"float_layer_interpolation": false,
	"cot_aware": false,
	"use_kl_optimization": false,
	"use_lora_ablation": false,
	"spectral_cascade": false,
	"spectral_bands": 3,
	"spectral_threshold": 0.05
	},
	"references": [
	"Arditi et al., Refusal in Language Models Is Mediated by a Single Direction (NeurIPS 2024)",
	"Gabliteration: SVD-based multi-direction extraction (arXiv:2512.18901)",
	"Norm-Preserving Biprojected Abliteration (grimjim, 2025)",
	"Young, Comparative Analysis of LLM Abliteration Methods (arXiv:2512.13655)",
	"Joad et al., More to Refusal than a Single Direction (2026)",
	"Heretic (p-e-w, 2025): Bayesian optimization, LoRA-mediated ablation, winsorization",
	"OBLITERATUS: Whitened SVD, EGA, CoT-aware, KL co-optimization, float interpolation (novel)"
	],
	"strong_layers": [
	2,
	3,
	4,
	5,
	6,
	7,
	8,
	9,
	10,
	11,
	12,
	13,
	14,
	15,
	16,
	17,
	18,
	19,
	20,
	21,
	22,
	23,
	24,
	25,
	26,
	27,
	28,
	29,
	30,
	31,
	32,
	33,
	34,
	35
	],
	"n_harmful_prompts": 33,
	"n_harmless_prompts": 33,
	"quality_metrics": {
	"perplexity": 4.811749685438475,
	"coherence": 1.0,
	"refusal_rate": 0.03333333333333333,
	"kl_divergence": 0.00011699854076141492,
	"spectral_certification": "RED"
	},
	"kl_contributions": {},
	"cot_preserved_layers": [],
	"float_layer_weights": {},
	"lora_adapters_saved": false
	}