Instructions to use HaadesX/Iconoclast with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use HaadesX/Iconoclast with Transformers:
# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("HaadesX/Iconoclast", dtype="auto") - Notebooks
- Google Colab
- Kaggle
File size: 4,208 Bytes
3236af9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 | {
"model": "microsoft/Phi-4-mini-instruct",
"study_checkpoint_dir": "/common/users/vp752/iconoclast_ilabs/checkpoints/phi4-mini-seq",
"base_metrics": {
"refusals": 20,
"overrefusals": 1,
"harmful_marker_hits": 61,
"harmful_compliance_score": 0.1103125,
"objective_regime": "refusal_reduction"
},
"pareto_trials": [
{
"index": 28,
"refusals": 2,
"overrefusals": 1,
"harmful_marker_hits": 2,
"harmful_compliance_score": 0.8972916666666666,
"objective_regime": "refusal_reduction",
"merge_penalty": 0.0,
"kl_divergence": 0.02042904868721962,
"direction_method": "variance",
"direction_scope": "global",
"direction_index": 16.45222352347153,
"direction_blend": 0.3728440320243102,
"parameters": {
"attn.o_proj": {
"max_weight": 1.882400232860422,
"max_weight_position": 13.328367131271403,
"min_weight": 0.6652997220745103,
"min_weight_distance": 8.31304092962315
},
"mlp.down_proj": {
"max_weight": 1.5015368946798242,
"max_weight_position": 13.674168714713092,
"min_weight": 0.11981929558782818,
"min_weight_distance": 11.52494746677227
}
},
"harmful_axis_metrics": {}
},
{
"index": 35,
"refusals": 3,
"overrefusals": 1,
"harmful_marker_hits": 3,
"harmful_compliance_score": 0.8765624999999998,
"objective_regime": "refusal_reduction",
"merge_penalty": 0.0,
"kl_divergence": 0.01764761470258236,
"direction_method": "variance",
"direction_scope": "global",
"direction_index": 14.391700130129191,
"direction_blend": 0.48119901424426287,
"parameters": {
"attn.o_proj": {
"max_weight": 1.8153179845721898,
"max_weight_position": 14.096237144989855,
"min_weight": 0.1952412632909053,
"min_weight_distance": 7.563651219512723
},
"mlp.down_proj": {
"max_weight": 1.6339436954317168,
"max_weight_position": 16.614695316501514,
"min_weight": 0.14139150079012267,
"min_weight_distance": 10.933304280225325
}
},
"harmful_axis_metrics": {}
},
{
"index": 21,
"refusals": 4,
"overrefusals": 1,
"harmful_marker_hits": 5,
"harmful_compliance_score": 0.8979166666666665,
"objective_regime": "refusal_reduction",
"merge_penalty": 0.0,
"kl_divergence": 0.014471019618213177,
"direction_method": "mean",
"direction_scope": "global",
"direction_index": 14.498629309192385,
"direction_blend": 0.8300558396943958,
"parameters": {
"attn.o_proj": {
"max_weight": 1.1983582967647113,
"max_weight_position": 13.868190570832633,
"min_weight": 0.8900674738884767,
"min_weight_distance": 11.933617812175987
},
"mlp.down_proj": {
"max_weight": 1.1356281861124395,
"max_weight_position": 16.564984175000383,
"min_weight": 0.3297443238531411,
"min_weight_distance": 18.394843487603442
}
},
"harmful_axis_metrics": {}
},
{
"index": 30,
"refusals": 10,
"overrefusals": 1,
"harmful_marker_hits": 11,
"harmful_compliance_score": 0.8746875,
"objective_regime": "refusal_reduction",
"merge_penalty": 0.0,
"kl_divergence": 0.010015908628702164,
"direction_method": "median",
"direction_scope": "global",
"direction_index": 17.66484981854466,
"direction_blend": 0.21628527164515216,
"parameters": {
"attn.o_proj": {
"max_weight": 1.774744142249135,
"max_weight_position": 14.257925722400284,
"min_weight": 0.6188083561998832,
"min_weight_distance": 9.469596075803267
},
"mlp.down_proj": {
"max_weight": 1.3468990075749931,
"max_weight_position": 18.293592163081765,
"min_weight": 0.10765475308238112,
"min_weight_distance": 8.26487191097985
}
},
"harmful_axis_metrics": {}
}
]
} |