File size: 4,208 Bytes
3236af9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
{
  "model": "microsoft/Phi-4-mini-instruct",
  "study_checkpoint_dir": "/common/users/vp752/iconoclast_ilabs/checkpoints/phi4-mini-seq",
  "base_metrics": {
    "refusals": 20,
    "overrefusals": 1,
    "harmful_marker_hits": 61,
    "harmful_compliance_score": 0.1103125,
    "objective_regime": "refusal_reduction"
  },
  "pareto_trials": [
    {
      "index": 28,
      "refusals": 2,
      "overrefusals": 1,
      "harmful_marker_hits": 2,
      "harmful_compliance_score": 0.8972916666666666,
      "objective_regime": "refusal_reduction",
      "merge_penalty": 0.0,
      "kl_divergence": 0.02042904868721962,
      "direction_method": "variance",
      "direction_scope": "global",
      "direction_index": 16.45222352347153,
      "direction_blend": 0.3728440320243102,
      "parameters": {
        "attn.o_proj": {
          "max_weight": 1.882400232860422,
          "max_weight_position": 13.328367131271403,
          "min_weight": 0.6652997220745103,
          "min_weight_distance": 8.31304092962315
        },
        "mlp.down_proj": {
          "max_weight": 1.5015368946798242,
          "max_weight_position": 13.674168714713092,
          "min_weight": 0.11981929558782818,
          "min_weight_distance": 11.52494746677227
        }
      },
      "harmful_axis_metrics": {}
    },
    {
      "index": 35,
      "refusals": 3,
      "overrefusals": 1,
      "harmful_marker_hits": 3,
      "harmful_compliance_score": 0.8765624999999998,
      "objective_regime": "refusal_reduction",
      "merge_penalty": 0.0,
      "kl_divergence": 0.01764761470258236,
      "direction_method": "variance",
      "direction_scope": "global",
      "direction_index": 14.391700130129191,
      "direction_blend": 0.48119901424426287,
      "parameters": {
        "attn.o_proj": {
          "max_weight": 1.8153179845721898,
          "max_weight_position": 14.096237144989855,
          "min_weight": 0.1952412632909053,
          "min_weight_distance": 7.563651219512723
        },
        "mlp.down_proj": {
          "max_weight": 1.6339436954317168,
          "max_weight_position": 16.614695316501514,
          "min_weight": 0.14139150079012267,
          "min_weight_distance": 10.933304280225325
        }
      },
      "harmful_axis_metrics": {}
    },
    {
      "index": 21,
      "refusals": 4,
      "overrefusals": 1,
      "harmful_marker_hits": 5,
      "harmful_compliance_score": 0.8979166666666665,
      "objective_regime": "refusal_reduction",
      "merge_penalty": 0.0,
      "kl_divergence": 0.014471019618213177,
      "direction_method": "mean",
      "direction_scope": "global",
      "direction_index": 14.498629309192385,
      "direction_blend": 0.8300558396943958,
      "parameters": {
        "attn.o_proj": {
          "max_weight": 1.1983582967647113,
          "max_weight_position": 13.868190570832633,
          "min_weight": 0.8900674738884767,
          "min_weight_distance": 11.933617812175987
        },
        "mlp.down_proj": {
          "max_weight": 1.1356281861124395,
          "max_weight_position": 16.564984175000383,
          "min_weight": 0.3297443238531411,
          "min_weight_distance": 18.394843487603442
        }
      },
      "harmful_axis_metrics": {}
    },
    {
      "index": 30,
      "refusals": 10,
      "overrefusals": 1,
      "harmful_marker_hits": 11,
      "harmful_compliance_score": 0.8746875,
      "objective_regime": "refusal_reduction",
      "merge_penalty": 0.0,
      "kl_divergence": 0.010015908628702164,
      "direction_method": "median",
      "direction_scope": "global",
      "direction_index": 17.66484981854466,
      "direction_blend": 0.21628527164515216,
      "parameters": {
        "attn.o_proj": {
          "max_weight": 1.774744142249135,
          "max_weight_position": 14.257925722400284,
          "min_weight": 0.6188083561998832,
          "min_weight_distance": 9.469596075803267
        },
        "mlp.down_proj": {
          "max_weight": 1.3468990075749931,
          "max_weight_position": 18.293592163081765,
          "min_weight": 0.10765475308238112,
          "min_weight_distance": 8.26487191097985
        }
      },
      "harmful_axis_metrics": {}
    }
  ]
}