viliana-dev commited on
Commit
a522118
·
verified ·
1 Parent(s): 4e8895d

Upload sandbagging probes and results

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. results/anthropic--claude-sonnet-4.6_baseline.json +152 -0
  2. results/attention_probe_results.json +46 -0
  3. results/attention_probe_results_10.json +46 -0
  4. results/attention_probe_results_25.json +46 -0
  5. results/attention_probe_results_50.json +46 -0
  6. results/attention_probe_results_75.json +46 -0
  7. results/embedding_baseline.json +152 -0
  8. results/evaluation.json +372 -0
  9. results/gemini_baseline.json +172 -0
  10. results/google--gemini-3-flash-preview_baseline.json +172 -0
  11. results/google--gemini-3.1-pro-preview_baseline.json +172 -0
  12. results/linear_probe_results.json +152 -0
  13. results/mlp_probe_results.json +122 -0
  14. results/multimax_probe_results_10.json +30 -0
  15. results/multimax_probe_results_100.json +30 -0
  16. results/multimax_probe_results_10_25.json +58 -0
  17. results/multimax_probe_results_25.json +30 -0
  18. results/multimax_probe_results_50.json +30 -0
  19. results/multimax_probe_results_50_75_100.json +86 -0
  20. results/multimax_probe_results_75.json +30 -0
  21. results/openai--gpt-5.1_baseline.json +172 -0
  22. results/rolling_means_probe_results.json +46 -0
  23. results/rolling_means_probe_results_10.json +46 -0
  24. results/rolling_means_probe_results_25.json +46 -0
  25. results/rolling_means_probe_results_50.json +46 -0
  26. results/rolling_means_probe_results_75.json +46 -0
  27. results/tfidf_baseline.json +152 -0
  28. saved_probes/mlp_L20_K50.0.pt +3 -0
  29. saved_probes/mlp_L20_K75.0.pt +3 -0
  30. saved_probes/mlp_L30_K10.0.pt +3 -0
  31. saved_probes/mlp_L30_K100.0.pt +3 -0
  32. saved_probes/mlp_L30_K25.0.pt +3 -0
  33. saved_probes/mlp_L30_K50.0.pt +3 -0
  34. saved_probes/mlp_L30_K75.0.pt +3 -0
  35. saved_probes/mlp_L35_K10.0.pt +3 -0
  36. saved_probes/mlp_L35_K100.0.pt +3 -0
  37. saved_probes/mlp_L35_K25.0.pt +3 -0
  38. saved_probes/mlp_L35_K50.0.pt +3 -0
  39. saved_probes/mlp_L35_K75.0.pt +3 -0
  40. saved_probes/multimax_L10_K10.0_seed0.pt +3 -0
  41. saved_probes/multimax_L10_K10.0_seed1.pt +3 -0
  42. saved_probes/multimax_L10_K10.0_seed2.pt +3 -0
  43. saved_probes/multimax_L10_K100.0_seed0.pt +3 -0
  44. saved_probes/multimax_L10_K100.0_seed1.pt +3 -0
  45. saved_probes/multimax_L10_K100.0_seed2.pt +3 -0
  46. saved_probes/multimax_L10_K25.0_seed0.pt +3 -0
  47. saved_probes/multimax_L10_K25.0_seed1.pt +3 -0
  48. saved_probes/multimax_L10_K25.0_seed2.pt +3 -0
  49. saved_probes/multimax_L10_K50.0_seed0.pt +3 -0
  50. saved_probes/multimax_L10_K50.0_seed1.pt +3 -0
results/anthropic--claude-sonnet-4.6_baseline.json ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "10": {
3
+ "all": {
4
+ "accuracy": 0.5085,
5
+ "auroc": 0.5085,
6
+ "gmean2": 0.0335,
7
+ "tpr": 0.0341,
8
+ "tnr": 0.983
9
+ },
10
+ "train": {
11
+ "accuracy": 0.5041,
12
+ "auroc": 0.5041,
13
+ "gmean2": 0.024,
14
+ "tpr": 0.0244,
15
+ "tnr": 0.9837
16
+ },
17
+ "val": {
18
+ "accuracy": 0.5192,
19
+ "auroc": 0.5192,
20
+ "gmean2": 0.0385,
21
+ "tpr": 0.0385,
22
+ "tnr": 1.0
23
+ },
24
+ "test": {
25
+ "accuracy": 0.5185,
26
+ "auroc": 0.5185,
27
+ "gmean2": 0.0713,
28
+ "tpr": 0.0741,
29
+ "tnr": 0.963
30
+ }
31
+ },
32
+ "25": {
33
+ "all": {
34
+ "accuracy": 0.5114,
35
+ "auroc": 0.5114,
36
+ "gmean2": 0.0391,
37
+ "tpr": 0.0398,
38
+ "tnr": 0.983
39
+ },
40
+ "train": {
41
+ "accuracy": 0.5122,
42
+ "auroc": 0.5122,
43
+ "gmean2": 0.0323,
44
+ "tpr": 0.0325,
45
+ "tnr": 0.9919
46
+ },
47
+ "val": {
48
+ "accuracy": 0.5192,
49
+ "auroc": 0.5192,
50
+ "gmean2": 0.0385,
51
+ "tpr": 0.0385,
52
+ "tnr": 1.0
53
+ },
54
+ "test": {
55
+ "accuracy": 0.5,
56
+ "auroc": 0.5,
57
+ "gmean2": 0.0686,
58
+ "tpr": 0.0741,
59
+ "tnr": 0.9259
60
+ }
61
+ },
62
+ "50": {
63
+ "all": {
64
+ "accuracy": 0.5284,
65
+ "auroc": 0.5284,
66
+ "gmean2": 0.0828,
67
+ "tpr": 0.0852,
68
+ "tnr": 0.9716
69
+ },
70
+ "train": {
71
+ "accuracy": 0.5244,
72
+ "auroc": 0.5244,
73
+ "gmean2": 0.0787,
74
+ "tpr": 0.0813,
75
+ "tnr": 0.9675
76
+ },
77
+ "val": {
78
+ "accuracy": 0.5192,
79
+ "auroc": 0.5192,
80
+ "gmean2": 0.0385,
81
+ "tpr": 0.0385,
82
+ "tnr": 1.0
83
+ },
84
+ "test": {
85
+ "accuracy": 0.5556,
86
+ "auroc": 0.5556,
87
+ "gmean2": 0.1427,
88
+ "tpr": 0.1481,
89
+ "tnr": 0.963
90
+ }
91
+ },
92
+ "75": {
93
+ "all": {
94
+ "accuracy": 0.5341,
95
+ "auroc": 0.5341,
96
+ "gmean2": 0.1179,
97
+ "tpr": 0.125,
98
+ "tnr": 0.9432
99
+ },
100
+ "train": {
101
+ "accuracy": 0.5407,
102
+ "auroc": 0.5407,
103
+ "gmean2": 0.1237,
104
+ "tpr": 0.1301,
105
+ "tnr": 0.9512
106
+ },
107
+ "val": {
108
+ "accuracy": 0.4808,
109
+ "auroc": 0.4808,
110
+ "gmean2": 0.0355,
111
+ "tpr": 0.0385,
112
+ "tnr": 0.9231
113
+ },
114
+ "test": {
115
+ "accuracy": 0.5556,
116
+ "auroc": 0.5556,
117
+ "gmean2": 0.1715,
118
+ "tpr": 0.1852,
119
+ "tnr": 0.9259
120
+ }
121
+ },
122
+ "100": {
123
+ "all": {
124
+ "accuracy": 0.5455,
125
+ "auroc": 0.5455,
126
+ "gmean2": 0.2484,
127
+ "tpr": 0.3239,
128
+ "tnr": 0.767
129
+ },
130
+ "train": {
131
+ "accuracy": 0.5488,
132
+ "auroc": 0.5488,
133
+ "gmean2": 0.2547,
134
+ "tpr": 0.3333,
135
+ "tnr": 0.7642
136
+ },
137
+ "val": {
138
+ "accuracy": 0.5192,
139
+ "auroc": 0.5192,
140
+ "gmean2": 0.2071,
141
+ "tpr": 0.2692,
142
+ "tnr": 0.7692
143
+ },
144
+ "test": {
145
+ "accuracy": 0.5556,
146
+ "auroc": 0.5556,
147
+ "gmean2": 0.2593,
148
+ "tpr": 0.3333,
149
+ "tnr": 0.7778
150
+ }
151
+ }
152
+ }
results/attention_probe_results.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "10|100.0": {
3
+ "val_auroc": 0.6538,
4
+ "test_auroc": 0.7037,
5
+ "val_std": 0.0,
6
+ "test_std": 0.0,
7
+ "test_accuracy": 0.6296,
8
+ "test_accuracy_std": 0.0,
9
+ "test_gmean2": 0.3745,
10
+ "test_gmean2_std": 0.0,
11
+ "n_seeds": 1
12
+ },
13
+ "20|100.0": {
14
+ "val_auroc": 0.7071,
15
+ "test_auroc": 0.6516,
16
+ "val_std": 0.0,
17
+ "test_std": 0.0,
18
+ "test_accuracy": 0.5741,
19
+ "test_accuracy_std": 0.0,
20
+ "test_gmean2": 0.3265,
21
+ "test_gmean2_std": 0.0,
22
+ "n_seeds": 1
23
+ },
24
+ "30|100.0": {
25
+ "val_auroc": 0.679,
26
+ "test_auroc": 0.6612,
27
+ "val_std": 0.0,
28
+ "test_std": 0.0,
29
+ "test_accuracy": 0.5741,
30
+ "test_accuracy_std": 0.0,
31
+ "test_gmean2": 0.3128,
32
+ "test_gmean2_std": 0.0,
33
+ "n_seeds": 1
34
+ },
35
+ "35|100.0": {
36
+ "val_auroc": 0.6612,
37
+ "test_auroc": 0.5967,
38
+ "val_std": 0.0,
39
+ "test_std": 0.0,
40
+ "test_accuracy": 0.5556,
41
+ "test_accuracy_std": 0.0,
42
+ "test_gmean2": 0.1111,
43
+ "test_gmean2_std": 0.0,
44
+ "n_seeds": 1
45
+ }
46
+ }
results/attention_probe_results_10.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "10|10.0": {
3
+ "val_auroc": 0.5592,
4
+ "test_auroc": 0.6927,
5
+ "val_std": 0.0,
6
+ "test_std": 0.0,
7
+ "test_accuracy": 0.6667,
8
+ "test_accuracy_std": 0.0,
9
+ "test_gmean2": 0.439,
10
+ "test_gmean2_std": 0.0,
11
+ "n_seeds": 1
12
+ },
13
+ "20|10.0": {
14
+ "val_auroc": 0.5666,
15
+ "test_auroc": 0.6941,
16
+ "val_std": 0.0,
17
+ "test_std": 0.0,
18
+ "test_accuracy": 0.6111,
19
+ "test_accuracy_std": 0.0,
20
+ "test_gmean2": 0.3567,
21
+ "test_gmean2_std": 0.0,
22
+ "n_seeds": 1
23
+ },
24
+ "30|10.0": {
25
+ "val_auroc": 0.5695,
26
+ "test_auroc": 0.7023,
27
+ "val_std": 0.0,
28
+ "test_std": 0.0,
29
+ "test_accuracy": 0.7037,
30
+ "test_accuracy_std": 0.0,
31
+ "test_gmean2": 0.4609,
32
+ "test_gmean2_std": 0.0,
33
+ "n_seeds": 1
34
+ },
35
+ "35|10.0": {
36
+ "val_auroc": 0.574,
37
+ "test_auroc": 0.716,
38
+ "val_std": 0.0,
39
+ "test_std": 0.0,
40
+ "test_accuracy": 0.7222,
41
+ "test_accuracy_std": 0.0,
42
+ "test_gmean2": 0.5213,
43
+ "test_gmean2_std": 0.0,
44
+ "n_seeds": 1
45
+ }
46
+ }
results/attention_probe_results_25.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "10|25.0": {
3
+ "val_auroc": 0.5828,
4
+ "test_auroc": 0.6324,
5
+ "val_std": 0.0,
6
+ "test_std": 0.0,
7
+ "test_accuracy": 0.5556,
8
+ "test_accuracy_std": 0.0,
9
+ "test_gmean2": 0.2743,
10
+ "test_gmean2_std": 0.0,
11
+ "n_seeds": 1
12
+ },
13
+ "20|25.0": {
14
+ "val_auroc": 0.608,
15
+ "test_auroc": 0.594,
16
+ "val_std": 0.0,
17
+ "test_std": 0.0,
18
+ "test_accuracy": 0.5,
19
+ "test_accuracy_std": 0.0,
20
+ "test_gmean2": 0.1509,
21
+ "test_gmean2_std": 0.0,
22
+ "n_seeds": 1
23
+ },
24
+ "30|25.0": {
25
+ "val_auroc": 0.5962,
26
+ "test_auroc": 0.5748,
27
+ "val_std": 0.0,
28
+ "test_std": 0.0,
29
+ "test_accuracy": 0.4815,
30
+ "test_accuracy_std": 0.0,
31
+ "test_gmean2": 0.144,
32
+ "test_gmean2_std": 0.0,
33
+ "n_seeds": 1
34
+ },
35
+ "35|25.0": {
36
+ "val_auroc": 0.6331,
37
+ "test_auroc": 0.5281,
38
+ "val_std": 0.0,
39
+ "test_std": 0.0,
40
+ "test_accuracy": 0.5,
41
+ "test_accuracy_std": 0.0,
42
+ "test_gmean2": 0.0,
43
+ "test_gmean2_std": 0.0,
44
+ "n_seeds": 1
45
+ }
46
+ }
results/attention_probe_results_50.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "10|50.0": {
3
+ "val_auroc": 0.5592,
4
+ "test_auroc": 0.7298,
5
+ "val_std": 0.0,
6
+ "test_std": 0.0,
7
+ "test_accuracy": 0.5741,
8
+ "test_accuracy_std": 0.0,
9
+ "test_gmean2": 0.2305,
10
+ "test_gmean2_std": 0.0,
11
+ "n_seeds": 1
12
+ },
13
+ "20|50.0": {
14
+ "val_auroc": 0.6154,
15
+ "test_auroc": 0.6818,
16
+ "val_std": 0.0,
17
+ "test_std": 0.0,
18
+ "test_accuracy": 0.5926,
19
+ "test_accuracy_std": 0.0,
20
+ "test_gmean2": 0.2634,
21
+ "test_gmean2_std": 0.0,
22
+ "n_seeds": 1
23
+ },
24
+ "30|50.0": {
25
+ "val_auroc": 0.5769,
26
+ "test_auroc": 0.6557,
27
+ "val_std": 0.0,
28
+ "test_std": 0.0,
29
+ "test_accuracy": 0.5926,
30
+ "test_accuracy_std": 0.0,
31
+ "test_gmean2": 0.3457,
32
+ "test_gmean2_std": 0.0,
33
+ "n_seeds": 1
34
+ },
35
+ "35|50.0": {
36
+ "val_auroc": 0.6361,
37
+ "test_auroc": 0.6063,
38
+ "val_std": 0.0,
39
+ "test_std": 0.0,
40
+ "test_accuracy": 0.5556,
41
+ "test_accuracy_std": 0.0,
42
+ "test_gmean2": 0.1111,
43
+ "test_gmean2_std": 0.0,
44
+ "n_seeds": 1
45
+ }
46
+ }
results/attention_probe_results_75.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "10|75.0": {
3
+ "val_auroc": 0.6006,
4
+ "test_auroc": 0.6488,
5
+ "val_std": 0.0,
6
+ "test_std": 0.0,
7
+ "test_accuracy": 0.5556,
8
+ "test_accuracy_std": 0.0,
9
+ "test_gmean2": 0.3086,
10
+ "test_gmean2_std": 0.0,
11
+ "n_seeds": 1
12
+ },
13
+ "20|75.0": {
14
+ "val_auroc": 0.6346,
15
+ "test_auroc": 0.6173,
16
+ "val_std": 0.0,
17
+ "test_std": 0.0,
18
+ "test_accuracy": 0.5185,
19
+ "test_accuracy_std": 0.0,
20
+ "test_gmean2": 0.2675,
21
+ "test_gmean2_std": 0.0,
22
+ "n_seeds": 1
23
+ },
24
+ "30|75.0": {
25
+ "val_auroc": 0.6531,
26
+ "test_auroc": 0.642,
27
+ "val_std": 0.0,
28
+ "test_std": 0.0,
29
+ "test_accuracy": 0.5741,
30
+ "test_accuracy_std": 0.0,
31
+ "test_gmean2": 0.3128,
32
+ "test_gmean2_std": 0.0,
33
+ "n_seeds": 1
34
+ },
35
+ "35|75.0": {
36
+ "val_auroc": 0.645,
37
+ "test_auroc": 0.6104,
38
+ "val_std": 0.0,
39
+ "test_std": 0.0,
40
+ "test_accuracy": 0.5741,
41
+ "test_accuracy_std": 0.0,
42
+ "test_gmean2": 0.1481,
43
+ "test_gmean2_std": 0.0,
44
+ "n_seeds": 1
45
+ }
46
+ }
results/embedding_baseline.json ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "10": {
3
+ "train": {
4
+ "accuracy": 0.8049,
5
+ "auroc": 0.9001,
6
+ "gmean2": 0.6476,
7
+ "tpr": 0.7886,
8
+ "tnr": 0.8211
9
+ },
10
+ "val": {
11
+ "accuracy": 0.7885,
12
+ "auroc": 0.8876,
13
+ "gmean2": 0.6036,
14
+ "tpr": 0.9231,
15
+ "tnr": 0.6538
16
+ },
17
+ "test": {
18
+ "accuracy": 0.6111,
19
+ "auroc": 0.5871,
20
+ "gmean2": 0.3649,
21
+ "tpr": 0.7037,
22
+ "tnr": 0.5185
23
+ },
24
+ "all": {
25
+ "accuracy": 0.7727,
26
+ "auroc": 0.8478,
27
+ "gmean2": 0.5966,
28
+ "tpr": 0.7955,
29
+ "tnr": 0.75
30
+ }
31
+ },
32
+ "25": {
33
+ "train": {
34
+ "accuracy": 0.8171,
35
+ "auroc": 0.9007,
36
+ "gmean2": 0.6672,
37
+ "tpr": 0.7967,
38
+ "tnr": 0.8374
39
+ },
40
+ "val": {
41
+ "accuracy": 0.7885,
42
+ "auroc": 0.895,
43
+ "gmean2": 0.6036,
44
+ "tpr": 0.9231,
45
+ "tnr": 0.6538
46
+ },
47
+ "test": {
48
+ "accuracy": 0.5741,
49
+ "auroc": 0.6049,
50
+ "gmean2": 0.3265,
51
+ "tpr": 0.6296,
52
+ "tnr": 0.5185
53
+ },
54
+ "all": {
55
+ "accuracy": 0.7756,
56
+ "auroc": 0.8504,
57
+ "gmean2": 0.6013,
58
+ "tpr": 0.7898,
59
+ "tnr": 0.7614
60
+ }
61
+ },
62
+ "50": {
63
+ "train": {
64
+ "accuracy": 0.8049,
65
+ "auroc": 0.8929,
66
+ "gmean2": 0.6468,
67
+ "tpr": 0.7724,
68
+ "tnr": 0.8374
69
+ },
70
+ "val": {
71
+ "accuracy": 0.7885,
72
+ "auroc": 0.8876,
73
+ "gmean2": 0.6124,
74
+ "tpr": 0.8846,
75
+ "tnr": 0.6923
76
+ },
77
+ "test": {
78
+ "accuracy": 0.5741,
79
+ "auroc": 0.6214,
80
+ "gmean2": 0.3265,
81
+ "tpr": 0.6296,
82
+ "tnr": 0.5185
83
+ },
84
+ "all": {
85
+ "accuracy": 0.767,
86
+ "auroc": 0.8455,
87
+ "gmean2": 0.5884,
88
+ "tpr": 0.767,
89
+ "tnr": 0.767
90
+ }
91
+ },
92
+ "75": {
93
+ "train": {
94
+ "accuracy": 0.8049,
95
+ "auroc": 0.8901,
96
+ "gmean2": 0.6462,
97
+ "tpr": 0.7642,
98
+ "tnr": 0.8455
99
+ },
100
+ "val": {
101
+ "accuracy": 0.7885,
102
+ "auroc": 0.8994,
103
+ "gmean2": 0.6124,
104
+ "tpr": 0.8846,
105
+ "tnr": 0.6923
106
+ },
107
+ "test": {
108
+ "accuracy": 0.5556,
109
+ "auroc": 0.631,
110
+ "gmean2": 0.3073,
111
+ "tpr": 0.5926,
112
+ "tnr": 0.5185
113
+ },
114
+ "all": {
115
+ "accuracy": 0.7642,
116
+ "auroc": 0.8461,
117
+ "gmean2": 0.5839,
118
+ "tpr": 0.7557,
119
+ "tnr": 0.7727
120
+ }
121
+ },
122
+ "100": {
123
+ "train": {
124
+ "accuracy": 0.8089,
125
+ "auroc": 0.8915,
126
+ "gmean2": 0.6531,
127
+ "tpr": 0.7724,
128
+ "tnr": 0.8455
129
+ },
130
+ "val": {
131
+ "accuracy": 0.7885,
132
+ "auroc": 0.8964,
133
+ "gmean2": 0.6124,
134
+ "tpr": 0.8846,
135
+ "tnr": 0.6923
136
+ },
137
+ "test": {
138
+ "accuracy": 0.5556,
139
+ "auroc": 0.6283,
140
+ "gmean2": 0.3073,
141
+ "tpr": 0.5926,
142
+ "tnr": 0.5185
143
+ },
144
+ "all": {
145
+ "accuracy": 0.767,
146
+ "auroc": 0.8462,
147
+ "gmean2": 0.5883,
148
+ "tpr": 0.7614,
149
+ "tnr": 0.7727
150
+ }
151
+ }
152
+ }
results/evaluation.json ADDED
@@ -0,0 +1,372 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "behavior": "sandbagging",
3
+ "probes": {
4
+ "best_aurocs": {
5
+ "10": 0.7435,
6
+ "25": 0.7298,
7
+ "50": 0.7229,
8
+ "75": 0.7339,
9
+ "100": 0.7339
10
+ }
11
+ },
12
+ "baselines": {
13
+ "anthropic--claude-sonnet-4.6_baseline": {
14
+ "10.0": {
15
+ "accuracy": 0.5185,
16
+ "auroc": 0.5185,
17
+ "gmean2": 0.0713,
18
+ "tpr": 0.0741,
19
+ "tnr": 0.963
20
+ },
21
+ "25.0": {
22
+ "accuracy": 0.5,
23
+ "auroc": 0.5,
24
+ "gmean2": 0.0686,
25
+ "tpr": 0.0741,
26
+ "tnr": 0.9259
27
+ },
28
+ "50.0": {
29
+ "accuracy": 0.5556,
30
+ "auroc": 0.5556,
31
+ "gmean2": 0.1427,
32
+ "tpr": 0.1481,
33
+ "tnr": 0.963
34
+ },
35
+ "75.0": {
36
+ "accuracy": 0.5556,
37
+ "auroc": 0.5556,
38
+ "gmean2": 0.1715,
39
+ "tpr": 0.1852,
40
+ "tnr": 0.9259
41
+ },
42
+ "100.0": {
43
+ "accuracy": 0.5556,
44
+ "auroc": 0.5556,
45
+ "gmean2": 0.2593,
46
+ "tpr": 0.3333,
47
+ "tnr": 0.7778
48
+ }
49
+ },
50
+ "embedding_baseline": {
51
+ "10.0": {
52
+ "accuracy": 0.6111,
53
+ "auroc": 0.5871,
54
+ "gmean2": 0.3649,
55
+ "tpr": 0.7037,
56
+ "tnr": 0.5185
57
+ },
58
+ "25.0": {
59
+ "accuracy": 0.5741,
60
+ "auroc": 0.6049,
61
+ "gmean2": 0.3265,
62
+ "tpr": 0.6296,
63
+ "tnr": 0.5185
64
+ },
65
+ "50.0": {
66
+ "accuracy": 0.5741,
67
+ "auroc": 0.6214,
68
+ "gmean2": 0.3265,
69
+ "tpr": 0.6296,
70
+ "tnr": 0.5185
71
+ },
72
+ "75.0": {
73
+ "accuracy": 0.5556,
74
+ "auroc": 0.631,
75
+ "gmean2": 0.3073,
76
+ "tpr": 0.5926,
77
+ "tnr": 0.5185
78
+ },
79
+ "100.0": {
80
+ "accuracy": 0.5556,
81
+ "auroc": 0.6283,
82
+ "gmean2": 0.3073,
83
+ "tpr": 0.5926,
84
+ "tnr": 0.5185
85
+ }
86
+ },
87
+ "gemini_baseline": {
88
+ "10.0": {
89
+ "accuracy": 0.463,
90
+ "auroc": 0.463,
91
+ "gmean2": 0.1372,
92
+ "tpr": 0.7407,
93
+ "tnr": 0.1852,
94
+ "n": 54
95
+ },
96
+ "25.0": {
97
+ "accuracy": 0.4815,
98
+ "auroc": 0.4815,
99
+ "gmean2": 0.0658,
100
+ "tpr": 0.8889,
101
+ "tnr": 0.0741,
102
+ "n": 54
103
+ },
104
+ "50.0": {
105
+ "accuracy": 0.4815,
106
+ "auroc": 0.4815,
107
+ "gmean2": 0.0658,
108
+ "tpr": 0.8889,
109
+ "tnr": 0.0741,
110
+ "n": 54
111
+ },
112
+ "75.0": {
113
+ "accuracy": 0.463,
114
+ "auroc": 0.463,
115
+ "gmean2": 0.0,
116
+ "tpr": 0.9259,
117
+ "tnr": 0.0,
118
+ "n": 54
119
+ },
120
+ "100.0": {
121
+ "accuracy": 0.537,
122
+ "auroc": 0.537,
123
+ "gmean2": 0.107,
124
+ "tpr": 0.963,
125
+ "tnr": 0.1111,
126
+ "n": 54
127
+ }
128
+ },
129
+ "google--gemini-3-flash-preview_baseline": {
130
+ "10.0": {
131
+ "accuracy": 0.463,
132
+ "auroc": 0.463,
133
+ "gmean2": 0.1372,
134
+ "tpr": 0.7407,
135
+ "tnr": 0.1852,
136
+ "n": 54
137
+ },
138
+ "25.0": {
139
+ "accuracy": 0.4815,
140
+ "auroc": 0.4815,
141
+ "gmean2": 0.0658,
142
+ "tpr": 0.8889,
143
+ "tnr": 0.0741,
144
+ "n": 54
145
+ },
146
+ "50.0": {
147
+ "accuracy": 0.4815,
148
+ "auroc": 0.4815,
149
+ "gmean2": 0.0658,
150
+ "tpr": 0.8889,
151
+ "tnr": 0.0741,
152
+ "n": 54
153
+ },
154
+ "75.0": {
155
+ "accuracy": 0.463,
156
+ "auroc": 0.463,
157
+ "gmean2": 0.0,
158
+ "tpr": 0.9259,
159
+ "tnr": 0.0,
160
+ "n": 54
161
+ },
162
+ "100.0": {
163
+ "accuracy": 0.537,
164
+ "auroc": 0.537,
165
+ "gmean2": 0.107,
166
+ "tpr": 0.963,
167
+ "tnr": 0.1111,
168
+ "n": 54
169
+ }
170
+ },
171
+ "google--gemini-3.1-pro-preview_baseline": {
172
+ "10.0": {
173
+ "accuracy": 0.5185,
174
+ "auroc": 0.5185,
175
+ "gmean2": 0.1317,
176
+ "tpr": 0.8889,
177
+ "tnr": 0.1481,
178
+ "n": 54
179
+ },
180
+ "25.0": {
181
+ "accuracy": 0.4815,
182
+ "auroc": 0.4815,
183
+ "gmean2": 0.0658,
184
+ "tpr": 0.8889,
185
+ "tnr": 0.0741,
186
+ "n": 54
187
+ },
188
+ "50.0": {
189
+ "accuracy": 0.4815,
190
+ "auroc": 0.4815,
191
+ "gmean2": 0.0658,
192
+ "tpr": 0.8889,
193
+ "tnr": 0.0741,
194
+ "n": 54
195
+ },
196
+ "75.0": {
197
+ "accuracy": 0.537,
198
+ "auroc": 0.537,
199
+ "gmean2": 0.0741,
200
+ "tpr": 1.0,
201
+ "tnr": 0.0741,
202
+ "n": 54
203
+ },
204
+ "100.0": {
205
+ "accuracy": 0.5556,
206
+ "auroc": 0.5556,
207
+ "gmean2": 0.1715,
208
+ "tpr": 0.9259,
209
+ "tnr": 0.1852,
210
+ "n": 54
211
+ }
212
+ },
213
+ "openai--gpt-5.1_baseline": {
214
+ "10.0": {
215
+ "accuracy": 0.551,
216
+ "auroc": 0.5217,
217
+ "gmean2": 0.0435,
218
+ "tpr": 1.0,
219
+ "tnr": 0.0435,
220
+ "n": 49
221
+ },
222
+ "25.0": {
223
+ "accuracy": 0.5208,
224
+ "auroc": 0.5,
225
+ "gmean2": 0.0,
226
+ "tpr": 1.0,
227
+ "tnr": 0.0,
228
+ "n": 48
229
+ },
230
+ "50.0": {
231
+ "accuracy": 0.5417,
232
+ "auroc": 0.5217,
233
+ "gmean2": 0.0435,
234
+ "tpr": 1.0,
235
+ "tnr": 0.0435,
236
+ "n": 48
237
+ },
238
+ "75.0": {
239
+ "accuracy": 0.5319,
240
+ "auroc": 0.5217,
241
+ "gmean2": 0.0435,
242
+ "tpr": 1.0,
243
+ "tnr": 0.0435,
244
+ "n": 47
245
+ },
246
+ "100.0": {
247
+ "accuracy": 0.5833,
248
+ "auroc": 0.5687,
249
+ "gmean2": 0.2,
250
+ "tpr": 0.92,
251
+ "tnr": 0.2174,
252
+ "n": 48
253
+ }
254
+ },
255
+ "tfidf_baseline": {
256
+ "10.0": {
257
+ "accuracy": 0.6667,
258
+ "auroc": 0.701,
259
+ "gmean2": 0.4444,
260
+ "tpr": 0.6667,
261
+ "tnr": 0.6667
262
+ },
263
+ "25.0": {
264
+ "accuracy": 0.5926,
265
+ "auroc": 0.6543,
266
+ "gmean2": 0.3512,
267
+ "tpr": 0.5926,
268
+ "tnr": 0.5926
269
+ },
270
+ "50.0": {
271
+ "accuracy": 0.6111,
272
+ "auroc": 0.6886,
273
+ "gmean2": 0.332,
274
+ "tpr": 0.8148,
275
+ "tnr": 0.4074
276
+ },
277
+ "75.0": {
278
+ "accuracy": 0.6111,
279
+ "auroc": 0.7119,
280
+ "gmean2": 0.3457,
281
+ "tpr": 0.7778,
282
+ "tnr": 0.4444
283
+ },
284
+ "100.0": {
285
+ "accuracy": 0.6111,
286
+ "auroc": 0.7215,
287
+ "gmean2": 0.3457,
288
+ "tpr": 0.7778,
289
+ "tnr": 0.4444
290
+ }
291
+ }
292
+ },
293
+ "comparisons": {
294
+ "anthropic--claude-sonnet-4.6_baseline": {
295
+ "igc": {
296
+ "10": 0.2250000000000001,
297
+ "25": 0.2298,
298
+ "50": 0.1673,
299
+ "75": 0.17830000000000001,
300
+ "100": 0.17830000000000001
301
+ },
302
+ "detection_latency_probe": 10,
303
+ "detection_latency_baseline": null
304
+ },
305
+ "embedding_baseline": {
306
+ "igc": {
307
+ "10": 0.1564000000000001,
308
+ "25": 0.12490000000000001,
309
+ "50": 0.10150000000000003,
310
+ "75": 0.10289999999999999,
311
+ "100": 0.10560000000000003
312
+ },
313
+ "detection_latency_probe": 10,
314
+ "detection_latency_baseline": null
315
+ },
316
+ "gemini_baseline": {
317
+ "igc": {
318
+ "10": 0.2805,
319
+ "25": 0.24830000000000002,
320
+ "50": 0.2414,
321
+ "75": 0.2709,
322
+ "100": 0.19689999999999996
323
+ },
324
+ "detection_latency_probe": 10,
325
+ "detection_latency_baseline": null
326
+ },
327
+ "google--gemini-3-flash-preview_baseline": {
328
+ "igc": {
329
+ "10": 0.2805,
330
+ "25": 0.24830000000000002,
331
+ "50": 0.2414,
332
+ "75": 0.2709,
333
+ "100": 0.19689999999999996
334
+ },
335
+ "detection_latency_probe": 10,
336
+ "detection_latency_baseline": null
337
+ },
338
+ "google--gemini-3.1-pro-preview_baseline": {
339
+ "igc": {
340
+ "10": 0.2250000000000001,
341
+ "25": 0.24830000000000002,
342
+ "50": 0.2414,
343
+ "75": 0.19689999999999996,
344
+ "100": 0.17830000000000001
345
+ },
346
+ "detection_latency_probe": 10,
347
+ "detection_latency_baseline": null
348
+ },
349
+ "openai--gpt-5.1_baseline": {
350
+ "igc": {
351
+ "10": 0.2218,
352
+ "25": 0.2298,
353
+ "50": 0.20119999999999993,
354
+ "75": 0.21219999999999994,
355
+ "100": 0.1652
356
+ },
357
+ "detection_latency_probe": 10,
358
+ "detection_latency_baseline": null
359
+ },
360
+ "tfidf_baseline": {
361
+ "igc": {
362
+ "10": 0.04250000000000009,
363
+ "25": 0.07550000000000001,
364
+ "50": 0.0343,
365
+ "75": 0.02200000000000002,
366
+ "100": 0.012399999999999967
367
+ },
368
+ "detection_latency_probe": 10,
369
+ "detection_latency_baseline": 10
370
+ }
371
+ }
372
+ }
results/gemini_baseline.json ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "10": {
3
+ "all": {
4
+ "accuracy": 0.5185,
5
+ "auroc": 0.5185,
6
+ "gmean2": 0.1029,
7
+ "tpr": 0.1111,
8
+ "tnr": 0.9259,
9
+ "n": 54
10
+ },
11
+ "train": {
12
+ "accuracy": 0.0,
13
+ "auroc": 0.5,
14
+ "gmean2": 0.0,
15
+ "tpr": 0.0,
16
+ "tnr": 0.0,
17
+ "n": 0
18
+ },
19
+ "val": {
20
+ "accuracy": 0.0,
21
+ "auroc": 0.5,
22
+ "gmean2": 0.0,
23
+ "tpr": 0.0,
24
+ "tnr": 0.0,
25
+ "n": 0
26
+ },
27
+ "test": {
28
+ "accuracy": 0.5185,
29
+ "auroc": 0.5185,
30
+ "gmean2": 0.1029,
31
+ "tpr": 0.1111,
32
+ "tnr": 0.9259,
33
+ "n": 54
34
+ }
35
+ },
36
+ "25": {
37
+ "all": {
38
+ "accuracy": 0.463,
39
+ "auroc": 0.463,
40
+ "gmean2": 0.0905,
41
+ "tpr": 0.1111,
42
+ "tnr": 0.8148,
43
+ "n": 54
44
+ },
45
+ "train": {
46
+ "accuracy": 0.0,
47
+ "auroc": 0.5,
48
+ "gmean2": 0.0,
49
+ "tpr": 0.0,
50
+ "tnr": 0.0,
51
+ "n": 0
52
+ },
53
+ "val": {
54
+ "accuracy": 0.0,
55
+ "auroc": 0.5,
56
+ "gmean2": 0.0,
57
+ "tpr": 0.0,
58
+ "tnr": 0.0,
59
+ "n": 0
60
+ },
61
+ "test": {
62
+ "accuracy": 0.463,
63
+ "auroc": 0.463,
64
+ "gmean2": 0.0905,
65
+ "tpr": 0.1111,
66
+ "tnr": 0.8148,
67
+ "n": 54
68
+ }
69
+ },
70
+ "50": {
71
+ "all": {
72
+ "accuracy": 0.537,
73
+ "auroc": 0.537,
74
+ "gmean2": 0.1893,
75
+ "tpr": 0.2222,
76
+ "tnr": 0.8519,
77
+ "n": 54
78
+ },
79
+ "train": {
80
+ "accuracy": 0.0,
81
+ "auroc": 0.5,
82
+ "gmean2": 0.0,
83
+ "tpr": 0.0,
84
+ "tnr": 0.0,
85
+ "n": 0
86
+ },
87
+ "val": {
88
+ "accuracy": 0.0,
89
+ "auroc": 0.5,
90
+ "gmean2": 0.0,
91
+ "tpr": 0.0,
92
+ "tnr": 0.0,
93
+ "n": 0
94
+ },
95
+ "test": {
96
+ "accuracy": 0.537,
97
+ "auroc": 0.537,
98
+ "gmean2": 0.1893,
99
+ "tpr": 0.2222,
100
+ "tnr": 0.8519,
101
+ "n": 54
102
+ }
103
+ },
104
+ "75": {
105
+ "all": {
106
+ "accuracy": 0.5,
107
+ "auroc": 0.5,
108
+ "gmean2": 0.2085,
109
+ "tpr": 0.2963,
110
+ "tnr": 0.7037,
111
+ "n": 54
112
+ },
113
+ "train": {
114
+ "accuracy": 0.0,
115
+ "auroc": 0.5,
116
+ "gmean2": 0.0,
117
+ "tpr": 0.0,
118
+ "tnr": 0.0,
119
+ "n": 0
120
+ },
121
+ "val": {
122
+ "accuracy": 0.0,
123
+ "auroc": 0.5,
124
+ "gmean2": 0.0,
125
+ "tpr": 0.0,
126
+ "tnr": 0.0,
127
+ "n": 0
128
+ },
129
+ "test": {
130
+ "accuracy": 0.5,
131
+ "auroc": 0.5,
132
+ "gmean2": 0.2085,
133
+ "tpr": 0.2963,
134
+ "tnr": 0.7037,
135
+ "n": 54
136
+ }
137
+ },
138
+ "100": {
139
+ "all": {
140
+ "accuracy": 0.5,
141
+ "auroc": 0.5,
142
+ "gmean2": 0.2469,
143
+ "tpr": 0.4444,
144
+ "tnr": 0.5556,
145
+ "n": 54
146
+ },
147
+ "train": {
148
+ "accuracy": 0.0,
149
+ "auroc": 0.5,
150
+ "gmean2": 0.0,
151
+ "tpr": 0.0,
152
+ "tnr": 0.0,
153
+ "n": 0
154
+ },
155
+ "val": {
156
+ "accuracy": 0.0,
157
+ "auroc": 0.5,
158
+ "gmean2": 0.0,
159
+ "tpr": 0.0,
160
+ "tnr": 0.0,
161
+ "n": 0
162
+ },
163
+ "test": {
164
+ "accuracy": 0.5,
165
+ "auroc": 0.5,
166
+ "gmean2": 0.2469,
167
+ "tpr": 0.4444,
168
+ "tnr": 0.5556,
169
+ "n": 54
170
+ }
171
+ }
172
+ }
results/google--gemini-3-flash-preview_baseline.json ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "10": {
3
+ "all": {
4
+ "accuracy": 0.5185,
5
+ "auroc": 0.5185,
6
+ "gmean2": 0.1029,
7
+ "tpr": 0.1111,
8
+ "tnr": 0.9259,
9
+ "n": 54
10
+ },
11
+ "train": {
12
+ "accuracy": 0.0,
13
+ "auroc": 0.5,
14
+ "gmean2": 0.0,
15
+ "tpr": 0.0,
16
+ "tnr": 0.0,
17
+ "n": 0
18
+ },
19
+ "val": {
20
+ "accuracy": 0.0,
21
+ "auroc": 0.5,
22
+ "gmean2": 0.0,
23
+ "tpr": 0.0,
24
+ "tnr": 0.0,
25
+ "n": 0
26
+ },
27
+ "test": {
28
+ "accuracy": 0.5185,
29
+ "auroc": 0.5185,
30
+ "gmean2": 0.1029,
31
+ "tpr": 0.1111,
32
+ "tnr": 0.9259,
33
+ "n": 54
34
+ }
35
+ },
36
+ "25": {
37
+ "all": {
38
+ "accuracy": 0.463,
39
+ "auroc": 0.463,
40
+ "gmean2": 0.0905,
41
+ "tpr": 0.1111,
42
+ "tnr": 0.8148,
43
+ "n": 54
44
+ },
45
+ "train": {
46
+ "accuracy": 0.0,
47
+ "auroc": 0.5,
48
+ "gmean2": 0.0,
49
+ "tpr": 0.0,
50
+ "tnr": 0.0,
51
+ "n": 0
52
+ },
53
+ "val": {
54
+ "accuracy": 0.0,
55
+ "auroc": 0.5,
56
+ "gmean2": 0.0,
57
+ "tpr": 0.0,
58
+ "tnr": 0.0,
59
+ "n": 0
60
+ },
61
+ "test": {
62
+ "accuracy": 0.463,
63
+ "auroc": 0.463,
64
+ "gmean2": 0.0905,
65
+ "tpr": 0.1111,
66
+ "tnr": 0.8148,
67
+ "n": 54
68
+ }
69
+ },
70
+ "50": {
71
+ "all": {
72
+ "accuracy": 0.537,
73
+ "auroc": 0.537,
74
+ "gmean2": 0.1893,
75
+ "tpr": 0.2222,
76
+ "tnr": 0.8519,
77
+ "n": 54
78
+ },
79
+ "train": {
80
+ "accuracy": 0.0,
81
+ "auroc": 0.5,
82
+ "gmean2": 0.0,
83
+ "tpr": 0.0,
84
+ "tnr": 0.0,
85
+ "n": 0
86
+ },
87
+ "val": {
88
+ "accuracy": 0.0,
89
+ "auroc": 0.5,
90
+ "gmean2": 0.0,
91
+ "tpr": 0.0,
92
+ "tnr": 0.0,
93
+ "n": 0
94
+ },
95
+ "test": {
96
+ "accuracy": 0.537,
97
+ "auroc": 0.537,
98
+ "gmean2": 0.1893,
99
+ "tpr": 0.2222,
100
+ "tnr": 0.8519,
101
+ "n": 54
102
+ }
103
+ },
104
+ "75": {
105
+ "all": {
106
+ "accuracy": 0.5,
107
+ "auroc": 0.5,
108
+ "gmean2": 0.2085,
109
+ "tpr": 0.2963,
110
+ "tnr": 0.7037,
111
+ "n": 54
112
+ },
113
+ "train": {
114
+ "accuracy": 0.0,
115
+ "auroc": 0.5,
116
+ "gmean2": 0.0,
117
+ "tpr": 0.0,
118
+ "tnr": 0.0,
119
+ "n": 0
120
+ },
121
+ "val": {
122
+ "accuracy": 0.0,
123
+ "auroc": 0.5,
124
+ "gmean2": 0.0,
125
+ "tpr": 0.0,
126
+ "tnr": 0.0,
127
+ "n": 0
128
+ },
129
+ "test": {
130
+ "accuracy": 0.5,
131
+ "auroc": 0.5,
132
+ "gmean2": 0.2085,
133
+ "tpr": 0.2963,
134
+ "tnr": 0.7037,
135
+ "n": 54
136
+ }
137
+ },
138
+ "100": {
139
+ "all": {
140
+ "accuracy": 0.5,
141
+ "auroc": 0.5,
142
+ "gmean2": 0.2469,
143
+ "tpr": 0.4444,
144
+ "tnr": 0.5556,
145
+ "n": 54
146
+ },
147
+ "train": {
148
+ "accuracy": 0.0,
149
+ "auroc": 0.5,
150
+ "gmean2": 0.0,
151
+ "tpr": 0.0,
152
+ "tnr": 0.0,
153
+ "n": 0
154
+ },
155
+ "val": {
156
+ "accuracy": 0.0,
157
+ "auroc": 0.5,
158
+ "gmean2": 0.0,
159
+ "tpr": 0.0,
160
+ "tnr": 0.0,
161
+ "n": 0
162
+ },
163
+ "test": {
164
+ "accuracy": 0.5,
165
+ "auroc": 0.5,
166
+ "gmean2": 0.2469,
167
+ "tpr": 0.4444,
168
+ "tnr": 0.5556,
169
+ "n": 54
170
+ }
171
+ }
172
+ }
results/google--gemini-3.1-pro-preview_baseline.json ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "10": {
3
+ "all": {
4
+ "accuracy": 0.5185,
5
+ "auroc": 0.5185,
6
+ "gmean2": 0.1811,
7
+ "tpr": 0.2222,
8
+ "tnr": 0.8148,
9
+ "n": 54
10
+ },
11
+ "train": {
12
+ "accuracy": 0.0,
13
+ "auroc": 0.5,
14
+ "gmean2": 0.0,
15
+ "tpr": 0.0,
16
+ "tnr": 0.0,
17
+ "n": 0
18
+ },
19
+ "val": {
20
+ "accuracy": 0.0,
21
+ "auroc": 0.5,
22
+ "gmean2": 0.0,
23
+ "tpr": 0.0,
24
+ "tnr": 0.0,
25
+ "n": 0
26
+ },
27
+ "test": {
28
+ "accuracy": 0.5185,
29
+ "auroc": 0.5185,
30
+ "gmean2": 0.1811,
31
+ "tpr": 0.2222,
32
+ "tnr": 0.8148,
33
+ "n": 54
34
+ }
35
+ },
36
+ "25": {
37
+ "all": {
38
+ "accuracy": 0.5098,
39
+ "auroc": 0.5138,
40
+ "gmean2": 0.2215,
41
+ "tpr": 0.3077,
42
+ "tnr": 0.72,
43
+ "n": 51
44
+ },
45
+ "train": {
46
+ "accuracy": 0.0,
47
+ "auroc": 0.5,
48
+ "gmean2": 0.0,
49
+ "tpr": 0.0,
50
+ "tnr": 0.0,
51
+ "n": 0
52
+ },
53
+ "val": {
54
+ "accuracy": 0.0,
55
+ "auroc": 0.5,
56
+ "gmean2": 0.0,
57
+ "tpr": 0.0,
58
+ "tnr": 0.0,
59
+ "n": 0
60
+ },
61
+ "test": {
62
+ "accuracy": 0.5098,
63
+ "auroc": 0.5138,
64
+ "gmean2": 0.2215,
65
+ "tpr": 0.3077,
66
+ "tnr": 0.72,
67
+ "n": 51
68
+ }
69
+ },
70
+ "50": {
71
+ "all": {
72
+ "accuracy": 0.6038,
73
+ "auroc": 0.6011,
74
+ "gmean2": 0.3419,
75
+ "tpr": 0.4615,
76
+ "tnr": 0.7407,
77
+ "n": 53
78
+ },
79
+ "train": {
80
+ "accuracy": 0.0,
81
+ "auroc": 0.5,
82
+ "gmean2": 0.0,
83
+ "tpr": 0.0,
84
+ "tnr": 0.0,
85
+ "n": 0
86
+ },
87
+ "val": {
88
+ "accuracy": 0.0,
89
+ "auroc": 0.5,
90
+ "gmean2": 0.0,
91
+ "tpr": 0.0,
92
+ "tnr": 0.0,
93
+ "n": 0
94
+ },
95
+ "test": {
96
+ "accuracy": 0.6038,
97
+ "auroc": 0.6011,
98
+ "gmean2": 0.3419,
99
+ "tpr": 0.4615,
100
+ "tnr": 0.7407,
101
+ "n": 53
102
+ }
103
+ },
104
+ "75": {
105
+ "all": {
106
+ "accuracy": 0.6111,
107
+ "auroc": 0.6111,
108
+ "gmean2": 0.3649,
109
+ "tpr": 0.5185,
110
+ "tnr": 0.7037,
111
+ "n": 54
112
+ },
113
+ "train": {
114
+ "accuracy": 0.0,
115
+ "auroc": 0.5,
116
+ "gmean2": 0.0,
117
+ "tpr": 0.0,
118
+ "tnr": 0.0,
119
+ "n": 0
120
+ },
121
+ "val": {
122
+ "accuracy": 0.0,
123
+ "auroc": 0.5,
124
+ "gmean2": 0.0,
125
+ "tpr": 0.0,
126
+ "tnr": 0.0,
127
+ "n": 0
128
+ },
129
+ "test": {
130
+ "accuracy": 0.6111,
131
+ "auroc": 0.6111,
132
+ "gmean2": 0.3649,
133
+ "tpr": 0.5185,
134
+ "tnr": 0.7037,
135
+ "n": 54
136
+ }
137
+ },
138
+ "100": {
139
+ "all": {
140
+ "accuracy": 0.5926,
141
+ "auroc": 0.5926,
142
+ "gmean2": 0.3388,
143
+ "tpr": 0.4815,
144
+ "tnr": 0.7037,
145
+ "n": 54
146
+ },
147
+ "train": {
148
+ "accuracy": 0.0,
149
+ "auroc": 0.5,
150
+ "gmean2": 0.0,
151
+ "tpr": 0.0,
152
+ "tnr": 0.0,
153
+ "n": 0
154
+ },
155
+ "val": {
156
+ "accuracy": 0.0,
157
+ "auroc": 0.5,
158
+ "gmean2": 0.0,
159
+ "tpr": 0.0,
160
+ "tnr": 0.0,
161
+ "n": 0
162
+ },
163
+ "test": {
164
+ "accuracy": 0.5926,
165
+ "auroc": 0.5926,
166
+ "gmean2": 0.3388,
167
+ "tpr": 0.4815,
168
+ "tnr": 0.7037,
169
+ "n": 54
170
+ }
171
+ }
172
+ }
results/linear_probe_results.json ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "10|10": {
3
+ "val_auroc": 1.0,
4
+ "test_auroc": 0.738,
5
+ "test_accuracy": 0.6481,
6
+ "test_gmean2": 0.4198
7
+ },
8
+ "10|25": {
9
+ "val_auroc": 1.0,
10
+ "test_auroc": 0.679,
11
+ "test_accuracy": 0.6481,
12
+ "test_gmean2": 0.417
13
+ },
14
+ "10|50": {
15
+ "val_auroc": 1.0,
16
+ "test_auroc": 0.6708,
17
+ "test_accuracy": 0.5926,
18
+ "test_gmean2": 0.3512
19
+ },
20
+ "10|75": {
21
+ "val_auroc": 1.0,
22
+ "test_auroc": 0.6571,
23
+ "test_accuracy": 0.6111,
24
+ "test_gmean2": 0.3649
25
+ },
26
+ "10|100": {
27
+ "val_auroc": 1.0,
28
+ "test_auroc": 0.679,
29
+ "test_accuracy": 0.5741,
30
+ "test_gmean2": 0.3128
31
+ },
32
+ "20|10": {
33
+ "val_auroc": 1.0,
34
+ "test_auroc": 0.7133,
35
+ "test_accuracy": 0.6296,
36
+ "test_gmean2": 0.3909
37
+ },
38
+ "20|25": {
39
+ "val_auroc": 1.0,
40
+ "test_auroc": 0.6571,
41
+ "test_accuracy": 0.6111,
42
+ "test_gmean2": 0.3704
43
+ },
44
+ "20|50": {
45
+ "val_auroc": 1.0,
46
+ "test_auroc": 0.6653,
47
+ "test_accuracy": 0.6667,
48
+ "test_gmean2": 0.4431
49
+ },
50
+ "20|75": {
51
+ "val_auroc": 1.0,
52
+ "test_auroc": 0.6612,
53
+ "test_accuracy": 0.6852,
54
+ "test_gmean2": 0.4691
55
+ },
56
+ "20|100": {
57
+ "val_auroc": 1.0,
58
+ "test_auroc": 0.6379,
59
+ "test_accuracy": 0.6296,
60
+ "test_gmean2": 0.3964
61
+ },
62
+ "30|10": {
63
+ "val_auroc": 1.0,
64
+ "test_auroc": 0.7366,
65
+ "test_accuracy": 0.6667,
66
+ "test_gmean2": 0.439
67
+ },
68
+ "30|25": {
69
+ "val_auroc": 1.0,
70
+ "test_auroc": 0.727,
71
+ "test_accuracy": 0.6296,
72
+ "test_gmean2": 0.3951
73
+ },
74
+ "30|50": {
75
+ "val_auroc": 1.0,
76
+ "test_auroc": 0.6927,
77
+ "test_accuracy": 0.6852,
78
+ "test_gmean2": 0.4691
79
+ },
80
+ "30|75": {
81
+ "val_auroc": 1.0,
82
+ "test_auroc": 0.6914,
83
+ "test_accuracy": 0.6667,
84
+ "test_gmean2": 0.4431
85
+ },
86
+ "30|100": {
87
+ "val_auroc": 1.0,
88
+ "test_auroc": 0.6639,
89
+ "test_accuracy": 0.6296,
90
+ "test_gmean2": 0.3951
91
+ },
92
+ "35|10": {
93
+ "val_auroc": 1.0,
94
+ "test_auroc": 0.7284,
95
+ "test_accuracy": 0.6667,
96
+ "test_gmean2": 0.4431
97
+ },
98
+ "35|25": {
99
+ "val_auroc": 1.0,
100
+ "test_auroc": 0.7298,
101
+ "test_accuracy": 0.6481,
102
+ "test_gmean2": 0.4198
103
+ },
104
+ "35|50": {
105
+ "val_auroc": 1.0,
106
+ "test_auroc": 0.7229,
107
+ "test_accuracy": 0.6481,
108
+ "test_gmean2": 0.4198
109
+ },
110
+ "35|75": {
111
+ "val_auroc": 1.0,
112
+ "test_auroc": 0.7339,
113
+ "test_accuracy": 0.6481,
114
+ "test_gmean2": 0.4198
115
+ },
116
+ "35|100": {
117
+ "val_auroc": 1.0,
118
+ "test_auroc": 0.7339,
119
+ "test_accuracy": 0.6481,
120
+ "test_gmean2": 0.4198
121
+ },
122
+ "all|10": {
123
+ "val_auroc": 1.0,
124
+ "test_auroc": 0.7435,
125
+ "test_accuracy": 0.6111,
126
+ "test_gmean2": 0.3704
127
+ },
128
+ "all|25": {
129
+ "val_auroc": 1.0,
130
+ "test_auroc": 0.7092,
131
+ "test_accuracy": 0.6296,
132
+ "test_gmean2": 0.3909
133
+ },
134
+ "all|50": {
135
+ "val_auroc": 1.0,
136
+ "test_auroc": 0.7064,
137
+ "test_accuracy": 0.6481,
138
+ "test_gmean2": 0.417
139
+ },
140
+ "all|75": {
141
+ "val_auroc": 1.0,
142
+ "test_auroc": 0.7064,
143
+ "test_accuracy": 0.6111,
144
+ "test_gmean2": 0.3731
145
+ },
146
+ "all|100": {
147
+ "val_auroc": 1.0,
148
+ "test_auroc": 0.6941,
149
+ "test_accuracy": 0.5926,
150
+ "test_gmean2": 0.3457
151
+ }
152
+ }
results/mlp_probe_results.json ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "10|10": {
3
+ "val_auroc": 0.5251,
4
+ "test_auroc": 0.6886,
5
+ "test_accuracy": 0.6296,
6
+ "test_gmean2": 0.3964
7
+ },
8
+ "10|25": {
9
+ "val_auroc": 0.568,
10
+ "test_auroc": 0.6502,
11
+ "test_accuracy": 0.5926,
12
+ "test_gmean2": 0.3388
13
+ },
14
+ "10|50": {
15
+ "val_auroc": 0.605,
16
+ "test_auroc": 0.6571,
17
+ "test_accuracy": 0.5185,
18
+ "test_gmean2": 0.0713
19
+ },
20
+ "10|75": {
21
+ "val_auroc": 0.6154,
22
+ "test_auroc": 0.6708,
23
+ "test_accuracy": 0.5926,
24
+ "test_gmean2": 0.3292
25
+ },
26
+ "10|100": {
27
+ "val_auroc": 0.6331,
28
+ "test_auroc": 0.6612,
29
+ "test_accuracy": 0.6296,
30
+ "test_gmean2": 0.3841
31
+ },
32
+ "20|10": {
33
+ "val_auroc": 0.5518,
34
+ "test_auroc": 0.6461,
35
+ "test_accuracy": 0.6296,
36
+ "test_gmean2": 0.3909
37
+ },
38
+ "20|25": {
39
+ "val_auroc": 0.6405,
40
+ "test_auroc": 0.6571,
41
+ "test_accuracy": 0.5926,
42
+ "test_gmean2": 0.3018
43
+ },
44
+ "20|50": {
45
+ "val_auroc": 0.6228,
46
+ "test_auroc": 0.6557,
47
+ "test_accuracy": 0.5185,
48
+ "test_gmean2": 0.2195
49
+ },
50
+ "20|75": {
51
+ "val_auroc": 0.6228,
52
+ "test_auroc": 0.631,
53
+ "test_accuracy": 0.5926,
54
+ "test_gmean2": 0.3512
55
+ },
56
+ "20|100": {
57
+ "val_auroc": 0.6479,
58
+ "test_auroc": 0.5981,
59
+ "test_accuracy": 0.5185,
60
+ "test_gmean2": 0.2565
61
+ },
62
+ "30|10": {
63
+ "val_auroc": 0.5562,
64
+ "test_auroc": 0.6996,
65
+ "test_accuracy": 0.6296,
66
+ "test_gmean2": 0.3745
67
+ },
68
+ "30|25": {
69
+ "val_auroc": 0.5725,
70
+ "test_auroc": 0.5583,
71
+ "test_accuracy": 0.6111,
72
+ "test_gmean2": 0.2497
73
+ },
74
+ "30|50": {
75
+ "val_auroc": 0.6257,
76
+ "test_auroc": 0.6447,
77
+ "test_accuracy": 0.5741,
78
+ "test_gmean2": 0.3265
79
+ },
80
+ "30|75": {
81
+ "val_auroc": 0.6538,
82
+ "test_auroc": 0.6598,
83
+ "test_accuracy": 0.5741,
84
+ "test_gmean2": 0.2881
85
+ },
86
+ "30|100": {
87
+ "val_auroc": 0.6612,
88
+ "test_auroc": 0.62,
89
+ "test_accuracy": 0.537,
90
+ "test_gmean2": 0.2716
91
+ },
92
+ "35|10": {
93
+ "val_auroc": 0.5932,
94
+ "test_auroc": 0.6749,
95
+ "test_accuracy": 0.5741,
96
+ "test_gmean2": 0.3265
97
+ },
98
+ "35|25": {
99
+ "val_auroc": 0.6331,
100
+ "test_auroc": 0.6968,
101
+ "test_accuracy": 0.5,
102
+ "test_gmean2": 0.0
103
+ },
104
+ "35|50": {
105
+ "val_auroc": 0.6538,
106
+ "test_auroc": 0.6886,
107
+ "test_accuracy": 0.5926,
108
+ "test_gmean2": 0.3292
109
+ },
110
+ "35|75": {
111
+ "val_auroc": 0.6228,
112
+ "test_auroc": 0.631,
113
+ "test_accuracy": 0.5741,
114
+ "test_gmean2": 0.2058
115
+ },
116
+ "35|100": {
117
+ "val_auroc": 0.7086,
118
+ "test_auroc": 0.7106,
119
+ "test_accuracy": 0.6111,
120
+ "test_gmean2": 0.332
121
+ }
122
+ }
results/multimax_probe_results_10.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "10|10.0": {
3
+ "val_auroc": 0.6627,
4
+ "test_auroc": 0.5501,
5
+ "val_std": 0.0,
6
+ "test_std": 0.0,
7
+ "n_seeds": 1
8
+ },
9
+ "20|10.0": {
10
+ "val_auroc": 0.6287,
11
+ "test_auroc": 0.5501,
12
+ "val_std": 0.0,
13
+ "test_std": 0.0,
14
+ "n_seeds": 1
15
+ },
16
+ "30|10.0": {
17
+ "val_auroc": 0.6257,
18
+ "test_auroc": 0.583,
19
+ "val_std": 0.0,
20
+ "test_std": 0.0,
21
+ "n_seeds": 1
22
+ },
23
+ "35|10.0": {
24
+ "val_auroc": 0.5414,
25
+ "test_auroc": 0.6776,
26
+ "val_std": 0.0,
27
+ "test_std": 0.0,
28
+ "n_seeds": 1
29
+ }
30
+ }
results/multimax_probe_results_100.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "10|100.0": {
3
+ "val_auroc": 0.6021,
4
+ "test_auroc": 0.642,
5
+ "val_std": 0.0,
6
+ "test_std": 0.0,
7
+ "n_seeds": 1
8
+ },
9
+ "20|100.0": {
10
+ "val_auroc": 0.6568,
11
+ "test_auroc": 0.6872,
12
+ "val_std": 0.0,
13
+ "test_std": 0.0,
14
+ "n_seeds": 1
15
+ },
16
+ "30|100.0": {
17
+ "val_auroc": 0.6095,
18
+ "test_auroc": 0.62,
19
+ "val_std": 0.0,
20
+ "test_std": 0.0,
21
+ "n_seeds": 1
22
+ },
23
+ "35|100.0": {
24
+ "val_auroc": 0.5932,
25
+ "test_auroc": 0.5761,
26
+ "val_std": 0.0,
27
+ "test_std": 0.0,
28
+ "n_seeds": 1
29
+ }
30
+ }
results/multimax_probe_results_10_25.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "10|10.0": {
3
+ "val_auroc": 0.6228,
4
+ "test_auroc": 0.5944,
5
+ "val_std": 0.0611,
6
+ "test_std": 0.0068,
7
+ "n_seeds": 3
8
+ },
9
+ "10|25.0": {
10
+ "val_auroc": 0.5774,
11
+ "test_auroc": 0.6081,
12
+ "val_std": 0.0129,
13
+ "test_std": 0.0157,
14
+ "n_seeds": 3
15
+ },
16
+ "20|10.0": {
17
+ "val_auroc": 0.6144,
18
+ "test_auroc": 0.5574,
19
+ "val_std": 0.0482,
20
+ "test_std": 0.0307,
21
+ "n_seeds": 3
22
+ },
23
+ "20|25.0": {
24
+ "val_auroc": 0.5439,
25
+ "test_auroc": 0.5693,
26
+ "val_std": 0.0505,
27
+ "test_std": 0.0132,
28
+ "n_seeds": 3
29
+ },
30
+ "30|10.0": {
31
+ "val_auroc": 0.5577,
32
+ "test_auroc": 0.6132,
33
+ "val_std": 0.0349,
34
+ "test_std": 0.0741,
35
+ "n_seeds": 3
36
+ },
37
+ "30|25.0": {
38
+ "val_auroc": 0.5143,
39
+ "test_auroc": 0.6013,
40
+ "val_std": 0.0643,
41
+ "test_std": 0.0839,
42
+ "n_seeds": 3
43
+ },
44
+ "35|10.0": {
45
+ "val_auroc": 0.5397,
46
+ "test_auroc": 0.6283,
47
+ "val_std": 0.0082,
48
+ "test_std": 0.0159,
49
+ "n_seeds": 3
50
+ },
51
+ "35|25.0": {
52
+ "val_auroc": 0.4983,
53
+ "test_auroc": 0.5709,
54
+ "val_std": 0.0657,
55
+ "test_std": 0.0442,
56
+ "n_seeds": 3
57
+ }
58
+ }
results/multimax_probe_results_25.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "10|25.0": {
3
+ "val_auroc": 0.5651,
4
+ "test_auroc": 0.5446,
5
+ "val_std": 0.0,
6
+ "test_std": 0.0,
7
+ "n_seeds": 1
8
+ },
9
+ "20|25.0": {
10
+ "val_auroc": 0.5385,
11
+ "test_auroc": 0.5226,
12
+ "val_std": 0.0,
13
+ "test_std": 0.0,
14
+ "n_seeds": 1
15
+ },
16
+ "30|25.0": {
17
+ "val_auroc": 0.5459,
18
+ "test_auroc": 0.5734,
19
+ "val_std": 0.0,
20
+ "test_std": 0.0,
21
+ "n_seeds": 1
22
+ },
23
+ "35|25.0": {
24
+ "val_auroc": 0.551,
25
+ "test_auroc": 0.6536,
26
+ "val_std": 0.0,
27
+ "test_std": 0.0,
28
+ "n_seeds": 1
29
+ }
30
+ }
results/multimax_probe_results_50.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "10|50.0": {
3
+ "val_auroc": 0.6524,
4
+ "test_auroc": 0.7106,
5
+ "val_std": 0.0,
6
+ "test_std": 0.0,
7
+ "n_seeds": 1
8
+ },
9
+ "20|50.0": {
10
+ "val_auroc": 0.5207,
11
+ "test_auroc": 0.5748,
12
+ "val_std": 0.0,
13
+ "test_std": 0.0,
14
+ "n_seeds": 1
15
+ },
16
+ "30|50.0": {
17
+ "val_auroc": 0.5607,
18
+ "test_auroc": 0.4938,
19
+ "val_std": 0.0,
20
+ "test_std": 0.0,
21
+ "n_seeds": 1
22
+ },
23
+ "35|50.0": {
24
+ "val_auroc": 0.6124,
25
+ "test_auroc": 0.6077,
26
+ "val_std": 0.0,
27
+ "test_std": 0.0,
28
+ "n_seeds": 1
29
+ }
30
+ }
results/multimax_probe_results_50_75_100.json ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "10|50.0": {
3
+ "val_auroc": 0.6243,
4
+ "test_auroc": 0.679,
5
+ "val_std": 0.0582,
6
+ "test_std": 0.0221,
7
+ "n_seeds": 3
8
+ },
9
+ "10|75.0": {
10
+ "val_auroc": 0.6464,
11
+ "test_auroc": 0.6392,
12
+ "val_std": 0.0367,
13
+ "test_std": 0.0482,
14
+ "n_seeds": 3
15
+ },
16
+ "10|100.0": {
17
+ "val_auroc": 0.6686,
18
+ "test_auroc": 0.6754,
19
+ "val_std": 0.0285,
20
+ "test_std": 0.0416,
21
+ "n_seeds": 3
22
+ },
23
+ "20|50.0": {
24
+ "val_auroc": 0.5947,
25
+ "test_auroc": 0.6324,
26
+ "val_std": 0.0238,
27
+ "test_std": 0.0372,
28
+ "n_seeds": 3
29
+ },
30
+ "20|75.0": {
31
+ "val_auroc": 0.5947,
32
+ "test_auroc": 0.6374,
33
+ "val_std": 0.032,
34
+ "test_std": 0.0371,
35
+ "n_seeds": 3
36
+ },
37
+ "20|100.0": {
38
+ "val_auroc": 0.5878,
39
+ "test_auroc": 0.6415,
40
+ "val_std": 0.0351,
41
+ "test_std": 0.0593,
42
+ "n_seeds": 3
43
+ },
44
+ "30|50.0": {
45
+ "val_auroc": 0.5789,
46
+ "test_auroc": 0.6196,
47
+ "val_std": 0.0283,
48
+ "test_std": 0.0763,
49
+ "n_seeds": 3
50
+ },
51
+ "30|75.0": {
52
+ "val_auroc": 0.5794,
53
+ "test_auroc": 0.588,
54
+ "val_std": 0.0347,
55
+ "test_std": 0.0944,
56
+ "n_seeds": 3
57
+ },
58
+ "30|100.0": {
59
+ "val_auroc": 0.6119,
60
+ "test_auroc": 0.6283,
61
+ "val_std": 0.0793,
62
+ "test_std": 0.0688,
63
+ "n_seeds": 3
64
+ },
65
+ "35|50.0": {
66
+ "val_auroc": 0.6134,
67
+ "test_auroc": 0.5722,
68
+ "val_std": 0.0525,
69
+ "test_std": 0.0289,
70
+ "n_seeds": 3
71
+ },
72
+ "35|75.0": {
73
+ "val_auroc": 0.5897,
74
+ "test_auroc": 0.5949,
75
+ "val_std": 0.0833,
76
+ "test_std": 0.0553,
77
+ "n_seeds": 3
78
+ },
79
+ "35|100.0": {
80
+ "val_auroc": 0.5819,
81
+ "test_auroc": 0.6671,
82
+ "val_std": 0.0329,
83
+ "test_std": 0.0479,
84
+ "n_seeds": 3
85
+ }
86
+ }
results/multimax_probe_results_75.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "10|75.0": {
3
+ "val_auroc": 0.6479,
4
+ "test_auroc": 0.6735,
5
+ "val_std": 0.0,
6
+ "test_std": 0.0,
7
+ "n_seeds": 1
8
+ },
9
+ "20|75.0": {
10
+ "val_auroc": 0.5769,
11
+ "test_auroc": 0.6255,
12
+ "val_std": 0.0,
13
+ "test_std": 0.0,
14
+ "n_seeds": 1
15
+ },
16
+ "30|75.0": {
17
+ "val_auroc": 0.5547,
18
+ "test_auroc": 0.5953,
19
+ "val_std": 0.0,
20
+ "test_std": 0.0,
21
+ "n_seeds": 1
22
+ },
23
+ "35|75.0": {
24
+ "val_auroc": 0.6243,
25
+ "test_auroc": 0.5569,
26
+ "val_std": 0.0,
27
+ "test_std": 0.0,
28
+ "n_seeds": 1
29
+ }
30
+ }
results/openai--gpt-5.1_baseline.json ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "10": {
3
+ "all": {
4
+ "accuracy": 0.537,
5
+ "auroc": 0.537,
6
+ "gmean2": 0.107,
7
+ "tpr": 0.1111,
8
+ "tnr": 0.963,
9
+ "n": 54
10
+ },
11
+ "train": {
12
+ "accuracy": 0.0,
13
+ "auroc": 0.5,
14
+ "gmean2": 0.0,
15
+ "tpr": 0.0,
16
+ "tnr": 0.0,
17
+ "n": 0
18
+ },
19
+ "val": {
20
+ "accuracy": 0.0,
21
+ "auroc": 0.5,
22
+ "gmean2": 0.0,
23
+ "tpr": 0.0,
24
+ "tnr": 0.0,
25
+ "n": 0
26
+ },
27
+ "test": {
28
+ "accuracy": 0.537,
29
+ "auroc": 0.537,
30
+ "gmean2": 0.107,
31
+ "tpr": 0.1111,
32
+ "tnr": 0.963,
33
+ "n": 54
34
+ }
35
+ },
36
+ "25": {
37
+ "all": {
38
+ "accuracy": 0.5185,
39
+ "auroc": 0.5185,
40
+ "gmean2": 0.0713,
41
+ "tpr": 0.0741,
42
+ "tnr": 0.963,
43
+ "n": 54
44
+ },
45
+ "train": {
46
+ "accuracy": 0.0,
47
+ "auroc": 0.5,
48
+ "gmean2": 0.0,
49
+ "tpr": 0.0,
50
+ "tnr": 0.0,
51
+ "n": 0
52
+ },
53
+ "val": {
54
+ "accuracy": 0.0,
55
+ "auroc": 0.5,
56
+ "gmean2": 0.0,
57
+ "tpr": 0.0,
58
+ "tnr": 0.0,
59
+ "n": 0
60
+ },
61
+ "test": {
62
+ "accuracy": 0.5185,
63
+ "auroc": 0.5185,
64
+ "gmean2": 0.0713,
65
+ "tpr": 0.0741,
66
+ "tnr": 0.963,
67
+ "n": 54
68
+ }
69
+ },
70
+ "50": {
71
+ "all": {
72
+ "accuracy": 0.5741,
73
+ "auroc": 0.5741,
74
+ "gmean2": 0.2058,
75
+ "tpr": 0.2222,
76
+ "tnr": 0.9259,
77
+ "n": 54
78
+ },
79
+ "train": {
80
+ "accuracy": 0.0,
81
+ "auroc": 0.5,
82
+ "gmean2": 0.0,
83
+ "tpr": 0.0,
84
+ "tnr": 0.0,
85
+ "n": 0
86
+ },
87
+ "val": {
88
+ "accuracy": 0.0,
89
+ "auroc": 0.5,
90
+ "gmean2": 0.0,
91
+ "tpr": 0.0,
92
+ "tnr": 0.0,
93
+ "n": 0
94
+ },
95
+ "test": {
96
+ "accuracy": 0.5741,
97
+ "auroc": 0.5741,
98
+ "gmean2": 0.2058,
99
+ "tpr": 0.2222,
100
+ "tnr": 0.9259,
101
+ "n": 54
102
+ }
103
+ },
104
+ "75": {
105
+ "all": {
106
+ "accuracy": 0.6111,
107
+ "auroc": 0.6111,
108
+ "gmean2": 0.2743,
109
+ "tpr": 0.2963,
110
+ "tnr": 0.9259,
111
+ "n": 54
112
+ },
113
+ "train": {
114
+ "accuracy": 0.0,
115
+ "auroc": 0.5,
116
+ "gmean2": 0.0,
117
+ "tpr": 0.0,
118
+ "tnr": 0.0,
119
+ "n": 0
120
+ },
121
+ "val": {
122
+ "accuracy": 0.0,
123
+ "auroc": 0.5,
124
+ "gmean2": 0.0,
125
+ "tpr": 0.0,
126
+ "tnr": 0.0,
127
+ "n": 0
128
+ },
129
+ "test": {
130
+ "accuracy": 0.6111,
131
+ "auroc": 0.6111,
132
+ "gmean2": 0.2743,
133
+ "tpr": 0.2963,
134
+ "tnr": 0.9259,
135
+ "n": 54
136
+ }
137
+ },
138
+ "100": {
139
+ "all": {
140
+ "accuracy": 0.6111,
141
+ "auroc": 0.6111,
142
+ "gmean2": 0.3155,
143
+ "tpr": 0.3704,
144
+ "tnr": 0.8519,
145
+ "n": 54
146
+ },
147
+ "train": {
148
+ "accuracy": 0.0,
149
+ "auroc": 0.5,
150
+ "gmean2": 0.0,
151
+ "tpr": 0.0,
152
+ "tnr": 0.0,
153
+ "n": 0
154
+ },
155
+ "val": {
156
+ "accuracy": 0.0,
157
+ "auroc": 0.5,
158
+ "gmean2": 0.0,
159
+ "tpr": 0.0,
160
+ "tnr": 0.0,
161
+ "n": 0
162
+ },
163
+ "test": {
164
+ "accuracy": 0.6111,
165
+ "auroc": 0.6111,
166
+ "gmean2": 0.3155,
167
+ "tpr": 0.3704,
168
+ "tnr": 0.8519,
169
+ "n": 54
170
+ }
171
+ }
172
+ }
results/rolling_means_probe_results.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "10|100.0": {
3
+ "val_auroc": 0.6864,
4
+ "test_auroc": 0.6022,
5
+ "val_std": 0.0,
6
+ "test_std": 0.0,
7
+ "test_accuracy": 0.5556,
8
+ "test_accuracy_std": 0.0,
9
+ "test_gmean2": 0.3073,
10
+ "test_gmean2_std": 0.0,
11
+ "n_seeds": 1
12
+ },
13
+ "20|100.0": {
14
+ "val_auroc": 0.7278,
15
+ "test_auroc": 0.6214,
16
+ "val_std": 0.0,
17
+ "test_std": 0.0,
18
+ "test_accuracy": 0.5556,
19
+ "test_accuracy_std": 0.0,
20
+ "test_gmean2": 0.3073,
21
+ "test_gmean2_std": 0.0,
22
+ "n_seeds": 1
23
+ },
24
+ "30|100.0": {
25
+ "val_auroc": 0.6583,
26
+ "test_auroc": 0.62,
27
+ "val_std": 0.0,
28
+ "test_std": 0.0,
29
+ "test_accuracy": 0.537,
30
+ "test_accuracy_std": 0.0,
31
+ "test_gmean2": 0.2853,
32
+ "test_gmean2_std": 0.0,
33
+ "n_seeds": 1
34
+ },
35
+ "35|100.0": {
36
+ "val_auroc": 0.6568,
37
+ "test_auroc": 0.5274,
38
+ "val_std": 0.0,
39
+ "test_std": 0.0,
40
+ "test_accuracy": 0.537,
41
+ "test_accuracy_std": 0.0,
42
+ "test_gmean2": 0.107,
43
+ "test_gmean2_std": 0.0,
44
+ "n_seeds": 1
45
+ }
46
+ }
results/rolling_means_probe_results_10.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "10|10.0": {
3
+ "val_auroc": 0.571,
4
+ "test_auroc": 0.7531,
5
+ "val_std": 0.0,
6
+ "test_std": 0.0,
7
+ "test_accuracy": 0.6852,
8
+ "test_accuracy_std": 0.0,
9
+ "test_gmean2": 0.4691,
10
+ "test_gmean2_std": 0.0,
11
+ "n_seeds": 1
12
+ },
13
+ "20|10.0": {
14
+ "val_auroc": 0.5858,
15
+ "test_auroc": 0.6214,
16
+ "val_std": 0.0,
17
+ "test_std": 0.0,
18
+ "test_accuracy": 0.537,
19
+ "test_accuracy_std": 0.0,
20
+ "test_gmean2": 0.2853,
21
+ "test_gmean2_std": 0.0,
22
+ "n_seeds": 1
23
+ },
24
+ "30|10.0": {
25
+ "val_auroc": 0.6302,
26
+ "test_auroc": 0.6859,
27
+ "val_std": 0.0,
28
+ "test_std": 0.0,
29
+ "test_accuracy": 0.6852,
30
+ "test_accuracy_std": 0.0,
31
+ "test_gmean2": 0.4609,
32
+ "test_gmean2_std": 0.0,
33
+ "n_seeds": 1
34
+ },
35
+ "35|10.0": {
36
+ "val_auroc": 0.5828,
37
+ "test_auroc": 0.5473,
38
+ "val_std": 0.0,
39
+ "test_std": 0.0,
40
+ "test_accuracy": 0.5185,
41
+ "test_accuracy_std": 0.0,
42
+ "test_gmean2": 0.2675,
43
+ "test_gmean2_std": 0.0,
44
+ "n_seeds": 1
45
+ }
46
+ }
results/rolling_means_probe_results_25.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "10|25.0": {
3
+ "val_auroc": 0.6464,
4
+ "test_auroc": 0.5556,
5
+ "val_std": 0.0,
6
+ "test_std": 0.0,
7
+ "test_accuracy": 0.6111,
8
+ "test_accuracy_std": 0.0,
9
+ "test_gmean2": 0.3704,
10
+ "test_gmean2_std": 0.0,
11
+ "n_seeds": 1
12
+ },
13
+ "20|25.0": {
14
+ "val_auroc": 0.6553,
15
+ "test_auroc": 0.5953,
16
+ "val_std": 0.0,
17
+ "test_std": 0.0,
18
+ "test_accuracy": 0.5,
19
+ "test_accuracy_std": 0.0,
20
+ "test_gmean2": 0.2332,
21
+ "test_gmean2_std": 0.0,
22
+ "n_seeds": 1
23
+ },
24
+ "30|25.0": {
25
+ "val_auroc": 0.5902,
26
+ "test_auroc": 0.6125,
27
+ "val_std": 0.0,
28
+ "test_std": 0.0,
29
+ "test_accuracy": 0.5185,
30
+ "test_accuracy_std": 0.0,
31
+ "test_gmean2": 0.2675,
32
+ "test_gmean2_std": 0.0,
33
+ "n_seeds": 1
34
+ },
35
+ "35|25.0": {
36
+ "val_auroc": 0.6198,
37
+ "test_auroc": 0.7263,
38
+ "val_std": 0.0,
39
+ "test_std": 0.0,
40
+ "test_accuracy": 0.6296,
41
+ "test_accuracy_std": 0.0,
42
+ "test_gmean2": 0.3292,
43
+ "test_gmean2_std": 0.0,
44
+ "n_seeds": 1
45
+ }
46
+ }
results/rolling_means_probe_results_50.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "10|50.0": {
3
+ "val_auroc": 0.6997,
4
+ "test_auroc": 0.6735,
5
+ "val_std": 0.0,
6
+ "test_std": 0.0,
7
+ "test_accuracy": 0.5556,
8
+ "test_accuracy_std": 0.0,
9
+ "test_gmean2": 0.1427,
10
+ "test_gmean2_std": 0.0,
11
+ "n_seeds": 1
12
+ },
13
+ "20|50.0": {
14
+ "val_auroc": 0.6842,
15
+ "test_auroc": 0.6145,
16
+ "val_std": 0.0,
17
+ "test_std": 0.0,
18
+ "test_accuracy": 0.5741,
19
+ "test_accuracy_std": 0.0,
20
+ "test_gmean2": 0.3265,
21
+ "test_gmean2_std": 0.0,
22
+ "n_seeds": 1
23
+ },
24
+ "30|50.0": {
25
+ "val_auroc": 0.6524,
26
+ "test_auroc": 0.6207,
27
+ "val_std": 0.0,
28
+ "test_std": 0.0,
29
+ "test_accuracy": 0.6111,
30
+ "test_accuracy_std": 0.0,
31
+ "test_gmean2": 0.3731,
32
+ "test_gmean2_std": 0.0,
33
+ "n_seeds": 1
34
+ },
35
+ "35|50.0": {
36
+ "val_auroc": 0.6257,
37
+ "test_auroc": 0.6687,
38
+ "val_std": 0.0,
39
+ "test_std": 0.0,
40
+ "test_accuracy": 0.5741,
41
+ "test_accuracy_std": 0.0,
42
+ "test_gmean2": 0.2305,
43
+ "test_gmean2_std": 0.0,
44
+ "n_seeds": 1
45
+ }
46
+ }
results/rolling_means_probe_results_75.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "10|75.0": {
3
+ "val_auroc": 0.6612,
4
+ "test_auroc": 0.6214,
5
+ "val_std": 0.0,
6
+ "test_std": 0.0,
7
+ "test_accuracy": 0.5741,
8
+ "test_accuracy_std": 0.0,
9
+ "test_gmean2": 0.2881,
10
+ "test_gmean2_std": 0.0,
11
+ "n_seeds": 1
12
+ },
13
+ "20|75.0": {
14
+ "val_auroc": 0.6583,
15
+ "test_auroc": 0.6269,
16
+ "val_std": 0.0,
17
+ "test_std": 0.0,
18
+ "test_accuracy": 0.5741,
19
+ "test_accuracy_std": 0.0,
20
+ "test_gmean2": 0.321,
21
+ "test_gmean2_std": 0.0,
22
+ "n_seeds": 1
23
+ },
24
+ "30|75.0": {
25
+ "val_auroc": 0.6464,
26
+ "test_auroc": 0.5967,
27
+ "val_std": 0.0,
28
+ "test_std": 0.0,
29
+ "test_accuracy": 0.537,
30
+ "test_accuracy_std": 0.0,
31
+ "test_gmean2": 0.2606,
32
+ "test_gmean2_std": 0.0,
33
+ "n_seeds": 1
34
+ },
35
+ "35|75.0": {
36
+ "val_auroc": 0.6642,
37
+ "test_auroc": 0.6159,
38
+ "val_std": 0.0,
39
+ "test_std": 0.0,
40
+ "test_accuracy": 0.5741,
41
+ "test_accuracy_std": 0.0,
42
+ "test_gmean2": 0.2881,
43
+ "test_gmean2_std": 0.0,
44
+ "n_seeds": 1
45
+ }
46
+ }
results/tfidf_baseline.json ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "10": {
3
+ "train": {
4
+ "accuracy": 0.9756,
5
+ "auroc": 0.996,
6
+ "gmean2": 0.9518,
7
+ "tpr": 0.9756,
8
+ "tnr": 0.9756
9
+ },
10
+ "val": {
11
+ "accuracy": 0.9808,
12
+ "auroc": 0.997,
13
+ "gmean2": 0.9615,
14
+ "tpr": 1.0,
15
+ "tnr": 0.9615
16
+ },
17
+ "test": {
18
+ "accuracy": 0.6667,
19
+ "auroc": 0.701,
20
+ "gmean2": 0.4444,
21
+ "tpr": 0.6667,
22
+ "tnr": 0.6667
23
+ },
24
+ "all": {
25
+ "accuracy": 0.929,
26
+ "auroc": 0.9726,
27
+ "gmean2": 0.863,
28
+ "tpr": 0.9318,
29
+ "tnr": 0.9261
30
+ }
31
+ },
32
+ "25": {
33
+ "train": {
34
+ "accuracy": 0.9756,
35
+ "auroc": 0.9976,
36
+ "gmean2": 0.9516,
37
+ "tpr": 0.9593,
38
+ "tnr": 0.9919
39
+ },
40
+ "val": {
41
+ "accuracy": 0.9615,
42
+ "auroc": 0.9896,
43
+ "gmean2": 0.9246,
44
+ "tpr": 0.9615,
45
+ "tnr": 0.9615
46
+ },
47
+ "test": {
48
+ "accuracy": 0.5926,
49
+ "auroc": 0.6543,
50
+ "gmean2": 0.3512,
51
+ "tpr": 0.5926,
52
+ "tnr": 0.5926
53
+ },
54
+ "all": {
55
+ "accuracy": 0.9148,
56
+ "auroc": 0.964,
57
+ "gmean2": 0.8367,
58
+ "tpr": 0.9034,
59
+ "tnr": 0.9261
60
+ }
61
+ },
62
+ "50": {
63
+ "train": {
64
+ "accuracy": 0.9756,
65
+ "auroc": 0.9955,
66
+ "gmean2": 0.9518,
67
+ "tpr": 0.9756,
68
+ "tnr": 0.9756
69
+ },
70
+ "val": {
71
+ "accuracy": 0.9615,
72
+ "auroc": 0.9926,
73
+ "gmean2": 0.9231,
74
+ "tpr": 1.0,
75
+ "tnr": 0.9231
76
+ },
77
+ "test": {
78
+ "accuracy": 0.6111,
79
+ "auroc": 0.6886,
80
+ "gmean2": 0.332,
81
+ "tpr": 0.8148,
82
+ "tnr": 0.4074
83
+ },
84
+ "all": {
85
+ "accuracy": 0.9176,
86
+ "auroc": 0.964,
87
+ "gmean2": 0.8407,
88
+ "tpr": 0.9545,
89
+ "tnr": 0.8807
90
+ }
91
+ },
92
+ "75": {
93
+ "train": {
94
+ "accuracy": 0.9593,
95
+ "auroc": 0.9921,
96
+ "gmean2": 0.9203,
97
+ "tpr": 0.9512,
98
+ "tnr": 0.9675
99
+ },
100
+ "val": {
101
+ "accuracy": 0.9038,
102
+ "auroc": 0.9882,
103
+ "gmean2": 0.8136,
104
+ "tpr": 0.9615,
105
+ "tnr": 0.8462
106
+ },
107
+ "test": {
108
+ "accuracy": 0.6111,
109
+ "auroc": 0.7119,
110
+ "gmean2": 0.3457,
111
+ "tpr": 0.7778,
112
+ "tnr": 0.4444
113
+ },
114
+ "all": {
115
+ "accuracy": 0.8977,
116
+ "auroc": 0.9587,
117
+ "gmean2": 0.8051,
118
+ "tpr": 0.9261,
119
+ "tnr": 0.8693
120
+ }
121
+ },
122
+ "100": {
123
+ "train": {
124
+ "accuracy": 0.9634,
125
+ "auroc": 0.9875,
126
+ "gmean2": 0.9282,
127
+ "tpr": 0.9675,
128
+ "tnr": 0.9593
129
+ },
130
+ "val": {
131
+ "accuracy": 0.9231,
132
+ "auroc": 0.9926,
133
+ "gmean2": 0.8462,
134
+ "tpr": 1.0,
135
+ "tnr": 0.8462
136
+ },
137
+ "test": {
138
+ "accuracy": 0.6111,
139
+ "auroc": 0.7215,
140
+ "gmean2": 0.3457,
141
+ "tpr": 0.7778,
142
+ "tnr": 0.4444
143
+ },
144
+ "all": {
145
+ "accuracy": 0.9034,
146
+ "auroc": 0.9574,
147
+ "gmean2": 0.8146,
148
+ "tpr": 0.9432,
149
+ "tnr": 0.8636
150
+ }
151
+ }
152
+ }
saved_probes/mlp_L20_K50.0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c96096cf50a3d28fe934dd0f7dd911d692de9abdfa318400902c5a85bfea0915
3
+ size 5247769
saved_probes/mlp_L20_K75.0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b2b72789b24811b135c1c5d9549c5e95ed0d91081fdc7acab9e7d49c1db1284
3
+ size 5247769
saved_probes/mlp_L30_K10.0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f64363fafece160b62345dc36aff7677bb152e8e6d330d274efc1fdd917e42a
3
+ size 5247769
saved_probes/mlp_L30_K100.0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d5f94f5f68fd15c2bc98afc13468bfdac9c78fed21cc12c2c439d24fdee6043
3
+ size 5247779
saved_probes/mlp_L30_K25.0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5dc86863846213f64fa682096e0c51834de1ba02d3f0987d899b4116e2746a42
3
+ size 5247769
saved_probes/mlp_L30_K50.0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dca59f9c28b519e776bbed147c34d3c9e6f407625adc720e88f7d6385f6a5c64
3
+ size 5247769
saved_probes/mlp_L30_K75.0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b768a21c074dda95566a1db4aa1f25e0989825799deb2e06eb0ae08bf3e7aaf7
3
+ size 5247769
saved_probes/mlp_L35_K10.0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3012114092a3a6d671568b53402745e64e865ea6bb5e999b31bff94b79849cf
3
+ size 5247769
saved_probes/mlp_L35_K100.0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29824008523b3fc22a6cfdec0683e036d077663ee2a68c9524542bf8c64f1454
3
+ size 5247779
saved_probes/mlp_L35_K25.0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91cf550b1863bee4024f4a404583738d7a8a3ab7a1b74cb3d3915663dcdc1530
3
+ size 5247769
saved_probes/mlp_L35_K50.0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3075a5c49f7604fc606038d2f313569f7c4a97fe45e8809297b099c483d2e995
3
+ size 5247769
saved_probes/mlp_L35_K75.0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ecc2ace55f6f949c56705f6574199ba59df4fddd423ae8cf1fe0cb22e91bbd8
3
+ size 5247769
saved_probes/multimax_L10_K10.0_seed0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c103ff2ef5dbc2dd93afaf4d8939b6af86bc0f782439e31c569a39f7af209a9
3
+ size 69125
saved_probes/multimax_L10_K10.0_seed1.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f76f19fd41df93f213530f892ad01a2f6aa19221890649aa9a6937d97091909
3
+ size 69125
saved_probes/multimax_L10_K10.0_seed2.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8a73654ac2f00951145579e33cc3f913fde58b474025f27ad36b29e3160b8e6
3
+ size 69125
saved_probes/multimax_L10_K100.0_seed0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a6adfacb850da1ca336624928b659b4cb76228deba4875b5d5f191962b65ccf
3
+ size 69213
saved_probes/multimax_L10_K100.0_seed1.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51c988edbb7a1cf83116033ea0a5ec38696424c0990b7af296b28775e90c3dd3
3
+ size 69213
saved_probes/multimax_L10_K100.0_seed2.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6b61d50efdcd93aa08712804debe02e145b1c4090555b318fd2a45f6da3fcb3
3
+ size 69213
saved_probes/multimax_L10_K25.0_seed0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06d8cc589d9724bf4b831cd09a91f5264eb7c0f89f9f601eecf114c9cf0cdd95
3
+ size 69125
saved_probes/multimax_L10_K25.0_seed1.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:417effd1d8ddb7a6eda0190a63de6f81a6e5a8ae606120422a52768cf4548f29
3
+ size 69125
saved_probes/multimax_L10_K25.0_seed2.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5adc37fa136226fbb72e016e12636f2ab2b169af7fdbf16aceff5e1d2a1651d2
3
+ size 69125
saved_probes/multimax_L10_K50.0_seed0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4292857c93082d4245987f76387000b9cee911149a80f9e19c12ab0d4fe39cdc
3
+ size 69125
saved_probes/multimax_L10_K50.0_seed1.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61b278aba5af4c4f45a4d61c36a0b7ddfde497b12eecdfc0179c9b027afd6895
3
+ size 69125