Spaces:
Sleeping
Sleeping
Upload results/leaderboard.csv with huggingface_hub
Browse files- results/leaderboard.csv +195 -0
results/leaderboard.csv
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Method,Paradigm,Model,Dataset,Layer,AUROC,AUROC_CI_Low,AUROC_CI_High,F1,Precision,Recall,FPR_at_95TPR,Latency_ms
|
| 2 |
+
SAE-Classifier (logistic_regression),sae,Gemma-2-2B,advbench,12.0,1.0,0.9999,1.0,0.9971,0.9981,0.9962,0.0,0.29
|
| 3 |
+
CC-Delta (top-100),sae,Gemma-2-2B,advbench,12.0,0.9997,0.9993,0.9999,0.9913,0.9923,0.9904,0.0,3.9
|
| 4 |
+
"GSAE (k=10, α=0.1)",sae,Gemma-2-2B,advbench,12.0,0.9999,0.9998,1.0,0.9971,0.9981,0.9962,0.0,3.22
|
| 5 |
+
"Random SAE (100 features, 5 seeds)",sae,Gemma-2-2B,advbench,12.0,0.9999,0.9996,1.0,0.9971,0.9981,0.9962,0.0,0.6
|
| 6 |
+
Linear Probe,activation,Gemma-2-2B,advbench,12.0,1.0,0.9999,1.0,0.9971,0.9981,0.9962,0.0,0.24
|
| 7 |
+
"MLP Probe ([256, 128])",activation,Gemma-2-2B,advbench,12.0,1.0,0.9999,1.0,0.9971,0.9981,0.9962,0.0,0.92
|
| 8 |
+
FJD (entropy),logit,Gemma-2-2B,advbench,12.0,0.5,0.5,0.5,0.6667,0.5,1.0,1.0,0.59
|
| 9 |
+
SAE-Classifier (logistic_regression),sae,Gemma-2-2B,harmbench,12.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.32
|
| 10 |
+
CC-Delta (top-100),sae,Gemma-2-2B,harmbench,12.0,0.9996,0.9989,1.0,0.9876,0.9785,0.9969,0.0,6.53
|
| 11 |
+
"GSAE (k=10, α=0.1)",sae,Gemma-2-2B,harmbench,12.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,5.56
|
| 12 |
+
"Random SAE (100 features, 5 seeds)",sae,Gemma-2-2B,harmbench,12.0,0.9997,0.9993,1.0,0.9891,0.9845,0.9938,0.0,0.91
|
| 13 |
+
Linear Probe,activation,Gemma-2-2B,harmbench,12.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.34
|
| 14 |
+
"MLP Probe ([256, 128])",activation,Gemma-2-2B,harmbench,12.0,1.0,1.0,1.0,0.9984,0.9969,1.0,0.0,1.03
|
| 15 |
+
FJD (entropy),logit,Gemma-2-2B,harmbench,12.0,0.5,0.5,0.5,0.6667,0.5,1.0,1.0,0.72
|
| 16 |
+
SAE-Classifier (logistic_regression),sae,Gemma-2-2B,jailbreakbench,12.0,0.9486,0.9175,0.9739,0.8832,0.8969,0.87,0.22,0.27
|
| 17 |
+
CC-Delta (top-100),sae,Gemma-2-2B,jailbreakbench,12.0,0.9158,0.8737,0.9534,0.8485,0.8571,0.84,0.35,18.96
|
| 18 |
+
"GSAE (k=10, α=0.1)",sae,Gemma-2-2B,jailbreakbench,12.0,0.9475,0.9167,0.973,0.8731,0.8866,0.86,0.24,9.22
|
| 19 |
+
"Random SAE (100 features, 5 seeds)",sae,Gemma-2-2B,jailbreakbench,12.0,0.9292,0.8937,0.9601,0.8571,0.8447,0.87,0.35,0.82
|
| 20 |
+
Linear Probe,activation,Gemma-2-2B,jailbreakbench,12.0,0.9486,0.9175,0.9739,0.8832,0.8969,0.87,0.22,0.48
|
| 21 |
+
"MLP Probe ([256, 128])",activation,Gemma-2-2B,jailbreakbench,12.0,0.9421,0.9091,0.9697,0.87,0.87,0.87,0.29,5.05
|
| 22 |
+
FJD (entropy),logit,Gemma-2-2B,jailbreakbench,12.0,0.4715,0.3924,0.5497,0.0,0.0,0.0,1.0,6.93
|
| 23 |
+
SAE-Classifier (logistic_regression),sae,Gemma-2-2B,sorry-bench,12.0,0.9997,0.9994,1.0,0.9889,0.9867,0.9911,0.0,4.67
|
| 24 |
+
CC-Delta (top-100),sae,Gemma-2-2B,sorry-bench,12.0,0.9969,0.9949,0.9984,0.971,0.9732,0.9689,0.0156,9.37
|
| 25 |
+
"GSAE (k=10, α=0.1)",sae,Gemma-2-2B,sorry-bench,12.0,0.9995,0.999,0.9999,0.9845,0.9781,0.9911,0.0,33.59
|
| 26 |
+
"Random SAE (100 features, 5 seeds)",sae,Gemma-2-2B,sorry-bench,12.0,0.9961,0.9931,0.9984,0.9735,0.9671,0.98,0.0178,2.23
|
| 27 |
+
Linear Probe,activation,Gemma-2-2B,sorry-bench,12.0,0.9997,0.9994,1.0,0.9889,0.9867,0.9911,0.0,0.49
|
| 28 |
+
"MLP Probe ([256, 128])",activation,Gemma-2-2B,sorry-bench,12.0,0.9995,0.999,0.9999,0.9878,0.9824,0.9933,0.0022,2.41
|
| 29 |
+
FJD (entropy),logit,Gemma-2-2B,sorry-bench,12.0,0.4903,0.4535,0.5269,0.6667,0.5,1.0,1.0,7.51
|
| 30 |
+
SAE-Classifier (logistic_regression),sae,Gemma-2-2B,wildjailbreak,12.0,0.9995,0.9983,1.0,0.997,0.998,0.996,0.0,3.82
|
| 31 |
+
CC-Delta (top-100),sae,Gemma-2-2B,wildjailbreak,12.0,0.9988,0.9961,1.0,0.992,0.996,0.988,0.0,7.58
|
| 32 |
+
"GSAE (k=10, α=0.1)",sae,Gemma-2-2B,wildjailbreak,12.0,0.9993,0.9978,1.0,0.994,0.996,0.992,0.0,34.47
|
| 33 |
+
"Random SAE (100 features, 5 seeds)",sae,Gemma-2-2B,wildjailbreak,12.0,0.9997,0.999,1.0,0.995,0.998,0.992,0.0,1.44
|
| 34 |
+
Linear Probe,activation,Gemma-2-2B,wildjailbreak,12.0,0.9995,0.9983,1.0,0.997,0.998,0.996,0.0,0.1
|
| 35 |
+
"MLP Probe ([256, 128])",activation,Gemma-2-2B,wildjailbreak,12.0,0.9991,0.9971,1.0,0.998,1.0,0.996,0.0,2.36
|
| 36 |
+
FJD (entropy),logit,Gemma-2-2B,wildjailbreak,12.0,0.6798,0.6446,0.7147,0.6667,0.5,1.0,0.482,7.22
|
| 37 |
+
SAE-Classifier (logistic_regression),sae,Gemma-2-2B,advbench,18.0,1.0,0.9999,1.0,0.9961,0.9981,0.9942,0.0,0.17
|
| 38 |
+
CC-Delta (top-100),sae,Gemma-2-2B,advbench,18.0,0.9994,0.9986,0.9999,0.9923,0.9923,0.9923,0.0019,3.96
|
| 39 |
+
"GSAE (k=10, α=0.1)",sae,Gemma-2-2B,advbench,18.0,0.9999,0.9998,1.0,0.9942,0.9981,0.9904,0.0,3.75
|
| 40 |
+
"Random SAE (100 features, 5 seeds)",sae,Gemma-2-2B,advbench,18.0,0.9997,0.9994,0.9999,0.9903,0.9942,0.9865,0.0,0.49
|
| 41 |
+
Linear Probe,activation,Gemma-2-2B,advbench,18.0,1.0,0.9999,1.0,0.9961,0.9981,0.9942,0.0,0.14
|
| 42 |
+
"MLP Probe ([256, 128])",activation,Gemma-2-2B,advbench,18.0,0.9999,0.9998,1.0,0.9952,0.9981,0.9923,0.0,1.0
|
| 43 |
+
FJD (entropy),logit,Gemma-2-2B,advbench,18.0,0.5,0.5,0.5,0.6667,0.5,1.0,1.0,0.58
|
| 44 |
+
SAE-Classifier (logistic_regression),sae,Gemma-2-2B,harmbench,18.0,1.0,0.9998,1.0,0.9953,0.9907,1.0,0.0,0.26
|
| 45 |
+
CC-Delta (top-100),sae,Gemma-2-2B,harmbench,18.0,0.9995,0.9987,0.9999,0.986,0.9844,0.9875,0.0,6.49
|
| 46 |
+
"GSAE (k=10, α=0.1)",sae,Gemma-2-2B,harmbench,18.0,0.9999,0.9997,1.0,0.9953,0.9907,1.0,0.0,5.61
|
| 47 |
+
"Random SAE (100 features, 5 seeds)",sae,Gemma-2-2B,harmbench,18.0,0.9995,0.9988,0.9999,0.9875,0.9875,0.9875,0.0,0.48
|
| 48 |
+
Linear Probe,activation,Gemma-2-2B,harmbench,18.0,1.0,0.9998,1.0,0.9953,0.9907,1.0,0.0,0.42
|
| 49 |
+
"MLP Probe ([256, 128])",activation,Gemma-2-2B,harmbench,18.0,1.0,0.9999,1.0,0.9953,0.9907,1.0,0.0,0.99
|
| 50 |
+
FJD (entropy),logit,Gemma-2-2B,harmbench,18.0,0.5,0.5,0.5,0.6667,0.5,1.0,1.0,0.7
|
| 51 |
+
SAE-Classifier (logistic_regression),sae,Gemma-2-2B,jailbreakbench,18.0,0.8762,0.8273,0.9205,0.7864,0.7642,0.81,0.42,1.85
|
| 52 |
+
CC-Delta (top-100),sae,Gemma-2-2B,jailbreakbench,18.0,0.8363,0.7792,0.8875,0.74,0.74,0.74,0.49,20.64
|
| 53 |
+
"GSAE (k=10, α=0.1)",sae,Gemma-2-2B,jailbreakbench,18.0,0.8738,0.8243,0.9194,0.7767,0.7547,0.8,0.5,14.62
|
| 54 |
+
"Random SAE (100 features, 5 seeds)",sae,Gemma-2-2B,jailbreakbench,18.0,0.8594,0.8065,0.908,0.7885,0.7593,0.82,0.49,3.94
|
| 55 |
+
Linear Probe,activation,Gemma-2-2B,jailbreakbench,18.0,0.8762,0.8273,0.9205,0.7864,0.7642,0.81,0.42,1.58
|
| 56 |
+
"MLP Probe ([256, 128])",activation,Gemma-2-2B,jailbreakbench,18.0,0.8662,0.8156,0.9123,0.7847,0.7523,0.82,0.44,2.35
|
| 57 |
+
FJD (entropy),logit,Gemma-2-2B,jailbreakbench,18.0,0.5,0.5,0.5,0.6667,0.5,1.0,1.0,1.51
|
| 58 |
+
SAE-Classifier (logistic_regression),sae,Gemma-2-2B,advbench,6.0,1.0,0.9999,1.0,0.9961,0.9981,0.9942,0.0,0.15
|
| 59 |
+
CC-Delta (top-100),sae,Gemma-2-2B,advbench,6.0,0.9995,0.9991,0.9999,0.9884,0.9942,0.9827,0.0,3.95
|
| 60 |
+
"GSAE (k=10, α=0.1)",sae,Gemma-2-2B,advbench,6.0,0.9999,0.9998,1.0,0.9942,0.9961,0.9923,0.0,3.37
|
| 61 |
+
"Random SAE (100 features, 5 seeds)",sae,Gemma-2-2B,advbench,6.0,0.9999,0.9997,1.0,0.9914,0.9904,0.9923,0.0,0.14
|
| 62 |
+
Linear Probe,activation,Gemma-2-2B,advbench,6.0,1.0,0.9999,1.0,0.9961,0.9981,0.9942,0.0,0.19
|
| 63 |
+
"MLP Probe ([256, 128])",activation,Gemma-2-2B,advbench,6.0,0.9999,0.9998,1.0,0.9961,1.0,0.9923,0.0,1.02
|
| 64 |
+
FJD (entropy),logit,Gemma-2-2B,advbench,6.0,0.649,0.6166,0.6818,0.6658,0.506,0.9731,0.9154,0.59
|
| 65 |
+
SAE-Classifier (logistic_regression),sae,Gemma-2-2B,harmbench,6.0,1.0,0.9999,1.0,0.9984,1.0,0.9969,0.0,0.39
|
| 66 |
+
CC-Delta (top-100),sae,Gemma-2-2B,harmbench,6.0,0.9985,0.9971,0.9996,0.9811,0.9873,0.975,0.0031,6.41
|
| 67 |
+
"GSAE (k=10, α=0.1)",sae,Gemma-2-2B,harmbench,6.0,1.0,0.9998,1.0,0.9953,0.9938,0.9969,0.0,5.5
|
| 68 |
+
"Random SAE (100 features, 5 seeds)",sae,Gemma-2-2B,harmbench,6.0,0.9994,0.9986,0.9999,0.9876,0.9815,0.9938,0.0031,0.47
|
| 69 |
+
Linear Probe,activation,Gemma-2-2B,harmbench,6.0,1.0,0.9999,1.0,0.9984,1.0,0.9969,0.0,0.63
|
| 70 |
+
"MLP Probe ([256, 128])",activation,Gemma-2-2B,harmbench,6.0,1.0,0.9999,1.0,0.9969,0.9938,1.0,0.0,0.97
|
| 71 |
+
FJD (entropy),logit,Gemma-2-2B,harmbench,6.0,0.7795,0.7439,0.8141,0.6897,0.5365,0.9656,0.8125,0.74
|
| 72 |
+
SAE-Classifier (logistic_regression),sae,Gemma-2-2B,jailbreakbench,6.0,0.9107,0.8706,0.9455,0.8276,0.8155,0.84,0.41,1.08
|
| 73 |
+
CC-Delta (top-100),sae,Gemma-2-2B,jailbreakbench,6.0,0.8608,0.8083,0.9069,0.7867,0.7477,0.83,0.43,20.49
|
| 74 |
+
"GSAE (k=10, α=0.1)",sae,Gemma-2-2B,jailbreakbench,6.0,0.9088,0.8679,0.9438,0.8155,0.7925,0.84,0.42,13.03
|
| 75 |
+
"Random SAE (100 features, 5 seeds)",sae,Gemma-2-2B,jailbreakbench,6.0,0.8375,0.7828,0.8887,0.77,0.77,0.77,0.61,4.65
|
| 76 |
+
Linear Probe,activation,Gemma-2-2B,jailbreakbench,6.0,0.9107,0.8706,0.9455,0.8276,0.8155,0.84,0.41,1.26
|
| 77 |
+
"MLP Probe ([256, 128])",activation,Gemma-2-2B,jailbreakbench,6.0,0.901,0.8579,0.938,0.7921,0.7843,0.8,0.38,5.0
|
| 78 |
+
FJD (entropy),logit,Gemma-2-2B,jailbreakbench,6.0,0.4779,0.3973,0.5568,0.6644,0.5026,0.98,0.93,1.47
|
| 79 |
+
CC-Delta (top-100) (W16k),sae,Gemma-2-2B,advbench,12.0,0.9987,0.9976,0.9995,0.9817,0.9827,0.9808,0.0038,3.58
|
| 80 |
+
CC-Delta (top-100) (W65k),sae,Gemma-2-2B,advbench,12.0,0.9951,0.9917,0.9977,0.9701,0.9729,0.9673,0.0192,10.2
|
| 81 |
+
CC-Delta (top-100) (W65k),sae,Gemma-2-2B,harmbench,12.0,0.9501,0.9348,0.9642,0.8732,0.8746,0.8719,0.2125,17.17
|
| 82 |
+
CC-Delta (top-100) (W65k),sae,Gemma-2-2B,jailbreakbench,12.0,0.8286,0.7704,0.88,0.7308,0.7037,0.76,0.59,42.67
|
| 83 |
+
SAE-Classifier (logistic_regression) (W16k),sae,Gemma-2-2B,sorry-bench,12.0,0.954,0.9406,0.9666,0.8909,0.8929,0.8889,0.2733,11.54
|
| 84 |
+
CC-Delta (top-100) (W16k),sae,Gemma-2-2B,sorry-bench,12.0,0.9521,0.9392,0.9644,0.8965,0.8886,0.9044,0.2244,21.01
|
| 85 |
+
"GSAE (k=10, α=0.1) (W16k)",sae,Gemma-2-2B,sorry-bench,12.0,0.9545,0.9414,0.9667,0.896,0.8921,0.9,0.2133,469.44
|
| 86 |
+
"Random SAE (100 features, 5 seeds) (W16k)",sae,Gemma-2-2B,sorry-bench,12.0,0.5704,0.5336,0.6068,0.2602,0.7955,0.1556,0.9156,26.12
|
| 87 |
+
Linear Probe (W16k),activation,Gemma-2-2B,sorry-bench,12.0,0.954,0.9406,0.9666,0.8909,0.8929,0.8889,0.2733,14.17
|
| 88 |
+
"MLP Probe ([256, 128]) (W16k)",activation,Gemma-2-2B,sorry-bench,12.0,0.956,0.9433,0.9676,0.8859,0.8919,0.88,0.22,16.82
|
| 89 |
+
FJD (entropy) (W16k),logit,Gemma-2-2B,sorry-bench,12.0,0.4903,0.4535,0.5269,0.6667,0.5,1.0,1.0,6.5
|
| 90 |
+
CC-Delta (top-100) (W65k),sae,Gemma-2-2B,sorry-bench,12.0,0.938,0.9218,0.953,0.8597,0.8616,0.8578,0.2822,12.09
|
| 91 |
+
SAE-Classifier (logistic_regression) (W16k),sae,Gemma-2-2B,wildjailbreak,12.0,0.9988,0.9971,0.9998,0.99,0.988,0.992,0.0,1.37
|
| 92 |
+
CC-Delta (top-100) (W16k),sae,Gemma-2-2B,wildjailbreak,12.0,0.9988,0.9966,0.9999,0.991,0.992,0.99,0.002,3.83
|
| 93 |
+
"GSAE (k=10, α=0.1) (W16k)",sae,Gemma-2-2B,wildjailbreak,12.0,0.9958,0.9911,0.9992,0.987,0.9899,0.984,0.002,247.78
|
| 94 |
+
"Random SAE (100 features, 5 seeds) (W16k)",sae,Gemma-2-2B,wildjailbreak,12.0,0.57,0.5358,0.6047,0.6949,0.5325,1.0,0.878,0.1
|
| 95 |
+
Linear Probe (W16k),activation,Gemma-2-2B,wildjailbreak,12.0,0.9988,0.9971,0.9998,0.99,0.988,0.992,0.0,0.86
|
| 96 |
+
"MLP Probe ([256, 128]) (W16k)",activation,Gemma-2-2B,wildjailbreak,12.0,0.9987,0.9975,0.9996,0.9861,0.9821,0.99,0.006,11.26
|
| 97 |
+
FJD (entropy) (W16k),logit,Gemma-2-2B,wildjailbreak,12.0,0.6798,0.6446,0.7147,0.6667,0.5,1.0,0.482,5.18
|
| 98 |
+
CC-Delta (top-100) (W65k),sae,Gemma-2-2B,wildjailbreak,12.0,0.9984,0.9954,1.0,0.995,0.994,0.996,0.002,7.82
|
| 99 |
+
LinearProbe (multi-seed raw),activation,Gemma-2-2B,jailbreakbench,12.0,0.9484,,,,,,,
|
| 100 |
+
LinearProbe (multi-seed sae),sae,Gemma-2-2B,jailbreakbench,12.0,0.7438,,,,,,,
|
| 101 |
+
MLP_Probe (multi-seed raw),activation,Gemma-2-2B,jailbreakbench,12.0,0.9386,,,,,,,
|
| 102 |
+
MLP_Probe (multi-seed sae),sae,Gemma-2-2B,jailbreakbench,12.0,0.7065,,,,,,,
|
| 103 |
+
CC_Delta (multi-seed raw),activation,Gemma-2-2B,jailbreakbench,12.0,0.9229,,,,,,,
|
| 104 |
+
CC_Delta (multi-seed sae),sae,Gemma-2-2B,jailbreakbench,12.0,0.7543,,,,,,,
|
| 105 |
+
SAE_Classifier (multi-seed raw),activation,Gemma-2-2B,jailbreakbench,12.0,0.9484,,,,,,,
|
| 106 |
+
SAE_Classifier (multi-seed sae),sae,Gemma-2-2B,jailbreakbench,12.0,0.7438,,,,,,,
|
| 107 |
+
Linear Probe,activation,Llama-3.1-8B,jailbreakbench,12.0,0.867,,,,,,,
|
| 108 |
+
Linear Probe (SAE),sae,Llama-3.1-8B,jailbreakbench,12.0,0.4765,,,,,,,
|
| 109 |
+
Linear Probe,activation,Gemma-3-4B,jailbreakbench,9.0,0.905,,,,,,,
|
| 110 |
+
Linear Probe (SAE),sae,Gemma-3-4B,jailbreakbench,9.0,0.6088,,,,,,,
|
| 111 |
+
Linear Probe,activation,Gemma-3-4B,jailbreakbench,17.0,0.9215,,,,,,,
|
| 112 |
+
Linear Probe (SAE),sae,Gemma-3-4B,jailbreakbench,17.0,0.7085,,,,,,,
|
| 113 |
+
Linear Probe,activation,Gemma-3-4B,jailbreakbench,29.0,0.812,,,,,,,
|
| 114 |
+
Linear Probe (SAE),sae,Gemma-3-4B,jailbreakbench,29.0,0.69,,,,,,,
|
| 115 |
+
Linear Probe,activation,Gemma-3-4B,wildguardtest,9.0,0.9699,,,,,,,
|
| 116 |
+
Linear Probe (SAE),sae,Gemma-3-4B,wildguardtest,9.0,0.8704,,,,,,,
|
| 117 |
+
Linear Probe,activation,Gemma-3-4B,wildguardtest,17.0,0.9742,,,,,,,
|
| 118 |
+
Linear Probe (SAE),sae,Gemma-3-4B,wildguardtest,17.0,0.9565,,,,,,,
|
| 119 |
+
Linear Probe,activation,Gemma-3-4B,wildguardtest,29.0,0.9628,,,,,,,
|
| 120 |
+
Linear Probe (SAE),sae,Gemma-3-4B,wildguardtest,29.0,0.9452,,,,,,,
|
| 121 |
+
Linear Probe,activation,Llama-3.3-70B,jailbreakbench,25.0,1.0,,,,,,,
|
| 122 |
+
Linear Probe,activation,Llama-3.3-70B,jailbreakbench,50.0,1.0,,,,,,,
|
| 123 |
+
Linear Probe (SAE),sae,Llama-3.3-70B,jailbreakbench,50.0,0.9492,,,,,,,
|
| 124 |
+
Linear Probe,activation,Llama-3.3-70B,jailbreakbench,65.0,1.0,,,,,,,
|
| 125 |
+
Linear Probe,activation,Llama-3.3-70B,wildguardtest,25.0,0.9165,,,,,,,
|
| 126 |
+
Linear Probe,activation,Llama-3.3-70B,wildguardtest,50.0,0.905,,,,,,,
|
| 127 |
+
Linear Probe (SAE),sae,Llama-3.3-70B,wildguardtest,50.0,0.82,,,,,,,
|
| 128 |
+
Linear Probe,activation,Llama-3.3-70B,wildguardtest,65.0,0.899,,,,,,,
|
| 129 |
+
LlamaGuard-3,external,LlamaGuard-3-8B,jailbreakbench,,0.885,0.839,0.9259,0.892,0.8407,0.95,0.18,448.64
|
| 130 |
+
WildGuard,external,WildGuard,wildguardtest,,0.9458,,,0.8814,0.9181,0.8475,0.0587,528.2144
|
| 131 |
+
ClassAware-SAE (lambda=0.0),sae,Gemma-2-2B,jailbreakbench,12.0,0.88,,,,,,,
|
| 132 |
+
ClassAware-SAE (lambda=0.01),sae,Gemma-2-2B,jailbreakbench,12.0,0.877,,,,,,,
|
| 133 |
+
ClassAware-SAE (lambda=0.1),sae,Gemma-2-2B,jailbreakbench,12.0,0.8925,,,,,,,
|
| 134 |
+
ClassAware-SAE (lambda=1.0),sae,Gemma-2-2B,jailbreakbench,12.0,0.9445,,,,,,,
|
| 135 |
+
ClassAware-SAE (lambda=10.0),sae,Gemma-2-2B,jailbreakbench,12.0,0.9535,,,,,,,
|
| 136 |
+
ClassAware-SAE (lambda=0.0),sae,Gemma-2-2B,wildguardtest,12.0,0.8684,,,,,,,
|
| 137 |
+
ClassAware-SAE (lambda=0.01),sae,Gemma-2-2B,wildguardtest,12.0,0.8706,,,,,,,
|
| 138 |
+
ClassAware-SAE (lambda=0.1),sae,Gemma-2-2B,wildguardtest,12.0,0.8739,,,,,,,
|
| 139 |
+
ClassAware-SAE (lambda=1.0),sae,Gemma-2-2B,wildguardtest,12.0,0.9002,,,,,,,
|
| 140 |
+
ClassAware-SAE (lambda=10.0),sae,Gemma-2-2B,wildguardtest,12.0,0.9204,,,,,,,
|
| 141 |
+
ClassAware-SAE (lambda=0.0),sae,Gemma-2-2B,harmbench,12.0,0.9989,,,,,,,
|
| 142 |
+
ClassAware-SAE (lambda=0.01),sae,Gemma-2-2B,harmbench,12.0,0.9987,,,,,,,
|
| 143 |
+
ClassAware-SAE (lambda=0.1),sae,Gemma-2-2B,harmbench,12.0,0.9993,,,,,,,
|
| 144 |
+
ClassAware-SAE (lambda=1.0),sae,Gemma-2-2B,harmbench,12.0,0.9999,,,,,,,
|
| 145 |
+
ClassAware-SAE (lambda=10.0),sae,Gemma-2-2B,harmbench,12.0,1.0,,,,,,,
|
| 146 |
+
ClassAware-SAE (lambda=0.0),sae,Gemma-2-2B,advbench,12.0,0.9998,,,,,,,
|
| 147 |
+
ClassAware-SAE (lambda=0.01),sae,Gemma-2-2B,advbench,12.0,0.9998,,,,,,,
|
| 148 |
+
ClassAware-SAE (lambda=0.1),sae,Gemma-2-2B,advbench,12.0,0.9998,,,,,,,
|
| 149 |
+
ClassAware-SAE (lambda=1.0),sae,Gemma-2-2B,advbench,12.0,0.9999,,,,,,,
|
| 150 |
+
ClassAware-SAE (lambda=10.0),sae,Gemma-2-2B,advbench,12.0,0.9999,,,,,,,
|
| 151 |
+
ClassAware-SAE (lambda=0.0),sae,Gemma-2-2B,sorry-bench,12.0,0.9963,,,,,,,
|
| 152 |
+
ClassAware-SAE (lambda=0.01),sae,Gemma-2-2B,sorry-bench,12.0,0.9966,,,,,,,
|
| 153 |
+
ClassAware-SAE (lambda=0.1),sae,Gemma-2-2B,sorry-bench,12.0,0.9976,,,,,,,
|
| 154 |
+
ClassAware-SAE (lambda=1.0),sae,Gemma-2-2B,sorry-bench,12.0,0.9994,,,,,,,
|
| 155 |
+
ClassAware-SAE (lambda=10.0),sae,Gemma-2-2B,sorry-bench,12.0,0.9998,,,,,,,
|
| 156 |
+
ClassAware-SAE (lambda=0.0),sae,Gemma-2-2B,wildjailbreak,12.0,0.9997,,,,,,,
|
| 157 |
+
ClassAware-SAE (lambda=0.01),sae,Gemma-2-2B,wildjailbreak,12.0,0.9997,,,,,,,
|
| 158 |
+
ClassAware-SAE (lambda=0.1),sae,Gemma-2-2B,wildjailbreak,12.0,0.9998,,,,,,,
|
| 159 |
+
ClassAware-SAE (lambda=1.0),sae,Gemma-2-2B,wildjailbreak,12.0,0.9996,,,,,,,
|
| 160 |
+
ClassAware-SAE (lambda=10.0),sae,Gemma-2-2B,wildjailbreak,12.0,0.9996,,,,,,,
|
| 161 |
+
Linear Probe (raw),activation,Gemma-2-2B,advbench,12.0,0.9972,,,,,,,
|
| 162 |
+
Linear Probe (SAE),sae,Gemma-2-2B,advbench,12.0,0.9581,,,,,,,
|
| 163 |
+
InterpGuard (hybrid_concat),hybrid,Gemma-2-2B,advbench,12.0,0.9981,,,,,,,
|
| 164 |
+
Linear Probe (raw),activation,Gemma-2-2B,harmbench,12.0,0.9878,,,,,,,
|
| 165 |
+
Linear Probe (SAE),sae,Gemma-2-2B,harmbench,12.0,0.8908,,,,,,,
|
| 166 |
+
InterpGuard (hybrid_concat),hybrid,Gemma-2-2B,harmbench,12.0,0.9853,,,,,,,
|
| 167 |
+
Linear Probe (raw),activation,Gemma-2-2B,jailbreakbench,12.0,0.9565,,,,,,,
|
| 168 |
+
Linear Probe (SAE),sae,Gemma-2-2B,jailbreakbench,12.0,0.7065,,,,,,,
|
| 169 |
+
InterpGuard (hybrid_concat),hybrid,Gemma-2-2B,jailbreakbench,12.0,0.943,,,,,,,
|
| 170 |
+
Linear Probe (residual),activation,Gemma-2-2B,jailbreakbench,12.0,0.9345,,,,,,,
|
| 171 |
+
InterpGuard (hybrid_residual),hybrid,Gemma-2-2B,jailbreakbench,12.0,0.9225,,,,,,,
|
| 172 |
+
Linear Probe (raw),activation,Gemma-2-2B,sorry-bench,12.0,0.9951,,,,,,,
|
| 173 |
+
Linear Probe (SAE),sae,Gemma-2-2B,sorry-bench,12.0,0.9132,,,,,,,
|
| 174 |
+
InterpGuard (hybrid_concat),hybrid,Gemma-2-2B,sorry-bench,12.0,0.993,,,,,,,
|
| 175 |
+
Linear Probe (raw),activation,Gemma-2-2B,wildjailbreak,12.0,1.0,,,,,,,
|
| 176 |
+
Linear Probe (SAE),sae,Gemma-2-2B,wildjailbreak,12.0,1.0,,,,,,,
|
| 177 |
+
InterpGuard (hybrid_concat),hybrid,Gemma-2-2B,wildjailbreak,12.0,1.0,,,,,,,
|
| 178 |
+
InterpGuard (hybrid),hybrid,Gemma-2-2B,jailbreakbench,12.0,0.943,,,,,,,
|
| 179 |
+
Linear Probe (raw),activation,Gemma-2-2B,wildguardtest,12.0,0.9321,,,,,,,
|
| 180 |
+
Linear Probe (SAE),sae,Gemma-2-2B,wildguardtest,12.0,0.8294,,,,,,,
|
| 181 |
+
InterpGuard (hybrid),hybrid,Gemma-2-2B,wildguardtest,12.0,0.9321,,,,,,,
|
| 182 |
+
Linear Probe (raw),activation,Llama-3.1-8B,jailbreakbench,12.0,0.867,,,,,,,
|
| 183 |
+
InterpGuard (hybrid),hybrid,Llama-3.1-8B,jailbreakbench,12.0,0.821,,,,,,,
|
| 184 |
+
Linear Probe (raw),activation,Gemma-3-4B,jailbreakbench,17.0,0.9205,,,,,,,
|
| 185 |
+
Linear Probe (SAE),sae,Gemma-3-4B,jailbreakbench,17.0,0.685,,,,,,,
|
| 186 |
+
InterpGuard (hybrid),hybrid,Gemma-3-4B,jailbreakbench,17.0,0.918,,,,,,,
|
| 187 |
+
Linear Probe (raw),activation,Gemma-3-4B,wildguardtest,17.0,0.9835,,,,,,,
|
| 188 |
+
Linear Probe (SAE),sae,Gemma-3-4B,wildguardtest,17.0,0.9587,,,,,,,
|
| 189 |
+
InterpGuard (hybrid),hybrid,Gemma-3-4B,wildguardtest,17.0,0.9849,,,,,,,
|
| 190 |
+
Linear Probe (raw),activation,Llama-3.3-70B,jailbreakbench,50.0,1.0,,,,,,,
|
| 191 |
+
Linear Probe (SAE),sae,Llama-3.3-70B,jailbreakbench,50.0,0.9505,,,,,,,
|
| 192 |
+
InterpGuard (hybrid),hybrid,Llama-3.3-70B,jailbreakbench,50.0,1.0,,,,,,,
|
| 193 |
+
Linear Probe (raw),activation,Llama-3.3-70B,wildguardtest,50.0,0.9045,,,,,,,
|
| 194 |
+
Linear Probe (SAE),sae,Llama-3.3-70B,wildguardtest,50.0,0.8235,,,,,,,
|
| 195 |
+
InterpGuard (hybrid),hybrid,Llama-3.3-70B,wildguardtest,50.0,0.9035,,,,,,,
|