Spaces:
Running
Running
File size: 8,026 Bytes
2607058 e3ac6d9 2607058 493ead8 30360e9 493ead8 30360e9 2607058 493ead8 30360e9 493ead8 30360e9 2607058 e3ac6d9 30360e9 e3ac6d9 30360e9 493ead8 30360e9 493ead8 30360e9 493ead8 30360e9 493ead8 30360e9 2607058 e57087d 2607058 e3ac6d9 30360e9 e3ac6d9 30360e9 2607058 e3ac6d9 30360e9 e3ac6d9 30360e9 2607058 493ead8 30360e9 493ead8 30360e9 2607058 493ead8 30360e9 493ead8 30360e9 2607058 30360e9 e57087d 30360e9 2607058 e57087d 2607058 30360e9 2607058 e57087d 2607058 5f3beb1 30360e9 2607058 30360e9 5b4f4bd 2607058 30360e9 5b4f4bd 2607058 5b4f4bd f4cfdd4 5b4f4bd e57087d 5b4f4bd 2607058 493ead8 30360e9 f4cfdd4 1a83a4e 30360e9 5b4f4bd 30360e9 f4cfdd4 30360e9 c1ae531 5b4f4bd 30360e9 c1ae531 2607058 30360e9 5b4f4bd 30360e9 5b4f4bd 30360e9 e57087d 5b4f4bd e57087d 5b4f4bd 2607058 30360e9 2607058 5b4f4bd 2607058 30360e9 2607058 e3ac6d9 30360e9 5b4f4bd 2607058 5f3beb1 2607058 5f3beb1 2607058 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 |
import React, { useState, useEffect } from 'react';
const BenchmarkChart = () => {
// Real data sorted by Human Baseline ASR (highest to lowest)
const benchmarkData = [
{
model: "Grok 4",
baseline: 68.67,
methods: {
keyword_objective_combined: 85.15
}
},
{
model: "Deepseek R1-0528",
baseline: 68.67,
methods: {
keyword_objective_combined: 83.76
}
},
{
model: "Gemini 2.5 Pro",
baseline: 55.67,
methods: {
keyword_objective_combined: 74.14,
root_problem: 67.19
}
},
{
model: "Mixtral 8x22B",
baseline: 48.00,
methods: {
keyword_objective_combined: 66.82
}
},
{
model: "Llama 4 Maverick Instruct",
baseline: 45.00,
methods: {
keyword_objective_combined: 56.46
}
},
{
model: "Gemini 3",
baseline: 32.50,
methods: {
bio_topic_change: 53.68
}
},
{
model: "Claude 4 Sonnet",
baseline: 26.33,
methods: {
keyword_objective_combined: 28.64
}
},
{
model: "GPT o3",
baseline: 22.00,
methods: {
keyword_objective_combined: 30.53
}
},
{
model: "Claude Opus 4.1",
baseline: 20.67,
methods: {
keyword_objective_combined: 23.56
}
},
{
model: "GPT 5",
baseline: 8.33,
methods: {
keyword_objective_combined: 11.68,
root_problem: 12.46
}
}
];
const [currentPhase, setCurrentPhase] = useState('baseline');
const getCurrentValue = (modelData, phase) => {
if (phase === 'baseline') {
return modelData.baseline;
} else if (phase === 'transformation') {
// Return the maximum value from all available transformation methods
let maxASR = modelData.baseline;
Object.values(modelData.methods).forEach(value => {
maxASR = Math.max(maxASR, value);
});
return maxASR;
}
return 0;
};
const handleToggle = () => {
setCurrentPhase(currentPhase === 'baseline' ? 'transformation' : 'baseline');
};
return (
<div className="min-h-screen bg-gradient-to-br from-slate-900 to-slate-800 p-4">
<div className="max-w-6xl mx-auto">
{/* Header */}
<div className="text-center mb-6">
<h1 className="text-3xl font-bold text-white mb-3">
JulyAI Safety Benchmark For Frontier Models
</h1>
<p className="text-slate-300">
Attack Success Rate Analysis with Transformation Methods
</p>
{/* Control Button */}
<div className="flex justify-center">
<button
onClick={handleToggle}
className="px-6 py-2 bg-blue-600 hover:bg-blue-700 text-white rounded-lg font-semibold transition-colors"
>
{currentPhase === 'baseline' ? 'Show Transformation Impact' : 'Reset to Baseline'}
</button>
</div>
</div>
{/* Chart Container - Longer */}
<div className="bg-white rounded-xl shadow-2xl p-4">
{/* Legend - At Top of Chart */}
<div className="mb-6 flex justify-center space-x-6 text-sm bg-gray-50 p-3 rounded-lg">
<div className="flex items-center space-x-2">
<div className="w-3 h-3 bg-gradient-to-r from-blue-500 to-blue-600 rounded"></div>
<span className="text-gray-700">Human Baseline</span>
</div>
<div className="flex items-center space-x-2">
<div className="w-3 h-3 bg-gradient-to-r from-green-500 to-green-600 rounded"></div>
<span className="text-gray-700">Transformation Extension</span>
</div>
</div>
<div className="h-[600px] overflow-y-auto pr-2">
<div className="space-y-2">
{benchmarkData.map((modelData, index) => {
const currentValue = getCurrentValue(modelData, currentPhase);
const baselineValue = modelData.baseline;
const maxValue = 90;
const baselineWidth = (baselineValue / maxValue) * 100;
const totalWidth = (currentValue / maxValue) * 100;
const extensionWidth = totalWidth - baselineWidth;
const gain = currentValue - baselineValue;
return (
<div key={modelData.model} className="relative">
{/* Model Name and Value - Cleaner Layout */}
<div className="flex items-center justify-between mb-1">
<div>
<h3 className="font-semibold text-gray-800 text-sm">
{modelData.model}
</h3>
</div>
<div className="text-right flex items-center space-x-2">
{gain > 0 && (
<span className="text-xs font-semibold text-green-600">
+{gain.toFixed(1)}
</span>
)}
<span className="text-lg font-bold text-gray-700">
{currentValue.toFixed(1)}%
</span>
</div>
</div>
{/* Progress Bar - Shows Growth from Baseline */}
<div className="relative h-6 bg-gray-200 rounded-full overflow-hidden">
{/* Baseline Bar (Blue) - Flat, no rounding for seamless extension */}
<div
className="absolute left-0 top-0 h-full bg-gradient-to-r from-blue-500 to-blue-600"
style={{ width: `${Math.max(baselineWidth, 5)}%` }}
/>
{/* Extension Bar (Green) - Only rounded at the end */}
<div
className="absolute top-0 h-full bg-gradient-to-r from-green-500 to-green-600 rounded-r-full transition-all duration-1000 ease-out"
style={{
left: `${baselineWidth}%`,
width: currentPhase === 'transformation' ? `${extensionWidth}%` : '0%'
}}
/>
</div>
</div>
);
})}
</div>
</div>
</div>
{/* Methodology Note - Moved Below Chart */}
<div className="mt-6 p-3 bg-yellow-900/30 border border-yellow-500/30 rounded-lg max-w-4xl mx-auto">
<div className="flex items-start space-x-3">
<div className="text-yellow-400 mt-1">⚠️</div>
<div className="text-left">
<p className="text-yellow-200 font-semibold mb-2">Methodology Note</p>
<p className="text-yellow-100 text-sm leading-relaxed">
<strong>Additive Visualization:</strong> This chart shows cumulative impact by progressively adding each transformation method's individual attack success rate.
Values >100% represent transformation of multiple conversations off one failed, human seed conversation.
Results are based on HarmBench Grading methodology and should be interpreted as relative performance indicators.
</p>
</div>
</div>
</div>
{/* Footer */}
<div className="mt-4 text-center text-slate-400 space-y-1">
<p className="text-sm">
Sorted by Human Baseline ASR (highest to lowest) • Click button above for manual control
</p>
<p className="text-xs">
Bars extend from baseline to show transformation method impact
</p>
</div>
</div>
</div>
);
};
export default BenchmarkChart; |