import React, { useState, useEffect } from 'react'; const BenchmarkChart = () => { // Real data sorted by Human Baseline ASR (highest to lowest) const benchmarkData = [ { model: "Grok 4", baseline: 68.67, methods: { keyword_objective_combined: 85.15 } }, { model: "Deepseek R1-0528", baseline: 68.67, methods: { keyword_objective_combined: 83.76 } }, { model: "Gemini 2.5 Pro", baseline: 55.67, methods: { keyword_objective_combined: 74.14, root_problem: 67.19 } }, { model: "Gemini 3", baseline: 55.67, methods: { keyword_objective_combined: 55.67, } }, { model: "Mixtral 8x22B", baseline: 48.00, methods: { keyword_objective_combined: 66.82 } }, { model: "Llama 4 Maverick Instruct", baseline: 45.00, methods: { keyword_objective_combined: 56.46 } }, { model: "Claude 4 Sonnet", baseline: 26.33, methods: { keyword_objective_combined: 28.64 } }, { model: "GPT o3", baseline: 22.00, methods: { keyword_objective_combined: 30.53 } }, { model: "Claude Opus 4.1", baseline: 20.67, methods: { keyword_objective_combined: 23.56 } }, { model: "GPT 5", baseline: 8.33, methods: { keyword_objective_combined: 11.68, root_problem: 12.46 } } ]; const [currentPhase, setCurrentPhase] = useState('baseline'); const [currentMethodIndex, setCurrentMethodIndex] = useState(0); const synthesisMethodsOrder = ['keyword_objective_combined', 'root_problem']; const getCurrentValue = (modelData, phase) => { if (phase === 'baseline') { return modelData.baseline; } else if (phase === 'additive_synthesis') { let maxASR = modelData.baseline; for (let i = 0; i < currentMethodIndex; i++) { const method = synthesisMethodsOrder[i]; if (modelData.methods[method] !== undefined) { maxASR = Math.max(maxASR, modelData.methods[method]); } } return maxASR; } return 0; }; const getCurrentMethod = (modelData, phase) => { if (phase === 'baseline') return 'Human Baseline'; if (currentMethodIndex === 0) return 'Human Baseline'; const availableMethods = []; for (let i = 0; i < currentMethodIndex; i++) { const method = synthesisMethodsOrder[i]; if (modelData.methods[method] !== undefined) { availableMethods.push(method); } } if (availableMethods.length === 0) return 'Human Baseline'; const lastMethod = availableMethods[availableMethods.length - 1]; let transformationName; if (lastMethod === 'keyword_objective_combined') transformationName = 'Keyword/Objective Transformation'; else if (lastMethod === 'root_problem') transformationName = 'Root Problem Transformation'; else transformationName = lastMethod.replace(/_/g, ' ').replace(/\b\w/g, l => l.toUpperCase()); return `Human Baseline + ${transformationName}`; }; const handleManualAnimate = () => { if (currentPhase === 'baseline') { setCurrentPhase('additive_synthesis'); setCurrentMethodIndex(0); // Manually step through methods setTimeout(() => setCurrentMethodIndex(1), 500); setTimeout(() => setCurrentMethodIndex(2), 2500); } else { setCurrentPhase('baseline'); setCurrentMethodIndex(0); } }; return (

{/* Header */}

JulyAI Safety Benchmark For Frontier Models

Attack Success Rate Analysis with Transformation Methods

{/* Control Button */}

{/* Chart Container - Longer */}

{/* Legend - At Top of Chart */}

Human Baseline

Transformation Extension

{benchmarkData.map((modelData, index) => { const currentValue = getCurrentValue(modelData, currentPhase); const baselineValue = modelData.baseline; const maxValue = 90; const baselineWidth = (baselineValue / maxValue) * 100; const totalWidth = (currentValue / maxValue) * 100; const extensionWidth = totalWidth - baselineWidth; const gain = currentValue - baselineValue; return (

{/* Model Name and Value - Cleaner Layout */}

{modelData.model}

{gain > 0 && ( +{gain.toFixed(1)} )} {currentValue.toFixed(1)}%

{/* Progress Bar - Shows Growth from Baseline */}

{/* Baseline Bar (Blue) - Flat, no rounding for seamless extension */}

{/* Extension Bar (Green) - Only rounded at the end */}

0 ? `${extensionWidth}%` : '0%' }} />

); })}

{/* Methodology Note - Moved Below Chart */}

⚠️

Methodology Note

Additive Visualization: This chart shows cumulative impact by progressively adding each transformation method's individual attack success rate. Values >100% represent transformation of multiple conversations off one failed, human seed conversation. Results are based on HarmBench Grading methodology and should be interpreted as relative performance indicators.

{/* Footer */}

Sorted by Human Baseline ASR (highest to lowest) • Click button above for manual control

Bars extend from baseline to show transformation method impact

); }; export default BenchmarkChart;