Spaces:
Running
Running
| import React, { useState, useEffect } from 'react'; | |
| const BenchmarkChart = () => { | |
| // Real data from your CSV | |
| const benchmarkData = [ | |
| { | |
| model: "Claude 4 Sonnet", | |
| direct_conversation: 26.33, | |
| keyword_objective_combined: 3.13 | |
| }, | |
| { | |
| model: "Claude Opus 4.1", | |
| direct_conversation: 20.67, | |
| keyword_objective_combined: 3.65 | |
| }, | |
| { | |
| model: "Deepseek R1-0528", | |
| direct_conversation: 68.67, | |
| keyword_objective_combined: 48.18 | |
| }, | |
| { | |
| model: "GPT 5", | |
| direct_conversation: 8.33, | |
| keyword_objective_combined: 3.65, | |
| bio_topic_change: 23.5, | |
| enhancement: 10, | |
| root_problem: 4.5 | |
| }, | |
| { | |
| model: "GPT 5 mini", | |
| direct_conversation: 7.67, | |
| keyword_objective_combined: 3.91, | |
| bio_topic_change: 14.5, | |
| enhancement: 5.5, | |
| root_problem: 3 | |
| }, | |
| { | |
| model: "GPT o3", | |
| direct_conversation: 22, | |
| keyword_objective_combined: 10.94 | |
| }, | |
| { | |
| model: "Gemini 2.5 Pro", | |
| direct_conversation: 55.67, | |
| keyword_objective_combined: 41.67, | |
| bio_topic_change: 53.5, | |
| enhancement: 47, | |
| root_problem: 26 | |
| }, | |
| { | |
| model: "Grok 4", | |
| direct_conversation: 68.67, | |
| keyword_objective_combined: 52.6 | |
| }, | |
| { | |
| model: "Llama 3.1 405B", | |
| direct_conversation: 67, | |
| keyword_objective_combined: 41.67 | |
| } | |
| ]; | |
| const [currentPhase, setCurrentPhase] = useState('baseline'); | |
| const [currentMethodIndex, setCurrentMethodIndex] = useState(0); | |
| const synthesisMethodsOrder = ['keyword_objective_combined', 'bio_topic_change', 'enhancement', 'root_problem']; | |
| const phases = [ | |
| { key: 'baseline', label: 'Direct Conversation (Baseline)' }, | |
| { key: 'additive_synthesis', label: 'Adding Synthesis Methods' } | |
| ]; | |
| useEffect(() => { | |
| const interval = setInterval(() => { | |
| setCurrentPhase(prev => prev === 'baseline' ? 'additive_synthesis' : 'baseline'); | |
| setCurrentMethodIndex(0); // Reset when switching phases | |
| }, 12000); // Increased total cycle time to 12 seconds | |
| return () => clearInterval(interval); | |
| }, []); | |
| useEffect(() => { | |
| if (currentPhase === 'additive_synthesis') { | |
| const methodInterval = setInterval(() => { | |
| setCurrentMethodIndex(prev => { | |
| const nextIndex = prev + 1; | |
| // Stay at final state (all methods added) for longer | |
| if (nextIndex > synthesisMethodsOrder.length) { | |
| return synthesisMethodsOrder.length; // Stay at max for longer | |
| } | |
| return nextIndex; | |
| }); | |
| }, 2000); // Slower progression - 2 seconds per method | |
| return () => clearInterval(methodInterval); | |
| } | |
| }, [currentPhase]); | |
| const getCurrentValue = (modelData, phase) => { | |
| if (phase === 'baseline') { | |
| return modelData.direct_conversation || 0; | |
| } else if (phase === 'additive_synthesis') { | |
| let cumulativeValue = modelData.direct_conversation || 0; | |
| // Add each synthesis method's contribution up to currentMethodIndex | |
| for (let i = 0; i < currentMethodIndex; i++) { | |
| const method = synthesisMethodsOrder[i]; | |
| if (modelData[method] !== undefined) { | |
| cumulativeValue += modelData[method]; | |
| } | |
| } | |
| return cumulativeValue; | |
| } | |
| return 0; | |
| }; | |
| const getCurrentMethodsAdded = (modelData, phase) => { | |
| if (phase === 'baseline') return ['Direct Conversation']; | |
| const methods = ['Direct Conversation']; | |
| for (let i = 0; i < currentMethodIndex; i++) { | |
| const method = synthesisMethodsOrder[i]; | |
| if (modelData[method] !== undefined) { | |
| methods.push(method.replace(/_/g, ' ').replace(/\b\w/g, l => l.toUpperCase())); | |
| } | |
| } | |
| return methods; | |
| }; | |
| const getBarColor = (modelData, phase) => { | |
| if (phase === 'baseline') { | |
| return 'from-blue-500 to-blue-600'; | |
| } else { | |
| // Green gradient for additive synthesis | |
| return 'from-green-500 to-green-600'; | |
| } | |
| }; | |
| return ( | |
| <div className="min-h-screen bg-gradient-to-br from-slate-900 to-slate-800 p-8"> | |
| <div className="max-w-6xl mx-auto"> | |
| {/* Header */} | |
| <div className="text-center mb-12"> | |
| <h1 className="text-4xl font-bold text-white mb-4"> | |
| LLM Safety Benchmark Results | |
| </h1> | |
| <p className="text-slate-300 text-lg"> | |
| SafetyBench Aug 2025 - Success Rate Comparison | |
| </p> | |
| {/* Methodology Disclaimer */} | |
| <div className="mt-6 p-4 bg-yellow-900/30 border border-yellow-500/30 rounded-lg max-w-4xl mx-auto"> | |
| <div className="flex items-start space-x-3"> | |
| <div className="text-yellow-400 mt-1">⚠️</div> | |
| <div className="text-left"> | |
| <p className="text-yellow-200 font-semibold mb-2">Methodology Note</p> | |
| <p className="text-yellow-100 text-sm leading-relaxed"> | |
| <strong>Additive Visualization:</strong> This chart shows cumulative impact by progressively adding each synthesis method's individual success rate. | |
| Values >100% represent theoretical maximum vulnerability discovery when combining multiple attack vectors. | |
| Results are based on SafetyBench Aug 2025 testing methodology and should be interpreted as relative performance indicators. | |
| </p> | |
| </div> | |
| </div> | |
| </div> | |
| <div className="mt-4 p-4 bg-slate-800 rounded-lg inline-block"> | |
| <p className="text-white font-semibold"> | |
| Current View: {phases.find(p => p.key === currentPhase)?.label} | |
| </p> | |
| {currentPhase === 'additive_synthesis' && currentMethodIndex > 0 && ( | |
| <p className="text-slate-300 text-sm mt-1"> | |
| Adding Method {currentMethodIndex}: {synthesisMethodsOrder[currentMethodIndex - 1]?.replace(/_/g, ' ').replace(/\b\w/g, l => l.toUpperCase())} | |
| </p> | |
| )} | |
| </div> | |
| </div> | |
| {/* Chart Container */} | |
| <div className="bg-white rounded-2xl shadow-2xl p-8"> | |
| <div className="space-y-6"> | |
| {benchmarkData.map((modelData, index) => { | |
| const currentValue = getCurrentValue(modelData, currentPhase); | |
| const baselineValue = modelData.direct_conversation; | |
| const maxValue = 100; // Increased max scale since we're adding values | |
| const barWidth = (currentValue / maxValue) * 100; | |
| const methodsAdded = getCurrentMethodsAdded(modelData, currentPhase); | |
| const totalGain = currentValue - baselineValue; | |
| return ( | |
| <div key={modelData.model} className="relative"> | |
| {/* Model Name and Methods */} | |
| <div className="flex items-center justify-between mb-2"> | |
| <div> | |
| <h3 className="font-semibold text-gray-800 text-lg"> | |
| {modelData.model} | |
| </h3> | |
| <p className="text-sm text-gray-600"> | |
| {methodsAdded.join(' + ')} | |
| </p> | |
| </div> | |
| <div className="text-right"> | |
| <span className="text-2xl font-bold text-gray-700"> | |
| {currentValue.toFixed(1)}% | |
| </span> | |
| {currentPhase === 'additive_synthesis' && totalGain > 0 && ( | |
| <div className="text-sm font-semibold text-green-600"> | |
| +{totalGain.toFixed(1)}% total gain | |
| </div> | |
| )} | |
| </div> | |
| </div> | |
| {/* Progress Bar */} | |
| <div className="relative h-12 bg-gray-200 rounded-full overflow-hidden"> | |
| <div | |
| className={`h-full bg-gradient-to-r ${getBarColor(modelData, currentPhase)} rounded-full transition-all duration-[1800ms] ease-in-out flex items-center justify-end pr-4`} | |
| style={{ width: `${Math.max(barWidth, 5)}%` }} | |
| > | |
| <div className="text-white font-semibold text-sm"> | |
| {currentValue > 8 ? `${currentValue.toFixed(1)}%` : ''} | |
| </div> | |
| </div> | |
| </div> | |
| {/* Method Breakdown */} | |
| {currentPhase === 'additive_synthesis' && currentMethodIndex > 0 && ( | |
| <div className="mt-2 text-xs text-gray-500 space-y-1"> | |
| <div>Baseline: {baselineValue.toFixed(1)}%</div> | |
| {synthesisMethodsOrder.slice(0, currentMethodIndex).map(method => { | |
| if (modelData[method] !== undefined) { | |
| return ( | |
| <div key={method}> | |
| + {method.replace(/_/g, ' ')}: {modelData[method].toFixed(1)}% | |
| </div> | |
| ); | |
| } | |
| return null; | |
| })} | |
| </div> | |
| )} | |
| </div> | |
| ); | |
| })} | |
| </div> | |
| {/* Legend */} | |
| <div className="mt-8 flex justify-center space-x-6 flex-wrap"> | |
| <div className="flex items-center space-x-2"> | |
| <div className="w-4 h-4 bg-gradient-to-r from-blue-500 to-blue-600 rounded"></div> | |
| <span className="text-gray-700">Baseline (Direct Conversation)</span> | |
| </div> | |
| <div className="flex items-center space-x-2"> | |
| <div className="w-4 h-4 bg-gradient-to-r from-green-500 to-green-600 rounded"></div> | |
| <span className="text-gray-700">Additive Synthesis Methods</span> | |
| </div> | |
| </div> | |
| </div> | |
| {/* Footer Info */} | |
| <div className="mt-8 text-center text-slate-400 space-y-2"> | |
| <p>Animation cycles every 12 seconds: baseline (6s) → progressive method addition (2s each) → final results (4s)</p> | |
| <p className="text-sm"> | |
| Data from SafetyBench Aug 2025 • Synthesis methods test different attack vectors | |
| </p> | |
| <div className="text-xs mt-4 max-w-4xl mx-auto space-y-1"> | |
| <p><strong>Interpretation:</strong> Each synthesis method represents a different approach to testing model vulnerabilities</p> | |
| <p><strong>Additive Display:</strong> Shows theoretical maximum impact when combining all available synthesis techniques</p> | |
| <p><strong>Baseline:</strong> Direct conversation represents standard prompting without augmentation techniques</p> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| ); | |
| }; | |
| export default BenchmarkChart; |