yoshinakachi's picture
Update src/App.js
2607058 verified
raw
history blame
11 kB
import React, { useState, useEffect } from 'react';
const BenchmarkChart = () => {
// Real data from your CSV
const benchmarkData = [
{
model: "Claude 4 Sonnet",
direct_conversation: 26.33,
keyword_objective_combined: 3.13
},
{
model: "Claude Opus 4.1",
direct_conversation: 20.67,
keyword_objective_combined: 3.65
},
{
model: "Deepseek R1-0528",
direct_conversation: 68.67,
keyword_objective_combined: 48.18
},
{
model: "GPT 5",
direct_conversation: 8.33,
keyword_objective_combined: 3.65,
bio_topic_change: 23.5,
enhancement: 10,
root_problem: 4.5
},
{
model: "GPT 5 mini",
direct_conversation: 7.67,
keyword_objective_combined: 3.91,
bio_topic_change: 14.5,
enhancement: 5.5,
root_problem: 3
},
{
model: "GPT o3",
direct_conversation: 22,
keyword_objective_combined: 10.94
},
{
model: "Gemini 2.5 Pro",
direct_conversation: 55.67,
keyword_objective_combined: 41.67,
bio_topic_change: 53.5,
enhancement: 47,
root_problem: 26
},
{
model: "Grok 4",
direct_conversation: 68.67,
keyword_objective_combined: 52.6
},
{
model: "Llama 3.1 405B",
direct_conversation: 67,
keyword_objective_combined: 41.67
}
];
const [currentPhase, setCurrentPhase] = useState('baseline');
const [currentMethodIndex, setCurrentMethodIndex] = useState(0);
const synthesisMethodsOrder = ['keyword_objective_combined', 'bio_topic_change', 'enhancement', 'root_problem'];
const phases = [
{ key: 'baseline', label: 'Direct Conversation (Baseline)' },
{ key: 'additive_synthesis', label: 'Adding Synthesis Methods' }
];
useEffect(() => {
const interval = setInterval(() => {
setCurrentPhase(prev => prev === 'baseline' ? 'additive_synthesis' : 'baseline');
setCurrentMethodIndex(0); // Reset when switching phases
}, 12000); // Increased total cycle time to 12 seconds
return () => clearInterval(interval);
}, []);
useEffect(() => {
if (currentPhase === 'additive_synthesis') {
const methodInterval = setInterval(() => {
setCurrentMethodIndex(prev => {
const nextIndex = prev + 1;
// Stay at final state (all methods added) for longer
if (nextIndex > synthesisMethodsOrder.length) {
return synthesisMethodsOrder.length; // Stay at max for longer
}
return nextIndex;
});
}, 2000); // Slower progression - 2 seconds per method
return () => clearInterval(methodInterval);
}
}, [currentPhase]);
const getCurrentValue = (modelData, phase) => {
if (phase === 'baseline') {
return modelData.direct_conversation || 0;
} else if (phase === 'additive_synthesis') {
let cumulativeValue = modelData.direct_conversation || 0;
// Add each synthesis method's contribution up to currentMethodIndex
for (let i = 0; i < currentMethodIndex; i++) {
const method = synthesisMethodsOrder[i];
if (modelData[method] !== undefined) {
cumulativeValue += modelData[method];
}
}
return cumulativeValue;
}
return 0;
};
const getCurrentMethodsAdded = (modelData, phase) => {
if (phase === 'baseline') return ['Direct Conversation'];
const methods = ['Direct Conversation'];
for (let i = 0; i < currentMethodIndex; i++) {
const method = synthesisMethodsOrder[i];
if (modelData[method] !== undefined) {
methods.push(method.replace(/_/g, ' ').replace(/\b\w/g, l => l.toUpperCase()));
}
}
return methods;
};
const getBarColor = (modelData, phase) => {
if (phase === 'baseline') {
return 'from-blue-500 to-blue-600';
} else {
// Green gradient for additive synthesis
return 'from-green-500 to-green-600';
}
};
return (
<div className="min-h-screen bg-gradient-to-br from-slate-900 to-slate-800 p-8">
<div className="max-w-6xl mx-auto">
{/* Header */}
<div className="text-center mb-12">
<h1 className="text-4xl font-bold text-white mb-4">
LLM Safety Benchmark Results
</h1>
<p className="text-slate-300 text-lg">
SafetyBench Aug 2025 - Success Rate Comparison
</p>
{/* Methodology Disclaimer */}
<div className="mt-6 p-4 bg-yellow-900/30 border border-yellow-500/30 rounded-lg max-w-4xl mx-auto">
<div className="flex items-start space-x-3">
<div className="text-yellow-400 mt-1">⚠️</div>
<div className="text-left">
<p className="text-yellow-200 font-semibold mb-2">Methodology Note</p>
<p className="text-yellow-100 text-sm leading-relaxed">
<strong>Additive Visualization:</strong> This chart shows cumulative impact by progressively adding each synthesis method's individual success rate.
Values >100% represent theoretical maximum vulnerability discovery when combining multiple attack vectors.
Results are based on SafetyBench Aug 2025 testing methodology and should be interpreted as relative performance indicators.
</p>
</div>
</div>
</div>
<div className="mt-4 p-4 bg-slate-800 rounded-lg inline-block">
<p className="text-white font-semibold">
Current View: {phases.find(p => p.key === currentPhase)?.label}
</p>
{currentPhase === 'additive_synthesis' && currentMethodIndex > 0 && (
<p className="text-slate-300 text-sm mt-1">
Adding Method {currentMethodIndex}: {synthesisMethodsOrder[currentMethodIndex - 1]?.replace(/_/g, ' ').replace(/\b\w/g, l => l.toUpperCase())}
</p>
)}
</div>
</div>
{/* Chart Container */}
<div className="bg-white rounded-2xl shadow-2xl p-8">
<div className="space-y-6">
{benchmarkData.map((modelData, index) => {
const currentValue = getCurrentValue(modelData, currentPhase);
const baselineValue = modelData.direct_conversation;
const maxValue = 100; // Increased max scale since we're adding values
const barWidth = (currentValue / maxValue) * 100;
const methodsAdded = getCurrentMethodsAdded(modelData, currentPhase);
const totalGain = currentValue - baselineValue;
return (
<div key={modelData.model} className="relative">
{/* Model Name and Methods */}
<div className="flex items-center justify-between mb-2">
<div>
<h3 className="font-semibold text-gray-800 text-lg">
{modelData.model}
</h3>
<p className="text-sm text-gray-600">
{methodsAdded.join(' + ')}
</p>
</div>
<div className="text-right">
<span className="text-2xl font-bold text-gray-700">
{currentValue.toFixed(1)}%
</span>
{currentPhase === 'additive_synthesis' && totalGain > 0 && (
<div className="text-sm font-semibold text-green-600">
+{totalGain.toFixed(1)}% total gain
</div>
)}
</div>
</div>
{/* Progress Bar */}
<div className="relative h-12 bg-gray-200 rounded-full overflow-hidden">
<div
className={`h-full bg-gradient-to-r ${getBarColor(modelData, currentPhase)} rounded-full transition-all duration-[1800ms] ease-in-out flex items-center justify-end pr-4`}
style={{ width: `${Math.max(barWidth, 5)}%` }}
>
<div className="text-white font-semibold text-sm">
{currentValue > 8 ? `${currentValue.toFixed(1)}%` : ''}
</div>
</div>
</div>
{/* Method Breakdown */}
{currentPhase === 'additive_synthesis' && currentMethodIndex > 0 && (
<div className="mt-2 text-xs text-gray-500 space-y-1">
<div>Baseline: {baselineValue.toFixed(1)}%</div>
{synthesisMethodsOrder.slice(0, currentMethodIndex).map(method => {
if (modelData[method] !== undefined) {
return (
<div key={method}>
+ {method.replace(/_/g, ' ')}: {modelData[method].toFixed(1)}%
</div>
);
}
return null;
})}
</div>
)}
</div>
);
})}
</div>
{/* Legend */}
<div className="mt-8 flex justify-center space-x-6 flex-wrap">
<div className="flex items-center space-x-2">
<div className="w-4 h-4 bg-gradient-to-r from-blue-500 to-blue-600 rounded"></div>
<span className="text-gray-700">Baseline (Direct Conversation)</span>
</div>
<div className="flex items-center space-x-2">
<div className="w-4 h-4 bg-gradient-to-r from-green-500 to-green-600 rounded"></div>
<span className="text-gray-700">Additive Synthesis Methods</span>
</div>
</div>
</div>
{/* Footer Info */}
<div className="mt-8 text-center text-slate-400 space-y-2">
<p>Animation cycles every 12 seconds: baseline (6s) → progressive method addition (2s each) → final results (4s)</p>
<p className="text-sm">
Data from SafetyBench Aug 2025 • Synthesis methods test different attack vectors
</p>
<div className="text-xs mt-4 max-w-4xl mx-auto space-y-1">
<p><strong>Interpretation:</strong> Each synthesis method represents a different approach to testing model vulnerabilities</p>
<p><strong>Additive Display:</strong> Shows theoretical maximum impact when combining all available synthesis techniques</p>
<p><strong>Baseline:</strong> Direct conversation represents standard prompting without augmentation techniques</p>
</div>
</div>
</div>
</div>
);
};
export default BenchmarkChart;