Spaces:
Running
Running
| import React, { useState, useEffect } from 'react'; | |
| const BenchmarkChart = () => { | |
| // Real data sorted by Human Baseline ASR (highest to lowest) | |
| const benchmarkData = [ | |
| { | |
| model: "Grok 4", | |
| baseline: 68.67, | |
| methods: { | |
| keyword_objective_combined: 85.15 | |
| } | |
| }, | |
| { | |
| model: "Deepseek R1-0528", | |
| baseline: 68.67, | |
| methods: { | |
| keyword_objective_combined: 83.76 | |
| } | |
| }, | |
| { | |
| model: "Gemini 2.5 Pro", | |
| baseline: 55.67, | |
| methods: { | |
| keyword_objective_combined: 74.14, | |
| root_problem: 67.19 | |
| } | |
| }, | |
| { | |
| model: "Gemini 3", | |
| baseline: 55.67, | |
| methods: { | |
| keyword_objective_combined: 55.67, | |
| } | |
| }, | |
| { | |
| model: "Mixtral 8x22B", | |
| baseline: 48.00, | |
| methods: { | |
| keyword_objective_combined: 66.82 | |
| } | |
| }, | |
| { | |
| model: "Llama 4 Maverick Instruct", | |
| baseline: 45.00, | |
| methods: { | |
| keyword_objective_combined: 56.46 | |
| } | |
| }, | |
| { | |
| model: "Claude 4 Sonnet", | |
| baseline: 26.33, | |
| methods: { | |
| keyword_objective_combined: 28.64 | |
| } | |
| }, | |
| { | |
| model: "GPT o3", | |
| baseline: 22.00, | |
| methods: { | |
| keyword_objective_combined: 30.53 | |
| } | |
| }, | |
| { | |
| model: "Claude Opus 4.1", | |
| baseline: 20.67, | |
| methods: { | |
| keyword_objective_combined: 23.56 | |
| } | |
| }, | |
| { | |
| model: "GPT 5", | |
| baseline: 8.33, | |
| methods: { | |
| keyword_objective_combined: 11.68, | |
| root_problem: 12.46 | |
| } | |
| } | |
| ]; | |
| const [currentPhase, setCurrentPhase] = useState('baseline'); | |
| const [currentMethodIndex, setCurrentMethodIndex] = useState(0); | |
| const synthesisMethodsOrder = ['keyword_objective_combined', 'root_problem']; | |
| const getCurrentValue = (modelData, phase) => { | |
| if (phase === 'baseline') { | |
| return modelData.baseline; | |
| } else if (phase === 'additive_synthesis') { | |
| let maxASR = modelData.baseline; | |
| for (let i = 0; i < currentMethodIndex; i++) { | |
| const method = synthesisMethodsOrder[i]; | |
| if (modelData.methods[method] !== undefined) { | |
| maxASR = Math.max(maxASR, modelData.methods[method]); | |
| } | |
| } | |
| return maxASR; | |
| } | |
| return 0; | |
| }; | |
| const getCurrentMethod = (modelData, phase) => { | |
| if (phase === 'baseline') return 'Human Baseline'; | |
| if (currentMethodIndex === 0) return 'Human Baseline'; | |
| const availableMethods = []; | |
| for (let i = 0; i < currentMethodIndex; i++) { | |
| const method = synthesisMethodsOrder[i]; | |
| if (modelData.methods[method] !== undefined) { | |
| availableMethods.push(method); | |
| } | |
| } | |
| if (availableMethods.length === 0) return 'Human Baseline'; | |
| const lastMethod = availableMethods[availableMethods.length - 1]; | |
| let transformationName; | |
| if (lastMethod === 'keyword_objective_combined') transformationName = 'Keyword/Objective Transformation'; | |
| else if (lastMethod === 'root_problem') transformationName = 'Root Problem Transformation'; | |
| else transformationName = lastMethod.replace(/_/g, ' ').replace(/\b\w/g, l => l.toUpperCase()); | |
| return `Human Baseline + ${transformationName}`; | |
| }; | |
| const handleManualAnimate = () => { | |
| if (currentPhase === 'baseline') { | |
| setCurrentPhase('additive_synthesis'); | |
| setCurrentMethodIndex(0); | |
| // Manually step through methods | |
| setTimeout(() => setCurrentMethodIndex(1), 500); | |
| setTimeout(() => setCurrentMethodIndex(2), 2500); | |
| } else { | |
| setCurrentPhase('baseline'); | |
| setCurrentMethodIndex(0); | |
| } | |
| }; | |
| return ( | |
| <div className="min-h-screen bg-gradient-to-br from-slate-900 to-slate-800 p-4"> | |
| <div className="max-w-6xl mx-auto"> | |
| {/* Header */} | |
| <div className="text-center mb-6"> | |
| <h1 className="text-3xl font-bold text-white mb-3"> | |
| JulyAI Safety Benchmark For Frontier Models | |
| </h1> | |
| <p className="text-slate-300"> | |
| Attack Success Rate Analysis with Transformation Methods | |
| </p> | |
| {/* Control Button */} | |
| <div className="flex justify-center"> | |
| <button | |
| onClick={handleManualAnimate} | |
| className="px-6 py-2 bg-blue-600 hover:bg-blue-700 text-white rounded-lg font-semibold transition-colors" | |
| > | |
| {currentPhase === 'baseline' ? 'Show Transformation Impact' : 'Reset to Baseline'} | |
| </button> | |
| </div> | |
| </div> | |
| {/* Chart Container - Longer */} | |
| <div className="bg-white rounded-xl shadow-2xl p-4"> | |
| {/* Legend - At Top of Chart */} | |
| <div className="mb-6 flex justify-center space-x-6 text-sm bg-gray-50 p-3 rounded-lg"> | |
| <div className="flex items-center space-x-2"> | |
| <div className="w-3 h-3 bg-gradient-to-r from-blue-500 to-blue-600 rounded"></div> | |
| <span className="text-gray-700">Human Baseline</span> | |
| </div> | |
| <div className="flex items-center space-x-2"> | |
| <div className="w-3 h-3 bg-gradient-to-r from-green-500 to-green-600 rounded"></div> | |
| <span className="text-gray-700">Transformation Extension</span> | |
| </div> | |
| </div> | |
| <div className="h-[600px] overflow-y-auto pr-2"> | |
| <div className="space-y-2"> | |
| {benchmarkData.map((modelData, index) => { | |
| const currentValue = getCurrentValue(modelData, currentPhase); | |
| const baselineValue = modelData.baseline; | |
| const maxValue = 90; | |
| const baselineWidth = (baselineValue / maxValue) * 100; | |
| const totalWidth = (currentValue / maxValue) * 100; | |
| const extensionWidth = totalWidth - baselineWidth; | |
| const gain = currentValue - baselineValue; | |
| return ( | |
| <div key={modelData.model} className="relative"> | |
| {/* Model Name and Value - Cleaner Layout */} | |
| <div className="flex items-center justify-between mb-1"> | |
| <div> | |
| <h3 className="font-semibold text-gray-800 text-sm"> | |
| {modelData.model} | |
| </h3> | |
| </div> | |
| <div className="text-right flex items-center space-x-2"> | |
| {gain > 0 && ( | |
| <span className="text-xs font-semibold text-green-600"> | |
| +{gain.toFixed(1)} | |
| </span> | |
| )} | |
| <span className="text-lg font-bold text-gray-700"> | |
| {currentValue.toFixed(1)}% | |
| </span> | |
| </div> | |
| </div> | |
| {/* Progress Bar - Shows Growth from Baseline */} | |
| <div className="relative h-6 bg-gray-200 rounded-full overflow-hidden"> | |
| {/* Baseline Bar (Blue) - Flat, no rounding for seamless extension */} | |
| <div | |
| className="absolute left-0 top-0 h-full bg-gradient-to-r from-blue-500 to-blue-600" | |
| style={{ width: `${Math.max(baselineWidth, 5)}%` }} | |
| /> | |
| {/* Extension Bar (Green) - Only rounded at the end */} | |
| <div | |
| className="absolute top-0 h-full bg-gradient-to-r from-green-500 to-green-600 rounded-r-full transition-all duration-2000 ease-out" | |
| style={{ | |
| left: `${baselineWidth}%`, | |
| width: currentPhase === 'additive_synthesis' && currentMethodIndex > 0 ? `${extensionWidth}%` : '0%' | |
| }} | |
| /> | |
| </div> | |
| </div> | |
| ); | |
| })} | |
| </div> | |
| </div> | |
| </div> | |
| {/* Methodology Note - Moved Below Chart */} | |
| <div className="mt-6 p-3 bg-yellow-900/30 border border-yellow-500/30 rounded-lg max-w-4xl mx-auto"> | |
| <div className="flex items-start space-x-3"> | |
| <div className="text-yellow-400 mt-1">⚠️</div> | |
| <div className="text-left"> | |
| <p className="text-yellow-200 font-semibold mb-2">Methodology Note</p> | |
| <p className="text-yellow-100 text-sm leading-relaxed"> | |
| <strong>Additive Visualization:</strong> This chart shows cumulative impact by progressively adding each transformation method's individual attack success rate. | |
| Values >100% represent transformation of multiple conversations off one failed, human seed conversation. | |
| Results are based on HarmBench Grading methodology and should be interpreted as relative performance indicators. | |
| </p> | |
| </div> | |
| </div> | |
| </div> | |
| {/* Footer */} | |
| <div className="mt-4 text-center text-slate-400 space-y-1"> | |
| <p className="text-sm"> | |
| Sorted by Human Baseline ASR (highest to lowest) • Click button above for manual control | |
| </p> | |
| <p className="text-xs"> | |
| Bars extend from baseline to show transformation method impact | |
| </p> | |
| </div> | |
| </div> | |
| </div> | |
| ); | |
| }; | |
| export default BenchmarkChart; |