Spaces:

GoJulyAI
/

benchmark-enhancements

Running

App Files Files Community

yoshinakachi commited on Aug 21

Commit

2607058

verified ·

1 Parent(s): c9c02ef

Update src/App.js

Browse files

Files changed (1) hide show

src/App.js +270 -20

src/App.js CHANGED Viewed

@@ -1,25 +1,275 @@
-import logo from './logo.svg';
-import './App.css';
-function App() {
   return (
-    <div className="App">
-      <header className="App-header">
-        <img src={logo} className="App-logo" alt="logo" />
-        <p>
-          Edit <code>src/App.js</code> and save to reload.
-        </p>
-        <a
-          className="App-link"
-          href="https://reactjs.org"
-          target="_blank"
-          rel="noopener noreferrer"
-        >
-          Learn React
-        </a>
-      </header>
     </div>
   );
-}
-export default App;

+import React, { useState, useEffect } from 'react';
+const BenchmarkChart = () => {
+  // Real data from your CSV
+  const benchmarkData = [
+    {
+      model: "Claude 4 Sonnet",
+      direct_conversation: 26.33,
+      keyword_objective_combined: 3.13
+    },
+    {
+      model: "Claude Opus 4.1",
+      direct_conversation: 20.67,
+      keyword_objective_combined: 3.65
+    },
+    {
+      model: "Deepseek R1-0528",
+      direct_conversation: 68.67,
+      keyword_objective_combined: 48.18
+    },
+    {
+      model: "GPT 5",
+      direct_conversation: 8.33,
+      keyword_objective_combined: 3.65,
+      bio_topic_change: 23.5,
+      enhancement: 10,
+      root_problem: 4.5
+    },
+    {
+      model: "GPT 5 mini",
+      direct_conversation: 7.67,
+      keyword_objective_combined: 3.91,
+      bio_topic_change: 14.5,
+      enhancement: 5.5,
+      root_problem: 3
+    },
+    {
+      model: "GPT o3",
+      direct_conversation: 22,
+      keyword_objective_combined: 10.94
+    },
+    {
+      model: "Gemini 2.5 Pro",
+      direct_conversation: 55.67,
+      keyword_objective_combined: 41.67,
+      bio_topic_change: 53.5,
+      enhancement: 47,
+      root_problem: 26
+    },
+    {
+      model: "Grok 4",
+      direct_conversation: 68.67,
+      keyword_objective_combined: 52.6
+    },
+    {
+      model: "Llama 3.1 405B",
+      direct_conversation: 67,
+      keyword_objective_combined: 41.67
+    }
+  ];
+  const [currentPhase, setCurrentPhase] = useState('baseline');
+  const [currentMethodIndex, setCurrentMethodIndex] = useState(0);
+  const synthesisMethodsOrder = ['keyword_objective_combined', 'bio_topic_change', 'enhancement', 'root_problem'];
+  const phases = [
+    { key: 'baseline', label: 'Direct Conversation (Baseline)' },
+    { key: 'additive_synthesis', label: 'Adding Synthesis Methods' }
+  ];
+  useEffect(() => {
+    const interval = setInterval(() => {
+      setCurrentPhase(prev => prev === 'baseline' ? 'additive_synthesis' : 'baseline');
+      setCurrentMethodIndex(0); // Reset when switching phases
+    }, 12000); // Increased total cycle time to 12 seconds
+    return () => clearInterval(interval);
+  }, []);
+  useEffect(() => {
+    if (currentPhase === 'additive_synthesis') {
+      const methodInterval = setInterval(() => {
+        setCurrentMethodIndex(prev => {
+          const nextIndex = prev + 1;
+          // Stay at final state (all methods added) for longer
+          if (nextIndex > synthesisMethodsOrder.length) {
+            return synthesisMethodsOrder.length; // Stay at max for longer
+          }
+          return nextIndex;
+        });
+      }, 2000); // Slower progression - 2 seconds per method
+      return () => clearInterval(methodInterval);
+    }
+  }, [currentPhase]);
+  const getCurrentValue = (modelData, phase) => {
+    if (phase === 'baseline') {
+      return modelData.direct_conversation || 0;
+    } else if (phase === 'additive_synthesis') {
+      let cumulativeValue = modelData.direct_conversation || 0;
+      // Add each synthesis method's contribution up to currentMethodIndex
+      for (let i = 0; i < currentMethodIndex; i++) {
+        const method = synthesisMethodsOrder[i];
+        if (modelData[method] !== undefined) {
+          cumulativeValue += modelData[method];
+        }
+      }
+      return cumulativeValue;
+    }
+    return 0;
+  };
+  const getCurrentMethodsAdded = (modelData, phase) => {
+    if (phase === 'baseline') return ['Direct Conversation'];
+    const methods = ['Direct Conversation'];
+    for (let i = 0; i < currentMethodIndex; i++) {
+      const method = synthesisMethodsOrder[i];
+      if (modelData[method] !== undefined) {
+        methods.push(method.replace(/_/g, ' ').replace(/\b\w/g, l => l.toUpperCase()));
+      }
+    }
+    return methods;
+  };
+  const getBarColor = (modelData, phase) => {
+    if (phase === 'baseline') {
+      return 'from-blue-500 to-blue-600';
+    } else {
+      // Green gradient for additive synthesis
+      return 'from-green-500 to-green-600';
+    }
+  };
   return (
+    <div className="min-h-screen bg-gradient-to-br from-slate-900 to-slate-800 p-8">
+      <div className="max-w-6xl mx-auto">
+        {/* Header */}
+        <div className="text-center mb-12">
+          <h1 className="text-4xl font-bold text-white mb-4">
+            LLM Safety Benchmark Results
+          </h1>
+          <p className="text-slate-300 text-lg">
+            SafetyBench Aug 2025 - Success Rate Comparison
+          </p>
+          {/* Methodology Disclaimer */}
+          <div className="mt-6 p-4 bg-yellow-900/30 border border-yellow-500/30 rounded-lg max-w-4xl mx-auto">
+            <div className="flex items-start space-x-3">
+              <div className="text-yellow-400 mt-1">⚠️</div>
+              <div className="text-left">
+                <p className="text-yellow-200 font-semibold mb-2">Methodology Note</p>
+                <p className="text-yellow-100 text-sm leading-relaxed">
+                  <strong>Additive Visualization:</strong> This chart shows cumulative impact by progressively adding each synthesis method's individual success rate.
+                  Values >100% represent theoretical maximum vulnerability discovery when combining multiple attack vectors.
+                  Results are based on SafetyBench Aug 2025 testing methodology and should be interpreted as relative performance indicators.
+                </p>
+              </div>
+            </div>
+          </div>
+          <div className="mt-4 p-4 bg-slate-800 rounded-lg inline-block">
+            <p className="text-white font-semibold">
+              Current View: {phases.find(p => p.key === currentPhase)?.label}
+            </p>
+            {currentPhase === 'additive_synthesis' && currentMethodIndex > 0 && (
+              <p className="text-slate-300 text-sm mt-1">
+                Adding Method {currentMethodIndex}: {synthesisMethodsOrder[currentMethodIndex - 1]?.replace(/_/g, ' ').replace(/\b\w/g, l => l.toUpperCase())}
+              </p>
+            )}
+          </div>
+        </div>
+        {/* Chart Container */}
+        <div className="bg-white rounded-2xl shadow-2xl p-8">
+          <div className="space-y-6">
+            {benchmarkData.map((modelData, index) => {
+              const currentValue = getCurrentValue(modelData, currentPhase);
+              const baselineValue = modelData.direct_conversation;
+              const maxValue = 100; // Increased max scale since we're adding values
+              const barWidth = (currentValue / maxValue) * 100;
+              const methodsAdded = getCurrentMethodsAdded(modelData, currentPhase);
+              const totalGain = currentValue - baselineValue;
+              return (
+                <div key={modelData.model} className="relative">
+                  {/* Model Name and Methods */}
+                  <div className="flex items-center justify-between mb-2">
+                    <div>
+                      <h3 className="font-semibold text-gray-800 text-lg">
+                        {modelData.model}
+                      </h3>
+                      <p className="text-sm text-gray-600">
+                        {methodsAdded.join(' + ')}
+                      </p>
+                    </div>
+                    <div className="text-right">
+                      <span className="text-2xl font-bold text-gray-700">
+                        {currentValue.toFixed(1)}%
+                      </span>
+                      {currentPhase === 'additive_synthesis' && totalGain > 0 && (
+                        <div className="text-sm font-semibold text-green-600">
+                          +{totalGain.toFixed(1)}% total gain
+                        </div>
+                      )}
+                    </div>
+                  </div>
+                  {/* Progress Bar */}
+                  <div className="relative h-12 bg-gray-200 rounded-full overflow-hidden">
+                    <div
+                      className={`h-full bg-gradient-to-r ${getBarColor(modelData, currentPhase)} rounded-full transition-all duration-[1800ms] ease-in-out flex items-center justify-end pr-4`}
+                      style={{ width: `${Math.max(barWidth, 5)}%` }}
+                    >
+                      <div className="text-white font-semibold text-sm">
+                        {currentValue > 8 ? `${currentValue.toFixed(1)}%` : ''}
+                      </div>
+                    </div>
+                  </div>
+                  {/* Method Breakdown */}
+                  {currentPhase === 'additive_synthesis' && currentMethodIndex > 0 && (
+                    <div className="mt-2 text-xs text-gray-500 space-y-1">
+                      <div>Baseline: {baselineValue.toFixed(1)}%</div>
+                      {synthesisMethodsOrder.slice(0, currentMethodIndex).map(method => {
+                        if (modelData[method] !== undefined) {
+                          return (
+                            <div key={method}>
+                              + {method.replace(/_/g, ' ')}: {modelData[method].toFixed(1)}%
+                            </div>
+                          );
+                        }
+                        return null;
+                      })}
+                    </div>
+                  )}
+                </div>
+              );
+            })}
+          </div>
+          {/* Legend */}
+          <div className="mt-8 flex justify-center space-x-6 flex-wrap">
+            <div className="flex items-center space-x-2">
+              <div className="w-4 h-4 bg-gradient-to-r from-blue-500 to-blue-600 rounded"></div>
+              <span className="text-gray-700">Baseline (Direct Conversation)</span>
+            </div>
+            <div className="flex items-center space-x-2">
+              <div className="w-4 h-4 bg-gradient-to-r from-green-500 to-green-600 rounded"></div>
+              <span className="text-gray-700">Additive Synthesis Methods</span>
+            </div>
+          </div>
+        </div>
+        {/* Footer Info */}
+        <div className="mt-8 text-center text-slate-400 space-y-2">
+          <p>Animation cycles every 12 seconds: baseline (6s) → progressive method addition (2s each) → final results (4s)</p>
+          <p className="text-sm">
+            Data from SafetyBench Aug 2025 • Synthesis methods test different attack vectors
+          </p>
+          <div className="text-xs mt-4 max-w-4xl mx-auto space-y-1">
+            <p><strong>Interpretation:</strong> Each synthesis method represents a different approach to testing model vulnerabilities</p>
+            <p><strong>Additive Display:</strong> Shows theoretical maximum impact when combining all available synthesis techniques</p>
+            <p><strong>Baseline:</strong> Direct conversation represents standard prompting without augmentation techniques</p>
+          </div>
+        </div>
+      </div>
     </div>
   );
+};
+export default BenchmarkChart;