File size: 10,967 Bytes
2607058
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f3beb1
 
2607058
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f3beb1
 
2607058
5f3beb1
2607058
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
import React, { useState, useEffect } from 'react';

const BenchmarkChart = () => {
  // Real data from your CSV
  const benchmarkData = [
    {
      model: "Claude 4 Sonnet",
      direct_conversation: 26.33,
      keyword_objective_combined: 3.13
    },
    {
      model: "Claude Opus 4.1",
      direct_conversation: 20.67,
      keyword_objective_combined: 3.65
    },
    {
      model: "Deepseek R1-0528",
      direct_conversation: 68.67,
      keyword_objective_combined: 48.18
    },
    {
      model: "GPT 5",
      direct_conversation: 8.33,
      keyword_objective_combined: 3.65,
      bio_topic_change: 23.5,
      enhancement: 10,
      root_problem: 4.5
    },
    {
      model: "GPT 5 mini",
      direct_conversation: 7.67,
      keyword_objective_combined: 3.91,
      bio_topic_change: 14.5,
      enhancement: 5.5,
      root_problem: 3
    },
    {
      model: "GPT o3",
      direct_conversation: 22,
      keyword_objective_combined: 10.94
    },
    {
      model: "Gemini 2.5 Pro",
      direct_conversation: 55.67,
      keyword_objective_combined: 41.67,
      bio_topic_change: 53.5,
      enhancement: 47,
      root_problem: 26
    },
    {
      model: "Grok 4",
      direct_conversation: 68.67,
      keyword_objective_combined: 52.6
    },
    {
      model: "Llama 3.1 405B",
      direct_conversation: 67,
      keyword_objective_combined: 41.67
    }
  ];

  const [currentPhase, setCurrentPhase] = useState('baseline');
  const [currentMethodIndex, setCurrentMethodIndex] = useState(0);
  
  const synthesisMethodsOrder = ['keyword_objective_combined', 'bio_topic_change', 'enhancement', 'root_problem'];
  
  const phases = [
    { key: 'baseline', label: 'Direct Conversation (Baseline)' },
    { key: 'additive_synthesis', label: 'Adding Synthesis Methods' }
  ];

  useEffect(() => {
    const interval = setInterval(() => {
      setCurrentPhase(prev => prev === 'baseline' ? 'additive_synthesis' : 'baseline');
      setCurrentMethodIndex(0); // Reset when switching phases
    }, 12000); // Increased total cycle time to 12 seconds

    return () => clearInterval(interval);
  }, []);

  useEffect(() => {
    if (currentPhase === 'additive_synthesis') {
      const methodInterval = setInterval(() => {
        setCurrentMethodIndex(prev => {
          const nextIndex = prev + 1;
          // Stay at final state (all methods added) for longer
          if (nextIndex > synthesisMethodsOrder.length) {
            return synthesisMethodsOrder.length; // Stay at max for longer
          }
          return nextIndex;
        });
      }, 2000); // Slower progression - 2 seconds per method
      return () => clearInterval(methodInterval);
    }
  }, [currentPhase]);

  const getCurrentValue = (modelData, phase) => {
    if (phase === 'baseline') {
      return modelData.direct_conversation || 0;
    } else if (phase === 'additive_synthesis') {
      let cumulativeValue = modelData.direct_conversation || 0;
      
      // Add each synthesis method's contribution up to currentMethodIndex
      for (let i = 0; i < currentMethodIndex; i++) {
        const method = synthesisMethodsOrder[i];
        if (modelData[method] !== undefined) {
          cumulativeValue += modelData[method];
        }
      }
      
      return cumulativeValue;
    }
    return 0;
  };

  const getCurrentMethodsAdded = (modelData, phase) => {
    if (phase === 'baseline') return ['Direct Conversation'];
    
    const methods = ['Direct Conversation'];
    for (let i = 0; i < currentMethodIndex; i++) {
      const method = synthesisMethodsOrder[i];
      if (modelData[method] !== undefined) {
        methods.push(method.replace(/_/g, ' ').replace(/\b\w/g, l => l.toUpperCase()));
      }
    }
    return methods;
  };

  const getBarColor = (modelData, phase) => {
    if (phase === 'baseline') {
      return 'from-blue-500 to-blue-600';
    } else {
      // Green gradient for additive synthesis
      return 'from-green-500 to-green-600';
    }
  };

  return (
    <div className="min-h-screen bg-gradient-to-br from-slate-900 to-slate-800 p-8">
      <div className="max-w-6xl mx-auto">
        {/* Header */}
        <div className="text-center mb-12">
          <h1 className="text-4xl font-bold text-white mb-4">
            LLM Safety Benchmark Results
          </h1>
          <p className="text-slate-300 text-lg">
            SafetyBench Aug 2025 - Success Rate Comparison
          </p>
          
          {/* Methodology Disclaimer */}
          <div className="mt-6 p-4 bg-yellow-900/30 border border-yellow-500/30 rounded-lg max-w-4xl mx-auto">
            <div className="flex items-start space-x-3">
              <div className="text-yellow-400 mt-1">⚠️</div>
              <div className="text-left">
                <p className="text-yellow-200 font-semibold mb-2">Methodology Note</p>
                <p className="text-yellow-100 text-sm leading-relaxed">
                  <strong>Additive Visualization:</strong> This chart shows cumulative impact by progressively adding each synthesis method's individual success rate. 
                  Values >100% represent theoretical maximum vulnerability discovery when combining multiple attack vectors. 
                  Results are based on SafetyBench Aug 2025 testing methodology and should be interpreted as relative performance indicators.
                </p>
              </div>
            </div>
          </div>
          
          <div className="mt-4 p-4 bg-slate-800 rounded-lg inline-block">
            <p className="text-white font-semibold">
              Current View: {phases.find(p => p.key === currentPhase)?.label}
            </p>
            {currentPhase === 'additive_synthesis' && currentMethodIndex > 0 && (
              <p className="text-slate-300 text-sm mt-1">
                Adding Method {currentMethodIndex}: {synthesisMethodsOrder[currentMethodIndex - 1]?.replace(/_/g, ' ').replace(/\b\w/g, l => l.toUpperCase())}
              </p>
            )}
          </div>
        </div>

        {/* Chart Container */}
        <div className="bg-white rounded-2xl shadow-2xl p-8">
          <div className="space-y-6">
            {benchmarkData.map((modelData, index) => {
              const currentValue = getCurrentValue(modelData, currentPhase);
              const baselineValue = modelData.direct_conversation;
              const maxValue = 100; // Increased max scale since we're adding values
              const barWidth = (currentValue / maxValue) * 100;
              const methodsAdded = getCurrentMethodsAdded(modelData, currentPhase);
              const totalGain = currentValue - baselineValue;
              
              return (
                <div key={modelData.model} className="relative">
                  {/* Model Name and Methods */}
                  <div className="flex items-center justify-between mb-2">
                    <div>
                      <h3 className="font-semibold text-gray-800 text-lg">
                        {modelData.model}
                      </h3>
                      <p className="text-sm text-gray-600">
                        {methodsAdded.join(' + ')}
                      </p>
                    </div>
                    <div className="text-right">
                      <span className="text-2xl font-bold text-gray-700">
                        {currentValue.toFixed(1)}%
                      </span>
                      {currentPhase === 'additive_synthesis' && totalGain > 0 && (
                        <div className="text-sm font-semibold text-green-600">
                          +{totalGain.toFixed(1)}% total gain
                        </div>
                      )}
                    </div>
                  </div>
                  
                  {/* Progress Bar */}
                  <div className="relative h-12 bg-gray-200 rounded-full overflow-hidden">
                    <div 
                      className={`h-full bg-gradient-to-r ${getBarColor(modelData, currentPhase)} rounded-full transition-all duration-[1800ms] ease-in-out flex items-center justify-end pr-4`}
                      style={{ width: `${Math.max(barWidth, 5)}%` }}
                    >
                      <div className="text-white font-semibold text-sm">
                        {currentValue > 8 ? `${currentValue.toFixed(1)}%` : ''}
                      </div>
                    </div>
                  </div>
                  
                  {/* Method Breakdown */}
                  {currentPhase === 'additive_synthesis' && currentMethodIndex > 0 && (
                    <div className="mt-2 text-xs text-gray-500 space-y-1">
                      <div>Baseline: {baselineValue.toFixed(1)}%</div>
                      {synthesisMethodsOrder.slice(0, currentMethodIndex).map(method => {
                        if (modelData[method] !== undefined) {
                          return (
                            <div key={method}>
                              + {method.replace(/_/g, ' ')}: {modelData[method].toFixed(1)}%
                            </div>
                          );
                        }
                        return null;
                      })}
                    </div>
                  )}
                </div>
              );
            })}
          </div>
          
          {/* Legend */}
          <div className="mt-8 flex justify-center space-x-6 flex-wrap">
            <div className="flex items-center space-x-2">
              <div className="w-4 h-4 bg-gradient-to-r from-blue-500 to-blue-600 rounded"></div>
              <span className="text-gray-700">Baseline (Direct Conversation)</span>
            </div>
            <div className="flex items-center space-x-2">
              <div className="w-4 h-4 bg-gradient-to-r from-green-500 to-green-600 rounded"></div>
              <span className="text-gray-700">Additive Synthesis Methods</span>
            </div>
          </div>
        </div>
        
        {/* Footer Info */}
        <div className="mt-8 text-center text-slate-400 space-y-2">
          <p>Animation cycles every 12 seconds: baseline (6s) → progressive method addition (2s each) → final results (4s)</p>
          <p className="text-sm">
            Data from SafetyBench Aug 2025 • Synthesis methods test different attack vectors
          </p>
          <div className="text-xs mt-4 max-w-4xl mx-auto space-y-1">
            <p><strong>Interpretation:</strong> Each synthesis method represents a different approach to testing model vulnerabilities</p>
            <p><strong>Additive Display:</strong> Shows theoretical maximum impact when combining all available synthesis techniques</p>
            <p><strong>Baseline:</strong> Direct conversation represents standard prompting without augmentation techniques</p>
          </div>
        </div>
      </div>
    </div>
  );
};

export default BenchmarkChart;