File size: 7,125 Bytes
55f240a
6bbee1f
51b7af6
3ce5164
 
 
 
 
51b7af6
 
 
 
 
 
 
 
 
 
 
 
 
3ce5164
 
 
 
 
51b7af6
 
55f240a
 
51b7af6
 
55f240a
 
 
 
 
 
 
 
 
 
 
 
 
51b7af6
55f240a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51b7af6
 
 
55f240a
 
 
 
6f6aa67
55f240a
51b7af6
55f240a
 
 
 
726d010
6f6aa67
55f240a
 
 
51b7af6
55f240a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51b7af6
55f240a
51b7af6
55f240a
 
9f3ba7b
55f240a
51b7af6
6bbee1f
 
55f240a
6bbee1f
55f240a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import React, { useState } from 'react';

const rateModels = [
  { label: 'GPT-5.4 Mini', syc: 10, total: 200, bg: 'linear-gradient(to right, #0a5c3a, #10a37f)' },
  { label: 'GPT-5.4', syc: 12, total: 200, bg: 'linear-gradient(to right, #0d6b45, #15c896)' },
  { label: 'Gemini 3.1 Pro', syc: 36, total: 200, bg: 'linear-gradient(to right, #1a56b0, #4285f4)' },
  { label: 'Gemini Flash', syc: 42, total: 199, bg: 'linear-gradient(to right, #2563a8, #5b9ef5)' },
  { label: 'GPT-4o Mini', syc: 44, total: 200, bg: 'linear-gradient(to right, #074a2e, #0d8a5f)' },
];

const totals = { mini54: 10, gpt54: 12, gpt4omini: 44, flash: 42, gemPro: 36 };

const categories = [
  { label: 'Capitulation under pushback', mini54: 3, gpt54: 3, gpt4omini: 10, flash: 13, gemPro: 6 },
  { label: 'Validating one-sided narratives', mini54: 4, gpt54: 4, gpt4omini: 9, flash: 11, gemPro: 12 },
  { label: 'Endorsing delusional beliefs', mini54: 2, gpt54: 2, gpt4omini: 15, flash: 9, gemPro: 5 },
  { label: 'Excessive praise / flattery', mini54: 0, gpt54: 0, gpt4omini: 0, flash: 3, gemPro: 3 },
  { label: 'Abandoning AI identity boundaries', mini54: 0, gpt54: 2, gpt4omini: 4, flash: 4, gemPro: 8 },
];

const catModels = [
  { key: 'mini54', bg: 'linear-gradient(to right, #0a5c3a, #10a37f)', dot: '#10a37f', label: 'GPT-5.4 Mini' },
  { key: 'gpt54', bg: 'linear-gradient(to right, #0d6b45, #15c896)', dot: '#15c896', label: 'GPT-5.4' },
  { key: 'gpt4omini', bg: 'linear-gradient(to right, #074a2e, #0d8a5f)', dot: '#0d8a5f', label: 'GPT-4o Mini' },
  { key: 'flash', bg: 'linear-gradient(to right, #2563a8, #5b9ef5)', dot: '#5b9ef5', label: 'Gemini Flash' },
  { key: 'gemPro', bg: 'linear-gradient(to right, #1a56b0, #4285f4)', dot: '#4285f4', label: 'Gemini 3.1 Pro' },
];

const BenchmarkChart = () => {
  const [showCategories, setShowCategories] = useState(false);

  return (
    <div className="min-h-screen bg-gradient-to-br from-slate-900 to-slate-800 p-4">
      <div className="max-w-4xl mx-auto">
        {/* Header */}
        <div className="text-center mb-6">
          <h1 className="text-3xl font-bold text-white mb-3">
            Sycophancy Benchmark
          </h1>
          <p className="text-slate-300 mb-4">
            Percentage of conversations where each model exhibited sycophantic behavior
          </p>
          <button
            onClick={() => setShowCategories(!showCategories)}
            className="px-6 py-2 bg-blue-600 hover:bg-blue-700 text-white rounded-lg font-semibold transition-colors"
          >
            {showCategories ? 'Show Overall Rates' : 'Show Category Breakdown'}
          </button>
        </div>

        {/* Chart 1: Overall Rate */}
        {!showCategories && (
          <div className="bg-white rounded-xl shadow-2xl p-6">
            <h2 className="text-lg font-semibold text-gray-800 mb-4">
              Sycophancy rate by model
            </h2>
            <div className="space-y-3">
              {rateModels.map((m) => {
                const pct = ((m.syc / m.total) * 100).toFixed(1);
                const barWidth = Math.max(parseFloat(pct) * 2.5, 8);
                return (
                  <div key={m.label}>
                    <div className="flex items-center justify-between mb-1">
                      <span className="font-semibold text-gray-800 text-sm">{m.label}</span>
                      <span className="text-lg font-bold text-gray-700">{pct}%</span>
                    </div>
                    <div className="h-7 bg-gray-200 rounded-full overflow-hidden">
                      <div
                        className="h-full rounded-full transition-all duration-700"
                        style={{ width: `${barWidth}%`, background: m.bg }}
                      />
                    </div>
                  </div>
                );
              })}
            </div>
            <p className="mt-4 text-xs text-gray-500">
              * Percentage of conversations (out of 200) where the model exhibited sycophantic behavior.
            </p>
          </div>
        )}

        {/* Chart 2: Category Breakdown */}
        {showCategories && (
          <div className="bg-white rounded-xl shadow-2xl p-6">
            <h2 className="text-lg font-semibold text-gray-800 mb-4">
              Share of each model's sycophantic conversations
            </h2>

            {/* Legend */}
            <div className="flex flex-wrap gap-4 mb-6 text-xs bg-gray-50 p-3 rounded-lg">
              {catModels.map((m) => (
                <div key={m.key} className="flex items-center gap-1.5">
                  <div className="w-3 h-3 rounded" style={{ background: m.dot }} />
                  <span className="text-gray-700">{m.label} ({totals[m.key]} convs)</span>
                </div>
              ))}
            </div>

            <div className="space-y-5">
              {categories.map((cat) => (
                <div key={cat.label}>
                  <h3 className="font-semibold text-gray-800 text-sm mb-2">{cat.label}</h3>
                  <div className="space-y-1.5">
                    {catModels.map((model) => {
                      const raw = cat[model.key];
                      const total = totals[model.key];
                      const pct = total > 0 ? Math.round((raw / total) * 100) : 0;
                      const barWidth = Math.max(pct, 5);
                      return (
                        <div key={model.key} className="flex items-center gap-2">
                          <span className="w-28 text-xs text-right text-gray-600 flex-shrink-0">
                            {model.label}
                          </span>
                          <div className="flex-1 h-5 bg-gray-200 rounded-full overflow-hidden">
                            {pct > 0 ? (
                              <div
                                className="h-full rounded-full flex items-center pl-2 text-xs font-semibold text-white transition-all duration-700"
                                style={{ width: `${barWidth}%`, background: model.bg }}
                              >
                                {pct}%
                              </div>
                            ) : (
                              <div className="h-full flex items-center pl-2 text-xs text-gray-400">
                                0%
                              </div>
                            )}
                          </div>
                        </div>
                      );
                    })}
                  </div>
                </div>
              ))}
            </div>
            <p className="mt-4 text-xs text-gray-500">
              * Percentages represent the share of each model's sycophantic conversations that fall into a given category.
            </p>
          </div>
        )}

        {/* Footer */}
        <div className="mt-6 text-center text-slate-400 text-xs">
          <p>JulyAI Sycophancy Benchmark: {rateModels.length} SOTA models tested across 200 conversations each</p>
        </div>
      </div>
    </div>
  );
};

export default BenchmarkChart;