Spaces:
Sleeping
Sleeping
File size: 7,125 Bytes
55f240a 6bbee1f 51b7af6 3ce5164 51b7af6 3ce5164 51b7af6 55f240a 51b7af6 55f240a 51b7af6 55f240a 51b7af6 55f240a 6f6aa67 55f240a 51b7af6 55f240a 726d010 6f6aa67 55f240a 51b7af6 55f240a 51b7af6 55f240a 51b7af6 55f240a 9f3ba7b 55f240a 51b7af6 6bbee1f 55f240a 6bbee1f 55f240a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 | import React, { useState } from 'react';
const rateModels = [
{ label: 'GPT-5.4 Mini', syc: 10, total: 200, bg: 'linear-gradient(to right, #0a5c3a, #10a37f)' },
{ label: 'GPT-5.4', syc: 12, total: 200, bg: 'linear-gradient(to right, #0d6b45, #15c896)' },
{ label: 'Gemini 3.1 Pro', syc: 36, total: 200, bg: 'linear-gradient(to right, #1a56b0, #4285f4)' },
{ label: 'Gemini Flash', syc: 42, total: 199, bg: 'linear-gradient(to right, #2563a8, #5b9ef5)' },
{ label: 'GPT-4o Mini', syc: 44, total: 200, bg: 'linear-gradient(to right, #074a2e, #0d8a5f)' },
];
const totals = { mini54: 10, gpt54: 12, gpt4omini: 44, flash: 42, gemPro: 36 };
const categories = [
{ label: 'Capitulation under pushback', mini54: 3, gpt54: 3, gpt4omini: 10, flash: 13, gemPro: 6 },
{ label: 'Validating one-sided narratives', mini54: 4, gpt54: 4, gpt4omini: 9, flash: 11, gemPro: 12 },
{ label: 'Endorsing delusional beliefs', mini54: 2, gpt54: 2, gpt4omini: 15, flash: 9, gemPro: 5 },
{ label: 'Excessive praise / flattery', mini54: 0, gpt54: 0, gpt4omini: 0, flash: 3, gemPro: 3 },
{ label: 'Abandoning AI identity boundaries', mini54: 0, gpt54: 2, gpt4omini: 4, flash: 4, gemPro: 8 },
];
const catModels = [
{ key: 'mini54', bg: 'linear-gradient(to right, #0a5c3a, #10a37f)', dot: '#10a37f', label: 'GPT-5.4 Mini' },
{ key: 'gpt54', bg: 'linear-gradient(to right, #0d6b45, #15c896)', dot: '#15c896', label: 'GPT-5.4' },
{ key: 'gpt4omini', bg: 'linear-gradient(to right, #074a2e, #0d8a5f)', dot: '#0d8a5f', label: 'GPT-4o Mini' },
{ key: 'flash', bg: 'linear-gradient(to right, #2563a8, #5b9ef5)', dot: '#5b9ef5', label: 'Gemini Flash' },
{ key: 'gemPro', bg: 'linear-gradient(to right, #1a56b0, #4285f4)', dot: '#4285f4', label: 'Gemini 3.1 Pro' },
];
const BenchmarkChart = () => {
const [showCategories, setShowCategories] = useState(false);
return (
<div className="min-h-screen bg-gradient-to-br from-slate-900 to-slate-800 p-4">
<div className="max-w-4xl mx-auto">
{/* Header */}
<div className="text-center mb-6">
<h1 className="text-3xl font-bold text-white mb-3">
Sycophancy Benchmark
</h1>
<p className="text-slate-300 mb-4">
Percentage of conversations where each model exhibited sycophantic behavior
</p>
<button
onClick={() => setShowCategories(!showCategories)}
className="px-6 py-2 bg-blue-600 hover:bg-blue-700 text-white rounded-lg font-semibold transition-colors"
>
{showCategories ? 'Show Overall Rates' : 'Show Category Breakdown'}
</button>
</div>
{/* Chart 1: Overall Rate */}
{!showCategories && (
<div className="bg-white rounded-xl shadow-2xl p-6">
<h2 className="text-lg font-semibold text-gray-800 mb-4">
Sycophancy rate by model
</h2>
<div className="space-y-3">
{rateModels.map((m) => {
const pct = ((m.syc / m.total) * 100).toFixed(1);
const barWidth = Math.max(parseFloat(pct) * 2.5, 8);
return (
<div key={m.label}>
<div className="flex items-center justify-between mb-1">
<span className="font-semibold text-gray-800 text-sm">{m.label}</span>
<span className="text-lg font-bold text-gray-700">{pct}%</span>
</div>
<div className="h-7 bg-gray-200 rounded-full overflow-hidden">
<div
className="h-full rounded-full transition-all duration-700"
style={{ width: `${barWidth}%`, background: m.bg }}
/>
</div>
</div>
);
})}
</div>
<p className="mt-4 text-xs text-gray-500">
* Percentage of conversations (out of 200) where the model exhibited sycophantic behavior.
</p>
</div>
)}
{/* Chart 2: Category Breakdown */}
{showCategories && (
<div className="bg-white rounded-xl shadow-2xl p-6">
<h2 className="text-lg font-semibold text-gray-800 mb-4">
Share of each model's sycophantic conversations
</h2>
{/* Legend */}
<div className="flex flex-wrap gap-4 mb-6 text-xs bg-gray-50 p-3 rounded-lg">
{catModels.map((m) => (
<div key={m.key} className="flex items-center gap-1.5">
<div className="w-3 h-3 rounded" style={{ background: m.dot }} />
<span className="text-gray-700">{m.label} ({totals[m.key]} convs)</span>
</div>
))}
</div>
<div className="space-y-5">
{categories.map((cat) => (
<div key={cat.label}>
<h3 className="font-semibold text-gray-800 text-sm mb-2">{cat.label}</h3>
<div className="space-y-1.5">
{catModels.map((model) => {
const raw = cat[model.key];
const total = totals[model.key];
const pct = total > 0 ? Math.round((raw / total) * 100) : 0;
const barWidth = Math.max(pct, 5);
return (
<div key={model.key} className="flex items-center gap-2">
<span className="w-28 text-xs text-right text-gray-600 flex-shrink-0">
{model.label}
</span>
<div className="flex-1 h-5 bg-gray-200 rounded-full overflow-hidden">
{pct > 0 ? (
<div
className="h-full rounded-full flex items-center pl-2 text-xs font-semibold text-white transition-all duration-700"
style={{ width: `${barWidth}%`, background: model.bg }}
>
{pct}%
</div>
) : (
<div className="h-full flex items-center pl-2 text-xs text-gray-400">
0%
</div>
)}
</div>
</div>
);
})}
</div>
</div>
))}
</div>
<p className="mt-4 text-xs text-gray-500">
* Percentages represent the share of each model's sycophantic conversations that fall into a given category.
</p>
</div>
)}
{/* Footer */}
<div className="mt-6 text-center text-slate-400 text-xs">
<p>JulyAI Sycophancy Benchmark: {rateModels.length} SOTA models tested across 200 conversations each</p>
</div>
</div>
</div>
);
};
export default BenchmarkChart;
|