Spaces:
Sleeping
Sleeping
| import React, { useState } from 'react'; | |
| const rateModels = [ | |
| { label: 'GPT-5.4 Mini', syc: 10, total: 200, bg: 'linear-gradient(to right, #0a5c3a, #10a37f)' }, | |
| { label: 'GPT-5.4', syc: 12, total: 200, bg: 'linear-gradient(to right, #0d6b45, #15c896)' }, | |
| { label: 'Gemini 3.1 Pro', syc: 36, total: 200, bg: 'linear-gradient(to right, #1a56b0, #4285f4)' }, | |
| { label: 'Gemini Flash', syc: 42, total: 199, bg: 'linear-gradient(to right, #2563a8, #5b9ef5)' }, | |
| { label: 'GPT-4o Mini', syc: 44, total: 200, bg: 'linear-gradient(to right, #074a2e, #0d8a5f)' }, | |
| ]; | |
| const totals = { mini54: 10, gpt54: 12, gpt4omini: 44, flash: 42, gemPro: 36 }; | |
| const categories = [ | |
| { label: 'Capitulation under pushback', mini54: 3, gpt54: 3, gpt4omini: 10, flash: 13, gemPro: 6 }, | |
| { label: 'Validating one-sided narratives', mini54: 4, gpt54: 4, gpt4omini: 9, flash: 11, gemPro: 12 }, | |
| { label: 'Endorsing delusional beliefs', mini54: 2, gpt54: 2, gpt4omini: 15, flash: 9, gemPro: 5 }, | |
| { label: 'Excessive praise / flattery', mini54: 0, gpt54: 0, gpt4omini: 0, flash: 3, gemPro: 3 }, | |
| { label: 'Abandoning AI identity boundaries', mini54: 0, gpt54: 2, gpt4omini: 4, flash: 4, gemPro: 8 }, | |
| ]; | |
| const catModels = [ | |
| { key: 'mini54', bg: 'linear-gradient(to right, #0a5c3a, #10a37f)', dot: '#10a37f', label: 'GPT-5.4 Mini' }, | |
| { key: 'gpt54', bg: 'linear-gradient(to right, #0d6b45, #15c896)', dot: '#15c896', label: 'GPT-5.4' }, | |
| { key: 'gpt4omini', bg: 'linear-gradient(to right, #074a2e, #0d8a5f)', dot: '#0d8a5f', label: 'GPT-4o Mini' }, | |
| { key: 'flash', bg: 'linear-gradient(to right, #2563a8, #5b9ef5)', dot: '#5b9ef5', label: 'Gemini Flash' }, | |
| { key: 'gemPro', bg: 'linear-gradient(to right, #1a56b0, #4285f4)', dot: '#4285f4', label: 'Gemini 3.1 Pro' }, | |
| ]; | |
| const BenchmarkChart = () => { | |
| const [showCategories, setShowCategories] = useState(false); | |
| return ( | |
| <div className="min-h-screen bg-gradient-to-br from-slate-900 to-slate-800 p-4"> | |
| <div className="max-w-4xl mx-auto"> | |
| {/* Header */} | |
| <div className="text-center mb-6"> | |
| <h1 className="text-3xl font-bold text-white mb-3"> | |
| Sycophancy Benchmark | |
| </h1> | |
| <p className="text-slate-300 mb-4"> | |
| Percentage of conversations where each model exhibited sycophantic behavior | |
| </p> | |
| <button | |
| onClick={() => setShowCategories(!showCategories)} | |
| className="px-6 py-2 bg-blue-600 hover:bg-blue-700 text-white rounded-lg font-semibold transition-colors" | |
| > | |
| {showCategories ? 'Show Overall Rates' : 'Show Category Breakdown'} | |
| </button> | |
| </div> | |
| {/* Chart 1: Overall Rate */} | |
| {!showCategories && ( | |
| <div className="bg-white rounded-xl shadow-2xl p-6"> | |
| <h2 className="text-lg font-semibold text-gray-800 mb-4"> | |
| Sycophancy rate by model | |
| </h2> | |
| <div className="space-y-3"> | |
| {rateModels.map((m) => { | |
| const pct = ((m.syc / m.total) * 100).toFixed(1); | |
| const barWidth = Math.max(parseFloat(pct) * 2.5, 8); | |
| return ( | |
| <div key={m.label}> | |
| <div className="flex items-center justify-between mb-1"> | |
| <span className="font-semibold text-gray-800 text-sm">{m.label}</span> | |
| <span className="text-lg font-bold text-gray-700">{pct}%</span> | |
| </div> | |
| <div className="h-7 bg-gray-200 rounded-full overflow-hidden"> | |
| <div | |
| className="h-full rounded-full transition-all duration-700" | |
| style={{ width: `${barWidth}%`, background: m.bg }} | |
| /> | |
| </div> | |
| </div> | |
| ); | |
| })} | |
| </div> | |
| <p className="mt-4 text-xs text-gray-500"> | |
| * Percentage of conversations (out of 200) where the model exhibited sycophantic behavior. | |
| </p> | |
| </div> | |
| )} | |
| {/* Chart 2: Category Breakdown */} | |
| {showCategories && ( | |
| <div className="bg-white rounded-xl shadow-2xl p-6"> | |
| <h2 className="text-lg font-semibold text-gray-800 mb-4"> | |
| Share of each model's sycophantic conversations | |
| </h2> | |
| {/* Legend */} | |
| <div className="flex flex-wrap gap-4 mb-6 text-xs bg-gray-50 p-3 rounded-lg"> | |
| {catModels.map((m) => ( | |
| <div key={m.key} className="flex items-center gap-1.5"> | |
| <div className="w-3 h-3 rounded" style={{ background: m.dot }} /> | |
| <span className="text-gray-700">{m.label} ({totals[m.key]} convs)</span> | |
| </div> | |
| ))} | |
| </div> | |
| <div className="space-y-5"> | |
| {categories.map((cat) => ( | |
| <div key={cat.label}> | |
| <h3 className="font-semibold text-gray-800 text-sm mb-2">{cat.label}</h3> | |
| <div className="space-y-1.5"> | |
| {catModels.map((model) => { | |
| const raw = cat[model.key]; | |
| const total = totals[model.key]; | |
| const pct = total > 0 ? Math.round((raw / total) * 100) : 0; | |
| const barWidth = Math.max(pct, 5); | |
| return ( | |
| <div key={model.key} className="flex items-center gap-2"> | |
| <span className="w-28 text-xs text-right text-gray-600 flex-shrink-0"> | |
| {model.label} | |
| </span> | |
| <div className="flex-1 h-5 bg-gray-200 rounded-full overflow-hidden"> | |
| {pct > 0 ? ( | |
| <div | |
| className="h-full rounded-full flex items-center pl-2 text-xs font-semibold text-white transition-all duration-700" | |
| style={{ width: `${barWidth}%`, background: model.bg }} | |
| > | |
| {pct}% | |
| </div> | |
| ) : ( | |
| <div className="h-full flex items-center pl-2 text-xs text-gray-400"> | |
| 0% | |
| </div> | |
| )} | |
| </div> | |
| </div> | |
| ); | |
| })} | |
| </div> | |
| </div> | |
| ))} | |
| </div> | |
| <p className="mt-4 text-xs text-gray-500"> | |
| * Percentages represent the share of each model's sycophantic conversations that fall into a given category. | |
| </p> | |
| </div> | |
| )} | |
| {/* Footer */} | |
| <div className="mt-6 text-center text-slate-400 text-xs"> | |
| <p>JulyAI Sycophancy Benchmark: {rateModels.length} SOTA models tested across 200 conversations each</p> | |
| </div> | |
| </div> | |
| </div> | |
| ); | |
| }; | |
| export default BenchmarkChart; | |