import { useState, useCallback } from 'react'; import { motion, AnimatePresence } from 'framer-motion'; import { GitCompareArrows, Play, Loader2 } from 'lucide-react'; import type { EpisodeState, ScenarioTemplate, Difficulty } from '@/types'; import { resetEpisode, stepEpisode, buildDefaultScientistAction, buildAcceptAction } from '@/lib/api'; import { cn, formatScore, formatReward } from '@/lib/utils'; interface ComparisonRun { label: string; seed: number; difficulty: Difficulty; template: ScenarioTemplate; state: EpisodeState | null; status: 'idle' | 'running' | 'done' | 'error'; error?: string; } export default function EpisodeComparison({ className }: { className?: string }) { const [runs, setRuns] = useState([ { label: 'Math', seed: 42, difficulty: 'medium', template: 'math_reasoning', state: null, status: 'idle' }, { label: 'ML', seed: 42, difficulty: 'medium', template: 'ml_benchmark', state: null, status: 'idle' }, { label: 'Finance', seed: 42, difficulty: 'medium', template: 'finance_trading', state: null, status: 'idle' }, ]); const [running, setRunning] = useState(false); const updateRun = useCallback((index: number, patch: Partial) => { setRuns((prev) => prev.map((r, i) => (i === index ? { ...r, ...patch } : r))); }, []); async function runSingleEpisode(index: number) { const run = runs[index]; updateRun(index, { status: 'running', state: null, error: undefined }); try { let state = await resetEpisode({ seed: run.seed, template: run.template, difficulty: run.difficulty, }); // Auto-play through all rounds while (!state.done) { const isLastRound = state.round >= state.max_rounds - 1; const action = isLastRound ? buildAcceptAction() : buildDefaultScientistAction(state); state = await stepEpisode(state.session_id, action, state); } updateRun(index, { state, status: 'done' }); } catch (err) { updateRun(index, { status: 'error', error: err instanceof Error ? err.message : 'Unknown error', }); } } async function runAll() { setRunning(true); await Promise.all(runs.map((_, i) => runSingleEpisode(i))); setRunning(false); } return (

Seeded Benchmark Comparison

Compare one seeded case from each domain, or edit the rows below to stress-test the same family across seeds and difficulties.

{/* Config rows */}

{runs.map((run, i) => (

{run.label}

{(['easy', 'medium', 'hard'] as const).map((d) => ( ))}

updateRun(i, { seed: parseInt(e.target.value) || 0 })} disabled={running} className="w-full rounded border border-border bg-background px-1.5 py-0.5 text-[10px]" placeholder="Seed" />

))}

{/* Results comparison */} {runs.some((r) => r.status === 'done') && (

Results

{runs.map((r, i) => ( ))} {[ { label: 'Reward', fn: (s: EpisodeState) => formatReward(s.scores?.total_reward ?? 0) }, { label: 'Rigor', fn: (s: EpisodeState) => formatScore(s.scores?.rigor ?? 0) }, { label: 'Feasibility', fn: (s: EpisodeState) => formatScore(s.scores?.feasibility ?? 0) }, { label: 'Fidelity', fn: (s: EpisodeState) => formatScore(s.scores?.fidelity ?? 0) }, { label: 'Rounds', fn: (s: EpisodeState) => `${s.round}/${s.max_rounds}` }, { label: 'Verdict', fn: (s: EpisodeState) => s.judge_audit?.verdict ?? '-' }, ].map((metric) => ( {runs.map((r, i) => ( ))} ))}

Metric	{r.label}
{metric.label}	{r.state ? ( {metric.fn(r.state)} ) : r.status === 'running' ? ( ) : r.status === 'error' ? ( err ) : ( - )}

{/* Visual score bars comparison */}

{['rigor', 'feasibility', 'fidelity'].map((metric) => (

{metric}

{runs.map((r, i) => { const val = r.state?.scores?.[metric as keyof typeof r.state.scores] as number | undefined; return (

{r.label}

{val !== undefined ? formatScore(val) : '-'}

); })}

))}

)}

); }