Spaces:
Running
Running
File size: 9,884 Bytes
80d8c84 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 | import { useState, useCallback } from 'react';
import { motion, AnimatePresence } from 'framer-motion';
import { GitCompareArrows, Play, Loader2 } from 'lucide-react';
import type { EpisodeState, ScenarioTemplate, Difficulty } from '@/types';
import { resetEpisode, stepEpisode, buildDefaultScientistAction, buildAcceptAction } from '@/lib/api';
import { cn, formatScore, formatReward } from '@/lib/utils';
interface ComparisonRun {
label: string;
seed: number;
difficulty: Difficulty;
template: ScenarioTemplate;
state: EpisodeState | null;
status: 'idle' | 'running' | 'done' | 'error';
error?: string;
}
export default function EpisodeComparison({ className }: { className?: string }) {
const [runs, setRuns] = useState<ComparisonRun[]>([
{ label: 'Math', seed: 42, difficulty: 'medium', template: 'math_reasoning', state: null, status: 'idle' },
{ label: 'ML', seed: 42, difficulty: 'medium', template: 'ml_benchmark', state: null, status: 'idle' },
{ label: 'Finance', seed: 42, difficulty: 'medium', template: 'finance_trading', state: null, status: 'idle' },
]);
const [running, setRunning] = useState(false);
const updateRun = useCallback((index: number, patch: Partial<ComparisonRun>) => {
setRuns((prev) => prev.map((r, i) => (i === index ? { ...r, ...patch } : r)));
}, []);
async function runSingleEpisode(index: number) {
const run = runs[index];
updateRun(index, { status: 'running', state: null, error: undefined });
try {
let state = await resetEpisode({
seed: run.seed,
template: run.template,
difficulty: run.difficulty,
});
// Auto-play through all rounds
while (!state.done) {
const isLastRound = state.round >= state.max_rounds - 1;
const action = isLastRound ? buildAcceptAction() : buildDefaultScientistAction(state);
state = await stepEpisode(state.session_id, action, state);
}
updateRun(index, { state, status: 'done' });
} catch (err) {
updateRun(index, {
status: 'error',
error: err instanceof Error ? err.message : 'Unknown error',
});
}
}
async function runAll() {
setRunning(true);
await Promise.all(runs.map((_, i) => runSingleEpisode(i)));
setRunning(false);
}
return (
<div className={cn('rounded-xl border border-border bg-card p-6', className)}>
<div className="mb-4 flex items-center justify-between">
<div className="flex items-center gap-2">
<GitCompareArrows className="h-5 w-5 text-primary" />
<h2 className="text-base font-bold">Seeded Benchmark Comparison</h2>
</div>
<button
onClick={runAll}
disabled={running}
className="flex items-center gap-1.5 rounded-md bg-primary px-3 py-1.5 text-xs font-medium text-primary-foreground hover:bg-primary/90 disabled:opacity-50 transition-colors"
>
{running ? <Loader2 className="h-3.5 w-3.5 animate-spin" /> : <Play className="h-3.5 w-3.5" />}
Run Bench
</button>
</div>
<p className="mb-4 text-sm text-muted-foreground">
Compare one seeded case from each domain, or edit the rows below to stress-test the same family across seeds and difficulties.
</p>
{/* Config rows */}
<div className="mb-4 grid grid-cols-3 gap-3">
{runs.map((run, i) => (
<div key={i} className="rounded-lg border border-border p-3 space-y-2">
<div className="flex items-center gap-1.5">
<div className={cn(
'h-2 w-2 rounded-full',
run.status === 'done' ? 'bg-lab-manager' :
run.status === 'running' ? 'bg-primary animate-pulse' :
run.status === 'error' ? 'bg-destructive' : 'bg-muted-foreground',
)} />
<span className="text-xs font-bold">{run.label}</span>
</div>
<div className="space-y-1">
<select
value={run.template}
onChange={(e) => updateRun(i, { template: e.target.value as ScenarioTemplate })}
disabled={running}
className="w-full rounded border border-border bg-background px-1.5 py-0.5 text-[10px]"
>
<option value="math_reasoning">Math Reasoning</option>
<option value="ml_benchmark">ML Benchmark</option>
<option value="finance_trading">Finance Trading</option>
</select>
<div className="flex gap-1">
{(['easy', 'medium', 'hard'] as const).map((d) => (
<button
key={d}
onClick={() => updateRun(i, { difficulty: d })}
disabled={running}
className={cn(
'flex-1 rounded border px-1 py-0.5 text-[9px] font-medium transition-colors',
run.difficulty === d ? 'border-primary bg-primary/10 text-primary' : 'border-border text-muted-foreground',
)}
>
{d}
</button>
))}
</div>
<input
type="number"
value={run.seed}
onChange={(e) => updateRun(i, { seed: parseInt(e.target.value) || 0 })}
disabled={running}
className="w-full rounded border border-border bg-background px-1.5 py-0.5 text-[10px]"
placeholder="Seed"
/>
</div>
</div>
))}
</div>
{/* Results comparison */}
<AnimatePresence>
{runs.some((r) => r.status === 'done') && (
<motion.div
initial={{ opacity: 0, y: 10 }}
animate={{ opacity: 1, y: 0 }}
>
<h3 className="mb-3 text-xs font-semibold text-muted-foreground">Results</h3>
<div className="overflow-x-auto">
<table className="w-full text-xs">
<thead>
<tr className="border-b border-border text-muted-foreground">
<th className="py-1.5 pr-3 text-left font-medium">Metric</th>
{runs.map((r, i) => (
<th key={i} className="py-1.5 px-3 text-center font-medium">{r.label}</th>
))}
</tr>
</thead>
<tbody>
{[
{ label: 'Reward', fn: (s: EpisodeState) => formatReward(s.scores?.total_reward ?? 0) },
{ label: 'Rigor', fn: (s: EpisodeState) => formatScore(s.scores?.rigor ?? 0) },
{ label: 'Feasibility', fn: (s: EpisodeState) => formatScore(s.scores?.feasibility ?? 0) },
{ label: 'Fidelity', fn: (s: EpisodeState) => formatScore(s.scores?.fidelity ?? 0) },
{ label: 'Rounds', fn: (s: EpisodeState) => `${s.round}/${s.max_rounds}` },
{ label: 'Verdict', fn: (s: EpisodeState) => s.judge_audit?.verdict ?? '-' },
].map((metric) => (
<tr key={metric.label} className="border-b border-border/50">
<td className="py-1.5 pr-3 font-medium text-muted-foreground">{metric.label}</td>
{runs.map((r, i) => (
<td key={i} className="py-1.5 px-3 text-center">
{r.state ? (
<span className="font-medium">{metric.fn(r.state)}</span>
) : r.status === 'running' ? (
<Loader2 className="inline h-3 w-3 animate-spin text-primary" />
) : r.status === 'error' ? (
<span className="text-destructive">err</span>
) : (
<span className="text-muted-foreground">-</span>
)}
</td>
))}
</tr>
))}
</tbody>
</table>
</div>
{/* Visual score bars comparison */}
<div className="mt-4 space-y-2">
{['rigor', 'feasibility', 'fidelity'].map((metric) => (
<div key={metric}>
<div className="mb-1 text-[10px] font-medium capitalize text-muted-foreground">{metric}</div>
<div className="space-y-0.5">
{runs.map((r, i) => {
const val = r.state?.scores?.[metric as keyof typeof r.state.scores] as number | undefined;
return (
<div key={i} className="flex items-center gap-2">
<span className="w-10 text-[9px] font-medium">{r.label}</span>
<div className="h-1.5 flex-1 overflow-hidden rounded-full bg-muted">
<motion.div
className={cn(
'h-full rounded-full',
metric === 'rigor' ? 'bg-scientist' : metric === 'feasibility' ? 'bg-lab-manager' : 'bg-judge',
)}
animate={{ width: `${(val ?? 0) * 100}%` }}
transition={{ duration: 0.5 }}
/>
</div>
<span className="w-8 text-right text-[9px] font-bold">
{val !== undefined ? formatScore(val) : '-'}
</span>
</div>
);
})}
</div>
</div>
))}
</div>
</motion.div>
)}
</AnimatePresence>
</div>
);
}
|