File size: 9,884 Bytes
80d8c84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
import { useState, useCallback } from 'react';
import { motion, AnimatePresence } from 'framer-motion';
import { GitCompareArrows, Play, Loader2 } from 'lucide-react';
import type { EpisodeState, ScenarioTemplate, Difficulty } from '@/types';
import { resetEpisode, stepEpisode, buildDefaultScientistAction, buildAcceptAction } from '@/lib/api';
import { cn, formatScore, formatReward } from '@/lib/utils';

interface ComparisonRun {
  label: string;
  seed: number;
  difficulty: Difficulty;
  template: ScenarioTemplate;
  state: EpisodeState | null;
  status: 'idle' | 'running' | 'done' | 'error';
  error?: string;
}

export default function EpisodeComparison({ className }: { className?: string }) {
  const [runs, setRuns] = useState<ComparisonRun[]>([
    { label: 'Math', seed: 42, difficulty: 'medium', template: 'math_reasoning', state: null, status: 'idle' },
    { label: 'ML', seed: 42, difficulty: 'medium', template: 'ml_benchmark', state: null, status: 'idle' },
    { label: 'Finance', seed: 42, difficulty: 'medium', template: 'finance_trading', state: null, status: 'idle' },
  ]);
  const [running, setRunning] = useState(false);

  const updateRun = useCallback((index: number, patch: Partial<ComparisonRun>) => {
    setRuns((prev) => prev.map((r, i) => (i === index ? { ...r, ...patch } : r)));
  }, []);

  async function runSingleEpisode(index: number) {
    const run = runs[index];
    updateRun(index, { status: 'running', state: null, error: undefined });

    try {
      let state = await resetEpisode({
        seed: run.seed,
        template: run.template,
        difficulty: run.difficulty,
      });

      // Auto-play through all rounds
      while (!state.done) {
        const isLastRound = state.round >= state.max_rounds - 1;
        const action = isLastRound ? buildAcceptAction() : buildDefaultScientistAction(state);
        state = await stepEpisode(state.session_id, action, state);
      }

      updateRun(index, { state, status: 'done' });
    } catch (err) {
      updateRun(index, {
        status: 'error',
        error: err instanceof Error ? err.message : 'Unknown error',
      });
    }
  }

  async function runAll() {
    setRunning(true);
    await Promise.all(runs.map((_, i) => runSingleEpisode(i)));
    setRunning(false);
  }

  return (
    <div className={cn('rounded-xl border border-border bg-card p-6', className)}>
      <div className="mb-4 flex items-center justify-between">
        <div className="flex items-center gap-2">
          <GitCompareArrows className="h-5 w-5 text-primary" />
          <h2 className="text-base font-bold">Seeded Benchmark Comparison</h2>
        </div>
        <button
          onClick={runAll}
          disabled={running}
          className="flex items-center gap-1.5 rounded-md bg-primary px-3 py-1.5 text-xs font-medium text-primary-foreground hover:bg-primary/90 disabled:opacity-50 transition-colors"
        >
          {running ? <Loader2 className="h-3.5 w-3.5 animate-spin" /> : <Play className="h-3.5 w-3.5" />}
          Run Bench
        </button>
      </div>
      <p className="mb-4 text-sm text-muted-foreground">
        Compare one seeded case from each domain, or edit the rows below to stress-test the same family across seeds and difficulties.
      </p>

      {/* Config rows */}
      <div className="mb-4 grid grid-cols-3 gap-3">
        {runs.map((run, i) => (
          <div key={i} className="rounded-lg border border-border p-3 space-y-2">
            <div className="flex items-center gap-1.5">
              <div className={cn(
                'h-2 w-2 rounded-full',
                run.status === 'done' ? 'bg-lab-manager' :
                run.status === 'running' ? 'bg-primary animate-pulse' :
                run.status === 'error' ? 'bg-destructive' : 'bg-muted-foreground',
              )} />
              <span className="text-xs font-bold">{run.label}</span>
            </div>
            <div className="space-y-1">
              <select
                value={run.template}
                onChange={(e) => updateRun(i, { template: e.target.value as ScenarioTemplate })}
                disabled={running}
                className="w-full rounded border border-border bg-background px-1.5 py-0.5 text-[10px]"
              >
                <option value="math_reasoning">Math Reasoning</option>
                <option value="ml_benchmark">ML Benchmark</option>
                <option value="finance_trading">Finance Trading</option>
              </select>
              <div className="flex gap-1">
                {(['easy', 'medium', 'hard'] as const).map((d) => (
                  <button
                    key={d}
                    onClick={() => updateRun(i, { difficulty: d })}
                    disabled={running}
                    className={cn(
                      'flex-1 rounded border px-1 py-0.5 text-[9px] font-medium transition-colors',
                      run.difficulty === d ? 'border-primary bg-primary/10 text-primary' : 'border-border text-muted-foreground',
                    )}
                  >
                    {d}
                  </button>
                ))}
              </div>
              <input
                type="number"
                value={run.seed}
                onChange={(e) => updateRun(i, { seed: parseInt(e.target.value) || 0 })}
                disabled={running}
                className="w-full rounded border border-border bg-background px-1.5 py-0.5 text-[10px]"
                placeholder="Seed"
              />
            </div>
          </div>
        ))}
      </div>

      {/* Results comparison */}
      <AnimatePresence>
        {runs.some((r) => r.status === 'done') && (
          <motion.div
            initial={{ opacity: 0, y: 10 }}
            animate={{ opacity: 1, y: 0 }}
          >
            <h3 className="mb-3 text-xs font-semibold text-muted-foreground">Results</h3>
            <div className="overflow-x-auto">
              <table className="w-full text-xs">
                <thead>
                  <tr className="border-b border-border text-muted-foreground">
                    <th className="py-1.5 pr-3 text-left font-medium">Metric</th>
                    {runs.map((r, i) => (
                      <th key={i} className="py-1.5 px-3 text-center font-medium">{r.label}</th>
                    ))}
                  </tr>
                </thead>
                <tbody>
                  {[
                    { label: 'Reward', fn: (s: EpisodeState) => formatReward(s.scores?.total_reward ?? 0) },
                    { label: 'Rigor', fn: (s: EpisodeState) => formatScore(s.scores?.rigor ?? 0) },
                    { label: 'Feasibility', fn: (s: EpisodeState) => formatScore(s.scores?.feasibility ?? 0) },
                    { label: 'Fidelity', fn: (s: EpisodeState) => formatScore(s.scores?.fidelity ?? 0) },
                    { label: 'Rounds', fn: (s: EpisodeState) => `${s.round}/${s.max_rounds}` },
                    { label: 'Verdict', fn: (s: EpisodeState) => s.judge_audit?.verdict ?? '-' },
                  ].map((metric) => (
                    <tr key={metric.label} className="border-b border-border/50">
                      <td className="py-1.5 pr-3 font-medium text-muted-foreground">{metric.label}</td>
                      {runs.map((r, i) => (
                        <td key={i} className="py-1.5 px-3 text-center">
                          {r.state ? (
                            <span className="font-medium">{metric.fn(r.state)}</span>
                          ) : r.status === 'running' ? (
                            <Loader2 className="inline h-3 w-3 animate-spin text-primary" />
                          ) : r.status === 'error' ? (
                            <span className="text-destructive">err</span>
                          ) : (
                            <span className="text-muted-foreground">-</span>
                          )}
                        </td>
                      ))}
                    </tr>
                  ))}
                </tbody>
              </table>
            </div>

            {/* Visual score bars comparison */}
            <div className="mt-4 space-y-2">
              {['rigor', 'feasibility', 'fidelity'].map((metric) => (
                <div key={metric}>
                  <div className="mb-1 text-[10px] font-medium capitalize text-muted-foreground">{metric}</div>
                  <div className="space-y-0.5">
                    {runs.map((r, i) => {
                      const val = r.state?.scores?.[metric as keyof typeof r.state.scores] as number | undefined;
                      return (
                        <div key={i} className="flex items-center gap-2">
                          <span className="w-10 text-[9px] font-medium">{r.label}</span>
                          <div className="h-1.5 flex-1 overflow-hidden rounded-full bg-muted">
                            <motion.div
                              className={cn(
                                'h-full rounded-full',
                                metric === 'rigor' ? 'bg-scientist' : metric === 'feasibility' ? 'bg-lab-manager' : 'bg-judge',
                              )}
                              animate={{ width: `${(val ?? 0) * 100}%` }}
                              transition={{ duration: 0.5 }}
                            />
                          </div>
                          <span className="w-8 text-right text-[9px] font-bold">
                            {val !== undefined ? formatScore(val) : '-'}
                          </span>
                        </div>
                      );
                    })}
                  </div>
                </div>
              ))}
            </div>
          </motion.div>
        )}
      </AnimatePresence>
    </div>
  );
}