Spaces:

openenv-community
/

replicalab

Running

File size: 9,884 Bytes

80d8c84

import { useState, useCallback } from 'react';
import { motion, AnimatePresence } from 'framer-motion';
import { GitCompareArrows, Play, Loader2 } from 'lucide-react';
import type { EpisodeState, ScenarioTemplate, Difficulty } from '@/types';
import { resetEpisode, stepEpisode, buildDefaultScientistAction, buildAcceptAction } from '@/lib/api';
import { cn, formatScore, formatReward } from '@/lib/utils';

interface ComparisonRun {
  label: string;
  seed: number;
  difficulty: Difficulty;
  template: ScenarioTemplate;
  state: EpisodeState | null;
  status: 'idle' | 'running' | 'done' | 'error';
  error?: string;
}

export default function EpisodeComparison({ className }: { className?: string }) {
  const [runs, setRuns] = useState<ComparisonRun[]>([
    { label: 'Math', seed: 42, difficulty: 'medium', template: 'math_reasoning', state: null, status: 'idle' },
    { label: 'ML', seed: 42, difficulty: 'medium', template: 'ml_benchmark', state: null, status: 'idle' },
    { label: 'Finance', seed: 42, difficulty: 'medium', template: 'finance_trading', state: null, status: 'idle' },
  ]);
  const [running, setRunning] = useState(false);

  const updateRun = useCallback((index: number, patch: Partial<ComparisonRun>) => {
    setRuns((prev) => prev.map((r, i) => (i === index ? { ...r, ...patch } : r)));
  }, []);

  async function runSingleEpisode(index: number) {
    const run = runs[index];
    updateRun(index, { status: 'running', state: null, error: undefined });

    try {
      let state = await resetEpisode({
        seed: run.seed,
        template: run.template,
        difficulty: run.difficulty,
      });

      // Auto-play through all rounds
      while (!state.done) {
        const isLastRound = state.round >= state.max_rounds - 1;
        const action = isLastRound ? buildAcceptAction() : buildDefaultScientistAction(state);
        state = await stepEpisode(state.session_id, action, state);
      }

      updateRun(index, { state, status: 'done' });
    } catch (err) {
      updateRun(index, {
        status: 'error',
        error: err instanceof Error ? err.message : 'Unknown error',
      });
    }
  }

  async function runAll() {
    setRunning(true);
    await Promise.all(runs.map((_, i) => runSingleEpisode(i)));
    setRunning(false);
  }

  return (
    <div className={cn('rounded-xl border border-border bg-card p-6', className)}>
      <div className="mb-4 flex items-center justify-between">
        <div className="flex items-center gap-2">
          <GitCompareArrows className="h-5 w-5 text-primary" />
          <h2 className="text-base font-bold">Seeded Benchmark Comparison</h2>
        </div>
        <button
          onClick={runAll}
          disabled={running}
          className="flex items-center gap-1.5 rounded-md bg-primary px-3 py-1.5 text-xs font-medium text-primary-foreground hover:bg-primary/90 disabled:opacity-50 transition-colors"
        >
          {running ? <Loader2 className="h-3.5 w-3.5 animate-spin" /> : <Play className="h-3.5 w-3.5" />}
          Run Bench
        </button>
      </div>
      <p className="mb-4 text-sm text-muted-foreground">
        Compare one seeded case from each domain, or edit the rows below to stress-test the same family across seeds and difficulties.
      </p>

      {/* Config rows */}
      <div className="mb-4 grid grid-cols-3 gap-3">
        {runs.map((run, i) => (
          <div key={i} className="rounded-lg border border-border p-3 space-y-2">
            <div className="flex items-center gap-1.5">
              <div className={cn(
                'h-2 w-2 rounded-full',
                run.status === 'done' ? 'bg-lab-manager' :
                run.status === 'running' ? 'bg-primary animate-pulse' :
                run.status === 'error' ? 'bg-destructive' : 'bg-muted-foreground',
              )} />
              <span className="text-xs font-bold">{run.label}</span>
            </div>
            <div className="space-y-1">
              <select
                value={run.template}
                onChange={(e) => updateRun(i, { template: e.target.value as ScenarioTemplate })}
                disabled={running}
                className="w-full rounded border border-border bg-background px-1.5 py-0.5 text-[10px]"
              >
                <option value="math_reasoning">Math Reasoning</option>
                <option value="ml_benchmark">ML Benchmark</option>
                <option value="finance_trading">Finance Trading</option>
              </select>
              <div className="flex gap-1">
                {(['easy', 'medium', 'hard'] as const).map((d) => (
                  <button
                    key={d}
                    onClick={() => updateRun(i, { difficulty: d })}
                    disabled={running}
                    className={cn(
                      'flex-1 rounded border px-1 py-0.5 text-[9px] font-medium transition-colors',
                      run.difficulty === d ? 'border-primary bg-primary/10 text-primary' : 'border-border text-muted-foreground',
                    )}
                  >
                    {d}
                  </button>
                ))}
              </div>
              <input
                type="number"
                value={run.seed}
                onChange={(e) => updateRun(i, { seed: parseInt(e.target.value) || 0 })}
                disabled={running}
                className="w-full rounded border border-border bg-background px-1.5 py-0.5 text-[10px]"
                placeholder="Seed"
              />
            </div>
          </div>
        ))}
      </div>

      {/* Results comparison */}
      <AnimatePresence>
        {runs.some((r) => r.status === 'done') && (
          <motion.div
            initial={{ opacity: 0, y: 10 }}
            animate={{ opacity: 1, y: 0 }}
          >
            <h3 className="mb-3 text-xs font-semibold text-muted-foreground">Results</h3>
            <div className="overflow-x-auto">
              <table className="w-full text-xs">
                <thead>
                  <tr className="border-b border-border text-muted-foreground">
                    <th className="py-1.5 pr-3 text-left font-medium">Metric</th>
                    {runs.map((r, i) => (
                      <th key={i} className="py-1.5 px-3 text-center font-medium">{r.label}</th>
                    ))}
                  </tr>
                </thead>
                <tbody>
                  {[
                    { label: 'Reward', fn: (s: EpisodeState) => formatReward(s.scores?.total_reward ?? 0) },
                    { label: 'Rigor', fn: (s: EpisodeState) => formatScore(s.scores?.rigor ?? 0) },
                    { label: 'Feasibility', fn: (s: EpisodeState) => formatScore(s.scores?.feasibility ?? 0) },
                    { label: 'Fidelity', fn: (s: EpisodeState) => formatScore(s.scores?.fidelity ?? 0) },
                    { label: 'Rounds', fn: (s: EpisodeState) => `${s.round}/${s.max_rounds}` },
                    { label: 'Verdict', fn: (s: EpisodeState) => s.judge_audit?.verdict ?? '-' },
                  ].map((metric) => (
                    <tr key={metric.label} className="border-b border-border/50">
                      <td className="py-1.5 pr-3 font-medium text-muted-foreground">{metric.label}</td>
                      {runs.map((r, i) => (
                        <td key={i} className="py-1.5 px-3 text-center">
                          {r.state ? (
                            <span className="font-medium">{metric.fn(r.state)}</span>
                          ) : r.status === 'running' ? (
                            <Loader2 className="inline h-3 w-3 animate-spin text-primary" />
                          ) : r.status === 'error' ? (
                            <span className="text-destructive">err</span>
                          ) : (
                            <span className="text-muted-foreground">-</span>
                          )}
                        </td>
                      ))}
                    </tr>
                  ))}
                </tbody>
              </table>
            </div>

            {/* Visual score bars comparison */}
            <div className="mt-4 space-y-2">
              {['rigor', 'feasibility', 'fidelity'].map((metric) => (
                <div key={metric}>
                  <div className="mb-1 text-[10px] font-medium capitalize text-muted-foreground">{metric}</div>
                  <div className="space-y-0.5">
                    {runs.map((r, i) => {
                      const val = r.state?.scores?.[metric as keyof typeof r.state.scores] as number | undefined;
                      return (
                        <div key={i} className="flex items-center gap-2">
                          <span className="w-10 text-[9px] font-medium">{r.label}</span>
                          <div className="h-1.5 flex-1 overflow-hidden rounded-full bg-muted">
                            <motion.div
                              className={cn(
                                'h-full rounded-full',
                                metric === 'rigor' ? 'bg-scientist' : metric === 'feasibility' ? 'bg-lab-manager' : 'bg-judge',
                              )}
                              animate={{ width: `${(val ?? 0) * 100}%` }}
                              transition={{ duration: 0.5 }}
                            />
                          </div>
                          <span className="w-8 text-right text-[9px] font-bold">
                            {val !== undefined ? formatScore(val) : '-'}
                          </span>
                        </div>
                      );
                    })}
                  </div>
                </div>
              ))}
            </div>
          </motion.div>
        )}
      </AnimatePresence>
    </div>
  );
}