import { useEffect, useMemo, useState } from 'react' import { fetchBaseline } from '../api/client' import { TASK_CATALOG } from '../data/taskCatalog' import type { BaselineResponse } from '../types' import { SectionCard } from './SectionCard' const IDEAL_SCORE = 0.99 export function BaselineArenaTab() { const [baseline, setBaseline] = useState(null) const [loading, setLoading] = useState(true) const [error, setError] = useState(null) const loadBaseline = async () => { setLoading(true) setError(null) try { const result = await fetchBaseline() setBaseline(result) } catch (loadError) { setError(loadError instanceof Error ? loadError.message : String(loadError)) } finally { setLoading(false) } } useEffect(() => { void loadBaseline() }, []) const entries = useMemo( () => Object.entries(baseline?.scores ?? {}).map(([taskId, score]) => ({ taskId, score, delta: Number((IDEAL_SCORE - score).toFixed(4)), })), [baseline], ) return (
void loadBaseline()} className="px-3 py-2 rounded-lg text-sm border border-zinc-800 bg-zinc-950/60 hover:bg-zinc-900/60" > Refresh Baseline } > {error && (
{error}
)} {loading && (
Collecting live baseline scores…
)} {!loading && baseline && (
{entries.map(({ taskId, score, delta }) => (

{taskId}

{TASK_CATALOG[taskId]?.story}

Score

{score.toFixed(4)}

Gap to a near-perfect solve: {delta.toFixed(4)}

))}

Grader formula

  • 0.05 for submitting any query
  • 0.25 when the last query executes without error
  • 0.60 when the result matches the expected rows
  • 0.09 efficiency bonus for faster perfect solves

Why this matters

The scores stay strictly inside the open interval (0, 1), while still leaving enough spread between broken-query baselines and reference solves for the validator to tell them apart.

Max steps

{baseline.max_steps}

Every task shares the same step ceiling, which keeps grading and agent runtime predictable for the OpenEnv portal.

)}
) }