import { useEffect, useState } from "react"; import { Activity, BadgeCheck, BarChart3, ChevronRight, CircleAlert, ClipboardList, Database, FlaskConical, Play, Server, TerminalSquare, } from "lucide-react"; const API_BASE = import.meta.env.VITE_API_BASE_URL ?? "http://127.0.0.1:7860"; const ARCHETYPES = ["array", "graph", "dp"]; const DIFFICULTIES = [1, 2, 3]; const pipelineSteps = [ "Create a seeded adversarial episode from the OpenEnv backend.", "Collect solver completions and send them to the Docker Oracle sandbox.", "Score correctness, hidden-test robustness, and anti-gaming signals.", "Apply GRPO reward shaping to improve the Solver policy.", "Track artifacts and compare baseline vs. trained behavior.", ]; function classNames(...values) { return values.filter(Boolean).join(" "); } function formatPercent(value) { if (value == null || Number.isNaN(value)) return "n/a"; return `${(value * 100).toFixed(1)}%`; } function formatReward(value) { if (value == null || Number.isNaN(value)) return "n/a"; return value.toFixed(2); } function Pill({ children, tone = "default" }) { const tones = { default: "border-slate-700 bg-slate-900/70 text-slate-300", success: "border-emerald-500/30 bg-emerald-500/10 text-emerald-300", danger: "border-rose-500/30 bg-rose-500/10 text-rose-300", warning: "border-amber-500/30 bg-amber-500/10 text-amber-300", accent: "border-cyan-500/30 bg-cyan-500/10 text-cyan-300", }; return ( {children} ); } function SectionCard({ icon: Icon, title, subtitle, children, action }) { return (

{title}

{subtitle ?

{subtitle}

: null}

{action}

{children}

); } function Header({ health }) { const healthy = health?.status === "ok"; return (

CodeCourt OpenEnv Space v1.0.0 API {healthy ? "Healthy" : "Unavailable"}

Adversarial RL arena for code generation, hidden failures, and judge-ready proof.

CodeCourt visualizes the full OpenEnv loop: episode generation, secure code execution, GRPO reward shaping, and artifact tracking. This dashboard is designed to make failure, intervention, and improvement obvious in a single demo.

Story Arc

Before

Brute-force solvers fail on hidden adversarial cases.

Fix

Reward shaping, sandboxed Oracle, and seeded task variation.

After

Better pass rate, stronger reward, clearer training evidence.

); } function EnvironmentConsole({ onCreateEpisode, loading }) { const [archetype, setArchetype] = useState("array"); const [difficulty, setDifficulty] = useState(1); return (

Archetype Difficulty

OpenEnv Session

); } function BenchmarkSandbox({ onRun, loading, benchmark }) { return ( {loading ? "Running..." : "Run Benchmark"} } >

{(benchmark?.episodes ?? []).length ? ( benchmark.episodes.map((episode) => ( )) ) : ( )}

Episode	Task	Outcome	Pass Rate	Reward
{episode.episode + 1}	{episode.archetype} task {episode.task_id}	{episode.outcome}	{formatPercent(episode.solver_pass_rate)}	{formatReward(episode.solver_reward)}
No benchmark run yet. Trigger the sandbox to compare baseline vs. reference behavior.

{benchmark?.summary ? (

Avg Pass Rate

{formatPercent(benchmark.summary.avg_solver_pass_rate)}

Avg Reward

{formatReward(benchmark.summary.avg_solver_reward)}

Setter Win Rate

{formatPercent(benchmark.summary.setter_win_rate)}

) : null} ); } function LiveExecutionArena({ session, lastRun }) { const resultTone = lastRun?.info?.outcome === "solver_wins" ? "text-emerald-300" : lastRun?.info?.outcome ? "text-rose-300" : "text-slate-500"; return (

Current Problem

            {session?.problem?.description ??
              "Create an episode to view the generated problem statement and hidden-test-ready task."}

Run Results {lastRun?.info?.outcome ?? "No execution yet"}

Pass Rate

{formatPercent(lastRun?.info?.solver_pass_rate)}

Solver Reward

{formatReward(lastRun?.solver_reward_info?.reward)}

Status

setter {lastRun?.info?.setter_valid ? "valid" : "invalid"} hidden {formatPercent(lastRun?.info?.solver_hidden_pass_rate)}

); } function TrainingPipelineStatus() { return (

{pipelineSteps.map((step, index) => (

{index + 1}

{step}

))}

); } function ArtifactTracker({ artifacts }) { const statusItems = [ ["Baseline", artifacts?.baseline_available], ["Manifest", artifacts?.training_manifest_available], ["GRPO Logs", artifacts?.training_log_available], ["Plots", artifacts?.plots_available], ]; return (

{statusItems.map(([label, present]) => ( {present ? : } {label} {present ? "present" : "missing"} ))}

Baseline Pass Rate

{formatPercent(artifacts?.baseline_summary?.avg_solver_pass_rate)}

Latest Reward

{formatReward(artifacts?.latest_training_metrics?.reward)}

); } export default function CodeCourtDashboard() { const [health, setHealth] = useState(null); const [artifacts, setArtifacts] = useState(null); const [session, setSession] = useState(null); const [lastRun, setLastRun] = useState(null); const [benchmark, setBenchmark] = useState(null); const [creatingEpisode, setCreatingEpisode] = useState(false); const [runningBenchmark, setRunningBenchmark] = useState(false); useEffect(() => { async function bootstrap() { try { const [healthResponse, artifactsResponse] = await Promise.all([ fetch(`${API_BASE}/api/health`).then((response) => response.json()), fetch(`${API_BASE}/api/artifacts`).then((response) => response.json()), ]); setHealth(healthResponse); setArtifacts(artifactsResponse); } catch (error) { setHealth({ status: "error", message: error.message }); } } bootstrap(); }, []); async function handleCreateEpisode({ archetype, difficulty }) { setCreatingEpisode(true); try { const response = await fetch(`${API_BASE}/api/session`, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ archetype, difficulty, seed: 42, }), }); const data = await response.json(); setSession(data.state); setLastRun(null); } finally { setCreatingEpisode(false); } } async function handleRunBenchmark() { setRunningBenchmark(true); try { const response = await fetch(`${API_BASE}/api/benchmark`, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ episodes: 6, solver_mode: "brute_force", seed: 42, }), }); const data = await response.json(); setBenchmark(data); } finally { setRunningBenchmark(false); } } return (

); }