| """FastAPI application for the CleanOps data-cleaning environment.""" |
|
|
| from __future__ import annotations |
|
|
| import copy |
| import random |
|
|
| from openenv.core import create_app |
| from fastapi.responses import HTMLResponse, JSONResponse |
|
|
| from cleanops_env.environment import CleanOpsEnvironment |
| from cleanops_env.models import DataCleaningAction, DataCleaningObservation |
| from cleanops_env.tasks import first_table_name, get_task_spec, sorted_rows |
|
|
|
|
| app = create_app( |
| CleanOpsEnvironment, |
| DataCleaningAction, |
| DataCleaningObservation, |
| env_name="cleanops_env", |
| max_concurrent_envs=4, |
| ) |
|
|
|
|
| @app.get("/demo/compare", include_in_schema=False) |
| def demo_compare(task_id: str = "customer_contacts_easy", table_name: str | None = None, seed: int | None = None) -> JSONResponse: |
| task_spec = get_task_spec(task_id) |
| selected_table = table_name if table_name in task_spec.dirty_tables else first_table_name(task_spec) |
| primary_key = task_spec.primary_keys[selected_table] |
| before_rows = _seed_preview_rows(task_spec.dirty_tables[selected_table], primary_key, selected_table, seed) |
| after_rows = _seed_preview_rows(task_spec.gold_tables[selected_table], primary_key, selected_table, seed) |
| columns = sorted({column_name for row in before_rows + after_rows for column_name in row}) |
| return JSONResponse( |
| { |
| "task_id": task_spec.task_id, |
| "task_title": task_spec.title, |
| "table_name": selected_table, |
| "requested_seed": seed, |
| "available_tables": list(task_spec.dirty_tables.keys()), |
| "columns": columns, |
| "before_rows": before_rows[:4], |
| "after_rows": after_rows[:4], |
| "before_row_count": len(before_rows), |
| "after_row_count": len(after_rows), |
| "solution_operation_ids": list(task_spec.solution_operation_ids), |
| } |
| ) |
|
|
|
|
| def _seed_preview_rows( |
| rows: list[dict[str, str]], |
| primary_key: str, |
| table_name: str, |
| seed: int | None, |
| ) -> list[dict[str, str]]: |
| ordered_rows = sorted_rows(rows, primary_key) |
| if seed is None or len(ordered_rows) <= 1: |
| return ordered_rows |
| shuffled_rows = copy.deepcopy(ordered_rows) |
| random.Random(max(0, int(seed)) + sum(ord(char) for char in table_name)).shuffle(shuffled_rows) |
| return shuffled_rows |
|
|
|
|
| @app.get("/", include_in_schema=False) |
| def root() -> HTMLResponse: |
| return HTMLResponse( |
| """ |
| <!doctype html> |
| <html lang="en"> |
| <head> |
| <meta charset="utf-8" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1" /> |
| <title>CleanOps OpenEnv</title> |
| <style> |
| :root { |
| color-scheme: light; |
| --bg: #f8fbff; |
| --bg-2: #eef6ff; |
| --panel: rgba(255, 255, 255, 0.92); |
| --panel-strong: #ffffff; |
| --panel-soft: #f4f8ff; |
| --line: rgba(148, 163, 184, 0.24); |
| --line-strong: rgba(148, 163, 184, 0.34); |
| --text: #22314d; |
| --muted: #64748b; |
| --accent: #4f9cf9; |
| --accent-2: #f59ac2; |
| --success: #22c55e; |
| --accent-3: #7dd3fc; |
| --danger: #dc6b8a; |
| --warning: #d9a441; |
| --shadow: 0 24px 80px rgba(113, 140, 177, 0.20); |
| --radius: 22px; |
| } |
| html, |
| body { |
| font-family: Inter, Arial, sans-serif; |
| margin: 0; |
| min-height: 100vh; |
| background: |
| radial-gradient(circle at top left, rgba(79, 156, 249, 0.18), transparent 28%), |
| radial-gradient(circle at top right, rgba(245, 154, 194, 0.16), transparent 24%), |
| linear-gradient(180deg, #fdfcff 0%, #f4f9ff 48%, #eef6ff 100%); |
| color: var(--text); |
| overflow-x: hidden; |
| } |
| * { |
| box-sizing: border-box; |
| } |
| .shell { |
| width: min(1180px, 100%); |
| max-width: 1180px; |
| margin: 0 auto; |
| padding: 28px 20px 56px; |
| } |
| .topbar { |
| display: flex; |
| justify-content: space-between; |
| align-items: center; |
| gap: 18px; |
| margin-bottom: 18px; |
| } |
| .brand { |
| display: flex; |
| align-items: center; |
| gap: 14px; |
| } |
| .brand-mark { |
| width: 42px; |
| height: 42px; |
| border-radius: 14px; |
| display: grid; |
| place-items: center; |
| font-size: 20px; |
| background: linear-gradient(135deg, rgba(79, 156, 249, 0.18), rgba(245, 154, 194, 0.18)); |
| border: 1px solid rgba(148, 163, 184, 0.20); |
| } |
| .brand-copy strong { |
| display: block; |
| font-size: 15px; |
| letter-spacing: 0.01em; |
| } |
| .brand-copy span { |
| color: var(--muted); |
| font-size: 13px; |
| } |
| .health { |
| display: inline-flex; |
| align-items: center; |
| gap: 10px; |
| padding: 10px 14px; |
| border-radius: 999px; |
| background: rgba(255, 255, 255, 0.72); |
| border: 1px solid var(--line); |
| color: var(--muted); |
| font-size: 13px; |
| box-shadow: 0 8px 24px rgba(148, 163, 184, 0.10); |
| } |
| .health-dot { |
| width: 10px; |
| height: 10px; |
| border-radius: 999px; |
| background: var(--warning); |
| box-shadow: 0 0 0 5px rgba(251, 191, 36, 0.10); |
| } |
| .health.ready .health-dot { |
| background: var(--success); |
| box-shadow: 0 0 0 5px rgba(34, 197, 94, 0.12); |
| } |
| .health.error .health-dot { |
| background: var(--danger); |
| box-shadow: 0 0 0 5px rgba(248, 113, 113, 0.10); |
| } |
| .hero { |
| position: relative; |
| overflow: hidden; |
| border-radius: 30px; |
| padding: 34px; |
| background: |
| linear-gradient(135deg, rgba(79, 156, 249, 0.20), rgba(245, 154, 194, 0.16)), |
| rgba(255, 255, 255, 0.82); |
| border: 1px solid var(--line); |
| box-shadow: var(--shadow); |
| display: grid; |
| grid-template-columns: 1.15fr 0.85fr; |
| gap: 26px; |
| } |
| .hero::after { |
| content: ""; |
| position: absolute; |
| inset: auto -10% -35% auto; |
| width: 360px; |
| height: 360px; |
| border-radius: 999px; |
| background: radial-gradient(circle, rgba(245, 154, 194, 0.20), transparent 60%); |
| pointer-events: none; |
| } |
| .badge-row { |
| display: flex; |
| gap: 9px; |
| flex-wrap: wrap; |
| margin-bottom: 18px; |
| } |
| .badge { |
| display: inline-flex; |
| align-items: center; |
| gap: 7px; |
| font-size: 12px; |
| padding: 7px 11px; |
| border-radius: 999px; |
| background: rgba(255, 255, 255, 0.72); |
| border: 1px solid rgba(148, 163, 184, 0.18); |
| color: #35527d; |
| } |
| h1 { |
| margin: 0 0 14px; |
| font-size: 46px; |
| line-height: 1.02; |
| letter-spacing: -0.03em; |
| } |
| .hero p { |
| margin: 0; |
| font-size: 17px; |
| color: #5a708d; |
| max-width: 58ch; |
| } |
| .hero-actions { |
| display: flex; |
| gap: 12px; |
| flex-wrap: wrap; |
| margin-top: 22px; |
| } |
| .demo-controls { |
| margin-top: 18px; |
| display: grid; |
| grid-template-columns: 1fr auto auto; |
| gap: 10px; |
| align-items: end; |
| } |
| .field { |
| display: grid; |
| gap: 6px; |
| } |
| .field label { |
| font-size: 12px; |
| color: var(--muted); |
| font-weight: 600; |
| } |
| .field input, |
| .field select { |
| width: 100%; |
| appearance: none; |
| border: 1px solid var(--line); |
| border-radius: 14px; |
| background: rgba(255, 255, 255, 0.88); |
| color: #294668; |
| padding: 12px 14px; |
| font-size: 14px; |
| font-family: inherit; |
| } |
| .field input:focus, |
| .field select:focus { |
| outline: 2px solid rgba(79, 156, 249, 0.20); |
| border-color: rgba(79, 156, 249, 0.35); |
| } |
| .run-feedback { |
| margin-top: 12px; |
| display: inline-flex; |
| align-items: center; |
| gap: 10px; |
| min-height: 20px; |
| color: #426284; |
| font-size: 13px; |
| font-weight: 600; |
| } |
| .run-feedback.loading::before, |
| .run-feedback.success::before, |
| .run-feedback.error::before { |
| content: ""; |
| width: 10px; |
| height: 10px; |
| border-radius: 999px; |
| display: inline-block; |
| } |
| .run-feedback.loading::before { |
| background: var(--accent); |
| box-shadow: 0 0 0 5px rgba(79, 156, 249, 0.10); |
| } |
| .run-feedback.success::before { |
| background: #4fb58d; |
| box-shadow: 0 0 0 5px rgba(79, 181, 141, 0.10); |
| } |
| .run-feedback.error::before { |
| background: var(--danger); |
| box-shadow: 0 0 0 5px rgba(220, 107, 138, 0.10); |
| } |
| .hero-note { |
| margin-top: 18px; |
| color: var(--muted); |
| font-size: 13px; |
| } |
| .hero-card { |
| background: rgba(255, 255, 255, 0.82); |
| border: 1px solid var(--line); |
| border-radius: 24px; |
| padding: 20px; |
| display: flex; |
| flex-direction: column; |
| gap: 16px; |
| } |
| .hero-card h2 { |
| margin: 0; |
| font-size: 16px; |
| color: #26436b; |
| } |
| .hero-card p, |
| .hero-card li { |
| font-size: 14px; |
| color: var(--muted); |
| } |
| .stat-grid { |
| display: grid; |
| grid-template-columns: repeat(2, minmax(0, 1fr)); |
| gap: 12px; |
| } |
| .stat { |
| background: rgba(244, 248, 255, 0.92); |
| border: 1px solid var(--line); |
| border-radius: 18px; |
| padding: 14px; |
| } |
| .stat span { |
| display: block; |
| font-size: 12px; |
| color: var(--muted); |
| margin-bottom: 6px; |
| } |
| .stat strong { |
| font-size: 18px; |
| color: #2b466e; |
| } |
| .grid { |
| display: grid; |
| grid-template-columns: 1fr; |
| gap: 22px; |
| margin-top: 22px; |
| align-items: start; |
| } |
| .hero > *, |
| .grid > *, |
| .subgrid > *, |
| .compare-grid > *, |
| .panel, |
| .mini-panel, |
| .table-wrap, |
| .compare-card { |
| min-width: 0; |
| } |
| .panel { |
| background: var(--panel); |
| border: 1px solid var(--line); |
| border-radius: 24px; |
| box-shadow: var(--shadow); |
| padding: 24px; |
| backdrop-filter: blur(10px); |
| } |
| .panel h2 { |
| margin: 0 0 10px; |
| font-size: 21px; |
| letter-spacing: -0.02em; |
| } |
| .panel p { |
| margin: 0 0 14px; |
| color: var(--muted); |
| } |
| .subgrid { |
| display: grid; |
| grid-template-columns: 1fr; |
| gap: 16px; |
| margin-top: 18px; |
| align-items: start; |
| } |
| .button-row { |
| display: flex; |
| gap: 10px; |
| flex-wrap: wrap; |
| margin: 18px 0 18px; |
| } |
| button, |
| .link-btn { |
| appearance: none; |
| border: 1px solid transparent; |
| border-radius: 14px; |
| padding: 11px 15px; |
| font-size: 14px; |
| font-weight: 600; |
| cursor: pointer; |
| text-decoration: none; |
| transition: transform 0.15s ease, opacity 0.15s ease, box-shadow 0.15s ease; |
| display: inline-flex; |
| align-items: center; |
| justify-content: center; |
| } |
| button:hover, |
| .link-btn:hover { |
| transform: translateY(-1px); |
| } |
| button:disabled { |
| cursor: not-allowed; |
| opacity: 0.7; |
| transform: none; |
| } |
| .primary { |
| background: linear-gradient(135deg, rgba(79, 156, 249, 0.18), rgba(245, 154, 194, 0.18)); |
| color: #244267; |
| border-color: rgba(79, 156, 249, 0.22); |
| box-shadow: 0 8px 24px rgba(79, 156, 249, 0.12); |
| } |
| .primary.is-loading { |
| background: linear-gradient(135deg, rgba(79, 156, 249, 0.24), rgba(245, 154, 194, 0.24)); |
| border-color: rgba(79, 156, 249, 0.32); |
| } |
| .secondary { |
| background: rgba(244, 248, 255, 0.92); |
| color: #32527d; |
| border-color: var(--line); |
| } |
| .button-row .primary.active { |
| background: linear-gradient(135deg, rgba(79, 156, 249, 0.24), rgba(245, 154, 194, 0.24)); |
| border-color: rgba(79, 156, 249, 0.32); |
| } |
| .status { |
| display: inline-flex; |
| align-items: center; |
| gap: 8px; |
| background: rgba(255, 255, 255, 0.94); |
| color: #2d4a73; |
| border: 1px solid var(--line); |
| padding: 10px 12px; |
| border-radius: 12px; |
| font-size: 14px; |
| font-weight: 600; |
| } |
| .status.error { |
| color: var(--danger); |
| border-color: rgba(248, 113, 113, 0.28); |
| } |
| .kpis { |
| display: grid; |
| grid-template-columns: repeat(auto-fit, minmax(180px, 1fr)); |
| gap: 12px; |
| margin-top: 16px; |
| } |
| .kpi { |
| background: rgba(244, 248, 255, 0.92); |
| border: 1px solid var(--line); |
| border-radius: 18px; |
| padding: 14px; |
| } |
| .kpi span { |
| display: block; |
| font-size: 12px; |
| color: var(--muted); |
| margin-bottom: 6px; |
| } |
| .kpi strong { |
| font-size: 24px; |
| letter-spacing: -0.02em; |
| display: block; |
| line-height: 1.05; |
| word-break: break-word; |
| overflow-wrap: anywhere; |
| } |
| .kpi.task-card strong { |
| font-size: clamp(18px, 2vw, 28px); |
| line-height: 1.15; |
| } |
| .kpi small { |
| display: block; |
| margin-top: 8px; |
| color: var(--muted); |
| font-size: 12px; |
| word-break: break-word; |
| overflow-wrap: anywhere; |
| } |
| pre { |
| margin: 0; |
| background: #f5f8ff; |
| color: #29405e; |
| border-radius: 18px; |
| padding: 16px; |
| overflow-x: auto; |
| white-space: pre-wrap; |
| overflow-wrap: anywhere; |
| font-size: 13px; |
| line-height: 1.5; |
| border: 1px solid var(--line); |
| } |
| code { |
| background: rgba(79, 156, 249, 0.10); |
| color: #35527d; |
| border-radius: 8px; |
| padding: 2px 7px; |
| } |
| .mini-panel { |
| background: rgba(255, 255, 255, 0.80); |
| border: 1px solid var(--line); |
| border-radius: 18px; |
| padding: 16px; |
| } |
| .mini-panel h3 { |
| margin: 0 0 10px; |
| font-size: 14px; |
| color: #294668; |
| } |
| .chips { |
| display: flex; |
| flex-wrap: wrap; |
| gap: 8px; |
| } |
| .chip { |
| display: inline-flex; |
| align-items: center; |
| gap: 6px; |
| padding: 8px 10px; |
| border-radius: 999px; |
| background: rgba(79, 156, 249, 0.08); |
| border: 1px solid var(--line); |
| color: #35527d; |
| font-size: 12px; |
| } |
| .chip.warn { |
| color: #9b5a72; |
| border-color: rgba(245, 154, 194, 0.24); |
| background: rgba(245, 154, 194, 0.10); |
| } |
| .table-wrap { |
| overflow: auto; |
| border: 1px solid var(--line); |
| border-radius: 18px; |
| background: rgba(255, 255, 255, 0.84); |
| width: 100%; |
| max-width: 100%; |
| } |
| .compare-grid { |
| display: grid; |
| grid-template-columns: 1fr; |
| gap: 12px; |
| } |
| .compare-card { |
| background: rgba(255, 255, 255, 0.82); |
| border: 1px solid var(--line); |
| border-radius: 16px; |
| overflow: hidden; |
| } |
| .compare-card header { |
| display: flex; |
| justify-content: space-between; |
| gap: 10px; |
| align-items: center; |
| padding: 12px 14px; |
| border-bottom: 1px solid var(--line); |
| background: rgba(244, 248, 255, 0.92); |
| } |
| .compare-card strong { |
| color: #294668; |
| font-size: 13px; |
| } |
| .compare-card span { |
| color: var(--muted); |
| font-size: 12px; |
| } |
| table { |
| width: max-content; |
| min-width: 100%; |
| border-collapse: collapse; |
| } |
| th, td { |
| padding: 12px 14px; |
| text-align: left; |
| border-bottom: 1px solid rgba(148, 163, 184, 0.12); |
| font-size: 13px; |
| vertical-align: top; |
| } |
| th { |
| position: sticky; |
| top: 0; |
| background: rgba(244, 248, 255, 0.98); |
| color: #294668; |
| font-weight: 600; |
| } |
| td { |
| color: #49617f; |
| } |
| .endpoint-list { |
| display: grid; |
| gap: 10px; |
| } |
| .endpoint-list { |
| display: grid; |
| gap: 10px; |
| } |
| .endpoint { |
| display: flex; |
| justify-content: space-between; |
| gap: 10px; |
| align-items: center; |
| background: rgba(244, 248, 255, 0.92); |
| border: 1px solid var(--line); |
| border-radius: 16px; |
| padding: 12px 14px; |
| } |
| .endpoint small { |
| color: var(--muted); |
| } |
| a { |
| color: var(--accent); |
| } |
| .footer { |
| margin-top: 20px; |
| color: var(--muted); |
| font-size: 13px; |
| } |
| .stack { |
| display: grid; |
| gap: 16px; |
| } |
| @media (max-width: 860px) { |
| .topbar, |
| .hero, |
| .grid, |
| .subgrid { |
| grid-template-columns: 1fr; |
| } |
| .demo-controls { |
| grid-template-columns: 1fr; |
| } |
| .kpis, |
| .stat-grid { |
| grid-template-columns: 1fr; |
| } |
| .compare-grid { |
| grid-template-columns: 1fr; |
| } |
| h1 { |
| font-size: 34px; |
| } |
| } |
| </style> |
| </head> |
| <body> |
| <div class="shell"> |
| <div class="topbar"> |
| <div class="brand"> |
| <div class="brand-mark">🧹</div> |
| <div class="brand-copy"> |
| <strong>CleanOps OpenEnv</strong> |
| <span>Operational data cleaning benchmark</span> |
| </div> |
| </div> |
| <div id="health" class="health"> |
| <span class="health-dot"></span> |
| <span id="healthText">Checking live API status...</span> |
| </div> |
| </div> |
| |
| <section class="hero"> |
| <div> |
| <div class="badge-row"> |
| <div class="badge">OpenEnv Benchmark</div> |
| <div class="badge">Real-world Data Cleaning</div> |
| <div class="badge">Deterministic Graders</div> |
| </div> |
| <h1>See real data cleaning tasks working live.</h1> |
| <p> |
| CleanOps simulates the kind of operational cleanup analysts |
| actually do before data reaches a CRM, warehouse, or billing |
| system. The UI below runs the same hosted benchmark API used |
| by the evaluator. |
| </p> |
| <div class="hero-actions"> |
| <button class="primary active" data-task="customer_contacts_easy">Try Easy Task</button> |
| <button class="primary" data-task="orders_reconciliation_medium">Try Medium Task</button> |
| <button class="primary" data-task="crm_migration_hard">Try Hard Task</button> |
| </div> |
| <div class="demo-controls"> |
| <div class="field"> |
| <label for="taskSelect">Choose task</label> |
| <select id="taskSelect"> |
| <option value="customer_contacts_easy">customer_contacts_easy</option> |
| <option value="orders_reconciliation_medium">orders_reconciliation_medium</option> |
| <option value="crm_migration_hard">crm_migration_hard</option> |
| </select> |
| </div> |
| <div class="field"> |
| <label for="seedInput">Seed</label> |
| <input id="seedInput" type="number" min="0" step="1" value="7" /> |
| </div> |
| <button id="runCustomTask" class="secondary" type="button">Run Selected Task</button> |
| </div> |
| <div id="runFeedback" class="run-feedback">Ready to run a live benchmark task.</div> |
| <div class="hero-note"> |
| Changing the seed changes the visible preview ordering and compare view. It does not change the task score itself. |
| </div> |
| <div class="hero-note"> |
| Fixed tasks, typed actions, shaped rewards, and reproducible graders. |
| </div> |
| </div> |
| <div class="hero-card"> |
| <h2>At a glance</h2> |
| <div class="stat-grid"> |
| <div class="stat"> |
| <span>Task ladder</span> |
| <strong>Easy → Hard</strong> |
| </div> |
| <div class="stat"> |
| <span>Core API</span> |
| <strong>/reset /step /state</strong> |
| </div> |
| <div class="stat"> |
| <span>Domain</span> |
| <strong>CRM + Orders + Billing</strong> |
| </div> |
| <div class="stat"> |
| <span>Reward signal</span> |
| <strong>Dense + partial progress</strong> |
| </div> |
| </div> |
| <p> |
| This homepage is a thin demo over the live environment. It |
| doesn’t fake results: every task button calls the deployed API. |
| </p> |
| </div> |
| </section> |
| |
| <section class="grid"> |
| <div class="panel"> |
| <h2>Live Task Snapshot</h2> |
| <p> |
| The cards and table below are populated from a real |
| <code>POST /reset</code> response. Use the task buttons above to |
| switch between benchmark scenarios, or choose your own task and seed. |
| </p> |
| |
| <div class="kpis"> |
| <div class="kpi task-card"> |
| <span>Task</span> |
| <strong id="taskId">-</strong> |
| <small id="taskMeta">-</small> |
| </div> |
| <div class="kpi"> |
| <span>Seed Used</span> |
| <strong id="seedUsed">-</strong> |
| </div> |
| <div class="kpi"> |
| <span>Initial Score</span> |
| <strong id="score">-</strong> |
| </div> |
| <div class="kpi"> |
| <span>Validation Issues</span> |
| <strong id="issues">-</strong> |
| </div> |
| <div class="kpi"> |
| <span>Focus Table Rows</span> |
| <strong id="rowCount">-</strong> |
| </div> |
| </div> |
| |
| <div class="subgrid"> |
| <div class="stack"> |
| <div class="mini-panel"> |
| <h3>Objective</h3> |
| <div id="objective" style="color: var(--text); line-height: 1.55;"> |
| Loading... |
| </div> |
| </div> |
| <div class="mini-panel"> |
| <h3>Validation Issues</h3> |
| <div id="issueChips" class="chips"></div> |
| </div> |
| <div class="mini-panel"> |
| <h3>Available Operations</h3> |
| <div id="operationChips" class="chips"></div> |
| </div> |
| </div> |
| <div class="stack"> |
| <div class="mini-panel"> |
| <h3>Before / After Cleaning</h3> |
| <div id="compareMeta" style="color: var(--muted); margin-bottom: 12px;"> |
| Loading compare view... |
| </div> |
| <div id="solutionChips" class="chips" style="margin-bottom: 12px;"></div> |
| <div class="compare-grid"> |
| <div class="compare-card"> |
| <header> |
| <strong>Dirty input</strong> |
| <span id="beforeMeta">-</span> |
| </header> |
| <div class="table-wrap" style="border: none; border-radius: 0; background: transparent;"> |
| <table> |
| <thead> |
| <tr id="beforeHeadRow"></tr> |
| </thead> |
| <tbody id="beforeBody"></tbody> |
| </table> |
| </div> |
| </div> |
| <div class="compare-card"> |
| <header> |
| <strong>Expected clean output</strong> |
| <span id="afterMeta">-</span> |
| </header> |
| <div class="table-wrap" style="border: none; border-radius: 0; background: transparent;"> |
| <table> |
| <thead> |
| <tr id="afterHeadRow"></tr> |
| </thead> |
| <tbody id="afterBody"></tbody> |
| </table> |
| </div> |
| </div> |
| </div> |
| </div> |
| <div class="mini-panel"> |
| <h3>Focus Table Preview</h3> |
| <div class="table-wrap"> |
| <table> |
| <thead> |
| <tr id="tableHeadRow"></tr> |
| </thead> |
| <tbody id="tableBody"></tbody> |
| </table> |
| </div> |
| </div> |
| <div class="mini-panel"> |
| <h3>Raw Demo Payload</h3> |
| <pre id="output">Loading live task data...</pre> |
| </div> |
| </div> |
| </div> |
| </div> |
| |
| <div class="panel"> |
| <h2>API & Submission Notes</h2> |
| <p> |
| The evaluator checks these endpoints directly. This page exists |
| to make the environment easier to inspect visually. |
| </p> |
| <div class="endpoint-list"> |
| <div class="endpoint"> |
| <div> |
| <strong>GET /health</strong><br /> |
| <small>Service liveness check</small> |
| </div> |
| <a href="/health">Open</a> |
| </div> |
| <div class="endpoint"> |
| <div> |
| <strong>GET /schema</strong><br /> |
| <small>Typed OpenEnv schema</small> |
| </div> |
| <a href="/schema">Open</a> |
| </div> |
| <div class="endpoint"> |
| <div> |
| <strong>GET /docs</strong><br /> |
| <small>Interactive FastAPI docs</small> |
| </div> |
| <a href="/docs">Open</a> |
| </div> |
| <div class="endpoint"> |
| <div> |
| <strong>POST /reset</strong><br /> |
| <small>Start a task episode</small> |
| </div> |
| <code>live</code> |
| </div> |
| <div class="endpoint"> |
| <div> |
| <strong>POST /step</strong><br /> |
| <small>Apply a typed action</small> |
| </div> |
| <code>live</code> |
| </div> |
| <div class="endpoint"> |
| <div> |
| <strong>GET /state</strong><br /> |
| <small>Inspect current environment state</small> |
| </div> |
| <code>live</code> |
| </div> |
| </div> |
| |
| <div class="mini-panel" style="margin-top: 18px;"> |
| <h3>Sample curl</h3> |
| <pre>curl -X POST /reset -H "Content-Type: application/json" -d '{"task_id":"customer_contacts_easy","seed":7}'</pre> |
| </div> |
| |
| <div class="footer"> |
| Fixed tasks plus deterministic graders keep evaluation reproducible. |
| </div> |
| </div> |
| </section> |
| </div> |
| |
| <script> |
| const healthEl = document.getElementById("health"); |
| const healthTextEl = document.getElementById("healthText"); |
| const outputEl = document.getElementById("output"); |
| const taskEl = document.getElementById("taskId"); |
| const taskMetaEl = document.getElementById("taskMeta"); |
| const seedUsedEl = document.getElementById("seedUsed"); |
| const taskSelectEl = document.getElementById("taskSelect"); |
| const seedInputEl = document.getElementById("seedInput"); |
| const runCustomTaskEl = document.getElementById("runCustomTask"); |
| const runFeedbackEl = document.getElementById("runFeedback"); |
| const scoreEl = document.getElementById("score"); |
| const issuesEl = document.getElementById("issues"); |
| const rowCountEl = document.getElementById("rowCount"); |
| const objectiveEl = document.getElementById("objective"); |
| const issueChipsEl = document.getElementById("issueChips"); |
| const operationChipsEl = document.getElementById("operationChips"); |
| const tableHeadRowEl = document.getElementById("tableHeadRow"); |
| const tableBodyEl = document.getElementById("tableBody"); |
| const compareMetaEl = document.getElementById("compareMeta"); |
| const solutionChipsEl = document.getElementById("solutionChips"); |
| const beforeMetaEl = document.getElementById("beforeMeta"); |
| const afterMetaEl = document.getElementById("afterMeta"); |
| const beforeHeadRowEl = document.getElementById("beforeHeadRow"); |
| const beforeBodyEl = document.getElementById("beforeBody"); |
| const afterHeadRowEl = document.getElementById("afterHeadRow"); |
| const afterBodyEl = document.getElementById("afterBody"); |
| const taskButtons = Array.from(document.querySelectorAll("button[data-task]")); |
| let isRunning = false; |
| |
| function setHealth(kind, message) { |
| healthEl.className = `health ${kind}`; |
| healthTextEl.textContent = message; |
| } |
| |
| function setRunFeedback(kind, message) { |
| runFeedbackEl.className = `run-feedback ${kind}`; |
| runFeedbackEl.textContent = message; |
| } |
| |
| function clearChildren(node) { |
| while (node.firstChild) { |
| node.removeChild(node.firstChild); |
| } |
| } |
| |
| function chip(text, className = "chip") { |
| const el = document.createElement("div"); |
| el.className = className; |
| el.textContent = text; |
| return el; |
| } |
| |
| function renderTableTo(headEl, bodyEl, columns, rows) { |
| clearChildren(headEl); |
| clearChildren(bodyEl); |
| columns.forEach((column) => { |
| const th = document.createElement("th"); |
| th.textContent = column; |
| headEl.appendChild(th); |
| }); |
| rows.forEach((row) => { |
| const tr = document.createElement("tr"); |
| columns.forEach((column) => { |
| const td = document.createElement("td"); |
| td.textContent = row[column] ?? ""; |
| tr.appendChild(td); |
| }); |
| bodyEl.appendChild(tr); |
| }); |
| } |
| |
| function renderTable(columns, rows) { |
| renderTableTo(tableHeadRowEl, tableBodyEl, columns, rows); |
| } |
| |
| function setActiveTask(taskId) { |
| taskSelectEl.value = taskId; |
| taskButtons.forEach((button) => { |
| button.classList.toggle("active", button.dataset.task === taskId); |
| }); |
| } |
| |
| function setRunningState(taskId, running) { |
| isRunning = running; |
| taskButtons.forEach((button) => { |
| const isSelected = button.dataset.task === taskId; |
| button.disabled = running; |
| button.classList.toggle("is-loading", running && isSelected); |
| button.textContent = running && isSelected |
| ? "Loading..." |
| : button.dataset.task === "customer_contacts_easy" |
| ? "Try Easy Task" |
| : button.dataset.task === "orders_reconciliation_medium" |
| ? "Try Medium Task" |
| : "Try Hard Task"; |
| }); |
| taskSelectEl.disabled = running; |
| seedInputEl.disabled = running; |
| runCustomTaskEl.disabled = running; |
| runCustomTaskEl.textContent = running ? "Running..." : "Run Selected Task"; |
| } |
| |
| async function loadHealth() { |
| try { |
| const response = await fetch("/health"); |
| if (!response.ok) throw new Error(`HTTP ${response.status}`); |
| const data = await response.json(); |
| setHealth("ready", `API healthy: ${data.status}`); |
| } catch (error) { |
| setHealth("error", `API check failed: ${error.message}`); |
| } |
| } |
| |
| async function runTask(taskId, seed = 7) { |
| if (isRunning) { |
| return; |
| } |
| setActiveTask(taskId); |
| setRunningState(taskId, true); |
| setRunFeedback("loading", `Running ${taskId} with seed ${seed}...`); |
| outputEl.textContent = "Loading..."; |
| objectiveEl.textContent = "Loading..."; |
| clearChildren(issueChipsEl); |
| clearChildren(operationChipsEl); |
| clearChildren(solutionChipsEl); |
| try { |
| const response = await fetch("/reset", { |
| method: "POST", |
| headers: { "Content-Type": "application/json" }, |
| body: JSON.stringify({ task_id: taskId, seed }), |
| }); |
| if (!response.ok) { |
| throw new Error(`HTTP ${response.status}`); |
| } |
| const payload = await response.json(); |
| const observation = payload.observation || {}; |
| const usedSeed = observation.requested_seed ?? seed; |
| taskEl.textContent = observation.task_title || observation.task_id || taskId; |
| taskMetaEl.textContent = observation.task_id || taskId; |
| seedUsedEl.textContent = String(usedSeed); |
| scoreEl.textContent = String(observation.quality_score ?? "-"); |
| issuesEl.textContent = String((observation.validation_issues || []).length); |
| rowCountEl.textContent = String((observation.focus_table?.rows || []).length); |
| objectiveEl.textContent = observation.objective || "-"; |
| |
| const validationIssues = observation.validation_issues || []; |
| if (validationIssues.length === 0) { |
| issueChipsEl.appendChild(chip("No validation issues", "chip")); |
| } else { |
| validationIssues.slice(0, 6).forEach((issue) => { |
| issueChipsEl.appendChild(chip(`${issue.table_name}.${issue.column_name}: ${issue.row_ids.length}`, "chip warn")); |
| }); |
| } |
| |
| const operations = observation.available_operations || []; |
| operations.slice(0, 8).forEach((operation) => { |
| operationChipsEl.appendChild(chip(operation.operation_id)); |
| }); |
| |
| const columns = observation.focus_table?.columns || []; |
| const rows = (observation.focus_table?.rows || []).slice(0, 4); |
| renderTable(columns, rows); |
| |
| const compareResponse = await fetch(`/demo/compare?task_id=${encodeURIComponent(taskId)}&table_name=${encodeURIComponent(observation.focus_table?.name || "")}&seed=${encodeURIComponent(String(seed))}`); |
| if (!compareResponse.ok) { |
| throw new Error(`Compare HTTP ${compareResponse.status}`); |
| } |
| const comparePayload = await compareResponse.json(); |
| compareMetaEl.textContent = `${comparePayload.task_title} • table: ${comparePayload.table_name} • seed: ${comparePayload.requested_seed ?? usedSeed}`; |
| beforeMetaEl.textContent = `${comparePayload.before_row_count} rows`; |
| afterMetaEl.textContent = `${comparePayload.after_row_count} rows`; |
| renderTableTo(beforeHeadRowEl, beforeBodyEl, comparePayload.columns || [], comparePayload.before_rows || []); |
| renderTableTo(afterHeadRowEl, afterBodyEl, comparePayload.columns || [], comparePayload.after_rows || []); |
| (comparePayload.solution_operation_ids || []).forEach((operationId) => { |
| solutionChipsEl.appendChild(chip(operationId)); |
| }); |
| |
| outputEl.textContent = JSON.stringify( |
| { |
| task_id: observation.task_id, |
| requested_seed: usedSeed, |
| difficulty: observation.difficulty, |
| objective: observation.objective, |
| quality_score: observation.quality_score, |
| remaining_steps: observation.remaining_steps, |
| validation_issue_count: (observation.validation_issues || []).length, |
| focus_table: observation.focus_table?.name, |
| available_operations: (observation.available_operations || []).map((item) => item.operation_id).slice(0, 8), |
| }, |
| null, |
| 2 |
| ); |
| setRunFeedback("success", `Loaded ${observation.task_title || taskId} successfully with seed ${usedSeed}.`); |
| } catch (error) { |
| outputEl.textContent = `Request failed: ${error.message}`; |
| objectiveEl.textContent = "Request failed."; |
| taskEl.textContent = "Unavailable"; |
| taskMetaEl.textContent = taskId; |
| seedUsedEl.textContent = "-"; |
| clearChildren(tableHeadRowEl); |
| clearChildren(tableBodyEl); |
| clearChildren(beforeHeadRowEl); |
| clearChildren(beforeBodyEl); |
| clearChildren(afterHeadRowEl); |
| clearChildren(afterBodyEl); |
| compareMetaEl.textContent = "Compare view unavailable."; |
| beforeMetaEl.textContent = "-"; |
| afterMetaEl.textContent = "-"; |
| setRunFeedback("error", `Run failed: ${error.message}`); |
| } finally { |
| setRunningState(taskId, false); |
| } |
| } |
| |
| taskButtons.forEach((button) => { |
| button.addEventListener("click", () => { |
| const seed = Number.parseInt(seedInputEl.value || "7", 10); |
| runTask(button.dataset.task, Number.isNaN(seed) ? 7 : seed); |
| }); |
| }); |
| |
| runCustomTaskEl.addEventListener("click", () => { |
| const seed = Number.parseInt(seedInputEl.value || "7", 10); |
| runTask(taskSelectEl.value, Number.isNaN(seed) ? 7 : seed); |
| }); |
| |
| loadHealth(); |
| runTask("customer_contacts_easy", 7); |
| </script> |
| </body> |
| </html> |
| """ |
| ) |
|
|
|
|
| def main(host: str = "0.0.0.0", port: int = 8000) -> None: |
| import uvicorn |
|
|
| uvicorn.run(app, host=host, port=port) |
|
|
|
|
| if __name__ == "__main__": |
| import argparse |
|
|
| parser = argparse.ArgumentParser() |
| parser.add_argument("--host", default="0.0.0.0") |
| parser.add_argument("--port", type=int, default=8000) |
| args = parser.parse_args() |
| if args.host == "0.0.0.0" and args.port == 8000: |
| main() |
| else: |
| main(host=args.host, port=args.port) |
|
|