Spaces:
Sleeping
Sleeping
| """ | |
| AutoClean-Ai v1.0.0 β Production FastAPI Server | |
| Endpoints: | |
| Standard : POST /reset POST /step GET /state GET /health | |
| OpenEnv : GET /tasks POST /grader POST /baseline | |
| Extra : GET /leaderboard POST /leaderboard/submit GET /datasets | |
| """ | |
| import sys, os, uuid, logging, dataclasses, enum, time, threading | |
| sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| from fastapi import FastAPI, HTTPException, Header, Request | |
| from fastapi.responses import JSONResponse, RedirectResponse, HTMLResponse, Response, FileResponse | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from contextlib import asynccontextmanager | |
| from typing import Dict, Any, Optional, List | |
| from models import DataCleaningAction, DataCleaningObservation, DataCleaningState | |
| from environment import DataCleaningEnvironment | |
| from metrics import get_tracker | |
| from tasks import ( | |
| ALL_TASKS, get_task, task_id_for_difficulty, compute_task_score, ACTION_SCHEMA, | |
| ) | |
| logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") | |
| logger = logging.getLogger(__name__) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # POLISHED DEMO PAGE | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| STUNNING_DOCS_HTML = """<!DOCTYPE html> | |
| <html lang="en"> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
| <title>DataQualityGuard-Env Β· OpenEnv</title> | |
| <link rel="icon" href="data:image/svg+xml,<svg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 100 100'><rect width='100' height='100' rx='20' fill='%23080c14'/><text x='50' y='68' font-size='55' text-anchor='middle' fill='%23f59e0b' font-family='sans-serif' font-weight='bold'>H</text></svg>"> | |
| <link href="https://fonts.googleapis.com/css2?family=Space+Grotesk:wght@300;400;500;600;700&family=Fira+Code:wght@400;500&display=swap" rel="stylesheet"> | |
| <style> | |
| :root { | |
| --bg: #080c14; | |
| --surface: #0d1321; | |
| --surface2: #111827; | |
| --border: #1e2d45; | |
| --border2: #243450; | |
| --text: #e2eaf5; | |
| --muted: #6b8aad; | |
| --amber: #f59e0b; | |
| --amber-dim: #78490a; | |
| --teal: #2dd4bf; | |
| --teal-dim: #0f4f49; | |
| --red: #f87171; | |
| --red-dim: #4c1515; | |
| --blue: #60a5fa; | |
| --blue-dim: #1e3a5f; | |
| --green: #4ade80; | |
| --green-dim: #14532d; | |
| --font: 'Space Grotesk', system-ui, sans-serif; | |
| --mono: 'Fira Code', monospace; | |
| } | |
| *, *::before, *::after { margin: 0; padding: 0; box-sizing: border-box; } | |
| html { scroll-behavior: smooth; } | |
| body { | |
| font-family: var(--font); | |
| background: var(--bg); | |
| color: var(--text); | |
| line-height: 1.6; | |
| min-height: 100vh; | |
| overflow-x: hidden; | |
| } | |
| /* ββ NOISE TEXTURE OVERLAY ββ */ | |
| body::before { | |
| content: ''; | |
| position: fixed; | |
| inset: 0; | |
| background-image: url("data:image/svg+xml,%3Csvg viewBox='0 0 256 256' xmlns='http://www.w3.org/2000/svg'%3E%3Cfilter id='noise'%3E%3CfeTurbulence type='fractalNoise' baseFrequency='0.9' numOctaves='4' stitchTiles='stitch'/%3E%3C/filter%3E%3Crect width='100%25' height='100%25' filter='url(%23noise)' opacity='0.04'/%3E%3C/svg%3E"); | |
| pointer-events: none; | |
| z-index: 0; | |
| opacity: 0.4; | |
| } | |
| /* ββ HERO ββ */ | |
| .hero { | |
| position: relative; | |
| padding: 64px 40px 56px; | |
| background: linear-gradient(135deg, #0a1628 0%, #080c14 50%, #0a1420 100%); | |
| border-bottom: 1px solid var(--border); | |
| overflow: hidden; | |
| } | |
| .hero::after { | |
| content: ''; | |
| position: absolute; | |
| top: -60px; right: -60px; | |
| width: 400px; height: 400px; | |
| background: radial-gradient(circle, rgba(245,158,11,0.07) 0%, transparent 70%); | |
| pointer-events: none; | |
| } | |
| .hero::before { | |
| content: ''; | |
| position: absolute; | |
| bottom: -80px; left: 30%; | |
| width: 500px; height: 300px; | |
| background: radial-gradient(ellipse, rgba(45,212,191,0.05) 0%, transparent 70%); | |
| pointer-events: none; | |
| } | |
| .hero-inner { max-width: 1100px; margin: 0 auto; position: relative; z-index: 1; } | |
| .hero-badge { | |
| display: inline-flex; align-items: center; gap: 6px; | |
| background: rgba(245,158,11,0.1); | |
| border: 1px solid rgba(245,158,11,0.25); | |
| color: var(--amber); | |
| padding: 4px 12px; border-radius: 100px; | |
| font-size: 11px; font-weight: 600; letter-spacing: 1px; | |
| text-transform: uppercase; margin-bottom: 20px; | |
| } | |
| .hero-badge::before { content: 'β'; font-size: 8px; animation: pulse 2s infinite; } | |
| @keyframes pulse { 0%,100%{opacity:1} 50%{opacity:0.3} } | |
| .hero h1 { | |
| font-size: clamp(28px, 5vw, 48px); | |
| font-weight: 700; | |
| letter-spacing: -1px; | |
| line-height: 1.1; | |
| margin-bottom: 16px; | |
| } | |
| .hero h1 .accent { color: var(--amber); } | |
| .hero h1 .accent2 { color: var(--teal); } | |
| .hero-sub { | |
| font-size: 16px; color: var(--muted); max-width: 600px; | |
| margin-bottom: 36px; font-weight: 400; line-height: 1.7; | |
| } | |
| .stats-row { | |
| display: flex; gap: 32px; flex-wrap: wrap; | |
| } | |
| .stat-pill { | |
| display: flex; flex-direction: column; | |
| animation: fadeUp 0.6s ease both; | |
| } | |
| .stat-pill:nth-child(2) { animation-delay: 0.1s; } | |
| .stat-pill:nth-child(3) { animation-delay: 0.2s; } | |
| .stat-pill:nth-child(4) { animation-delay: 0.3s; } | |
| @keyframes fadeUp { from{opacity:0;transform:translateY(12px)} to{opacity:1;transform:none} } | |
| .stat-num { | |
| font-size: 32px; font-weight: 700; | |
| background: linear-gradient(135deg, var(--amber), var(--teal)); | |
| -webkit-background-clip: text; -webkit-text-fill-color: transparent; | |
| background-clip: text; | |
| line-height: 1; | |
| } | |
| .stat-lbl { font-size: 12px; color: var(--muted); margin-top: 4px; font-weight: 500; letter-spacing: 0.5px; } | |
| .ver-chip { | |
| position: absolute; top: 24px; right: 40px; | |
| font-family: var(--mono); font-size: 11px; | |
| color: var(--muted); border: 1px solid var(--border2); | |
| padding: 4px 10px; border-radius: 4px; | |
| background: rgba(13,19,33,0.8); | |
| } | |
| /* ββ NAV ββ */ | |
| nav { | |
| position: sticky; top: 0; z-index: 100; | |
| background: rgba(8,12,20,0.95); | |
| backdrop-filter: blur(12px); | |
| border-bottom: 1px solid var(--border); | |
| padding: 0 40px; | |
| } | |
| .nav-inner { | |
| max-width: 1100px; margin: 0 auto; | |
| display: flex; gap: 0; | |
| } | |
| .tab { | |
| padding: 14px 20px; | |
| font-size: 13px; font-weight: 500; | |
| color: var(--muted); cursor: pointer; | |
| border-bottom: 2px solid transparent; | |
| transition: all 0.2s; letter-spacing: 0.3px; | |
| user-select: none; | |
| } | |
| .tab:hover { color: var(--text); } | |
| .tab.active { color: var(--amber); border-bottom-color: var(--amber); } | |
| /* ββ MAIN ββ */ | |
| main { max-width: 1100px; margin: 0 auto; padding: 40px 40px 80px; } | |
| .panel { display: none; animation: fadeIn 0.3s ease; } | |
| .panel.active { display: block; } | |
| @keyframes fadeIn { from{opacity:0;transform:translateY(8px)} to{opacity:1;transform:none} } | |
| /* ββ CARDS ββ */ | |
| .card { | |
| background: var(--surface); | |
| border: 1px solid var(--border); | |
| border-radius: 12px; | |
| padding: 24px; | |
| margin-bottom: 16px; | |
| transition: border-color 0.2s; | |
| } | |
| .card:hover { border-color: var(--border2); } | |
| .card h3 { font-size: 15px; font-weight: 600; margin-bottom: 10px; } | |
| .card p { font-size: 14px; color: var(--muted); line-height: 1.7; } | |
| .card p strong { color: var(--text); } | |
| /* ββ HOW IT WORKS ββ */ | |
| .steps { display: grid; grid-template-columns: repeat(3, 1fr); gap: 16px; margin-bottom: 24px; } | |
| .step { | |
| background: var(--surface); | |
| border: 1px solid var(--border); | |
| border-radius: 12px; | |
| padding: 24px 20px; | |
| text-align: center; | |
| position: relative; | |
| } | |
| .step-icon { | |
| width: 48px; height: 48px; | |
| border-radius: 12px; | |
| display: flex; align-items: center; justify-content: center; | |
| font-size: 22px; | |
| margin: 0 auto 14px; | |
| } | |
| .step:nth-child(1) .step-icon { background: rgba(245,158,11,0.12); border: 1px solid rgba(245,158,11,0.2); } | |
| .step:nth-child(2) .step-icon { background: rgba(45,212,191,0.12); border: 1px solid rgba(45,212,191,0.2); } | |
| .step:nth-child(3) .step-icon { background: rgba(96,165,250,0.12); border: 1px solid rgba(96,165,250,0.2); } | |
| .step-num { | |
| position: absolute; top: 12px; right: 12px; | |
| font-family: var(--mono); font-size: 10px; | |
| color: var(--border2); font-weight: 500; | |
| } | |
| .step h4 { font-size: 14px; font-weight: 600; margin-bottom: 6px; } | |
| .step p { font-size: 12px; color: var(--muted); line-height: 1.6; } | |
| .step-arrow { display: flex; align-items: center; justify-content: center; color: var(--border2); font-size: 20px; } | |
| /* ββ TASK CARDS ββ */ | |
| .task-card { | |
| background: var(--surface); | |
| border: 1px solid var(--border); | |
| border-left: 3px solid; | |
| border-radius: 12px; | |
| padding: 20px 24px; | |
| margin-bottom: 14px; | |
| display: flex; gap: 16px; align-items: flex-start; | |
| } | |
| .task-card.beginner { border-left-color: var(--green); } | |
| .task-card.intermediate { border-left-color: var(--blue); } | |
| .task-card.advanced { border-left-color: var(--red); } | |
| .task-icon { font-size: 28px; flex-shrink: 0; margin-top: 2px; } | |
| .task-body { flex: 1; } | |
| .task-head { display: flex; align-items: center; gap: 10px; margin-bottom: 8px; flex-wrap: wrap; } | |
| .task-head h3 { font-size: 15px; font-weight: 600; } | |
| .diff-badge { | |
| font-size: 10px; font-weight: 700; padding: 2px 8px; | |
| border-radius: 100px; text-transform: uppercase; letter-spacing: 1px; | |
| } | |
| .diff-badge.beginner { background: var(--green-dim); color: var(--green); } | |
| .diff-badge.intermediate { background: var(--blue-dim); color: var(--blue); } | |
| .diff-badge.advanced { background: var(--red-dim); color: var(--red); } | |
| .data-count { | |
| font-family: var(--mono); font-size: 11px; | |
| color: var(--muted); margin-left: auto; | |
| } | |
| .task-body p { font-size: 13px; color: var(--muted); line-height: 1.6; } | |
| .dataset-chips { display: flex; gap: 6px; flex-wrap: wrap; margin-top: 10px; } | |
| .ds-chip { | |
| font-size: 10px; font-family: var(--mono); | |
| padding: 2px 8px; border-radius: 4px; | |
| background: var(--surface2); border: 1px solid var(--border); | |
| color: var(--muted); | |
| } | |
| /* ββ API TABLE ββ */ | |
| .api-table-wrap { overflow: auto; border-radius: 10px; border: 1px solid var(--border); } | |
| table { width: 100%; border-collapse: collapse; font-size: 13px; } | |
| th { | |
| text-align: left; padding: 12px 16px; | |
| background: var(--surface2); | |
| border-bottom: 1px solid var(--border); | |
| font-size: 11px; color: var(--muted); | |
| font-weight: 600; text-transform: uppercase; letter-spacing: 0.8px; | |
| } | |
| td { padding: 12px 16px; border-bottom: 1px solid var(--border); color: var(--text); } | |
| tr:last-child td { border-bottom: none; } | |
| tr:hover td { background: rgba(255,255,255,0.015); } | |
| .method { | |
| font-family: var(--mono); font-size: 10px; font-weight: 600; | |
| padding: 3px 8px; border-radius: 4px; letter-spacing: 0.5px; | |
| } | |
| .method.get { background: var(--green-dim); color: var(--green); } | |
| .method.post { background: var(--blue-dim); color: var(--blue); } | |
| .endpoint { font-family: var(--mono); font-size: 12px; color: var(--amber); } | |
| .td-desc { color: var(--muted); font-size: 12px; } | |
| /* ββ CODE BLOCK ββ */ | |
| .code-block { | |
| background: #050810; | |
| border: 1px solid var(--border); | |
| border-radius: 10px; | |
| overflow: hidden; | |
| margin-bottom: 16px; | |
| } | |
| .code-header { | |
| display: flex; align-items: center; justify-content: space-between; | |
| padding: 10px 16px; | |
| background: var(--surface2); | |
| border-bottom: 1px solid var(--border); | |
| } | |
| .code-lang { font-size: 11px; color: var(--muted); font-family: var(--mono); } | |
| .copy-btn { | |
| font-size: 11px; color: var(--muted); | |
| background: none; border: 1px solid var(--border); | |
| padding: 3px 10px; border-radius: 4px; cursor: pointer; | |
| font-family: var(--font); transition: all 0.15s; | |
| } | |
| .copy-btn:hover { color: var(--text); border-color: var(--border2); } | |
| .code-body { | |
| padding: 20px; | |
| font-family: var(--mono); font-size: 12px; | |
| line-height: 1.8; color: #c9d4e8; | |
| overflow-x: auto; white-space: pre; | |
| } | |
| .code-body .cm { color: #4a6a8a; } | |
| .code-body .kw { color: var(--amber); } | |
| .code-body .st { color: var(--teal); } | |
| .code-body .fn { color: var(--blue); } | |
| .code-body .hl { color: var(--green); } | |
| /* ββ PLAYGROUND ββ */ | |
| .pg-grid { display: grid; grid-template-columns: 1fr 1fr; gap: 20px; } | |
| .pg-label { | |
| font-size: 11px; font-weight: 600; color: var(--muted); | |
| text-transform: uppercase; letter-spacing: 0.8px; | |
| margin-bottom: 8px; display: block; | |
| } | |
| .pg-select, .pg-input, .pg-textarea { | |
| width: 100%; padding: 10px 14px; | |
| background: var(--surface2); border: 1px solid var(--border2); | |
| border-radius: 8px; font-size: 13px; font-family: var(--font); | |
| color: var(--text); outline: none; transition: border-color 0.2s; | |
| appearance: none; | |
| } | |
| .pg-select:focus, .pg-input:focus, .pg-textarea:focus { | |
| border-color: var(--amber); | |
| } | |
| .pg-textarea { min-height: 80px; resize: vertical; line-height: 1.5; } | |
| .pg-select option { background: var(--surface2); } | |
| .slider-wrap { margin-top: 4px; } | |
| .slider-row { display: flex; align-items: center; gap: 12px; } | |
| input[type=range] { | |
| flex: 1; height: 4px; border-radius: 2px; | |
| background: var(--border2); outline: none; cursor: pointer; | |
| accent-color: var(--amber); | |
| } | |
| .slider-val { | |
| font-family: var(--mono); font-size: 13px; font-weight: 600; | |
| color: var(--amber); min-width: 30px; text-align: right; | |
| } | |
| .btn-row { display: flex; gap: 10px; margin-top: 16px; } | |
| .btn { | |
| padding: 10px 20px; border-radius: 8px; | |
| font-size: 13px; font-weight: 600; cursor: pointer; | |
| transition: all 0.15s; border: 1px solid; | |
| font-family: var(--font); letter-spacing: 0.3px; | |
| } | |
| .btn-primary { | |
| background: var(--amber); color: #0a0a0a; | |
| border-color: var(--amber); | |
| } | |
| .btn-primary:hover { background: #fbbf24; } | |
| .btn-primary:active { transform: scale(0.97); } | |
| .btn-secondary { | |
| background: transparent; color: var(--text); | |
| border-color: var(--border2); | |
| } | |
| .btn-secondary:hover { background: var(--surface2); border-color: var(--muted); } | |
| .btn-secondary:disabled { opacity: 0.4; cursor: not-allowed; } | |
| .input-group { margin-bottom: 14px; } | |
| /* ββ CONTEXT DISPLAY ββ */ | |
| .context-box { | |
| background: var(--surface2); | |
| border: 1px solid var(--border); | |
| border-radius: 8px; padding: 16px; | |
| font-size: 13px; line-height: 1.7; | |
| color: var(--muted); min-height: 80px; | |
| max-height: 180px; overflow-y: auto; | |
| margin-bottom: 4px; | |
| } | |
| .context-box .q-highlight { | |
| background: rgba(245,158,11,0.15); | |
| border: 1px solid rgba(245,158,11,0.2); | |
| border-radius: 4px; padding: 2px 6px; | |
| color: var(--text); font-weight: 500; | |
| } | |
| .empty-hint { color: var(--border2); font-style: italic; font-size: 12px; } | |
| /* ββ EPISODE PROGRESS ββ */ | |
| .ep-progress { margin-bottom: 16px; } | |
| .ep-bar-bg { | |
| height: 4px; background: var(--border); | |
| border-radius: 2px; overflow: hidden; margin-top: 6px; | |
| } | |
| .ep-bar-fill { | |
| height: 100%; | |
| background: linear-gradient(90deg, var(--amber), var(--teal)); | |
| border-radius: 2px; | |
| transition: width 0.4s ease; | |
| } | |
| .ep-meta { display: flex; justify-content: space-between; align-items: center; } | |
| .ep-step { font-size: 11px; color: var(--muted); font-family: var(--mono); } | |
| .cleanc-badge { | |
| display: none; | |
| font-size: 11px; font-weight: 700; letter-spacing: 0.5px; | |
| padding: 3px 10px; border-radius: 100px; | |
| } | |
| .cleanc-badge.show { display: inline-block; } | |
| .cleanc-badge.yes { background: var(--red-dim); color: var(--red); border: 1px solid rgba(248,113,113,0.3); } | |
| .cleanc-badge.no { background: var(--green-dim); color: var(--green); border: 1px solid rgba(74,222,128,0.3); } | |
| /* ββ REWARD BREAKDOWN ββ */ | |
| .reward-section { margin-top: 16px; } | |
| .reward-title { | |
| font-size: 11px; font-weight: 600; color: var(--muted); | |
| text-transform: uppercase; letter-spacing: 0.8px; margin-bottom: 12px; | |
| display: flex; justify-content: space-between; align-items: center; | |
| } | |
| .total-reward-val { | |
| font-family: var(--mono); font-size: 16px; | |
| font-weight: 700; color: var(--amber); | |
| } | |
| .reward-bars { display: flex; flex-direction: column; gap: 8px; } | |
| .reward-bar-row { display: flex; align-items: center; gap: 10px; } | |
| .rb-label { font-size: 11px; color: var(--muted); width: 130px; flex-shrink: 0; font-family: var(--mono); } | |
| .rb-track { | |
| flex: 1; height: 6px; background: var(--border); | |
| border-radius: 3px; overflow: hidden; | |
| } | |
| .rb-fill { | |
| height: 100%; border-radius: 3px; | |
| transition: width 0.5s cubic-bezier(.4,0,.2,1); | |
| width: 0%; | |
| } | |
| .rb-val { font-family: var(--mono); font-size: 11px; color: var(--text); width: 36px; text-align: right; flex-shrink: 0; } | |
| /* ββ RAW RESPONSE ββ */ | |
| .raw-toggle { | |
| font-size: 11px; color: var(--muted); cursor: pointer; | |
| display: flex; align-items: center; gap: 6px; margin-top: 12px; | |
| user-select: none; | |
| } | |
| .raw-toggle:hover { color: var(--text); } | |
| .raw-box { | |
| display: none; margin-top: 8px; | |
| background: #050810; border: 1px solid var(--border); | |
| border-radius: 8px; padding: 14px; | |
| font-family: var(--mono); font-size: 11px; | |
| color: var(--muted); white-space: pre-wrap; | |
| max-height: 200px; overflow-y: auto; line-height: 1.6; | |
| } | |
| .raw-box.open { display: block; } | |
| /* ββ SECTION HEADING ββ */ | |
| .section-head { margin-bottom: 20px; } | |
| .section-head h2 { font-size: 20px; font-weight: 700; letter-spacing: -0.3px; } | |
| .section-head p { font-size: 13px; color: var(--muted); margin-top: 4px; } | |
| /* ββ REWARD COLORS ββ */ | |
| .rc-0 { background: var(--green); } | |
| .rc-1 { background: var(--teal); } | |
| .rc-2 { background: var(--blue); } | |
| .rc-3 { background: var(--amber); } | |
| .rc-4 { background: #a78bfa; } | |
| .rc-5 { background: #fb923c; } | |
| .rc-6 { background: #34d399; } | |
| .rc-7 { background: #f472b6; } | |
| .rc-8 { background: #e879f9; } | |
| /* ββ STATUS INDICATOR ββ */ | |
| .status-dot { | |
| width: 8px; height: 8px; border-radius: 50%; | |
| display: inline-block; background: var(--muted); | |
| } | |
| .status-dot.ready { background: var(--amber); box-shadow: 0 0 6px var(--amber); } | |
| .status-dot.done { background: var(--teal); box-shadow: 0 0 6px var(--teal); } | |
| /* ββ RESPONSIVE ββ */ | |
| @media(max-width: 768px) { | |
| .hero { padding: 48px 20px 40px; } | |
| .ver-chip { display: none; } | |
| main { padding: 24px 20px 60px; } | |
| nav { padding: 0 20px; } | |
| .stats-row { gap: 20px; } | |
| .steps { grid-template-columns: 1fr; } | |
| .step-arrow { display: none; } | |
| .pg-grid { grid-template-columns: 1fr; } | |
| .tab { padding: 12px 14px; font-size: 12px; } | |
| } | |
| </style> | |
| </head> | |
| <body> | |
| <!-- ββ HERO ββ --> | |
| <section class="hero"> | |
| <div class="hero-inner"> | |
| <div class="hero-badge">OpenEnv Β· RL Environment</div> | |
| <div class="ver-chip">v4.2.0</div> | |
| <h1> | |
| <span class="accent">DataQuality</span><span class="accent2">Guard</span>βEnv | |
| </h1> | |
| <p class="hero-sub"> | |
| Train AI models to answer <strong>only from verified context</strong> β with a 9-component reward system that penalizes fabrication and rewards factual grounding, citation accuracy, and calibrated confidence. | |
| </p> | |
| <div class="stats-row"> | |
| <div class="stat-pill"> | |
| <span class="stat-num" data-target="1000000" data-suffix="M+">1M+</span> | |
| <span class="stat-lbl">Examples</span> | |
| </div> | |
| <div class="stat-pill"> | |
| <span class="stat-num">38</span> | |
| <span class="stat-lbl">Datasets</span> | |
| </div> | |
| <div class="stat-pill"> | |
| <span class="stat-num">3</span> | |
| <span class="stat-lbl">Task Tiers</span> | |
| </div> | |
| <div class="stat-pill"> | |
| <span class="stat-num">9</span> | |
| <span class="stat-lbl">Reward Components</span> | |
| </div> | |
| </div> | |
| </div> | |
| </section> | |
| <!-- ββ NAV ββ --> | |
| <nav> | |
| <div class="nav-inner"> | |
| <div class="tab active" onclick="showTab('overview',this)">Overview</div> | |
| <div class="tab" onclick="showTab('tasks',this)">Tasks</div> | |
| <div class="tab" onclick="showTab('api',this)">API Reference</div> | |
| <div class="tab" onclick="showTab('playground',this)">Playground</div> | |
| </div> | |
| </nav> | |
| <!-- ββ MAIN ββ --> | |
| <main> | |
| <!-- β OVERVIEW β --> | |
| <div id="overview" class="panel active"> | |
| <div class="section-head"> | |
| <h2>How it works</h2> | |
| <p>Three primitives. Nine reward signals. One goal: no data_qualitys.</p> | |
| </div> | |
| <div class="steps"> | |
| <div class="step"> | |
| <span class="step-num">01</span> | |
| <div class="step-icon">π</div> | |
| <h4>reset()</h4> | |
| <p>Sample a question + context document from one of 38 curated datasets, stratified by difficulty tier.</p> | |
| </div> | |
| <div class="step"> | |
| <span class="step-num">02</span> | |
| <div class="step-icon">π€</div> | |
| <h4>step(answer)</h4> | |
| <p>Submit your answer with confidence and a source quote. Receive a dense reward signal across all 9 components.</p> | |
| </div> | |
| <div class="step"> | |
| <span class="step-num">03</span> | |
| <div class="step-icon">π</div> | |
| <h4>grade()</h4> | |
| <p>Aggregate episode rewards into a task score. Track accuracy, data_quality rate, and skill rating over time.</p> | |
| </div> | |
| </div> | |
| <div class="card"> | |
| <h3>9-Component Reward System</h3> | |
| <p>Every answer is graded on <strong>factual correctness</strong>, <strong>source grounding</strong>, <strong>citation accuracy</strong>, <strong>confidence calibration</strong>, <strong>semantic consistency</strong>, <strong>data_quality detection</strong>, <strong>ROUGE-L</strong>, <strong>BERTScore</strong>, and <strong>AlignScore</strong>. Each component is weighted and combined into a single scalar reward in <strong>[0, 1]</strong>. Confident wrong answers are penalized harder than uncertain ones.</p> | |
| </div> | |
| <div class="card"> | |
| <h3>Curriculum Progression</h3> | |
| <p>Episodes advance from <strong>Beginner</strong> (single-hop factual QA with unambiguous ground-truth) through <strong>Intermediate</strong> (multi-hop synthesis across multiple context sentences) to <strong>Advanced</strong> (adversarial prompts where confident refusals score best). The environment tracks a live <strong>skill rating</strong> and adjusts difficulty sampling accordingly.</p> | |
| </div> | |
| </div> | |
| <!-- β TASKS β --> | |
| <div id="tasks" class="panel"> | |
| <div class="section-head"> | |
| <h2>Task Tiers</h2> | |
| <p>Three progressively harder tasks drawn from 38 datasets with 1M+ examples.</p> | |
| </div> | |
| <div class="task-card beginner"> | |
| <div class="task-icon">π’</div> | |
| <div class="task-body"> | |
| <div class="task-head"> | |
| <h3>Factual Grounding</h3> | |
| <span class="diff-badge beginner">Beginner</span> | |
| <span class="data-count">~450K examples</span> | |
| </div> | |
| <p>Answer straightforward factual questions from a short context passage. Single-hop retrieval with unambiguous ground truth. The grader rewards precise citation and heavily penalizes adding information not found in the context.</p> | |
| <div class="dataset-chips"> | |
| <span class="ds-chip">SQuAD</span> | |
| <span class="ds-chip">BoolQ</span> | |
| <span class="ds-chip">OpenBookQA</span> | |
| <span class="ds-chip">ARC</span> | |
| <span class="ds-chip">TriviaQA</span> | |
| <span class="ds-chip">+8 more</span> | |
| </div> | |
| </div> | |
| </div> | |
| <div class="task-card intermediate"> | |
| <div class="task-icon">π΅</div> | |
| <div class="task-body"> | |
| <div class="task-head"> | |
| <h3>Multi-Hop Synthesis</h3> | |
| <span class="diff-badge intermediate">Intermediate</span> | |
| <span class="data-count">~380K examples</span> | |
| </div> | |
| <p>Synthesize evidence from multiple context sentences to reach an answer. Requires connecting disparate facts without fabricating bridge claims. AlignScore and BERTScore are weighted more heavily at this tier.</p> | |
| <div class="dataset-chips"> | |
| <span class="ds-chip">HotpotQA</span> | |
| <span class="ds-chip">CoQA</span> | |
| <span class="ds-chip">NQ-Open</span> | |
| <span class="ds-chip">MS-MARCO</span> | |
| <span class="ds-chip">MuSiQue</span> | |
| <span class="ds-chip">+7 more</span> | |
| </div> | |
| </div> | |
| </div> | |
| <div class="task-card advanced"> | |
| <div class="task-icon">π΄</div> | |
| <div class="task-body"> | |
| <div class="task-head"> | |
| <h3>Adversarial Resistance</h3> | |
| <span class="diff-badge advanced">Advanced</span> | |
| <span class="data-count">~210K examples</span> | |
| </div> | |
| <p>Resist adversarial prompts designed to elicit data_qualitys. Many questions are deliberately unanswerable β confident refusals with low confidence score better than fabricated plausible-sounding answers.</p> | |
| <div class="dataset-chips"> | |
| <span class="ds-chip">DataQualityEval</span> | |
| <span class="ds-chip">TruthfulQA</span> | |
| <span class="ds-chip">FEVER</span> | |
| <span class="ds-chip">Climate-FEVER</span> | |
| <span class="ds-chip">WittyQA</span> | |
| <span class="ds-chip">+6 more</span> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- β API β --> | |
| <div id="api" class="panel"> | |
| <div class="section-head"> | |
| <h2>API Reference</h2> | |
| <p>RESTful JSON API. All endpoints accept and return <code style="font-family:var(--mono);font-size:12px;color:var(--amber)">application/json</code>. No auth required.</p> | |
| </div> | |
| <div class="api-table-wrap"> | |
| <table> | |
| <thead> | |
| <tr> | |
| <th>Method</th> | |
| <th>Endpoint</th> | |
| <th>Description</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| <tr><td><span class="method post">POST</span></td><td class="endpoint">/reset</td><td class="td-desc">Start episode β returns question, context, difficulty, episode_id</td></tr> | |
| <tr><td><span class="method post">POST</span></td><td class="endpoint">/step</td><td class="td-desc">Submit answer with confidence + source_quote, receive reward breakdown</td></tr> | |
| <tr><td><span class="method get">GET</span></td><td class="endpoint">/state</td><td class="td-desc">Current episode metadata β accuracy, data_quality_rate, skill_rating</td></tr> | |
| <tr><td><span class="method get">GET</span></td><td class="endpoint">/tasks</td><td class="td-desc">List all 3 tasks with action schema</td></tr> | |
| <tr><td><span class="method post">POST</span></td><td class="endpoint">/grader</td><td class="td-desc">Score a completed episode (0.0 β 1.0) from rewards + infos</td></tr> | |
| <tr><td><span class="method post">POST</span></td><td class="endpoint">/baseline</td><td class="td-desc">Run heuristic baseline across all 3 tasks</td></tr> | |
| <tr><td><span class="method get">GET</span></td><td class="endpoint">/metadata</td><td class="td-desc">Environment name, version, license</td></tr> | |
| <tr><td><span class="method get">GET</span></td><td class="endpoint">/schema</td><td class="td-desc">Full JSON schemas for action, observation, state</td></tr> | |
| <tr><td><span class="method get">GET</span></td><td class="endpoint">/health</td><td class="td-desc">Health check β returns {"status":"healthy"}</td></tr> | |
| <tr><td><span class="method post">POST</span></td><td class="endpoint">/mcp</td><td class="td-desc">JSON-RPC 2.0 tool discovery for MCP clients</td></tr> | |
| <tr><td><span class="method get">GET</span></td><td class="endpoint">/leaderboard</td><td class="td-desc">Ranked leaderboard by avg_reward</td></tr> | |
| <tr><td><span class="method post">POST</span></td><td class="endpoint">/leaderboard/submit</td><td class="td-desc">Submit model results for ranking</td></tr> | |
| </tbody> | |
| </table> | |
| </div> | |
| <div style="margin-top:28px"> | |
| <div class="section-head"><h2>Quick Start</h2><p>Three commands to run your first episode.</p></div> | |
| <div class="code-block"> | |
| <div class="code-header"> | |
| <span class="code-lang">bash</span> | |
| <button class="copy-btn" onclick="copyCode(this,'install')">Copy</button> | |
| </div> | |
| <div class="code-body" id="install"><span class="cm"># Install and launch</span> | |
| <span class="kw">pip</span> install -e . | |
| <span class="kw">uvicorn</span> server.app:app --port 7860 | |
| <span class="cm"># Run heuristic baseline</span> | |
| <span class="kw">python</span> inference.py --heuristic --env-url http://localhost:7860</div> | |
| </div> | |
| <div class="code-block"> | |
| <div class="code-header"> | |
| <span class="code-lang">python</span> | |
| <button class="copy-btn" onclick="copyCode(this,'pycode')">Copy</button> | |
| </div> | |
| <div class="code-body" id="pycode"><span class="kw">import</span> requests | |
| BASE = <span class="st">"http://localhost:7860"</span> | |
| <span class="cm"># 1. Reset β get a question + context</span> | |
| obs = requests.<span class="fn">post</span>(<span class="st">f"{BASE}/reset"</span>, json={<span class="st">"difficulty"</span>: <span class="st">"beginner"</span>}).json() | |
| session_id = obs[<span class="st">"session_id"</span>] | |
| <span class="fn">print</span>(obs[<span class="st">"question"</span>]) | |
| <span class="cm"># 2. Step β submit your answer</span> | |
| result = requests.<span class="fn">post</span>(<span class="st">f"{BASE}/step"</span>, json={ | |
| <span class="st">"answer"</span>: <span class="st">"Based on the context, ..."</span>, | |
| <span class="st">"confidence"</span>: <span class="hl">0.85</span>, | |
| <span class="st">"source_quote"</span>: <span class="st">"verbatim text from context"</span>, | |
| <span class="st">"session_id"</span>: session_id, | |
| }).json() | |
| <span class="fn">print</span>(result[<span class="st">"reward"</span>]) <span class="cm"># scalar in [0, 1]</span> | |
| <span class="fn">print</span>(result[<span class="st">"is_data_quality"</span>]) <span class="cm"># bool</span></div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- β PLAYGROUND β --> | |
| <div id="playground" class="panel"> | |
| <div class="section-head"> | |
| <h2>Interactive Playground</h2> | |
| <p>Reset an episode, read the context, craft your answer, and see the live reward breakdown.</p> | |
| </div> | |
| <div class="pg-grid"> | |
| <!-- LEFT: Controls --> | |
| <div> | |
| <div class="card"> | |
| <div class="ep-progress"> | |
| <div class="ep-meta"> | |
| <span class="ep-step" id="ep-step-label">No episode active</span> | |
| <span class="cleanc-badge" id="cleanc-badge"></span> | |
| </div> | |
| <div class="ep-bar-bg"><div class="ep-bar-fill" id="ep-bar" style="width:0%"></div></div> | |
| </div> | |
| <div class="input-group"> | |
| <label class="pg-label">Difficulty</label> | |
| <select class="pg-select" id="difficulty"> | |
| <option value="beginner">π’ Beginner β Factual Grounding</option> | |
| <option value="intermediate">π΅ Intermediate β Multi-Hop Synthesis</option> | |
| <option value="advanced">π΄ Advanced β Adversarial Resistance</option> | |
| </select> | |
| </div> | |
| <div class="input-group"> | |
| <label class="pg-label">Question & Context</label> | |
| <div class="context-box" id="ctx-box"> | |
| <span class="empty-hint">Click Reset to load a question and context...</span> | |
| </div> | |
| </div> | |
| <div class="input-group"> | |
| <label class="pg-label">Your Answer</label> | |
| <textarea class="pg-textarea" id="answer" placeholder="Write an answer derived only from the context above..."></textarea> | |
| </div> | |
| <div class="input-group"> | |
| <label class="pg-label">Confidence</label> | |
| <div class="slider-wrap"> | |
| <div class="slider-row"> | |
| <input type="range" min="0" max="1" step="0.05" value="0.7" id="confidence" | |
| oninput="document.getElementById('conf-val').textContent=parseFloat(this.value).toFixed(2)"> | |
| <span class="slider-val" id="conf-val">0.70</span> | |
| </div> | |
| </div> | |
| </div> | |
| <div class="input-group"> | |
| <label class="pg-label">Source Quote <span style="color:var(--border2);font-weight:400;text-transform:none;letter-spacing:0">(verbatim from context)</span></label> | |
| <input class="pg-input" id="source_quote" type="text" placeholder="Paste a verbatim phrase from the context..."> | |
| </div> | |
| <div class="btn-row"> | |
| <button class="btn btn-primary" onclick="doReset()">β³ Reset</button> | |
| <button class="btn btn-secondary" id="step-btn" onclick="doStep()" disabled>β Step</button> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- RIGHT: Results --> | |
| <div> | |
| <div class="card" id="reward-card"> | |
| <div class="reward-title"> | |
| <span>Reward Breakdown</span> | |
| <span class="total-reward-val" id="total-reward">β</span> | |
| </div> | |
| <div class="reward-bars" id="reward-bars"> | |
| <!-- placeholder bars --> | |
| <div style="text-align:center;padding:20px 0;color:var(--border2);font-size:13px;"> | |
| Submit an answer to see the 9-component reward breakdown | |
| </div> | |
| </div> | |
| <div class="raw-toggle" onclick="toggleRaw()"> | |
| <span id="raw-arrow">βΆ</span> Raw JSON response | |
| </div> | |
| <div class="raw-box" id="raw-box"></div> | |
| </div> | |
| <div class="card"> | |
| <div class="reward-title" style="margin-bottom:14px"> | |
| <span>Observation</span> | |
| <span style="display:flex;align-items:center;gap:6px"> | |
| <span class="status-dot" id="status-dot"></span> | |
| <span id="status-text" style="font-size:11px;color:var(--muted);font-weight:500"></span> | |
| </span> | |
| </div> | |
| <div class="raw-box open" id="obs-box" style="max-height:220px">Click Reset to start an episode.</div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </main> | |
| <!-- ββ FOOTER ββ --> | |
| <footer style="text-align:center;padding:32px 40px 24px;border-top:1px solid var(--border);color:var(--muted);font-size:12px;"> | |
| DataQualityGuard-Env v4.2.0 · OpenEnv · <a href="/swagger" style="color:var(--amber);text-decoration:none">Swagger Docs</a> · <a href="/redoc" style="color:var(--amber);text-decoration:none">ReDoc</a> | |
| </footer> | |
| <script> | |
| let sessionId = null; | |
| let stepCount = 0; | |
| const MAX_STEPS = 10; | |
| const REWARD_KEYS = [ | |
| {key:'correctness', label:'Factual Correctness', css:'rc-0'}, | |
| {key:'grounding', label:'Source Grounding', css:'rc-1'}, | |
| {key:'citation', label:'Citation Accuracy', css:'rc-2'}, | |
| {key:'calibration', label:'Confidence Calibr.', css:'rc-3'}, | |
| {key:'consistency', label:'Semantic Consistency', css:'rc-4'}, | |
| {key:'cleanc_detect', label:'DataQuality Detect.', css:'rc-5'}, | |
| {key:'rouge_l', label:'ROUGE-L', css:'rc-6'}, | |
| {key:'bert_score', label:'BERTScore', css:'rc-7'}, | |
| {key:'align_score', label:'AlignScore', css:'rc-8'}, | |
| // Also accept alternate key names from the grader | |
| {key:'factual_correctness', label:'Factual Correctness', css:'rc-0'}, | |
| {key:'source_grounding', label:'Source Grounding', css:'rc-1'}, | |
| {key:'citation_accuracy', label:'Citation Accuracy', css:'rc-2'}, | |
| {key:'confidence_calibration', label:'Confidence Calibr.', css:'rc-3'}, | |
| {key:'semantic_consistency', label:'Semantic Consistency', css:'rc-4'}, | |
| {key:'data_quality_penalty', label:'DataQuality Detect.', css:'rc-5'}, | |
| {key:'rouge_score', label:'ROUGE-L', css:'rc-6'}, | |
| {key:'bertscore', label:'BERTScore', css:'rc-7'}, | |
| {key:'alignscore', label:'AlignScore', css:'rc-8'}, | |
| ]; | |
| function showTab(id, el) { | |
| document.querySelectorAll('.tab').forEach(t => t.classList.remove('active')); | |
| document.querySelectorAll('.panel').forEach(p => p.classList.remove('active')); | |
| el.classList.add('active'); | |
| document.getElementById(id).classList.add('active'); | |
| if (id === 'playground') { | |
| document.getElementById('playground').scrollIntoView({behavior:'smooth', block:'start'}); | |
| } | |
| } | |
| function setStatus(state) { | |
| const dot = document.getElementById('status-dot'); | |
| const txt = document.getElementById('status-text'); | |
| dot.className = 'status-dot ' + state; | |
| txt.textContent = state === 'ready' ? 'Ready' : state === 'done' ? 'Episode done' : ''; | |
| } | |
| function updateProgress() { | |
| const pct = sessionId ? (stepCount / MAX_STEPS) * 100 : 0; | |
| document.getElementById('ep-bar').style.width = pct + '%'; | |
| document.getElementById('ep-step-label').textContent = sessionId | |
| ? `Step ${stepCount} / ${MAX_STEPS}` | |
| : 'No episode active'; | |
| } | |
| function renderContext(question, context) { | |
| const box = document.getElementById('ctx-box'); | |
| box.innerHTML = `<div style="margin-bottom:10px"><span style="font-size:10px;font-weight:700;letter-spacing:1px;color:var(--amber);text-transform:uppercase">Question</span><br><span class="q-highlight">${escHtml(question)}</span></div><div><span style="font-size:10px;font-weight:700;letter-spacing:1px;color:var(--muted);text-transform:uppercase">Context</span><br><span style="font-size:12px;color:var(--muted)">${escHtml(context)}</span></div>`; | |
| } | |
| function escHtml(s) { | |
| return String(s||'').replace(/&/g,'&').replace(/</g,'<').replace(/>/g,'>'); | |
| } | |
| function renderRewards(data) { | |
| const container = document.getElementById('reward-bars'); | |
| const reward = data.reward != null ? data.reward : data.total_reward; | |
| const totalEl = document.getElementById('total-reward'); | |
| if (reward != null) { | |
| const r = parseFloat(reward); | |
| totalEl.textContent = (r >= 0 ? '+' : '') + r.toFixed(3); | |
| totalEl.style.color = r >= 0 ? 'var(--teal)' : 'var(--red)'; | |
| } | |
| const info = data.info || data.reward_breakdown || data.breakdown || data.metadata || {}; | |
| let html = ''; | |
| let foundAny = false; | |
| const seen = new Set(); | |
| REWARD_KEYS.forEach(({key, label, css}) => { | |
| if (seen.has(label)) return; | |
| let v = info[key] != null ? info[key] : (data[key] != null ? data[key] : null); | |
| if (v == null) return; | |
| seen.add(label); | |
| foundAny = true; | |
| const pct = Math.min(100, Math.max(0, Math.round(Math.abs(parseFloat(v)) * 100))); | |
| const display = parseFloat(v).toFixed(3); | |
| html += `<div class="reward-bar-row"> | |
| <span class="rb-label">${label}</span> | |
| <div class="rb-track"><div class="rb-fill ${css}" style="width:${pct}%"></div></div> | |
| <span class="rb-val">${display}</span> | |
| </div>`; | |
| }); | |
| if (!foundAny && reward != null) { | |
| const pct = Math.min(100, Math.max(0, Math.round(parseFloat(reward)*100))); | |
| html = `<div class="reward-bar-row"> | |
| <span class="rb-label">total_reward</span> | |
| <div class="rb-track"><div class="rb-fill rc-0" style="width:${pct}%"></div></div> | |
| <span class="rb-val">${parseFloat(reward).toFixed(3)}</span> | |
| </div>`; | |
| } | |
| container.innerHTML = html || '<div style="color:var(--border2);font-size:12px;text-align:center;padding:12px">No breakdown data in response</div>'; | |
| // data_quality badge | |
| const badge = document.getElementById('cleanc-badge'); | |
| if (data.is_data_quality != null) { | |
| badge.className = 'cleanc-badge show ' + (data.is_data_quality ? 'yes' : 'no'); | |
| badge.textContent = data.is_data_quality ? 'β DataQuality' : 'β Grounded'; | |
| } | |
| } | |
| async function doReset() { | |
| const diff = document.getElementById('difficulty').value; | |
| const resetBtn = document.querySelector('.btn-primary'); | |
| resetBtn.disabled = true; resetBtn.textContent = 'Loading...'; | |
| document.getElementById('ctx-box').innerHTML = '<span class="empty-hint">Loading...</span>'; | |
| document.getElementById('obs-box').textContent = 'Loading...'; | |
| document.getElementById('step-btn').disabled = true; | |
| setStatus(''); | |
| try { | |
| const r = await fetch('/reset', { | |
| method: 'POST', | |
| headers: {'Content-Type':'application/json'}, | |
| body: JSON.stringify({difficulty: diff, seed: Math.floor(Math.random()*9999)}) | |
| }); | |
| const data = await r.json(); | |
| sessionId = data.session_id || null; | |
| stepCount = 0; | |
| updateProgress(); | |
| renderContext(data.question || '(no question)', data.context || '(no context)'); | |
| document.getElementById('obs-box').textContent = JSON.stringify(data, null, 2); | |
| document.getElementById('step-btn').disabled = false; | |
| document.getElementById('reward-bars').innerHTML = '<div style="text-align:center;padding:20px 0;color:var(--border2);font-size:13px;">Submit an answer to see the 9-component reward breakdown</div>'; | |
| document.getElementById('total-reward').textContent = 'β'; | |
| document.getElementById('total-reward').style.color = 'var(--amber)'; | |
| document.getElementById('cleanc-badge').className = 'cleanc-badge'; | |
| setStatus('ready'); | |
| } catch(e) { | |
| document.getElementById('ctx-box').innerHTML = '<span style="color:var(--red)">Error: ' + escHtml(e.message) + '</span>'; | |
| setStatus(''); | |
| } finally { | |
| resetBtn.disabled = false; resetBtn.textContent = 'β³ Reset'; | |
| } | |
| } | |
| async function doStep() { | |
| if (!sessionId) { alert('Reset first!'); return; } | |
| const stepBtn = document.getElementById('step-btn'); | |
| const body = { | |
| answer: document.getElementById('answer').value, | |
| confidence: parseFloat(document.getElementById('confidence').value) || 0.5, | |
| source_quote: document.getElementById('source_quote').value, | |
| session_id: sessionId, | |
| }; | |
| stepBtn.disabled = true; stepBtn.textContent = 'Submitting...'; | |
| try { | |
| const r = await fetch('/step', { | |
| method: 'POST', | |
| headers: {'Content-Type':'application/json'}, | |
| body: JSON.stringify(body) | |
| }); | |
| const data = await r.json(); | |
| stepCount++; | |
| updateProgress(); | |
| renderRewards(data); | |
| document.getElementById('raw-box').textContent = JSON.stringify(data, null, 2); | |
| if (data.done) { | |
| sessionId = null; | |
| stepBtn.textContent = 'β Step'; | |
| document.getElementById('ctx-box').innerHTML = '<span class="empty-hint">Episode complete. Click Reset for a new episode.</span>'; | |
| setStatus('done'); | |
| } else { | |
| stepBtn.disabled = false; stepBtn.textContent = 'β Step'; | |
| setStatus('ready'); | |
| if (data.question) renderContext(data.question, data.context || ''); | |
| } | |
| } catch(e) { | |
| document.getElementById('raw-box').textContent = 'Error: ' + e.message; | |
| stepBtn.disabled = false; stepBtn.textContent = 'β Step'; | |
| setStatus(''); | |
| } | |
| } | |
| function toggleRaw() { | |
| const box = document.getElementById('raw-box'); | |
| const arr = document.getElementById('raw-arrow'); | |
| box.classList.toggle('open'); | |
| arr.textContent = box.classList.contains('open') ? 'βΌ' : 'βΆ'; | |
| } | |
| function copyCode(btn, id) { | |
| const text = document.getElementById(id).textContent; | |
| const doCopy = (txt) => { | |
| if (navigator.clipboard && window.isSecureContext) { | |
| return navigator.clipboard.writeText(txt); | |
| } | |
| const ta = document.createElement('textarea'); | |
| ta.value = txt; ta.style.position = 'fixed'; ta.style.opacity = '0'; | |
| document.body.appendChild(ta); ta.select(); | |
| document.execCommand('copy'); document.body.removeChild(ta); | |
| return Promise.resolve(); | |
| }; | |
| doCopy(text).then(() => { | |
| btn.textContent = 'Copied!'; | |
| setTimeout(() => btn.textContent = 'Copy', 1800); | |
| }).catch(() => { | |
| btn.textContent = 'Failed'; setTimeout(() => btn.textContent = 'Copy', 1800); | |
| }); | |
| } | |
| </script> | |
| </body> | |
| </html>""" | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # FASTAPI APP β session-isolated environments for thread safety | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| _default_env: Optional[DataCleaningEnvironment] = None | |
| _env_loading = False | |
| _env_lock = threading.Lock() | |
| def _get_default_env() -> DataCleaningEnvironment: | |
| """Get or create the shared dataset-loader environment (used only for dataset access).""" | |
| global _default_env, _env_loading | |
| if _default_env is not None: | |
| return _default_env | |
| with _env_lock: | |
| if _default_env is not None: | |
| return _default_env | |
| _env_loading = True | |
| try: | |
| logger.info("Creating DataCleaningEnvironment (dataset loader)...") | |
| _default_env = DataCleaningEnvironment() | |
| logger.info(f"Environment ready β {_default_env.dataset_loader.get_total_examples():,} examples loaded.") | |
| return _default_env | |
| except Exception as e: | |
| logger.error(f"Failed to create environment: {e}") | |
| from dataset_loader import DatasetLoader | |
| class MinimalEnv: | |
| def __init__(self): | |
| self.dataset_loader = DatasetLoader() | |
| self.dataset_loader.examples = [] | |
| def reset(self, **kwargs): | |
| return type('Obs', (), {'question': 'Placeholder', 'context': 'Context', 'reward': 0.0, 'done': False, 'info': {}})() | |
| def step(self, action): | |
| return type('Obs', (), {'reward': 0.0, 'done': False, 'is_data_quality': False, 'info': {}})() | |
| def state(self): return {} | |
| def close(self): pass | |
| _default_env = MinimalEnv() | |
| return _default_env | |
| finally: | |
| _env_loading = False | |
| def _create_session_env(session_id: str) -> DataCleaningEnvironment: | |
| """Create a fresh per-session environment that shares the dataset loader | |
| (expensive to load) but has its own episode state (safe for concurrent use).""" | |
| loader_env = _get_default_env() | |
| # Pass the shared loader directly into __init__ so we skip the expensive | |
| # DatasetLoader() construction and dataset loading that would otherwise | |
| # happen inside DataQualityEnvironment.__init__ | |
| env = DataCleaningEnvironment(session_id=session_id, dataset_loader=loader_env.dataset_loader) | |
| return env | |
| _sessions: Dict[str, DataCleaningEnvironment] = {} | |
| _session_lock = threading.Lock() | |
| def _get_session(session_id: str) -> Optional[DataCleaningEnvironment]: | |
| """Retrieve an existing session environment.""" | |
| with _session_lock: | |
| return _sessions.get(session_id) | |
| def _cleanup_session(session_id: str): | |
| """Remove and clean up a session environment.""" | |
| with _session_lock: | |
| env = _sessions.pop(session_id, None) | |
| if env: | |
| try: env.close() | |
| except: pass | |
| async def lifespan(app: FastAPI): | |
| global _default_env | |
| def preload_models(): | |
| try: | |
| logger.info("Preloading ML models...") | |
| import transformers | |
| transformers.logging.set_verbosity_error() | |
| from sentence_transformers import SentenceTransformer, CrossEncoder | |
| SentenceTransformer('all-MiniLM-L6-v2') | |
| CrossEncoder('cross-encoder/nli-deberta-v3-small') | |
| from rouge_score import rouge_scorer | |
| rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True) | |
| try: | |
| from grader import _get_bert_scorer | |
| _get_bert_scorer() | |
| except: pass | |
| logger.info("All ML models preloaded!") | |
| except Exception as e: | |
| logger.error(f"Model preload failed: {e}") | |
| threading.Thread(target=preload_models, daemon=True).start() | |
| def background_load(): | |
| try: | |
| logger.info("Background dataset loading...") | |
| env = _get_default_env() | |
| logger.info(f"Loaded {env.dataset_loader.get_total_examples():,} examples.") | |
| except Exception as e: | |
| logger.error(f"Background loading failed: {e}") | |
| threading.Thread(target=background_load, daemon=True).start() | |
| yield | |
| if _default_env: | |
| try: _default_env.close() | |
| except: pass | |
| app = FastAPI( | |
| lifespan=lifespan, | |
| title="DataQualityGuard-Env", | |
| version="4.2.0", | |
| docs_url="/swagger", | |
| redoc_url="/redoc", | |
| ) | |
| app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"]) | |
| import json as _json | |
| _LEADERBOARD_FILE = "/tmp/data_quality_guard_leaderboard.json" | |
| def _load_leaderboard(): | |
| if os.path.exists(_LEADERBOARD_FILE): | |
| try: | |
| with open(_LEADERBOARD_FILE, "r", encoding="utf-8") as f: | |
| return _json.load(f) | |
| except Exception: | |
| pass | |
| return {} | |
| def _save_leaderboard(lb): | |
| try: | |
| with open(_LEADERBOARD_FILE, "w", encoding="utf-8") as f: | |
| _json.dump(lb, f, indent=2) | |
| except Exception: | |
| pass | |
| _leaderboard: Dict[str, Dict[str, Any]] = _load_leaderboard() | |
| def _safe_dict(obj): | |
| if hasattr(obj, 'model_dump'): return _safe_dict(obj.model_dump()) | |
| if hasattr(obj, 'dict'): return _safe_dict(obj.dict()) | |
| if dataclasses.is_dataclass(obj): return {f.name: _safe_dict(getattr(obj, f.name)) for f in dataclasses.fields(obj)} | |
| if isinstance(obj, enum.Enum): return obj.value | |
| if isinstance(obj, dict): return {k: _safe_dict(v) for k, v in obj.items()} | |
| if isinstance(obj, list): return [_safe_dict(i) for i in obj] | |
| if isinstance(obj, (str, int, float, bool, type(None))): return obj | |
| return str(obj) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # ROUTES | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| async def root(): return STUNNING_DOCS_HTML | |
| async def favicon(): | |
| # SVG favicon served as data-uri in the HTML; return 204 so browsers stop logging 404s | |
| return Response(status_code=204) | |
| async def docs(): return STUNNING_DOCS_HTML | |
| async def reset(body: Dict[str, Any] = {}): | |
| try: | |
| # Create a per-session environment for thread safety | |
| session_id = body.get("session_id") or f"ses_{uuid.uuid4().hex[:8]}" | |
| env = _create_session_env(session_id) | |
| obs = env.reset(**{k: v for k, v in body.items() if k in ("seed", "episode_id", "difficulty")}) | |
| # Store the episode_id -> session mapping so /step can find this env | |
| episode_id = getattr(obs, 'episode_id', None) or body.get("episode_id") or session_id | |
| with _session_lock: | |
| _sessions[episode_id] = env | |
| _sessions[session_id] = env | |
| result = _safe_dict(obs) | |
| result["session_id"] = session_id | |
| return JSONResponse(content=result) | |
| except Exception as e: | |
| import traceback | |
| logger.error(f"Reset error: {e}\n{traceback.format_exc()}") | |
| raise HTTPException(500, str(e)) | |
| async def step(action_data: Dict[str, Any]): | |
| try: | |
| # Look up session by episode_id or session_id for thread safety | |
| session_id = action_data.pop("session_id", None) or action_data.pop("episode_id", None) | |
| env = _get_session(session_id) if session_id else None | |
| if env is None: | |
| # Fallback: use default env (single-user mode) | |
| env = _get_default_env() | |
| valid = set(DataCleaningAction.model_fields.keys()) if hasattr(DataCleaningAction, 'model_fields') else set(DataCleaningAction.__fields__.keys()) | |
| action = DataCleaningAction(**{k: v for k, v in action_data.items() if k in valid}) | |
| result = _safe_dict(env.step(action)) | |
| # If episode is done, clean up session | |
| if result.get("done", False) and session_id: | |
| _cleanup_session(session_id) | |
| return JSONResponse(content=result) | |
| except Exception as e: | |
| import traceback | |
| logger.error(f"Step error: {e}\n{traceback.format_exc()}") | |
| raise HTTPException(500, str(e)) | |
| async def get_state(session_id: Optional[str] = None): | |
| try: | |
| env = _get_session(session_id) if session_id else _get_default_env() | |
| return JSONResponse(content=_safe_dict(env.state())) | |
| except Exception as e: | |
| raise HTTPException(500, str(e)) | |
| async def list_tasks(): | |
| ordered = ["task_1_factual_grounding", "task_2_multi_hop_synthesis", "task_3_adversarial_resistance"] | |
| return {"tasks": [ALL_TASKS[t].to_dict() for t in ordered if t in ALL_TASKS], "action_schema": ACTION_SCHEMA} | |
| async def grade_episode(body: Dict[str, Any]): | |
| task_id = body.get("task_id") | |
| if not task_id: raise HTTPException(422, "'task_id' required") | |
| task = get_task(task_id) | |
| if not task: raise HTTPException(404, f"task_id '{task_id}' not found") | |
| rewards, infos = body.get("step_rewards", []), body.get("step_infos", []) | |
| if not infos and rewards: return {"task_id": task_id, "score": round(sum(rewards)/len(rewards), 4)} | |
| return compute_task_score(task, rewards, infos) | |
| async def run_baseline(body: Dict[str, Any] = {}): | |
| steps = max(3, min(10, int(body.get("steps_per_task", 5)))) | |
| seed = int(body.get("seed", 42)) | |
| results = [] | |
| for task_id, diff in [("task_1_factual_grounding","beginner"),("task_2_multi_hop_synthesis","intermediate"),("task_3_adversarial_resistance","advanced")]: | |
| task = get_task(task_id) | |
| if not task: continue | |
| sid = f"bl_{task_id}_{seed}" | |
| # Use session-based env with shared dataset loader | |
| env = _create_session_env(sid) | |
| obs_dict = _safe_dict(env.reset(seed=seed, difficulty=diff)) | |
| rewards, infos = [], [] | |
| for _ in range(steps): | |
| if obs_dict.get("done"): break | |
| ctx = obs_dict.get("context", "") | |
| action = DataQualityAction(answer=ctx[:100], confidence=0.6, source_quote=ctx[:80]) | |
| obs_dict = _safe_dict(env.step(action)) | |
| rewards.append(float(obs_dict.get("reward") or 0)) | |
| obs_meta = obs_dict.get("metadata", {}) | |
| if isinstance(obs_meta, dict): | |
| obs_correctness = obs_meta.get("correctness", 0.0) | |
| obs_calibration = obs_meta.get("calibration", 0.6) | |
| rb = obs_meta.get("reward_breakdown", {}) | |
| infos.append({ | |
| "correctness": obs_correctness, | |
| "grounding": obs_dict.get("grounding_score", 0), | |
| "calibration": obs_calibration, | |
| "data_quality_score": 1.0 if obs_dict.get("is_data_quality") else 0.0, | |
| "is_data_quality": bool(obs_dict.get("is_data_quality", False)), | |
| "semantic_consistency": rb.get("semantic_consistency", 0.0), | |
| "rouge_l": rb.get("rouge_l", 0.0), | |
| "bert_score": rb.get("bert_score", 0.0), | |
| "align_score": rb.get("align_score", 0.0), | |
| }) | |
| else: | |
| infos.append({ | |
| "correctness": 0.0, | |
| "grounding": obs_dict.get("grounding_score", 0), | |
| "calibration": 0.6, | |
| "data_quality_score": 1.0 if obs_dict.get("is_data_quality") else 0.0, | |
| "is_data_quality": bool(obs_dict.get("is_data_quality", False)), | |
| }) | |
| results.append(compute_task_score(task, rewards, infos)) | |
| try: env.close() | |
| except: pass | |
| return {"tasks": results, "summary": {"overall_score": round(sum(r["score"] for r in results)/max(len(results),1), 4)}} | |
| async def batch_evaluate(body: Dict[str, Any]): | |
| items = body.get("items", []) | |
| if not items: raise HTTPException(422, "'items' required") | |
| from server.grader import calculate_reward | |
| results = [] | |
| for i, item in enumerate(items): | |
| r, info = calculate_reward(item.get("answer",""), item.get("confidence",0.5), item.get("source_quote",""), item.get("context",""), item.get("ground_truth","")) | |
| results.append({"index": i, "reward": round(r,4), "is_data_quality": info.get("is_data_quality", False)}) | |
| return {"total_items": len(results), "results": results} | |
| async def leaderboard(): | |
| if not _leaderboard: return {"leaderboard": [], "message": "No submissions"} | |
| ranked = sorted(_leaderboard.values(), key=lambda x: x.get("avg_reward",0), reverse=True) | |
| for i, e in enumerate(ranked): e["rank"] = i+1 | |
| return {"leaderboard": ranked} | |
| async def submit_leaderboard(data: Dict[str, Any]): | |
| required = ["model_name", "avg_reward", "avg_accuracy", "data_quality_rate", "total_episodes", "total_steps"] | |
| if missing := [f for f in required if f not in data]: raise HTTPException(422, f"Missing: {missing}") | |
| _leaderboard[data["model_name"]] = {**data, "submitted_at": time.time()} | |
| _save_leaderboard(_leaderboard) | |
| return {"status": "submitted", "model_name": data["model_name"]} | |
| async def health(): return {"status": "healthy", "version": "4.2.0"} | |
| async def metadata(): | |
| return { | |
| "name": "data_quality-guard-env", | |
| "version": "4.2.0", | |
| "license": "MIT", | |
| "description": ( | |
| "An OpenEnv RL environment that trains AI models to answer questions " | |
| "ONLY from verified context documents β penalizing data_quality and " | |
| "rewarding factual grounding." | |
| ), | |
| } | |
| async def schema(): | |
| return { | |
| "action": { | |
| "type": "object", | |
| "required": ["answer"], | |
| "properties": { | |
| "answer": {"type": "string", "description": "Answer derived ONLY from the provided context document."}, | |
| "confidence": {"type": "number", "minimum": 0.0, "maximum": 1.0, "default": 0.5}, | |
| "source_quote": {"type": "string", "default": ""}, | |
| "reasoning": {"type": "string", "default": ""}, | |
| }, | |
| }, | |
| "observation": { | |
| "type": "object", | |
| "properties": { | |
| "question": {"type": "string"}, | |
| "context": {"type": "string"}, | |
| "ground_truth": {"type": "string"}, | |
| "done": {"type": "boolean"}, | |
| "reward": {"type": "number"}, | |
| "feedback": {"type": "string"}, | |
| "is_data_quality": {"type": "boolean"}, | |
| "grounding_score": {"type": "number"}, | |
| "difficulty_level": {"type": "string"}, | |
| "attempts_remaining": {"type": "integer"}, | |
| }, | |
| }, | |
| "state": { | |
| "type": "object", | |
| "properties": { | |
| "episode_id": {"type": "string"}, | |
| "step_count": {"type": "integer"}, | |
| "accuracy": {"type": "number"}, | |
| "data_quality_rate": {"type": "number"}, | |
| "average_reward": {"type": "number"}, | |
| "current_difficulty": {"type": "string"}, | |
| "skill_rating": {"type": "number"}, | |
| "current_streak": {"type": "integer"}, | |
| "best_streak": {"type": "integer"}, | |
| }, | |
| }, | |
| } | |
| async def web(): | |
| return FileResponse("server/static/index.html") | |
| async def datasets(): | |
| try: return {"total_examples": _get_default_env().dataset_loader.get_total_examples()} | |
| except: return {"total_examples": 0} | |
| async def mcp(body: Dict[str, Any]): | |
| if body.get("method") == "tools/list": | |
| return {"jsonrpc": "2.0", "id": body.get("id",1), "result": {"tools": [{"name": "reset", "inputSchema": {"type": "object"}}, {"name": "step", "inputSchema": {"type": "object"}}]}} | |
| return {"jsonrpc": "2.0", "id": body.get("id",1), "result": {"name": "data_quality-guard-env", "version": "4.2.0"}} | |
| async def log_req(request, call_next): | |
| resp = await call_next(request) | |
| logger.info(f"{request.method} {request.url.path} β {resp.status_code}") | |
| return resp | |
| def main(): | |
| import uvicorn | |
| uvicorn.run(app, host="0.0.0.0", port=7860) | |
| if __name__ == "__main__": | |
| main() | |