sairaj2 commited on
Commit
d137754
Β·
verified Β·
1 Parent(s): 8d340f1

Upload folder using huggingface_hub

Browse files
Files changed (7) hide show
  1. Dockerfile +2 -2
  2. openenv.yaml +1 -1
  3. server/Dockerfile +1 -1
  4. server/__init__.py +1 -1
  5. server/app.py +39 -39
  6. server/metrics.py +71 -71
  7. server/tasks.py +22 -22
Dockerfile CHANGED
@@ -1,4 +1,4 @@
1
- # HallucinationGuard-Env Dockerfile - HF Spaces optimized
2
  # Single-stage build: avoids broken --target copy with compiled packages (torch, etc.)
3
  FROM python:3.10-slim
4
 
@@ -28,7 +28,7 @@ COPY . .
28
  RUN pip install --no-cache-dir -e .
29
 
30
  # Cache directory for datasets
31
- RUN mkdir -p /tmp/halluguard_cache /tmp/transformers_cache /tmp/hf_cache
32
 
33
  # HF Spaces default port
34
  EXPOSE 7860
 
1
+ # DataQualityGuard-Env Dockerfile - HF Spaces optimized
2
  # Single-stage build: avoids broken --target copy with compiled packages (torch, etc.)
3
  FROM python:3.10-slim
4
 
 
28
  RUN pip install --no-cache-dir -e .
29
 
30
  # Cache directory for datasets
31
+ RUN mkdir -p /tmp/cleanguard_cache /tmp/transformers_cache /tmp/hf_cache
32
 
33
  # HF Spaces default port
34
  EXPOSE 7860
openenv.yaml CHANGED
@@ -78,7 +78,7 @@ datasets:
78
  - squad
79
  - squad_v2
80
  - trivia_qa
81
- - halueval
82
  - truthful_qa
83
  - hotpotqa
84
  - boolq
 
78
  - squad
79
  - squad_v2
80
  - trivia_qa
81
+ - data_quality_eval
82
  - truthful_qa
83
  - hotpotqa
84
  - boolq
server/Dockerfile CHANGED
@@ -1,4 +1,4 @@
1
- # HallucinationGuard-Env Dockerfile
2
  FROM python:3.10-slim
3
 
4
  WORKDIR /app
 
1
+ # DataQualityGuard-Env Dockerfile
2
  FROM python:3.10-slim
3
 
4
  WORKDIR /app
server/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
- """Server module for HallucinationGuard-Env."""
2
 
3
  import sys
4
  import os
 
1
+ """Server module for DataQualityGuard-Env."""
2
 
3
  import sys
4
  import os
server/app.py CHANGED
@@ -38,7 +38,7 @@ STUNNING_DOCS_HTML = """<!DOCTYPE html>
38
  <head>
39
  <meta charset="UTF-8">
40
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
41
- <title>HallucinationGuard-Env Β· OpenEnv</title>
42
  <link rel="icon" href="data:image/svg+xml,<svg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 100 100'><rect width='100' height='100' rx='20' fill='%23080c14'/><text x='50' y='68' font-size='55' text-anchor='middle' fill='%23f59e0b' font-family='sans-serif' font-weight='bold'>H</text></svg>">
43
  <link href="https://fonts.googleapis.com/css2?family=Space+Grotesk:wght@300;400;500;600;700&family=Fira+Code:wght@400;500&display=swap" rel="stylesheet">
44
  <style>
@@ -409,14 +409,14 @@ input[type=range] {
409
  }
410
  .ep-meta { display: flex; justify-content: space-between; align-items: center; }
411
  .ep-step { font-size: 11px; color: var(--muted); font-family: var(--mono); }
412
- .halluc-badge {
413
  display: none;
414
  font-size: 11px; font-weight: 700; letter-spacing: 0.5px;
415
  padding: 3px 10px; border-radius: 100px;
416
  }
417
- .halluc-badge.show { display: inline-block; }
418
- .halluc-badge.yes { background: var(--red-dim); color: var(--red); border: 1px solid rgba(248,113,113,0.3); }
419
- .halluc-badge.no { background: var(--green-dim); color: var(--green); border: 1px solid rgba(74,222,128,0.3); }
420
 
421
  /* ── REWARD BREAKDOWN ── */
422
  .reward-section { margin-top: 16px; }
@@ -506,7 +506,7 @@ input[type=range] {
506
  <div class="hero-badge">OpenEnv Β· RL Environment</div>
507
  <div class="ver-chip">v4.2.0</div>
508
  <h1>
509
- <span class="accent">Hallucination</span><span class="accent2">Guard</span>‑Env
510
  </h1>
511
  <p class="hero-sub">
512
  Train AI models to answer <strong>only from verified context</strong> β€” with a 9-component reward system that penalizes fabrication and rewards factual grounding, citation accuracy, and calibrated confidence.
@@ -549,7 +549,7 @@ input[type=range] {
549
  <div id="overview" class="panel active">
550
  <div class="section-head">
551
  <h2>How it works</h2>
552
- <p>Three primitives. Nine reward signals. One goal: no hallucinations.</p>
553
  </div>
554
  <div class="steps">
555
  <div class="step">
@@ -568,13 +568,13 @@ input[type=range] {
568
  <span class="step-num">03</span>
569
  <div class="step-icon">πŸ“Š</div>
570
  <h4>grade()</h4>
571
- <p>Aggregate episode rewards into a task score. Track accuracy, hallucination rate, and skill rating over time.</p>
572
  </div>
573
  </div>
574
 
575
  <div class="card">
576
  <h3>9-Component Reward System</h3>
577
- <p>Every answer is graded on <strong>factual correctness</strong>, <strong>source grounding</strong>, <strong>citation accuracy</strong>, <strong>confidence calibration</strong>, <strong>semantic consistency</strong>, <strong>hallucination detection</strong>, <strong>ROUGE-L</strong>, <strong>BERTScore</strong>, and <strong>AlignScore</strong>. Each component is weighted and combined into a single scalar reward in <strong>[0, 1]</strong>. Confident wrong answers are penalized harder than uncertain ones.</p>
578
  </div>
579
  <div class="card">
580
  <h3>Curriculum Progression</h3>
@@ -634,9 +634,9 @@ input[type=range] {
634
  <span class="diff-badge advanced">Advanced</span>
635
  <span class="data-count">~210K examples</span>
636
  </div>
637
- <p>Resist adversarial prompts designed to elicit hallucinations. Many questions are deliberately unanswerable β€” confident refusals with low confidence score better than fabricated plausible-sounding answers.</p>
638
  <div class="dataset-chips">
639
- <span class="ds-chip">HaluEval</span>
640
  <span class="ds-chip">TruthfulQA</span>
641
  <span class="ds-chip">FEVER</span>
642
  <span class="ds-chip">Climate-FEVER</span>
@@ -665,7 +665,7 @@ input[type=range] {
665
  <tbody>
666
  <tr><td><span class="method post">POST</span></td><td class="endpoint">/reset</td><td class="td-desc">Start episode β€” returns question, context, difficulty, episode_id</td></tr>
667
  <tr><td><span class="method post">POST</span></td><td class="endpoint">/step</td><td class="td-desc">Submit answer with confidence + source_quote, receive reward breakdown</td></tr>
668
- <tr><td><span class="method get">GET</span></td><td class="endpoint">/state</td><td class="td-desc">Current episode metadata β€” accuracy, hallucination_rate, skill_rating</td></tr>
669
  <tr><td><span class="method get">GET</span></td><td class="endpoint">/tasks</td><td class="td-desc">List all 3 tasks with action schema</td></tr>
670
  <tr><td><span class="method post">POST</span></td><td class="endpoint">/grader</td><td class="td-desc">Score a completed episode (0.0 – 1.0) from rewards + infos</td></tr>
671
  <tr><td><span class="method post">POST</span></td><td class="endpoint">/baseline</td><td class="td-desc">Run heuristic baseline across all 3 tasks</td></tr>
@@ -716,7 +716,7 @@ result = requests.<span class="fn">post</span>(<span class="st">f"{BASE}/step"</
716
  }).json()
717
 
718
  <span class="fn">print</span>(result[<span class="st">"reward"</span>]) <span class="cm"># scalar in [0, 1]</span>
719
- <span class="fn">print</span>(result[<span class="st">"is_hallucination"</span>]) <span class="cm"># bool</span></div>
720
  </div>
721
  </div>
722
  </div>
@@ -735,7 +735,7 @@ result = requests.<span class="fn">post</span>(<span class="st">f"{BASE}/step"</
735
  <div class="ep-progress">
736
  <div class="ep-meta">
737
  <span class="ep-step" id="ep-step-label">No episode active</span>
738
- <span class="halluc-badge" id="halluc-badge"></span>
739
  </div>
740
  <div class="ep-bar-bg"><div class="ep-bar-fill" id="ep-bar" style="width:0%"></div></div>
741
  </div>
@@ -822,7 +822,7 @@ result = requests.<span class="fn">post</span>(<span class="st">f"{BASE}/step"</
822
 
823
  <!-- ══ FOOTER ══ -->
824
  <footer style="text-align:center;padding:32px 40px 24px;border-top:1px solid var(--border);color:var(--muted);font-size:12px;">
825
- HallucinationGuard-Env v4.2.0 &middot; OpenEnv &middot; <a href="/swagger" style="color:var(--amber);text-decoration:none">Swagger Docs</a> &middot; <a href="/redoc" style="color:var(--amber);text-decoration:none">ReDoc</a>
826
  </footer>
827
 
828
  <script>
@@ -836,7 +836,7 @@ const REWARD_KEYS = [
836
  {key:'citation', label:'Citation Accuracy', css:'rc-2'},
837
  {key:'calibration', label:'Confidence Calibr.', css:'rc-3'},
838
  {key:'consistency', label:'Semantic Consistency', css:'rc-4'},
839
- {key:'halluc_detect', label:'Hallucination Detect.', css:'rc-5'},
840
  {key:'rouge_l', label:'ROUGE-L', css:'rc-6'},
841
  {key:'bert_score', label:'BERTScore', css:'rc-7'},
842
  {key:'align_score', label:'AlignScore', css:'rc-8'},
@@ -846,7 +846,7 @@ const REWARD_KEYS = [
846
  {key:'citation_accuracy', label:'Citation Accuracy', css:'rc-2'},
847
  {key:'confidence_calibration', label:'Confidence Calibr.', css:'rc-3'},
848
  {key:'semantic_consistency', label:'Semantic Consistency', css:'rc-4'},
849
- {key:'hallucination_penalty', label:'Hallucination Detect.', css:'rc-5'},
850
  {key:'rouge_score', label:'ROUGE-L', css:'rc-6'},
851
  {key:'bertscore', label:'BERTScore', css:'rc-7'},
852
  {key:'alignscore', label:'AlignScore', css:'rc-8'},
@@ -926,11 +926,11 @@ function renderRewards(data) {
926
 
927
  container.innerHTML = html || '<div style="color:var(--border2);font-size:12px;text-align:center;padding:12px">No breakdown data in response</div>';
928
 
929
- // hallucination badge
930
- const badge = document.getElementById('halluc-badge');
931
- if (data.is_hallucination != null) {
932
- badge.className = 'halluc-badge show ' + (data.is_hallucination ? 'yes' : 'no');
933
- badge.textContent = data.is_hallucination ? '⚠ Hallucination' : 'βœ“ Grounded';
934
  }
935
  }
936
 
@@ -958,7 +958,7 @@ async function doReset() {
958
  document.getElementById('reward-bars').innerHTML = '<div style="text-align:center;padding:20px 0;color:var(--border2);font-size:13px;">Submit an answer to see the 9-component reward breakdown</div>';
959
  document.getElementById('total-reward').textContent = 'β€”';
960
  document.getElementById('total-reward').style.color = 'var(--amber)';
961
- document.getElementById('halluc-badge').className = 'halluc-badge';
962
  setStatus('ready');
963
  } catch(e) {
964
  document.getElementById('ctx-box').innerHTML = '<span style="color:var(--red)">Error: ' + escHtml(e.message) + '</span>';
@@ -1068,7 +1068,7 @@ def _get_default_env() -> DataCleaningEnvironment:
1068
  def reset(self, **kwargs):
1069
  return type('Obs', (), {'question': 'Placeholder', 'context': 'Context', 'reward': 0.0, 'done': False, 'info': {}})()
1070
  def step(self, action):
1071
- return type('Obs', (), {'reward': 0.0, 'done': False, 'is_hallucination': False, 'info': {}})()
1072
  def state(self): return {}
1073
  def close(self): pass
1074
  _default_env = MinimalEnv()
@@ -1083,7 +1083,7 @@ def _create_session_env(session_id: str) -> DataCleaningEnvironment:
1083
  loader_env = _get_default_env()
1084
  # Pass the shared loader directly into __init__ so we skip the expensive
1085
  # DatasetLoader() construction and dataset loading that would otherwise
1086
- # happen inside HallucinationEnvironment.__init__
1087
  env = DataCleaningEnvironment(session_id=session_id, dataset_loader=loader_env.dataset_loader)
1088
  return env
1089
 
@@ -1147,7 +1147,7 @@ async def lifespan(app: FastAPI):
1147
 
1148
  app = FastAPI(
1149
  lifespan=lifespan,
1150
- title="HallucinationGuard-Env",
1151
  version="4.2.0",
1152
  docs_url="/swagger",
1153
  redoc_url="/redoc",
@@ -1156,7 +1156,7 @@ app = FastAPI(
1156
  app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
1157
 
1158
  import json as _json
1159
- _LEADERBOARD_FILE = "/tmp/hallucination_guard_leaderboard.json"
1160
 
1161
  def _load_leaderboard():
1162
  if os.path.exists(_LEADERBOARD_FILE):
@@ -1281,7 +1281,7 @@ async def run_baseline(body: Dict[str, Any] = {}):
1281
  for _ in range(steps):
1282
  if obs_dict.get("done"): break
1283
  ctx = obs_dict.get("context", "")
1284
- action = HallucinationAction(answer=ctx[:100], confidence=0.6, source_quote=ctx[:80])
1285
  obs_dict = _safe_dict(env.step(action))
1286
  rewards.append(float(obs_dict.get("reward") or 0))
1287
  obs_meta = obs_dict.get("metadata", {})
@@ -1293,8 +1293,8 @@ async def run_baseline(body: Dict[str, Any] = {}):
1293
  "correctness": obs_correctness,
1294
  "grounding": obs_dict.get("grounding_score", 0),
1295
  "calibration": obs_calibration,
1296
- "hallucination_score": 1.0 if obs_dict.get("is_hallucination") else 0.0,
1297
- "is_hallucination": bool(obs_dict.get("is_hallucination", False)),
1298
  "semantic_consistency": rb.get("semantic_consistency", 0.0),
1299
  "rouge_l": rb.get("rouge_l", 0.0),
1300
  "bert_score": rb.get("bert_score", 0.0),
@@ -1305,8 +1305,8 @@ async def run_baseline(body: Dict[str, Any] = {}):
1305
  "correctness": 0.0,
1306
  "grounding": obs_dict.get("grounding_score", 0),
1307
  "calibration": 0.6,
1308
- "hallucination_score": 1.0 if obs_dict.get("is_hallucination") else 0.0,
1309
- "is_hallucination": bool(obs_dict.get("is_hallucination", False)),
1310
  })
1311
  results.append(compute_task_score(task, rewards, infos))
1312
  try: env.close()
@@ -1321,7 +1321,7 @@ async def batch_evaluate(body: Dict[str, Any]):
1321
  results = []
1322
  for i, item in enumerate(items):
1323
  r, info = calculate_reward(item.get("answer",""), item.get("confidence",0.5), item.get("source_quote",""), item.get("context",""), item.get("ground_truth",""))
1324
- results.append({"index": i, "reward": round(r,4), "is_hallucination": info.get("is_hallucination", False)})
1325
  return {"total_items": len(results), "results": results}
1326
 
1327
  @app.get("/leaderboard", tags=["Leaderboard"])
@@ -1333,7 +1333,7 @@ async def leaderboard():
1333
 
1334
  @app.post("/leaderboard/submit", tags=["Leaderboard"])
1335
  async def submit_leaderboard(data: Dict[str, Any]):
1336
- required = ["model_name", "avg_reward", "avg_accuracy", "hallucination_rate", "total_episodes", "total_steps"]
1337
  if missing := [f for f in required if f not in data]: raise HTTPException(422, f"Missing: {missing}")
1338
  _leaderboard[data["model_name"]] = {**data, "submitted_at": time.time()}
1339
  _save_leaderboard(_leaderboard)
@@ -1345,12 +1345,12 @@ async def health(): return {"status": "healthy", "version": "4.2.0"}
1345
  @app.get("/metadata", tags=["OpenEnv"])
1346
  async def metadata():
1347
  return {
1348
- "name": "hallucination-guard-env",
1349
  "version": "4.2.0",
1350
  "license": "MIT",
1351
  "description": (
1352
  "An OpenEnv RL environment that trains AI models to answer questions "
1353
- "ONLY from verified context documents β€” penalizing hallucination and "
1354
  "rewarding factual grounding."
1355
  ),
1356
  }
@@ -1377,7 +1377,7 @@ async def schema():
1377
  "done": {"type": "boolean"},
1378
  "reward": {"type": "number"},
1379
  "feedback": {"type": "string"},
1380
- "is_hallucination": {"type": "boolean"},
1381
  "grounding_score": {"type": "number"},
1382
  "difficulty_level": {"type": "string"},
1383
  "attempts_remaining": {"type": "integer"},
@@ -1389,7 +1389,7 @@ async def schema():
1389
  "episode_id": {"type": "string"},
1390
  "step_count": {"type": "integer"},
1391
  "accuracy": {"type": "number"},
1392
- "hallucination_rate": {"type": "number"},
1393
  "average_reward": {"type": "number"},
1394
  "current_difficulty": {"type": "string"},
1395
  "skill_rating": {"type": "number"},
@@ -1408,7 +1408,7 @@ async def datasets():
1408
  async def mcp(body: Dict[str, Any]):
1409
  if body.get("method") == "tools/list":
1410
  return {"jsonrpc": "2.0", "id": body.get("id",1), "result": {"tools": [{"name": "reset", "inputSchema": {"type": "object"}}, {"name": "step", "inputSchema": {"type": "object"}}]}}
1411
- return {"jsonrpc": "2.0", "id": body.get("id",1), "result": {"name": "hallucination-guard-env", "version": "4.2.0"}}
1412
 
1413
  @app.middleware("http")
1414
  async def log_req(request, call_next):
 
38
  <head>
39
  <meta charset="UTF-8">
40
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
41
+ <title>DataQualityGuard-Env Β· OpenEnv</title>
42
  <link rel="icon" href="data:image/svg+xml,<svg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 100 100'><rect width='100' height='100' rx='20' fill='%23080c14'/><text x='50' y='68' font-size='55' text-anchor='middle' fill='%23f59e0b' font-family='sans-serif' font-weight='bold'>H</text></svg>">
43
  <link href="https://fonts.googleapis.com/css2?family=Space+Grotesk:wght@300;400;500;600;700&family=Fira+Code:wght@400;500&display=swap" rel="stylesheet">
44
  <style>
 
409
  }
410
  .ep-meta { display: flex; justify-content: space-between; align-items: center; }
411
  .ep-step { font-size: 11px; color: var(--muted); font-family: var(--mono); }
412
+ .cleanc-badge {
413
  display: none;
414
  font-size: 11px; font-weight: 700; letter-spacing: 0.5px;
415
  padding: 3px 10px; border-radius: 100px;
416
  }
417
+ .cleanc-badge.show { display: inline-block; }
418
+ .cleanc-badge.yes { background: var(--red-dim); color: var(--red); border: 1px solid rgba(248,113,113,0.3); }
419
+ .cleanc-badge.no { background: var(--green-dim); color: var(--green); border: 1px solid rgba(74,222,128,0.3); }
420
 
421
  /* ── REWARD BREAKDOWN ── */
422
  .reward-section { margin-top: 16px; }
 
506
  <div class="hero-badge">OpenEnv Β· RL Environment</div>
507
  <div class="ver-chip">v4.2.0</div>
508
  <h1>
509
+ <span class="accent">DataQuality</span><span class="accent2">Guard</span>‑Env
510
  </h1>
511
  <p class="hero-sub">
512
  Train AI models to answer <strong>only from verified context</strong> β€” with a 9-component reward system that penalizes fabrication and rewards factual grounding, citation accuracy, and calibrated confidence.
 
549
  <div id="overview" class="panel active">
550
  <div class="section-head">
551
  <h2>How it works</h2>
552
+ <p>Three primitives. Nine reward signals. One goal: no data_qualitys.</p>
553
  </div>
554
  <div class="steps">
555
  <div class="step">
 
568
  <span class="step-num">03</span>
569
  <div class="step-icon">πŸ“Š</div>
570
  <h4>grade()</h4>
571
+ <p>Aggregate episode rewards into a task score. Track accuracy, data_quality rate, and skill rating over time.</p>
572
  </div>
573
  </div>
574
 
575
  <div class="card">
576
  <h3>9-Component Reward System</h3>
577
+ <p>Every answer is graded on <strong>factual correctness</strong>, <strong>source grounding</strong>, <strong>citation accuracy</strong>, <strong>confidence calibration</strong>, <strong>semantic consistency</strong>, <strong>data_quality detection</strong>, <strong>ROUGE-L</strong>, <strong>BERTScore</strong>, and <strong>AlignScore</strong>. Each component is weighted and combined into a single scalar reward in <strong>[0, 1]</strong>. Confident wrong answers are penalized harder than uncertain ones.</p>
578
  </div>
579
  <div class="card">
580
  <h3>Curriculum Progression</h3>
 
634
  <span class="diff-badge advanced">Advanced</span>
635
  <span class="data-count">~210K examples</span>
636
  </div>
637
+ <p>Resist adversarial prompts designed to elicit data_qualitys. Many questions are deliberately unanswerable β€” confident refusals with low confidence score better than fabricated plausible-sounding answers.</p>
638
  <div class="dataset-chips">
639
+ <span class="ds-chip">DataQualityEval</span>
640
  <span class="ds-chip">TruthfulQA</span>
641
  <span class="ds-chip">FEVER</span>
642
  <span class="ds-chip">Climate-FEVER</span>
 
665
  <tbody>
666
  <tr><td><span class="method post">POST</span></td><td class="endpoint">/reset</td><td class="td-desc">Start episode β€” returns question, context, difficulty, episode_id</td></tr>
667
  <tr><td><span class="method post">POST</span></td><td class="endpoint">/step</td><td class="td-desc">Submit answer with confidence + source_quote, receive reward breakdown</td></tr>
668
+ <tr><td><span class="method get">GET</span></td><td class="endpoint">/state</td><td class="td-desc">Current episode metadata β€” accuracy, data_quality_rate, skill_rating</td></tr>
669
  <tr><td><span class="method get">GET</span></td><td class="endpoint">/tasks</td><td class="td-desc">List all 3 tasks with action schema</td></tr>
670
  <tr><td><span class="method post">POST</span></td><td class="endpoint">/grader</td><td class="td-desc">Score a completed episode (0.0 – 1.0) from rewards + infos</td></tr>
671
  <tr><td><span class="method post">POST</span></td><td class="endpoint">/baseline</td><td class="td-desc">Run heuristic baseline across all 3 tasks</td></tr>
 
716
  }).json()
717
 
718
  <span class="fn">print</span>(result[<span class="st">"reward"</span>]) <span class="cm"># scalar in [0, 1]</span>
719
+ <span class="fn">print</span>(result[<span class="st">"is_data_quality"</span>]) <span class="cm"># bool</span></div>
720
  </div>
721
  </div>
722
  </div>
 
735
  <div class="ep-progress">
736
  <div class="ep-meta">
737
  <span class="ep-step" id="ep-step-label">No episode active</span>
738
+ <span class="cleanc-badge" id="cleanc-badge"></span>
739
  </div>
740
  <div class="ep-bar-bg"><div class="ep-bar-fill" id="ep-bar" style="width:0%"></div></div>
741
  </div>
 
822
 
823
  <!-- ══ FOOTER ══ -->
824
  <footer style="text-align:center;padding:32px 40px 24px;border-top:1px solid var(--border);color:var(--muted);font-size:12px;">
825
+ DataQualityGuard-Env v4.2.0 &middot; OpenEnv &middot; <a href="/swagger" style="color:var(--amber);text-decoration:none">Swagger Docs</a> &middot; <a href="/redoc" style="color:var(--amber);text-decoration:none">ReDoc</a>
826
  </footer>
827
 
828
  <script>
 
836
  {key:'citation', label:'Citation Accuracy', css:'rc-2'},
837
  {key:'calibration', label:'Confidence Calibr.', css:'rc-3'},
838
  {key:'consistency', label:'Semantic Consistency', css:'rc-4'},
839
+ {key:'cleanc_detect', label:'DataQuality Detect.', css:'rc-5'},
840
  {key:'rouge_l', label:'ROUGE-L', css:'rc-6'},
841
  {key:'bert_score', label:'BERTScore', css:'rc-7'},
842
  {key:'align_score', label:'AlignScore', css:'rc-8'},
 
846
  {key:'citation_accuracy', label:'Citation Accuracy', css:'rc-2'},
847
  {key:'confidence_calibration', label:'Confidence Calibr.', css:'rc-3'},
848
  {key:'semantic_consistency', label:'Semantic Consistency', css:'rc-4'},
849
+ {key:'data_quality_penalty', label:'DataQuality Detect.', css:'rc-5'},
850
  {key:'rouge_score', label:'ROUGE-L', css:'rc-6'},
851
  {key:'bertscore', label:'BERTScore', css:'rc-7'},
852
  {key:'alignscore', label:'AlignScore', css:'rc-8'},
 
926
 
927
  container.innerHTML = html || '<div style="color:var(--border2);font-size:12px;text-align:center;padding:12px">No breakdown data in response</div>';
928
 
929
+ // data_quality badge
930
+ const badge = document.getElementById('cleanc-badge');
931
+ if (data.is_data_quality != null) {
932
+ badge.className = 'cleanc-badge show ' + (data.is_data_quality ? 'yes' : 'no');
933
+ badge.textContent = data.is_data_quality ? '⚠ DataQuality' : 'βœ“ Grounded';
934
  }
935
  }
936
 
 
958
  document.getElementById('reward-bars').innerHTML = '<div style="text-align:center;padding:20px 0;color:var(--border2);font-size:13px;">Submit an answer to see the 9-component reward breakdown</div>';
959
  document.getElementById('total-reward').textContent = 'β€”';
960
  document.getElementById('total-reward').style.color = 'var(--amber)';
961
+ document.getElementById('cleanc-badge').className = 'cleanc-badge';
962
  setStatus('ready');
963
  } catch(e) {
964
  document.getElementById('ctx-box').innerHTML = '<span style="color:var(--red)">Error: ' + escHtml(e.message) + '</span>';
 
1068
  def reset(self, **kwargs):
1069
  return type('Obs', (), {'question': 'Placeholder', 'context': 'Context', 'reward': 0.0, 'done': False, 'info': {}})()
1070
  def step(self, action):
1071
+ return type('Obs', (), {'reward': 0.0, 'done': False, 'is_data_quality': False, 'info': {}})()
1072
  def state(self): return {}
1073
  def close(self): pass
1074
  _default_env = MinimalEnv()
 
1083
  loader_env = _get_default_env()
1084
  # Pass the shared loader directly into __init__ so we skip the expensive
1085
  # DatasetLoader() construction and dataset loading that would otherwise
1086
+ # happen inside DataQualityEnvironment.__init__
1087
  env = DataCleaningEnvironment(session_id=session_id, dataset_loader=loader_env.dataset_loader)
1088
  return env
1089
 
 
1147
 
1148
  app = FastAPI(
1149
  lifespan=lifespan,
1150
+ title="DataQualityGuard-Env",
1151
  version="4.2.0",
1152
  docs_url="/swagger",
1153
  redoc_url="/redoc",
 
1156
  app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
1157
 
1158
  import json as _json
1159
+ _LEADERBOARD_FILE = "/tmp/data_quality_guard_leaderboard.json"
1160
 
1161
  def _load_leaderboard():
1162
  if os.path.exists(_LEADERBOARD_FILE):
 
1281
  for _ in range(steps):
1282
  if obs_dict.get("done"): break
1283
  ctx = obs_dict.get("context", "")
1284
+ action = DataQualityAction(answer=ctx[:100], confidence=0.6, source_quote=ctx[:80])
1285
  obs_dict = _safe_dict(env.step(action))
1286
  rewards.append(float(obs_dict.get("reward") or 0))
1287
  obs_meta = obs_dict.get("metadata", {})
 
1293
  "correctness": obs_correctness,
1294
  "grounding": obs_dict.get("grounding_score", 0),
1295
  "calibration": obs_calibration,
1296
+ "data_quality_score": 1.0 if obs_dict.get("is_data_quality") else 0.0,
1297
+ "is_data_quality": bool(obs_dict.get("is_data_quality", False)),
1298
  "semantic_consistency": rb.get("semantic_consistency", 0.0),
1299
  "rouge_l": rb.get("rouge_l", 0.0),
1300
  "bert_score": rb.get("bert_score", 0.0),
 
1305
  "correctness": 0.0,
1306
  "grounding": obs_dict.get("grounding_score", 0),
1307
  "calibration": 0.6,
1308
+ "data_quality_score": 1.0 if obs_dict.get("is_data_quality") else 0.0,
1309
+ "is_data_quality": bool(obs_dict.get("is_data_quality", False)),
1310
  })
1311
  results.append(compute_task_score(task, rewards, infos))
1312
  try: env.close()
 
1321
  results = []
1322
  for i, item in enumerate(items):
1323
  r, info = calculate_reward(item.get("answer",""), item.get("confidence",0.5), item.get("source_quote",""), item.get("context",""), item.get("ground_truth",""))
1324
+ results.append({"index": i, "reward": round(r,4), "is_data_quality": info.get("is_data_quality", False)})
1325
  return {"total_items": len(results), "results": results}
1326
 
1327
  @app.get("/leaderboard", tags=["Leaderboard"])
 
1333
 
1334
  @app.post("/leaderboard/submit", tags=["Leaderboard"])
1335
  async def submit_leaderboard(data: Dict[str, Any]):
1336
+ required = ["model_name", "avg_reward", "avg_accuracy", "data_quality_rate", "total_episodes", "total_steps"]
1337
  if missing := [f for f in required if f not in data]: raise HTTPException(422, f"Missing: {missing}")
1338
  _leaderboard[data["model_name"]] = {**data, "submitted_at": time.time()}
1339
  _save_leaderboard(_leaderboard)
 
1345
  @app.get("/metadata", tags=["OpenEnv"])
1346
  async def metadata():
1347
  return {
1348
+ "name": "data_quality-guard-env",
1349
  "version": "4.2.0",
1350
  "license": "MIT",
1351
  "description": (
1352
  "An OpenEnv RL environment that trains AI models to answer questions "
1353
+ "ONLY from verified context documents β€” penalizing data_quality and "
1354
  "rewarding factual grounding."
1355
  ),
1356
  }
 
1377
  "done": {"type": "boolean"},
1378
  "reward": {"type": "number"},
1379
  "feedback": {"type": "string"},
1380
+ "is_data_quality": {"type": "boolean"},
1381
  "grounding_score": {"type": "number"},
1382
  "difficulty_level": {"type": "string"},
1383
  "attempts_remaining": {"type": "integer"},
 
1389
  "episode_id": {"type": "string"},
1390
  "step_count": {"type": "integer"},
1391
  "accuracy": {"type": "number"},
1392
+ "data_quality_rate": {"type": "number"},
1393
  "average_reward": {"type": "number"},
1394
  "current_difficulty": {"type": "string"},
1395
  "skill_rating": {"type": "number"},
 
1408
  async def mcp(body: Dict[str, Any]):
1409
  if body.get("method") == "tools/list":
1410
  return {"jsonrpc": "2.0", "id": body.get("id",1), "result": {"tools": [{"name": "reset", "inputSchema": {"type": "object"}}, {"name": "step", "inputSchema": {"type": "object"}}]}}
1411
+ return {"jsonrpc": "2.0", "id": body.get("id",1), "result": {"name": "data_quality-guard-env", "version": "4.2.0"}}
1412
 
1413
  @app.middleware("http")
1414
  async def log_req(request, call_next):
server/metrics.py CHANGED
@@ -1,9 +1,9 @@
1
- """Professional-grade metrics and visualization for HallucinationGuard-Env.
2
 
3
  This module provides:
4
  - Real-time metrics tracking
5
  - Training curve visualization
6
- - Hallucination heatmaps
7
  - Comprehensive logging
8
  - Export capabilities for analysis
9
  """
@@ -29,8 +29,8 @@ class StepMetrics:
29
  correctness: float
30
  grounding: float
31
  calibration: float
32
- hallucination_score: float
33
- is_hallucination: bool
34
  confidence: float
35
  difficulty: str
36
  timestamp: float = field(default_factory=time.time)
@@ -42,8 +42,8 @@ class EpisodeMetrics:
42
  episode_id: str
43
  total_steps: int
44
  average_reward: float
45
- total_hallucinations: int
46
- hallucination_rate: float
47
  accuracy: float
48
  average_confidence: float
49
  calibration_error: float
@@ -69,13 +69,13 @@ class TrainingSession:
69
 
70
  # Aggregated metrics
71
  overall_accuracy: float = 0.0
72
- overall_hallucination_rate: float = 0.0
73
  average_reward: float = 0.0
74
  skill_rating_progress: List[float] = field(default_factory=list)
75
 
76
  # Trend analysis
77
  reward_trend: str = "stable" # improving, stable, declining
78
- hallucination_trend: str = "stable"
79
 
80
  def to_dict(self) -> Dict[str, Any]:
81
  """Convert to dictionary for serialization."""
@@ -86,11 +86,11 @@ class TrainingSession:
86
  "total_episodes": self.total_episodes,
87
  "total_steps": self.total_steps,
88
  "overall_accuracy": self.overall_accuracy,
89
- "overall_hallucination_rate": self.overall_hallucination_rate,
90
  "average_reward": self.average_reward,
91
  "skill_rating_progress": self.skill_rating_progress,
92
  "reward_trend": self.reward_trend,
93
- "hallucination_trend": self.hallucination_trend,
94
  }
95
 
96
 
@@ -116,13 +116,13 @@ class MetricsTracker:
116
 
117
  # Rolling windows for trend analysis
118
  self.reward_window: List[float] = []
119
- self.hallucination_window: List[bool] = []
120
  self.window_size = 10
121
 
122
  # Real-time aggregates
123
  self.running_reward_sum = 0.0
124
  self.running_reward_count = 0
125
- self.running_hallucination_count = 0
126
  self.running_step_count = 0
127
 
128
  logger.info(f"Initialized MetricsTracker (session={self.session_id})")
@@ -136,8 +136,8 @@ class MetricsTracker:
136
  correctness=step_data.get("correctness", 0.0),
137
  grounding=step_data.get("grounding", 0.0),
138
  calibration=step_data.get("calibration", 0.0),
139
- hallucination_score=step_data.get("hallucination_score", 0.0),
140
- is_hallucination=step_data.get("is_hallucination", False),
141
  confidence=step_data.get("confidence", 0.5),
142
  difficulty=step_data.get("difficulty", "intermediate"),
143
  )
@@ -150,16 +150,16 @@ class MetricsTracker:
150
  self.running_reward_count += 1
151
  self.running_step_count += 1
152
 
153
- if step_metrics.is_hallucination:
154
- self.running_hallucination_count += 1
155
 
156
  # Update rolling windows
157
  self.reward_window.append(step_metrics.reward)
158
- self.hallucination_window.append(step_metrics.is_hallucination)
159
 
160
  if len(self.reward_window) > self.window_size:
161
  self.reward_window.pop(0)
162
- self.hallucination_window.pop(0)
163
 
164
  return step_metrics
165
 
@@ -169,8 +169,8 @@ class MetricsTracker:
169
  episode_id=episode_data.get("episode_id", ""),
170
  total_steps=episode_data.get("total_steps", len(self.current_episode_data)),
171
  average_reward=episode_data.get("average_reward", 0.0),
172
- total_hallucinations=episode_data.get("total_hallucinations", 0),
173
- hallucination_rate=episode_data.get("hallucination_rate", 0.0),
174
  accuracy=episode_data.get("accuracy", 0.0),
175
  average_confidence=episode_data.get("average_confidence", 0.5),
176
  calibration_error=episode_data.get("calibration_error", 0.0),
@@ -196,7 +196,7 @@ class MetricsTracker:
196
  self.current_episode_data = []
197
 
198
  logger.info(f"Episode {episode_metrics.episode_id} completed: reward={episode_metrics.average_reward:.3f}, "
199
- f"hallucination_rate={episode_metrics.hallucination_rate:.3f}")
200
 
201
  return episode_metrics
202
 
@@ -209,9 +209,9 @@ class MetricsTracker:
209
  total_correct = sum(ep.accuracy * ep.total_steps for ep in self.current_session.episode_metrics)
210
  self.current_session.overall_accuracy = total_correct / max(1, self.current_session.total_steps)
211
 
212
- # Overall hallucination rate
213
- total_hallucinations = sum(ep.total_hallucinations for ep in self.current_session.episode_metrics)
214
- self.current_session.overall_hallucination_rate = total_hallucinations / max(1, self.current_session.total_steps)
215
 
216
  # Average reward
217
  total_reward = sum(ep.average_reward * ep.total_steps for ep in self.current_session.episode_metrics)
@@ -238,17 +238,17 @@ class MetricsTracker:
238
  else:
239
  self.current_session.reward_trend = "stable"
240
 
241
- # Hallucination trend
242
- if len(self.hallucination_window) >= 5:
243
- recent_hallucination_rate = sum(self.hallucination_window[-5:]) / 5
244
- older_hallucination_rate = sum(self.hallucination_window[:-5]) / max(1, len(self.hallucination_window) - 5)
245
 
246
- if recent_hallucination_rate < older_hallucination_rate - 0.1:
247
- self.current_session.hallucination_trend = "improving"
248
- elif recent_hallucination_rate > older_hallucination_rate + 0.1:
249
- self.current_session.hallucination_trend = "worsening"
250
  else:
251
- self.current_session.hallucination_trend = "stable"
252
 
253
  def get_real_time_metrics(self) -> Dict[str, Any]:
254
  """Get current real-time metrics."""
@@ -257,18 +257,18 @@ class MetricsTracker:
257
  "episodes_completed": self.current_session.total_episodes,
258
  "total_steps": self.current_session.total_steps,
259
  "overall_accuracy": self.current_session.overall_accuracy,
260
- "overall_hallucination_rate": self.current_session.overall_hallucination_rate,
261
  "average_reward": self.current_session.average_reward,
262
  "reward_trend": self.current_session.reward_trend,
263
- "hallucination_trend": self.current_session.hallucination_trend,
264
  "recent_reward_avg": sum(self.reward_window) / max(1, len(self.reward_window)),
265
- "recent_hallucination_rate": sum(self.hallucination_window) / max(1, len(self.hallucination_window)),
266
  }
267
 
268
  def get_training_curve_data(self) -> Dict[str, List[Any]]:
269
  """Get data for plotting training curves."""
270
  episode_rewards = [ep.average_reward for ep in self.current_session.episode_metrics]
271
- hallucination_rates = [ep.hallucination_rate for ep in self.current_session.episode_metrics]
272
  accuracies = [ep.accuracy for ep in self.current_session.episode_metrics]
273
  skill_ratings = self.current_session.skill_rating_progress
274
 
@@ -282,15 +282,15 @@ class MetricsTracker:
282
  "episodes": list(range(1, len(episode_rewards) + 1)),
283
  "rewards": episode_rewards,
284
  "rewards_smooth": moving_average(episode_rewards),
285
- "hallucination_rates": hallucination_rates,
286
- "hallucination_rates_smooth": moving_average(hallucination_rates),
287
  "accuracies": accuracies,
288
  "skill_ratings": skill_ratings,
289
  }
290
 
291
- def get_hallucination_heatmap_data(self) -> Dict[str, Any]:
292
- """Get data for hallucination heatmap visualization."""
293
- # Group by difficulty and hallucination type
294
  heatmap_data = {}
295
 
296
  for step in self.current_session.step_metrics:
@@ -298,19 +298,19 @@ class MetricsTracker:
298
  if difficulty not in heatmap_data:
299
  heatmap_data[difficulty] = {
300
  "total": 0,
301
- "hallucinations": 0,
302
  "by_type": {}
303
  }
304
 
305
  heatmap_data[difficulty]["total"] += 1
306
- if step.is_hallucination:
307
- heatmap_data[difficulty]["hallucinations"] += 1
308
 
309
  # Calculate rates
310
  for difficulty in heatmap_data:
311
  total = heatmap_data[difficulty]["total"]
312
- hallucs = heatmap_data[difficulty]["hallucinations"]
313
- heatmap_data[difficulty]["rate"] = hallucs / max(1, total)
314
 
315
  return heatmap_data
316
 
@@ -324,14 +324,14 @@ class MetricsTracker:
324
  "correctness": [],
325
  "grounding": [],
326
  "calibration": [],
327
- "hallucination_score": [],
328
  }
329
 
330
  for step in self.current_session.step_metrics:
331
  components["correctness"].append(step.correctness)
332
  components["grounding"].append(step.grounding)
333
  components["calibration"].append(step.calibration)
334
- components["hallucination_score"].append(step.hallucination_score)
335
 
336
  # Calculate statistics
337
  analysis = {}
@@ -366,14 +366,14 @@ class MetricsTracker:
366
  "episode_id": ep.episode_id,
367
  "total_steps": ep.total_steps,
368
  "average_reward": ep.average_reward,
369
- "hallucination_rate": ep.hallucination_rate,
370
  "accuracy": ep.accuracy,
371
  "duration": ep.duration,
372
  }
373
  for ep in self.current_session.episode_metrics
374
  ],
375
  "training_curves": self.get_training_curve_data(),
376
- "heatmap_data": self.get_hallucination_heatmap_data(),
377
  "reward_analysis": self.get_reward_breakdown_analysis(),
378
  }
379
 
@@ -390,12 +390,12 @@ class MetricsTracker:
390
 
391
  with open(filepath, 'w', encoding='utf-8') as f:
392
  # Header
393
- f.write("step,episode_id,reward,correctness,grounding,calibration,hallucination_score,is_hallucination,confidence,difficulty,timestamp\n")
394
 
395
  # Data
396
  for step in self.current_session.step_metrics:
397
  f.write(f"{step.step},{step.episode_id},{step.reward},{step.correctness},{step.grounding},"
398
- f"{step.calibration},{step.hallucination_score},{int(step.is_hallucination)},"
399
  f"{step.confidence},{step.difficulty},{step.timestamp}\n")
400
 
401
  logger.info(f"Exported CSV to {filepath}")
@@ -407,7 +407,7 @@ class MetricsTracker:
407
 
408
  report = f"""
409
  ╔══════════════════════════════════════════════════════════╗
410
- β•‘ HallucinationGuard-Env Training Summary β•‘
411
  ╠══════════════════════════════════════════════════════════╣
412
 
413
  Session: {self.current_session.session_id}
@@ -419,15 +419,15 @@ PERFORMANCE METRICS
419
  ────────────────────────────────────────────────────────────
420
  Overall Accuracy: {metrics['overall_accuracy']:.1%}
421
  Average Reward: {metrics['average_reward']:.3f}
422
- Hallucination Rate: {metrics['overall_hallucination_rate']:.1%}
423
 
424
  ───────���────────────────────────────────────────────────────
425
  TREND ANALYSIS
426
  ────────────────────────────────────────────────────────────
427
  Reward Trend: {metrics['reward_trend'].upper()}
428
- Hallucination Trend: {metrics['hallucination_trend'].upper()}
429
  Recent Reward Avg: {metrics['recent_reward_avg']:.3f}
430
- Recent Hallucination Rate: {metrics['recent_hallucination_rate']:.1%}
431
 
432
  ────────────────────────────────────────────────────────────
433
  INTERPRETATION
@@ -442,12 +442,12 @@ INTERPRETATION
442
  else:
443
  report += "β†’ Model performance is STABLE\n"
444
 
445
- if metrics['hallucination_trend'] == "improving":
446
- report += "βœ“ Hallucination rate is DECREASING\n"
447
- elif metrics['hallucination_trend'] == "worsening":
448
- report += "⚠ Hallucination rate is INCREASING - review training data\n"
449
  else:
450
- report += "β†’ Hallucination rate is STABLE\n"
451
 
452
  if metrics['overall_accuracy'] > 0.8:
453
  report += "\nβ˜… EXCELLENT: Model is performing at expert level\n"
@@ -505,10 +505,10 @@ class VisualizationDataGenerator:
505
  "line": {"dash": "dash"},
506
  },
507
  {
508
- "name": "Hallucination Rate",
509
  "type": "scatter",
510
  "x": curve_data["episodes"],
511
- "y": curve_data["hallucination_rates"],
512
  "mode": "lines+markers",
513
  "yaxis": "y2",
514
  },
@@ -526,21 +526,21 @@ class VisualizationDataGenerator:
526
  "xaxis": {"title": "Episode"},
527
  "yaxis": {"title": "Reward / Accuracy"},
528
  "yaxis2": {
529
- "title": "Hallucination Rate",
530
  "overlaying": "y",
531
  "side": "right",
532
  },
533
  }
534
  }
535
 
536
- def get_hallucination_type_distribution(self) -> Dict[str, Any]:
537
- """Get hallucination type distribution for pie chart."""
538
  type_counts = {}
539
 
540
  for step in self.tracker.current_session.step_metrics:
541
- if step.is_hallucination:
542
  # In a full implementation, track specific types
543
- type_key = "hallucination"
544
  type_counts[type_key] = type_counts.get(type_key, 0) + 1
545
 
546
  return {
@@ -550,7 +550,7 @@ class VisualizationDataGenerator:
550
 
551
  def get_difficulty_performance_comparison(self) -> Dict[str, Any]:
552
  """Get performance comparison across difficulties."""
553
- heatmap_data = self.tracker.get_hallucination_heatmap_data()
554
 
555
  difficulties = list(heatmap_data.keys())
556
  rates = [heatmap_data[d]["rate"] for d in difficulties]
@@ -558,7 +558,7 @@ class VisualizationDataGenerator:
558
 
559
  return {
560
  "difficulties": difficulties,
561
- "hallucination_rates": rates,
562
  "sample_sizes": totals,
563
  }
564
 
 
1
+ """Professional-grade metrics and visualization for DataQualityGuard-Env.
2
 
3
  This module provides:
4
  - Real-time metrics tracking
5
  - Training curve visualization
6
+ - DataQuality heatmaps
7
  - Comprehensive logging
8
  - Export capabilities for analysis
9
  """
 
29
  correctness: float
30
  grounding: float
31
  calibration: float
32
+ data_quality_score: float
33
+ is_data_quality: bool
34
  confidence: float
35
  difficulty: str
36
  timestamp: float = field(default_factory=time.time)
 
42
  episode_id: str
43
  total_steps: int
44
  average_reward: float
45
+ total_data_qualitys: int
46
+ data_quality_rate: float
47
  accuracy: float
48
  average_confidence: float
49
  calibration_error: float
 
69
 
70
  # Aggregated metrics
71
  overall_accuracy: float = 0.0
72
+ overall_data_quality_rate: float = 0.0
73
  average_reward: float = 0.0
74
  skill_rating_progress: List[float] = field(default_factory=list)
75
 
76
  # Trend analysis
77
  reward_trend: str = "stable" # improving, stable, declining
78
+ data_quality_trend: str = "stable"
79
 
80
  def to_dict(self) -> Dict[str, Any]:
81
  """Convert to dictionary for serialization."""
 
86
  "total_episodes": self.total_episodes,
87
  "total_steps": self.total_steps,
88
  "overall_accuracy": self.overall_accuracy,
89
+ "overall_data_quality_rate": self.overall_data_quality_rate,
90
  "average_reward": self.average_reward,
91
  "skill_rating_progress": self.skill_rating_progress,
92
  "reward_trend": self.reward_trend,
93
+ "data_quality_trend": self.data_quality_trend,
94
  }
95
 
96
 
 
116
 
117
  # Rolling windows for trend analysis
118
  self.reward_window: List[float] = []
119
+ self.data_quality_window: List[bool] = []
120
  self.window_size = 10
121
 
122
  # Real-time aggregates
123
  self.running_reward_sum = 0.0
124
  self.running_reward_count = 0
125
+ self.running_data_quality_count = 0
126
  self.running_step_count = 0
127
 
128
  logger.info(f"Initialized MetricsTracker (session={self.session_id})")
 
136
  correctness=step_data.get("correctness", 0.0),
137
  grounding=step_data.get("grounding", 0.0),
138
  calibration=step_data.get("calibration", 0.0),
139
+ data_quality_score=step_data.get("data_quality_score", 0.0),
140
+ is_data_quality=step_data.get("is_data_quality", False),
141
  confidence=step_data.get("confidence", 0.5),
142
  difficulty=step_data.get("difficulty", "intermediate"),
143
  )
 
150
  self.running_reward_count += 1
151
  self.running_step_count += 1
152
 
153
+ if step_metrics.is_data_quality:
154
+ self.running_data_quality_count += 1
155
 
156
  # Update rolling windows
157
  self.reward_window.append(step_metrics.reward)
158
+ self.data_quality_window.append(step_metrics.is_data_quality)
159
 
160
  if len(self.reward_window) > self.window_size:
161
  self.reward_window.pop(0)
162
+ self.data_quality_window.pop(0)
163
 
164
  return step_metrics
165
 
 
169
  episode_id=episode_data.get("episode_id", ""),
170
  total_steps=episode_data.get("total_steps", len(self.current_episode_data)),
171
  average_reward=episode_data.get("average_reward", 0.0),
172
+ total_data_qualitys=episode_data.get("total_data_qualitys", 0),
173
+ data_quality_rate=episode_data.get("data_quality_rate", 0.0),
174
  accuracy=episode_data.get("accuracy", 0.0),
175
  average_confidence=episode_data.get("average_confidence", 0.5),
176
  calibration_error=episode_data.get("calibration_error", 0.0),
 
196
  self.current_episode_data = []
197
 
198
  logger.info(f"Episode {episode_metrics.episode_id} completed: reward={episode_metrics.average_reward:.3f}, "
199
+ f"data_quality_rate={episode_metrics.data_quality_rate:.3f}")
200
 
201
  return episode_metrics
202
 
 
209
  total_correct = sum(ep.accuracy * ep.total_steps for ep in self.current_session.episode_metrics)
210
  self.current_session.overall_accuracy = total_correct / max(1, self.current_session.total_steps)
211
 
212
+ # Overall data_quality rate
213
+ total_data_qualitys = sum(ep.total_data_qualitys for ep in self.current_session.episode_metrics)
214
+ self.current_session.overall_data_quality_rate = total_data_qualitys / max(1, self.current_session.total_steps)
215
 
216
  # Average reward
217
  total_reward = sum(ep.average_reward * ep.total_steps for ep in self.current_session.episode_metrics)
 
238
  else:
239
  self.current_session.reward_trend = "stable"
240
 
241
+ # DataQuality trend
242
+ if len(self.data_quality_window) >= 5:
243
+ recent_data_quality_rate = sum(self.data_quality_window[-5:]) / 5
244
+ older_data_quality_rate = sum(self.data_quality_window[:-5]) / max(1, len(self.data_quality_window) - 5)
245
 
246
+ if recent_data_quality_rate < older_data_quality_rate - 0.1:
247
+ self.current_session.data_quality_trend = "improving"
248
+ elif recent_data_quality_rate > older_data_quality_rate + 0.1:
249
+ self.current_session.data_quality_trend = "worsening"
250
  else:
251
+ self.current_session.data_quality_trend = "stable"
252
 
253
  def get_real_time_metrics(self) -> Dict[str, Any]:
254
  """Get current real-time metrics."""
 
257
  "episodes_completed": self.current_session.total_episodes,
258
  "total_steps": self.current_session.total_steps,
259
  "overall_accuracy": self.current_session.overall_accuracy,
260
+ "overall_data_quality_rate": self.current_session.overall_data_quality_rate,
261
  "average_reward": self.current_session.average_reward,
262
  "reward_trend": self.current_session.reward_trend,
263
+ "data_quality_trend": self.current_session.data_quality_trend,
264
  "recent_reward_avg": sum(self.reward_window) / max(1, len(self.reward_window)),
265
+ "recent_data_quality_rate": sum(self.data_quality_window) / max(1, len(self.data_quality_window)),
266
  }
267
 
268
  def get_training_curve_data(self) -> Dict[str, List[Any]]:
269
  """Get data for plotting training curves."""
270
  episode_rewards = [ep.average_reward for ep in self.current_session.episode_metrics]
271
+ data_quality_rates = [ep.data_quality_rate for ep in self.current_session.episode_metrics]
272
  accuracies = [ep.accuracy for ep in self.current_session.episode_metrics]
273
  skill_ratings = self.current_session.skill_rating_progress
274
 
 
282
  "episodes": list(range(1, len(episode_rewards) + 1)),
283
  "rewards": episode_rewards,
284
  "rewards_smooth": moving_average(episode_rewards),
285
+ "data_quality_rates": data_quality_rates,
286
+ "data_quality_rates_smooth": moving_average(data_quality_rates),
287
  "accuracies": accuracies,
288
  "skill_ratings": skill_ratings,
289
  }
290
 
291
+ def get_data_quality_heatmap_data(self) -> Dict[str, Any]:
292
+ """Get data for data_quality heatmap visualization."""
293
+ # Group by difficulty and data_quality type
294
  heatmap_data = {}
295
 
296
  for step in self.current_session.step_metrics:
 
298
  if difficulty not in heatmap_data:
299
  heatmap_data[difficulty] = {
300
  "total": 0,
301
+ "data_qualitys": 0,
302
  "by_type": {}
303
  }
304
 
305
  heatmap_data[difficulty]["total"] += 1
306
+ if step.is_data_quality:
307
+ heatmap_data[difficulty]["data_qualitys"] += 1
308
 
309
  # Calculate rates
310
  for difficulty in heatmap_data:
311
  total = heatmap_data[difficulty]["total"]
312
+ cleancs = heatmap_data[difficulty]["data_qualitys"]
313
+ heatmap_data[difficulty]["rate"] = cleancs / max(1, total)
314
 
315
  return heatmap_data
316
 
 
324
  "correctness": [],
325
  "grounding": [],
326
  "calibration": [],
327
+ "data_quality_score": [],
328
  }
329
 
330
  for step in self.current_session.step_metrics:
331
  components["correctness"].append(step.correctness)
332
  components["grounding"].append(step.grounding)
333
  components["calibration"].append(step.calibration)
334
+ components["data_quality_score"].append(step.data_quality_score)
335
 
336
  # Calculate statistics
337
  analysis = {}
 
366
  "episode_id": ep.episode_id,
367
  "total_steps": ep.total_steps,
368
  "average_reward": ep.average_reward,
369
+ "data_quality_rate": ep.data_quality_rate,
370
  "accuracy": ep.accuracy,
371
  "duration": ep.duration,
372
  }
373
  for ep in self.current_session.episode_metrics
374
  ],
375
  "training_curves": self.get_training_curve_data(),
376
+ "heatmap_data": self.get_data_quality_heatmap_data(),
377
  "reward_analysis": self.get_reward_breakdown_analysis(),
378
  }
379
 
 
390
 
391
  with open(filepath, 'w', encoding='utf-8') as f:
392
  # Header
393
+ f.write("step,episode_id,reward,correctness,grounding,calibration,data_quality_score,is_data_quality,confidence,difficulty,timestamp\n")
394
 
395
  # Data
396
  for step in self.current_session.step_metrics:
397
  f.write(f"{step.step},{step.episode_id},{step.reward},{step.correctness},{step.grounding},"
398
+ f"{step.calibration},{step.data_quality_score},{int(step.is_data_quality)},"
399
  f"{step.confidence},{step.difficulty},{step.timestamp}\n")
400
 
401
  logger.info(f"Exported CSV to {filepath}")
 
407
 
408
  report = f"""
409
  ╔══════════════════════════════════════════════════════════╗
410
+ β•‘ DataQualityGuard-Env Training Summary β•‘
411
  ╠══════════════════════════════════════════════════════════╣
412
 
413
  Session: {self.current_session.session_id}
 
419
  ────────────────────────────────────────────────────────────
420
  Overall Accuracy: {metrics['overall_accuracy']:.1%}
421
  Average Reward: {metrics['average_reward']:.3f}
422
+ DataQuality Rate: {metrics['overall_data_quality_rate']:.1%}
423
 
424
  ───────���────────────────────────────────────────────────────
425
  TREND ANALYSIS
426
  ────────────────────────────────────────────────────────────
427
  Reward Trend: {metrics['reward_trend'].upper()}
428
+ DataQuality Trend: {metrics['data_quality_trend'].upper()}
429
  Recent Reward Avg: {metrics['recent_reward_avg']:.3f}
430
+ Recent DataQuality Rate: {metrics['recent_data_quality_rate']:.1%}
431
 
432
  ────────────────────────────────────────────────────────────
433
  INTERPRETATION
 
442
  else:
443
  report += "β†’ Model performance is STABLE\n"
444
 
445
+ if metrics['data_quality_trend'] == "improving":
446
+ report += "βœ“ DataQuality rate is DECREASING\n"
447
+ elif metrics['data_quality_trend'] == "worsening":
448
+ report += "⚠ DataQuality rate is INCREASING - review training data\n"
449
  else:
450
+ report += "β†’ DataQuality rate is STABLE\n"
451
 
452
  if metrics['overall_accuracy'] > 0.8:
453
  report += "\nβ˜… EXCELLENT: Model is performing at expert level\n"
 
505
  "line": {"dash": "dash"},
506
  },
507
  {
508
+ "name": "DataQuality Rate",
509
  "type": "scatter",
510
  "x": curve_data["episodes"],
511
+ "y": curve_data["data_quality_rates"],
512
  "mode": "lines+markers",
513
  "yaxis": "y2",
514
  },
 
526
  "xaxis": {"title": "Episode"},
527
  "yaxis": {"title": "Reward / Accuracy"},
528
  "yaxis2": {
529
+ "title": "DataQuality Rate",
530
  "overlaying": "y",
531
  "side": "right",
532
  },
533
  }
534
  }
535
 
536
+ def get_data_quality_type_distribution(self) -> Dict[str, Any]:
537
+ """Get data_quality type distribution for pie chart."""
538
  type_counts = {}
539
 
540
  for step in self.tracker.current_session.step_metrics:
541
+ if step.is_data_quality:
542
  # In a full implementation, track specific types
543
+ type_key = "data_quality"
544
  type_counts[type_key] = type_counts.get(type_key, 0) + 1
545
 
546
  return {
 
550
 
551
  def get_difficulty_performance_comparison(self) -> Dict[str, Any]:
552
  """Get performance comparison across difficulties."""
553
+ heatmap_data = self.tracker.get_data_quality_heatmap_data()
554
 
555
  difficulties = list(heatmap_data.keys())
556
  rates = [heatmap_data[d]["rate"] for d in difficulties]
 
558
 
559
  return {
560
  "difficulties": difficulties,
561
+ "data_quality_rates": rates,
562
  "sample_sizes": totals,
563
  }
564
 
server/tasks.py CHANGED
@@ -1,5 +1,5 @@
1
  """
2
- HallucinationGuard-Env β€” Task Registry v4.0
3
 
4
  Defines the 3 required OpenEnv tasks, each with:
5
  - A unique task_id and human description
@@ -11,7 +11,7 @@ Task hierarchy
11
  --------------
12
  task_1_factual_grounding BEGINNER SQuAD, BoolQ, OpenBookQA, ARC
13
  task_2_multi_hop_synthesis INTERMEDIATE HotpotQA, CoQA, NQ-Open, MS-MARCO
14
- task_3_adversarial_resistance ADVANCED HaluEval, TruthfulQA, FEVER,
15
  Climate-FEVER, Adversarial-QA
16
  """
17
 
@@ -73,7 +73,7 @@ class TaskDefinition:
73
  action_schema: Dict[str, Any]
74
 
75
  # Scoring thresholds used by the task grader
76
- hallucination_penalty_weight: float = 0.25
77
  correctness_weight: float = 0.40
78
  grounding_weight: float = 0.20
79
  calibration_weight: float = 0.15
@@ -93,7 +93,7 @@ class TaskDefinition:
93
  "correctness_weight": self.correctness_weight,
94
  "grounding_weight": self.grounding_weight,
95
  "calibration_weight": self.calibration_weight,
96
- "hallucination_penalty_weight": self.hallucination_penalty_weight,
97
  "range": [0.0, 1.0],
98
  },
99
  "scoring_notes": self.scoring_notes,
@@ -117,10 +117,10 @@ TASK_1 = TaskDefinition(
117
  correctness_weight=0.45,
118
  grounding_weight=0.25,
119
  calibration_weight=0.10,
120
- hallucination_penalty_weight=0.20,
121
  scoring_notes=(
122
  "Scored 0.0–1.0. Full marks require: correct answer, quote from context, "
123
- "appropriate confidence. Hallucination causes a hard penalty of up to -0.4 "
124
  "applied after the weighted sum. Partial credit awarded for near-correct answers."
125
  ),
126
  )
@@ -142,11 +142,11 @@ TASK_2 = TaskDefinition(
142
  correctness_weight=0.40,
143
  grounding_weight=0.25,
144
  calibration_weight=0.10,
145
- hallucination_penalty_weight=0.25,
146
  scoring_notes=(
147
  "Scored 0.0–1.0. Answers must integrate evidence from multiple context spans. "
148
  "Fabricating a 'bridge' fact that is not in the context is penalised as "
149
- "hallucination even if the final answer happens to be correct. "
150
  "ROUGE-L and BERTScore contribute to correctness assessment."
151
  ),
152
  )
@@ -154,25 +154,25 @@ TASK_2 = TaskDefinition(
154
  # ── Task 3 β€” Adversarial Resistance (ADVANCED) ────────────────────────────────
155
  TASK_3 = TaskDefinition(
156
  task_id="task_3_adversarial_resistance",
157
- name="Adversarial Hallucination Resistance",
158
  difficulty="advanced",
159
  description=(
160
- "Resist adversarially-crafted prompts designed to elicit hallucinations. "
161
- "Questions come from HaluEval, TruthfulQA, FEVER, Climate-FEVER, and "
162
  "AdversarialQA β€” datasets built specifically to expose overconfident or "
163
  "fabricated responses. Many questions have misleading preambles or are "
164
  "unanswerable from the given context. The agent must refuse to answer, "
165
  "flag uncertainty, or correctly debunk false premises."
166
  ),
167
- datasets=["halueval", "truthful_qa", "fever", "climate_fever", "adversarial_qa"],
168
  action_schema=ACTION_SCHEMA,
169
  correctness_weight=0.30,
170
  grounding_weight=0.20,
171
  calibration_weight=0.20,
172
- hallucination_penalty_weight=0.30,
173
  scoring_notes=(
174
  "Scored 0.0–1.0. The hardest task: adversarial questions specifically target "
175
- "common hallucination failure modes. High calibration is rewarded β€” correctly "
176
  "expressing low confidence on unanswerable questions scores up to 0.6. "
177
  "A confident wrong answer on an adversarial question can score as low as 0.0. "
178
  "Frontier models (GPT-4o, Claude 3.5) typically score 0.55–0.75 on this task."
@@ -242,12 +242,12 @@ def compute_task_score(
242
  avg_correctness = _avg("correctness")
243
  avg_grounding = _avg("grounding")
244
  avg_calibration = _avg("calibration")
245
- avg_hallucination = _avg("hallucination_score")
246
- hallucination_rate = sum(1 for i in step_infos if i.get("is_hallucination")) / n
247
 
248
- # Primary score = mean per-step reward minus hallucination penalty
249
- hallucination_penalty = task.hallucination_penalty_weight * avg_hallucination
250
- base_score = max(0.0, avg_step_reward - hallucination_penalty)
251
 
252
  # Small completion bonus for finishing all steps
253
  completion_bonus = 0.02 if n >= 5 else 0.0
@@ -256,7 +256,7 @@ def compute_task_score(
256
 
257
  # Task-3: extra penalty for overconfident wrong answers
258
  if task.task_id == TASK_3.task_id:
259
- overconfidence_penalty = max(0.0, avg_calibration - 0.7) * avg_hallucination * 0.1
260
  raw_score = max(0.0, raw_score - overconfidence_penalty)
261
 
262
  return {
@@ -265,8 +265,8 @@ def compute_task_score(
265
  "avg_correctness": round(avg_correctness, 4),
266
  "avg_grounding": round(avg_grounding, 4),
267
  "avg_calibration": round(avg_calibration, 4),
268
- "avg_hallucination": round(avg_hallucination, 4),
269
- "hallucination_rate": round(hallucination_rate, 4),
270
  "completion_bonus": round(completion_bonus, 4),
271
  "avg_step_reward": round(avg_step_reward, 4),
272
  },
 
1
  """
2
+ DataQualityGuard-Env β€” Task Registry v4.0
3
 
4
  Defines the 3 required OpenEnv tasks, each with:
5
  - A unique task_id and human description
 
11
  --------------
12
  task_1_factual_grounding BEGINNER SQuAD, BoolQ, OpenBookQA, ARC
13
  task_2_multi_hop_synthesis INTERMEDIATE HotpotQA, CoQA, NQ-Open, MS-MARCO
14
+ task_3_adversarial_resistance ADVANCED DataQualityEval, TruthfulQA, FEVER,
15
  Climate-FEVER, Adversarial-QA
16
  """
17
 
 
73
  action_schema: Dict[str, Any]
74
 
75
  # Scoring thresholds used by the task grader
76
+ data_quality_penalty_weight: float = 0.25
77
  correctness_weight: float = 0.40
78
  grounding_weight: float = 0.20
79
  calibration_weight: float = 0.15
 
93
  "correctness_weight": self.correctness_weight,
94
  "grounding_weight": self.grounding_weight,
95
  "calibration_weight": self.calibration_weight,
96
+ "data_quality_penalty_weight": self.data_quality_penalty_weight,
97
  "range": [0.0, 1.0],
98
  },
99
  "scoring_notes": self.scoring_notes,
 
117
  correctness_weight=0.45,
118
  grounding_weight=0.25,
119
  calibration_weight=0.10,
120
+ data_quality_penalty_weight=0.20,
121
  scoring_notes=(
122
  "Scored 0.0–1.0. Full marks require: correct answer, quote from context, "
123
+ "appropriate confidence. DataQuality causes a hard penalty of up to -0.4 "
124
  "applied after the weighted sum. Partial credit awarded for near-correct answers."
125
  ),
126
  )
 
142
  correctness_weight=0.40,
143
  grounding_weight=0.25,
144
  calibration_weight=0.10,
145
+ data_quality_penalty_weight=0.25,
146
  scoring_notes=(
147
  "Scored 0.0–1.0. Answers must integrate evidence from multiple context spans. "
148
  "Fabricating a 'bridge' fact that is not in the context is penalised as "
149
+ "data_quality even if the final answer happens to be correct. "
150
  "ROUGE-L and BERTScore contribute to correctness assessment."
151
  ),
152
  )
 
154
  # ── Task 3 β€” Adversarial Resistance (ADVANCED) ────────────────────────────────
155
  TASK_3 = TaskDefinition(
156
  task_id="task_3_adversarial_resistance",
157
+ name="Adversarial DataQuality Resistance",
158
  difficulty="advanced",
159
  description=(
160
+ "Resist adversarially-crafted prompts designed to elicit data_qualitys. "
161
+ "Questions come from DataQualityEval, TruthfulQA, FEVER, Climate-FEVER, and "
162
  "AdversarialQA β€” datasets built specifically to expose overconfident or "
163
  "fabricated responses. Many questions have misleading preambles or are "
164
  "unanswerable from the given context. The agent must refuse to answer, "
165
  "flag uncertainty, or correctly debunk false premises."
166
  ),
167
+ datasets=["data_quality_eval", "truthful_qa", "fever", "climate_fever", "adversarial_qa"],
168
  action_schema=ACTION_SCHEMA,
169
  correctness_weight=0.30,
170
  grounding_weight=0.20,
171
  calibration_weight=0.20,
172
+ data_quality_penalty_weight=0.30,
173
  scoring_notes=(
174
  "Scored 0.0–1.0. The hardest task: adversarial questions specifically target "
175
+ "common data_quality failure modes. High calibration is rewarded β€” correctly "
176
  "expressing low confidence on unanswerable questions scores up to 0.6. "
177
  "A confident wrong answer on an adversarial question can score as low as 0.0. "
178
  "Frontier models (GPT-4o, Claude 3.5) typically score 0.55–0.75 on this task."
 
242
  avg_correctness = _avg("correctness")
243
  avg_grounding = _avg("grounding")
244
  avg_calibration = _avg("calibration")
245
+ avg_data_quality = _avg("data_quality_score")
246
+ data_quality_rate = sum(1 for i in step_infos if i.get("is_data_quality")) / n
247
 
248
+ # Primary score = mean per-step reward minus data_quality penalty
249
+ data_quality_penalty = task.data_quality_penalty_weight * avg_data_quality
250
+ base_score = max(0.0, avg_step_reward - data_quality_penalty)
251
 
252
  # Small completion bonus for finishing all steps
253
  completion_bonus = 0.02 if n >= 5 else 0.0
 
256
 
257
  # Task-3: extra penalty for overconfident wrong answers
258
  if task.task_id == TASK_3.task_id:
259
+ overconfidence_penalty = max(0.0, avg_calibration - 0.7) * avg_data_quality * 0.1
260
  raw_score = max(0.0, raw_score - overconfidence_penalty)
261
 
262
  return {
 
265
  "avg_correctness": round(avg_correctness, 4),
266
  "avg_grounding": round(avg_grounding, 4),
267
  "avg_calibration": round(avg_calibration, 4),
268
+ "avg_data_quality": round(avg_data_quality, 4),
269
+ "data_quality_rate": round(data_quality_rate, 4),
270
  "completion_bonus": round(completion_bonus, 4),
271
  "avg_step_reward": round(avg_step_reward, 4),
272
  },