Spaces:

kumar6591
/

data-quality-env

Sleeping

App Files Files Community

Hemanth Kunta commited on Apr 8

Commit

cb70147

1 Parent(s): 94595e2

final fix for phase 2 stdout score range

Browse files

Files changed (1) hide show

inference.py +42 -18

inference.py CHANGED Viewed

@@ -29,6 +29,7 @@ MAX_TOKENS = 1000
 MAX_AUDIT_STEPS = 9
 FIX_STEPS = 3
 WALL_LIMIT = 15 * 60
 SYSTEM_PROMPT = """You are a SQL Data Auditor.
@@ -110,6 +111,24 @@ def emit_block(kind: str, **fields) -> None:
     print(" ".join(parts), flush=True)
 def parse_action(text: str) -> dict:
     raw = (text or "").strip()
     raw = raw.replace("```json", "").replace("```", "").strip()
@@ -476,24 +495,26 @@ def run_task_hybrid(task_id: int, global_start: float) -> float:
     print(f"Tables: {list(obs['tables'].keys())} | Credits: {obs['query_credits_remaining']}")
     if time.time() - global_start > WALL_LIMIT - 60:
-        return 0.0
     evidence, base_report = build_probe_report(task_id)
     final_report = llm_refine_report(task_id, obs, evidence, base_report)
     final_report = normalize_report(final_report)
     out = submit(final_report)
-    score = float(out.get("reward", {}).get("value", 0.0))
     emit_block("STEP", task=task_id, step=1, reward=score, action="submit_report")
     # Optional harmless fix step for bonus phase behavior parity.
     try:
         fix = call_env("step", {"action": {"action_type": "fix_sql", "sql": "UPDATE orders SET order_total = order_total WHERE 1=0"}})
-        score = float(fix.get("reward", {}).get("value", score))
         emit_block("STEP", task=task_id, step=2, reward=score, action="fix_sql")
     except Exception:
         pass
-    print(f"  Episode done. Final score: {score:.3f}")
     emit_block("END", task=task_id, score=score, steps=2)
     return score
@@ -598,17 +619,17 @@ def run_task_heuristic(task_id: int) -> float:
         }
     out = submit(report)
-    score = float(out.get("reward", {}).get("value", 0.0))
-    print(f"  audit score: {score:.3f}")
     emit_block("STEP", task=task_id, step=1, reward=score, action="submit_report")
     # One no-op fix to demonstrate fix phase behavior.
     try:
         fix = call_env("step", {"action": {"action_type": "fix_sql", "sql": "UPDATE orders SET order_total = order_total WHERE 1=0"}})
-        score = float(fix.get("reward", {}).get("value", score))
         emit_block("STEP", task=task_id, step=2, reward=score, action="fix_sql")
     except Exception:
         pass
-    print(f"  final score: {score:.3f}")
     emit_block("END", task=task_id, score=score, steps=2)
     return score
@@ -623,7 +644,7 @@ def run_task(task_id: int, global_start: float) -> float:
     print(f"Tables: {list(obs['tables'].keys())} | Credits: {obs['query_credits_remaining']}")
     history = []
-    final_score = 0.0
     total_steps = MAX_AUDIT_STEPS + FIX_STEPS
     for step in range(1, total_steps + 1):
@@ -682,11 +703,11 @@ Return next action JSON only."""
         reward = step_result.get("reward", {})
         history.append({"step": step, "action": action.get("action_type", "unknown")})
-        final_score = float(reward.get("value", final_score))
         emit_block("STEP", task=task_id, step=step, reward=final_score, action=action.get("action_type", "unknown"))
         if reward.get("done"):
-            print(f"  Episode done. Final score: {final_score:.3f}")
             emit_block("END", task=task_id, score=final_score, steps=step)
             return final_score
@@ -704,7 +725,7 @@ Return next action JSON only."""
     }
     try:
         result = call_env("step", {"action": empty_report})
-        final_score = float(result.get("reward", {}).get("value", final_score))
     except Exception:
         pass
     emit_block("END", task=task_id, score=final_score, steps=total_steps)
@@ -741,20 +762,23 @@ def main():
     for task_id in [1, 2, 3, 4]:
         if time.time() - global_start > WALL_LIMIT - 120:
-            scores[f"task_{task_id}"] = 0.0
             continue
         if use_heuristic:
-            scores[f"task_{task_id}"] = run_task_heuristic(task_id)
         else:
-            scores[f"task_{task_id}"] = run_task_hybrid(task_id, global_start)
     print("\n" + "=" * 60)
     print("BASELINE RESULTS (seed=42)")
     print("=" * 60)
     for k, v in scores.items():
-        print(f"  {k}: {v:.3f}")
-    mean = sum(scores.values()) / max(len(scores), 1)
-    print(f"  mean: {mean:.3f}")
     print(f"  total wall time: {(time.time() - global_start) / 60:.1f} min")
     if not use_heuristic and all(v <= 0.0 for v in scores.values()):
         print("WARNING: LLM mode ran but all scores are 0.0. Check model connectivity and prompt behavior.")

 MAX_AUDIT_STEPS = 9
 FIX_STEPS = 3
 WALL_LIMIT = 15 * 60
+SCORE_EPS = 1e-6
 SYSTEM_PROMPT = """You are a SQL Data Auditor.
     print(" ".join(parts), flush=True)
+def strict_score(value: float | int | str | None, default: float = SCORE_EPS) -> float:
+    """Clamp score into strict open interval (0,1) for validator compatibility."""
+    try:
+        v = float(value)
+    except Exception:
+        v = float(default)
+    if not (v > 0.0):
+        return SCORE_EPS
+    if not (v < 1.0):
+        return 1.0 - SCORE_EPS
+    return v
+def score_text(value: float | int | str | None, default: float = SCORE_EPS) -> str:
+    """Stable string formatting for printed score lines without rounding to 1.000."""
+    return f"{strict_score(value, default=default):.6f}"
 def parse_action(text: str) -> dict:
     raw = (text or "").strip()
     raw = raw.replace("```json", "").replace("```", "").strip()
     print(f"Tables: {list(obs['tables'].keys())} | Credits: {obs['query_credits_remaining']}")
     if time.time() - global_start > WALL_LIMIT - 60:
+        score = strict_score(0.0)
+        emit_block("END", task=task_id, score=score, steps=0)
+        return score
     evidence, base_report = build_probe_report(task_id)
     final_report = llm_refine_report(task_id, obs, evidence, base_report)
     final_report = normalize_report(final_report)
     out = submit(final_report)
+    score = strict_score(out.get("reward", {}).get("value", 0.0))
     emit_block("STEP", task=task_id, step=1, reward=score, action="submit_report")
     # Optional harmless fix step for bonus phase behavior parity.
     try:
         fix = call_env("step", {"action": {"action_type": "fix_sql", "sql": "UPDATE orders SET order_total = order_total WHERE 1=0"}})
+        score = strict_score(fix.get("reward", {}).get("value", score), default=score)
         emit_block("STEP", task=task_id, step=2, reward=score, action="fix_sql")
     except Exception:
         pass
+    print(f"  Episode done. Final score: {score_text(score, default=score)}")
     emit_block("END", task=task_id, score=score, steps=2)
     return score
         }
     out = submit(report)
+    score = strict_score(out.get("reward", {}).get("value", 0.0))
+    print(f"  audit score: {score_text(score, default=score)}")
     emit_block("STEP", task=task_id, step=1, reward=score, action="submit_report")
     # One no-op fix to demonstrate fix phase behavior.
     try:
         fix = call_env("step", {"action": {"action_type": "fix_sql", "sql": "UPDATE orders SET order_total = order_total WHERE 1=0"}})
+        score = strict_score(fix.get("reward", {}).get("value", score), default=score)
         emit_block("STEP", task=task_id, step=2, reward=score, action="fix_sql")
     except Exception:
         pass
+    print(f"  final score: {score_text(score, default=score)}")
     emit_block("END", task=task_id, score=score, steps=2)
     return score
     print(f"Tables: {list(obs['tables'].keys())} | Credits: {obs['query_credits_remaining']}")
     history = []
+    final_score = strict_score(0.0)
     total_steps = MAX_AUDIT_STEPS + FIX_STEPS
     for step in range(1, total_steps + 1):
         reward = step_result.get("reward", {})
         history.append({"step": step, "action": action.get("action_type", "unknown")})
+        final_score = strict_score(reward.get("value", final_score), default=final_score)
         emit_block("STEP", task=task_id, step=step, reward=final_score, action=action.get("action_type", "unknown"))
         if reward.get("done"):
+            print(f"  Episode done. Final score: {score_text(final_score, default=final_score)}")
             emit_block("END", task=task_id, score=final_score, steps=step)
             return final_score
     }
     try:
         result = call_env("step", {"action": empty_report})
+        final_score = strict_score(result.get("reward", {}).get("value", final_score), default=final_score)
     except Exception:
         pass
     emit_block("END", task=task_id, score=final_score, steps=total_steps)
     for task_id in [1, 2, 3, 4]:
         if time.time() - global_start > WALL_LIMIT - 120:
+            score = strict_score(0.0)
+            emit_block("START", task=task_id, mode="skipped", seed=SEED)
+            emit_block("END", task=task_id, score=score, steps=0)
+            scores[f"task_{task_id}"] = score
             continue
         if use_heuristic:
+            scores[f"task_{task_id}"] = strict_score(run_task_heuristic(task_id))
         else:
+            scores[f"task_{task_id}"] = strict_score(run_task_hybrid(task_id, global_start))
     print("\n" + "=" * 60)
     print("BASELINE RESULTS (seed=42)")
     print("=" * 60)
     for k, v in scores.items():
+        print(f"  {k}: {score_text(v, default=v)}")
+    mean = strict_score(sum(scores.values()) / max(len(scores), 1))
+    print(f"  mean: {score_text(mean, default=mean)}")
     print(f"  total wall time: {(time.time() - global_start) / 60:.1f} min")
     if not use_heuristic and all(v <= 0.0 for v in scores.values()):
         print("WARNING: LLM mode ran but all scores are 0.0. Check model connectivity and prompt behavior.")