Spaces:
Sleeping
Sleeping
Update api/server.py
Browse files- api/server.py +130 -43
api/server.py
CHANGED
|
@@ -19,7 +19,7 @@ from env.models import (
|
|
| 19 |
HealthResponse, TaskInfo, ProgressResponse
|
| 20 |
)
|
| 21 |
from env.tasks import task_manager, ACTION_SCHEMA
|
| 22 |
-
from env.graders import grade
|
| 23 |
|
| 24 |
|
| 25 |
# βββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -143,8 +143,8 @@ async def step(action: Action):
|
|
| 143 |
return StepResponse(
|
| 144 |
observation = environment._build_observation(),
|
| 145 |
reward = Reward(
|
| 146 |
-
score = 0.
|
| 147 |
-
breakdown = {"validation_error": 0.
|
| 148 |
feedback = f"Malformed action: {str(e)}"
|
| 149 |
),
|
| 150 |
done = False,
|
|
@@ -183,67 +183,154 @@ async def tasks():
|
|
| 183 |
|
| 184 |
|
| 185 |
# βββββββββββββββββββββββββββββββββββββββββββββ
|
| 186 |
-
# 6. /grader β POST
|
| 187 |
# βββββββββββββββββββββββββββββββββββββββββββββ
|
| 188 |
|
| 189 |
@app.post("/grader", response_model=GraderResponse, tags=["Grading"])
|
| 190 |
async def grader(request: GraderRequest):
|
| 191 |
"""
|
| 192 |
-
Grades a
|
| 193 |
-
|
| 194 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 195 |
"""
|
| 196 |
try:
|
| 197 |
if request.action is None:
|
| 198 |
return GraderResponse(
|
| 199 |
-
score = 0.
|
| 200 |
feedback = "No action provided for grading.",
|
| 201 |
breakdown = {"error": "null_action"}
|
| 202 |
)
|
| 203 |
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
current = perf_history[-1] if perf_history else 0.0
|
| 211 |
-
max_possible = max(1.0, 100.0 - baseline)
|
| 212 |
-
|
| 213 |
-
perf_improvement = (current - baseline) / max_possible
|
| 214 |
-
step_efficiency = 1.0 - (ep_state.step_count / max(1, 50))
|
| 215 |
-
score = round(
|
| 216 |
-
(perf_improvement * 0.60) + (step_efficiency * 0.20) + 0.10, 4
|
| 217 |
-
)
|
| 218 |
-
score = max(0.0, min(1.0, score))
|
| 219 |
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
}
|
| 232 |
-
)
|
| 233 |
|
| 234 |
-
#
|
| 235 |
-
score, breakdown, feedback = grade(request.action,
|
| 236 |
-
score = max(0.
|
| 237 |
return GraderResponse(score=score, feedback=feedback, breakdown=breakdown)
|
| 238 |
|
| 239 |
except Exception as e:
|
| 240 |
return GraderResponse(
|
| 241 |
-
score = 0.
|
| 242 |
feedback = f"Grader error: {str(e)}",
|
| 243 |
breakdown = {"error": str(e)}
|
| 244 |
)
|
| 245 |
|
| 246 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
# βββββββββββββββββββββββββββββββββββββββββββββ
|
| 248 |
# 7. /baseline β POST
|
| 249 |
# βββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -280,7 +367,7 @@ async def baseline():
|
|
| 280 |
|
| 281 |
|
| 282 |
# βββββββββββββββββββββββββββββββββββββββββββββ
|
| 283 |
-
# 8. /progress β GET (Round 2
|
| 284 |
# βββββββββββββββββββββββββββββββββββββββββββββ
|
| 285 |
|
| 286 |
@app.get("/progress", response_model=ProgressResponse, tags=["Training"])
|
|
@@ -291,7 +378,7 @@ async def progress():
|
|
| 291 |
Shows improvement from baseline to current score.
|
| 292 |
"""
|
| 293 |
ep_state = environment.state()
|
| 294 |
-
ac = ep_state.action_counts
|
| 295 |
perf_history = ac.get("_perf_history", [])
|
| 296 |
milestones = ac.get("_milestones", [])
|
| 297 |
baseline = ac.get("_baseline_score", 0.0)
|
|
@@ -331,4 +418,4 @@ async def root():
|
|
| 331 |
"tasks_count": 30,
|
| 332 |
"max_steps": 50,
|
| 333 |
"themes": ["Long-Horizon Planning", "World Modeling", "Self-Improvement", "Wildcard"],
|
| 334 |
-
}
|
|
|
|
| 19 |
HealthResponse, TaskInfo, ProgressResponse
|
| 20 |
)
|
| 21 |
from env.tasks import task_manager, ACTION_SCHEMA
|
| 22 |
+
from env.graders import grade, grade_db_action, _is_scenario_task, _get_scenario
|
| 23 |
|
| 24 |
|
| 25 |
# βββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 143 |
return StepResponse(
|
| 144 |
observation = environment._build_observation(),
|
| 145 |
reward = Reward(
|
| 146 |
+
score = 0.001,
|
| 147 |
+
breakdown = {"validation_error": 0.001},
|
| 148 |
feedback = f"Malformed action: {str(e)}"
|
| 149 |
),
|
| 150 |
done = False,
|
|
|
|
| 183 |
|
| 184 |
|
| 185 |
# βββββββββββββββββββββββββββββββββββββββββββββ
|
| 186 |
+
# 6. /grader β POST (FIXED)
|
| 187 |
# βββββββββββββββββββββββββββββββββββββββββββββ
|
| 188 |
|
| 189 |
@app.post("/grader", response_model=GraderResponse, tags=["Grading"])
|
| 190 |
async def grader(request: GraderRequest):
|
| 191 |
"""
|
| 192 |
+
Grades an action for a given task_id. STATELESS β does not change episode state.
|
| 193 |
+
|
| 194 |
+
Routing:
|
| 195 |
+
Round 2 scenario IDs (easy_s001, medium_s002, hard_s003):
|
| 196 |
+
- submit_report β computes score from current DB performance delta
|
| 197 |
+
- all other types β grade_db_action() scores action quality vs scenario
|
| 198 |
+
|
| 199 |
+
Round 1 task IDs (easy_001, medium_001, hard_001):
|
| 200 |
+
β grade() β grade_easy/medium/hard() (original Round 1 graders)
|
| 201 |
+
|
| 202 |
+
Score is ALWAYS strictly between 0.001 and 0.999.
|
| 203 |
+
NEVER crashes β all exceptions caught and returned as 0.001.
|
| 204 |
+
|
| 205 |
+
FIXES applied vs original:
|
| 206 |
+
- Round 2 non-terminal actions now route to grade_db_action() instead of
|
| 207 |
+
grade_easy() which was looking for "fixed_query" in Round 2 payloads
|
| 208 |
+
and returning 0.001 for every create_index / analyze_indexes / inspect_query
|
| 209 |
+
- submit_report score now uses db_simulator state from environment directly
|
| 210 |
+
instead of brittle action_counts dict lookup which could be empty or stale
|
| 211 |
"""
|
| 212 |
try:
|
| 213 |
if request.action is None:
|
| 214 |
return GraderResponse(
|
| 215 |
+
score = 0.001,
|
| 216 |
feedback = "No action provided for grading.",
|
| 217 |
breakdown = {"error": "null_action"}
|
| 218 |
)
|
| 219 |
|
| 220 |
+
task_id = request.task_id or ""
|
| 221 |
+
action_type = (
|
| 222 |
+
request.action.action_type.value
|
| 223 |
+
if hasattr(request.action.action_type, "value")
|
| 224 |
+
else str(request.action.action_type)
|
| 225 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
|
| 227 |
+
# ββ ROUND 2: DB ENGINEERING SCENARIO βββββββββββββββββββββ
|
| 228 |
+
if _is_scenario_task(task_id):
|
| 229 |
+
|
| 230 |
+
# submit_report: use live DB state from environment simulator
|
| 231 |
+
if action_type == "submit_report":
|
| 232 |
+
return _grade_submit_report(request, task_id)
|
| 233 |
+
|
| 234 |
+
# All other Round 2 actions: stateless scenario-aware grading
|
| 235 |
+
score, breakdown, feedback = grade_db_action(request.action, task_id)
|
| 236 |
+
score = max(0.001, min(0.999, score))
|
| 237 |
+
return GraderResponse(score=score, feedback=feedback, breakdown=breakdown)
|
|
|
|
|
|
|
| 238 |
|
| 239 |
+
# ββ ROUND 1: SQL DEBUGGING TASK βββββββββββββββββββββββββββ
|
| 240 |
+
score, breakdown, feedback = grade(request.action, task_id)
|
| 241 |
+
score = max(0.001, min(0.999, score))
|
| 242 |
return GraderResponse(score=score, feedback=feedback, breakdown=breakdown)
|
| 243 |
|
| 244 |
except Exception as e:
|
| 245 |
return GraderResponse(
|
| 246 |
+
score = 0.001,
|
| 247 |
feedback = f"Grader error: {str(e)}",
|
| 248 |
breakdown = {"error": str(e)}
|
| 249 |
)
|
| 250 |
|
| 251 |
|
| 252 |
+
def _grade_submit_report(request: GraderRequest, task_id: str) -> GraderResponse:
|
| 253 |
+
"""
|
| 254 |
+
Grade a submit_report action for a Round 2 scenario.
|
| 255 |
+
|
| 256 |
+
Score components:
|
| 257 |
+
60% β performance improvement (baseline β current)
|
| 258 |
+
20% β step efficiency (fewer steps = higher bonus)
|
| 259 |
+
10% β base credit for submitting
|
| 260 |
+
10% β report summary quality
|
| 261 |
+
|
| 262 |
+
Falls back gracefully if DB simulator state is unavailable.
|
| 263 |
+
"""
|
| 264 |
+
try:
|
| 265 |
+
ep_state = environment.state()
|
| 266 |
+
|
| 267 |
+
# Get performance data from environment state
|
| 268 |
+
# Use action_counts as the store (set by environment.py during steps)
|
| 269 |
+
ac = ep_state.action_counts or {}
|
| 270 |
+
perf_history = ac.get("_perf_history", [])
|
| 271 |
+
baseline = float(ac.get("_baseline_score", 0.0))
|
| 272 |
+
current = float(perf_history[-1]) if perf_history else baseline
|
| 273 |
+
steps_used = ep_state.step_count
|
| 274 |
+
max_steps = 50 # Round 2 default
|
| 275 |
+
|
| 276 |
+
# If no perf history (called before reset, or env in wrong state):
|
| 277 |
+
# fall back to scenario-based quality score
|
| 278 |
+
if not perf_history or baseline == 0.0:
|
| 279 |
+
scenario = _get_scenario(task_id)
|
| 280 |
+
if scenario:
|
| 281 |
+
baseline = float(scenario.get("performance_score_baseline", 0.0))
|
| 282 |
+
target = float(scenario.get("target_score", 85.0))
|
| 283 |
+
# Score based on report quality only
|
| 284 |
+
summary = str((request.action.payload or {}).get("summary", ""))
|
| 285 |
+
base_score = 0.15 + min(len(summary) / 400, 0.25)
|
| 286 |
+
return GraderResponse(
|
| 287 |
+
score = round(max(0.001, min(0.999, base_score)), 4),
|
| 288 |
+
feedback = (
|
| 289 |
+
f"Report graded on quality only (episode state unavailable). "
|
| 290 |
+
f"Run a full episode via /reset then /step to get performance-based score."
|
| 291 |
+
),
|
| 292 |
+
breakdown = {"report_quality": round(base_score, 4), "note": "no_episode_state"}
|
| 293 |
+
)
|
| 294 |
+
|
| 295 |
+
max_possible = max(1.0, 100.0 - baseline)
|
| 296 |
+
perf_improvement = max(0.0, (current - baseline) / max_possible)
|
| 297 |
+
step_efficiency = max(0.0, 1.0 - (steps_used / max(1, max_steps)))
|
| 298 |
+
summary = str((request.action.payload or {}).get("summary", ""))
|
| 299 |
+
report_quality = min(len(summary) / 300, 0.10) if summary else 0.0
|
| 300 |
+
|
| 301 |
+
raw_score = (
|
| 302 |
+
perf_improvement * 0.60
|
| 303 |
+
+ step_efficiency * 0.20
|
| 304 |
+
+ 0.10 # base credit
|
| 305 |
+
+ report_quality # up to 0.10
|
| 306 |
+
)
|
| 307 |
+
score = round(max(0.001, min(0.999, raw_score)), 4)
|
| 308 |
+
|
| 309 |
+
return GraderResponse(
|
| 310 |
+
score = score,
|
| 311 |
+
feedback = (
|
| 312 |
+
f"DB performance: {baseline:.1f} β {current:.1f} "
|
| 313 |
+
f"(improvement: {perf_improvement*100:.1f}%). "
|
| 314 |
+
f"Steps used: {steps_used}/{max_steps}. "
|
| 315 |
+
f"Efficiency: {step_efficiency*100:.1f}%."
|
| 316 |
+
),
|
| 317 |
+
breakdown = {
|
| 318 |
+
"perf_improvement": round(perf_improvement, 4),
|
| 319 |
+
"step_efficiency": round(step_efficiency, 4),
|
| 320 |
+
"base_credit": 0.10,
|
| 321 |
+
"report_quality": round(report_quality, 4),
|
| 322 |
+
}
|
| 323 |
+
)
|
| 324 |
+
|
| 325 |
+
except Exception as e:
|
| 326 |
+
# Last resort β don't return an error, return a low but non-zero score
|
| 327 |
+
return GraderResponse(
|
| 328 |
+
score = 0.10,
|
| 329 |
+
feedback = f"Submit report scored with fallback (error: {str(e)}).",
|
| 330 |
+
breakdown = {"fallback": 0.10, "error": str(e)}
|
| 331 |
+
)
|
| 332 |
+
|
| 333 |
+
|
| 334 |
# βββββββββββββββββββββββββββββββββββββββββββββ
|
| 335 |
# 7. /baseline β POST
|
| 336 |
# βββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 367 |
|
| 368 |
|
| 369 |
# βββββββββββββββββββββββββββββββββββββββββββββ
|
| 370 |
+
# 8. /progress β GET (Round 2)
|
| 371 |
# βββββββββββββββββββββββββββββββββββββββββββββ
|
| 372 |
|
| 373 |
@app.get("/progress", response_model=ProgressResponse, tags=["Training"])
|
|
|
|
| 378 |
Shows improvement from baseline to current score.
|
| 379 |
"""
|
| 380 |
ep_state = environment.state()
|
| 381 |
+
ac = ep_state.action_counts or {}
|
| 382 |
perf_history = ac.get("_perf_history", [])
|
| 383 |
milestones = ac.get("_milestones", [])
|
| 384 |
baseline = ac.get("_baseline_score", 0.0)
|
|
|
|
| 418 |
"tasks_count": 30,
|
| 419 |
"max_steps": 50,
|
| 420 |
"themes": ["Long-Horizon Planning", "World Modeling", "Self-Improvement", "Wildcard"],
|
| 421 |
+
}
|