Spaces:
Running
Running
Keep task scores strictly inside (0,1) in inference logs
Browse files- inference.py +8 -2
inference.py
CHANGED
|
@@ -37,6 +37,12 @@ TASK_CONFIGS = {
|
|
| 37 |
"medium_logic_fix": {"max_steps": 20, "success_threshold": 0.7},
|
| 38 |
"hard_multi_bug": {"max_steps": 30, "success_threshold": 0.5},
|
| 39 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
|
| 42 |
# ββ Logging functions (EXACT FORMAT β DO NOT MODIFY) ββββββββββββββββββββββββ
|
|
@@ -266,7 +272,7 @@ def run_task(
|
|
| 266 |
break
|
| 267 |
|
| 268 |
# Compute final score
|
| 269 |
-
score =
|
| 270 |
success = score >= success_threshold
|
| 271 |
|
| 272 |
log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
|
|
@@ -310,7 +316,7 @@ def main():
|
|
| 310 |
all_results.append(result)
|
| 311 |
except Exception as e:
|
| 312 |
print(f"[DEBUG] Task {task_id} failed: {e}", flush=True)
|
| 313 |
-
log_end(success=False, steps=0, score=
|
| 314 |
|
| 315 |
# Small delay between tasks
|
| 316 |
time.sleep(2)
|
|
|
|
| 37 |
"medium_logic_fix": {"max_steps": 20, "success_threshold": 0.7},
|
| 38 |
"hard_multi_bug": {"max_steps": 30, "success_threshold": 0.5},
|
| 39 |
}
|
| 40 |
+
MIN_STRICT_SCORE = 0.001
|
| 41 |
+
MAX_STRICT_SCORE = 0.999
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def strict_score(value: float) -> float:
|
| 45 |
+
return min(MAX_STRICT_SCORE, max(MIN_STRICT_SCORE, value))
|
| 46 |
|
| 47 |
|
| 48 |
# ββ Logging functions (EXACT FORMAT β DO NOT MODIFY) ββββββββββββββββββββββββ
|
|
|
|
| 272 |
break
|
| 273 |
|
| 274 |
# Compute final score
|
| 275 |
+
score = strict_score(score)
|
| 276 |
success = score >= success_threshold
|
| 277 |
|
| 278 |
log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
|
|
|
|
| 316 |
all_results.append(result)
|
| 317 |
except Exception as e:
|
| 318 |
print(f"[DEBUG] Task {task_id} failed: {e}", flush=True)
|
| 319 |
+
log_end(success=False, steps=0, score=MIN_STRICT_SCORE, rewards=[])
|
| 320 |
|
| 321 |
# Small delay between tasks
|
| 322 |
time.sleep(2)
|