Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- inference.py +17 -2
inference.py
CHANGED
|
@@ -47,6 +47,14 @@ BENCHMARK = os.getenv("FORENSIC_BENCHMARK", "forensic_shell")
|
|
| 47 |
MAX_STEPS_PER_TASK = 14
|
| 48 |
SUCCESS_THRESHOLD = 0.5
|
| 49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
TASK_IDS: List[str] = ["t1_login", "t2_modified", "t3_timeline"]
|
| 51 |
|
| 52 |
|
|
@@ -168,8 +176,15 @@ async def _drive_one_task(
|
|
| 168 |
if done:
|
| 169 |
break
|
| 170 |
|
| 171 |
-
|
| 172 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
success = score >= SUCCESS_THRESHOLD
|
| 174 |
except Exception as e:
|
| 175 |
print(
|
|
|
|
| 47 |
MAX_STEPS_PER_TASK = 14
|
| 48 |
SUCCESS_THRESHOLD = 0.5
|
| 49 |
|
| 50 |
+
# Phase 2 grader rejects scores of exactly 0.0 or 1.0. The spec language
|
| 51 |
+
# "[0, 1]" is interpreted as the OPEN interval (0, 1) by the hackathon
|
| 52 |
+
# validator, so every reported score is clamped into (SCORE_FLOOR, SCORE_CEIL).
|
| 53 |
+
# We use 0.01 / 0.99 (not 0.001 / 0.999) so the .2f-rounded entries in the
|
| 54 |
+
# rewards=... field never collapse back to 0.00 or 1.00 either.
|
| 55 |
+
SCORE_FLOOR = 0.01
|
| 56 |
+
SCORE_CEIL = 0.99
|
| 57 |
+
|
| 58 |
TASK_IDS: List[str] = ["t1_login", "t2_modified", "t3_timeline"]
|
| 59 |
|
| 60 |
|
|
|
|
| 176 |
if done:
|
| 177 |
break
|
| 178 |
|
| 179 |
+
# Ensure we always have at least one reward entry, and clamp
|
| 180 |
+
# the terminal reward into the strict open interval (0, 1)
|
| 181 |
+
# required by the Phase 2 validator. The terminal reward IS
|
| 182 |
+
# the task score, so clamping both the list entry and the
|
| 183 |
+
# reported score keeps them consistent.
|
| 184 |
+
if not rewards:
|
| 185 |
+
rewards.append(SCORE_FLOOR)
|
| 186 |
+
rewards[-1] = max(SCORE_FLOOR, min(SCORE_CEIL, rewards[-1]))
|
| 187 |
+
score = rewards[-1]
|
| 188 |
success = score >= SUCCESS_THRESHOLD
|
| 189 |
except Exception as e:
|
| 190 |
print(
|