Spaces:

yashppawar
/

forensic-shell

Sleeping

yashppawar commited on Apr 11

Commit

62567eb

verified ·

1 Parent(s): 8c6d68f

Upload folder using huggingface_hub

Files changed (1) hide show

inference.py CHANGED Viewed

@@ -47,6 +47,14 @@ BENCHMARK = os.getenv("FORENSIC_BENCHMARK", "forensic_shell")
 MAX_STEPS_PER_TASK = 14
 SUCCESS_THRESHOLD = 0.5
 TASK_IDS: List[str] = ["t1_login", "t2_modified", "t3_timeline"]
@@ -168,8 +176,15 @@ async def _drive_one_task(
                         if done:
                             break
-                score = rewards[-1] if rewards else 0.0
-                score = max(0.0, min(1.0, score))
                 success = score >= SUCCESS_THRESHOLD
             except Exception as e:
                 print(

 MAX_STEPS_PER_TASK = 14
 SUCCESS_THRESHOLD = 0.5
+# Phase 2 grader rejects scores of exactly 0.0 or 1.0. The spec language
+# "[0, 1]" is interpreted as the OPEN interval (0, 1) by the hackathon
+# validator, so every reported score is clamped into (SCORE_FLOOR, SCORE_CEIL).
+# We use 0.01 / 0.99 (not 0.001 / 0.999) so the .2f-rounded entries in the
+# rewards=... field never collapse back to 0.00 or 1.00 either.
+SCORE_FLOOR = 0.01
+SCORE_CEIL = 0.99
 TASK_IDS: List[str] = ["t1_login", "t2_modified", "t3_timeline"]
                         if done:
                             break
+                # Ensure we always have at least one reward entry, and clamp
+                # the terminal reward into the strict open interval (0, 1)
+                # required by the Phase 2 validator. The terminal reward IS
+                # the task score, so clamping both the list entry and the
+                # reported score keeps them consistent.
+                if not rewards:
+                    rewards.append(SCORE_FLOOR)
+                rewards[-1] = max(SCORE_FLOOR, min(SCORE_CEIL, rewards[-1]))
+                score = rewards[-1]
                 success = score >= SUCCESS_THRESHOLD
             except Exception as e:
                 print(