yashppawar commited on
Commit
62567eb
·
verified ·
1 Parent(s): 8c6d68f

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. inference.py +17 -2
inference.py CHANGED
@@ -47,6 +47,14 @@ BENCHMARK = os.getenv("FORENSIC_BENCHMARK", "forensic_shell")
47
  MAX_STEPS_PER_TASK = 14
48
  SUCCESS_THRESHOLD = 0.5
49
 
 
 
 
 
 
 
 
 
50
  TASK_IDS: List[str] = ["t1_login", "t2_modified", "t3_timeline"]
51
 
52
 
@@ -168,8 +176,15 @@ async def _drive_one_task(
168
  if done:
169
  break
170
 
171
- score = rewards[-1] if rewards else 0.0
172
- score = max(0.0, min(1.0, score))
 
 
 
 
 
 
 
173
  success = score >= SUCCESS_THRESHOLD
174
  except Exception as e:
175
  print(
 
47
  MAX_STEPS_PER_TASK = 14
48
  SUCCESS_THRESHOLD = 0.5
49
 
50
+ # Phase 2 grader rejects scores of exactly 0.0 or 1.0. The spec language
51
+ # "[0, 1]" is interpreted as the OPEN interval (0, 1) by the hackathon
52
+ # validator, so every reported score is clamped into (SCORE_FLOOR, SCORE_CEIL).
53
+ # We use 0.01 / 0.99 (not 0.001 / 0.999) so the .2f-rounded entries in the
54
+ # rewards=... field never collapse back to 0.00 or 1.00 either.
55
+ SCORE_FLOOR = 0.01
56
+ SCORE_CEIL = 0.99
57
+
58
  TASK_IDS: List[str] = ["t1_login", "t2_modified", "t3_timeline"]
59
 
60
 
 
176
  if done:
177
  break
178
 
179
+ # Ensure we always have at least one reward entry, and clamp
180
+ # the terminal reward into the strict open interval (0, 1)
181
+ # required by the Phase 2 validator. The terminal reward IS
182
+ # the task score, so clamping both the list entry and the
183
+ # reported score keeps them consistent.
184
+ if not rewards:
185
+ rewards.append(SCORE_FLOOR)
186
+ rewards[-1] = max(SCORE_FLOOR, min(SCORE_CEIL, rewards[-1]))
187
+ score = rewards[-1]
188
  success = score >= SUCCESS_THRESHOLD
189
  except Exception as e:
190
  print(