immortalindeed commited on
Commit
cd5104a
Β·
1 Parent(s): fe9aa5c

Fix state machine bugs and switch to average scoring for discriminative benchmarking

Browse files
Files changed (3) hide show
  1. inference.py +3 -3
  2. server/app.py +3 -3
  3. server/router.py +19 -16
inference.py CHANGED
@@ -301,9 +301,9 @@ def run_task(client: OpenAI, task_id: str) -> float:
301
  if done:
302
  break
303
 
304
- # Best single-step reward β€” discriminative for multi-turn tasks
305
- best_reward = max(rewards) if rewards else 0.01
306
- score = round(min(max(best_reward, 0.01), 0.99), 4)
307
  success = score > 0.0
308
  rewards_str = ",".join(f"{r:.2f}" for r in rewards)
309
 
 
301
  if done:
302
  break
303
 
304
+ # Average reward across trajectory β€” discriminative for multi-turn tasks
305
+ avg_reward = sum(rewards) / max(len(rewards), 1) if rewards else 0.01
306
+ score = round(min(max(avg_reward, 0.01), 0.99), 4)
307
  success = score > 0.0
308
  rewards_str = ",".join(f"{r:.2f}" for r in rewards)
309
 
server/app.py CHANGED
@@ -528,9 +528,9 @@ def _run_single_task_inline(task_id, api_base, api_key, model_id, system_prompt)
528
  logs.append(msg)
529
  yield {'type': 'log', 'level': 'info', 'msg': msg}
530
 
531
- # Best single-step reward β€” same logic as inference.py
532
- best_reward = max(rewards) if rewards else 0.01
533
- score = round(min(max(best_reward, 0.01), 0.99), 4)
534
  success = score > 0.0
535
  rewards_str = ','.join(f'{r:.2f}' for r in rewards)
536
 
 
528
  logs.append(msg)
529
  yield {'type': 'log', 'level': 'info', 'msg': msg}
530
 
531
+ # Average reward across trajectory β€” same logic as inference.py
532
+ avg_reward = sum(rewards) / max(len(rewards), 1) if rewards else 0.01
533
+ score = round(min(max(avg_reward, 0.01), 0.99), 4)
534
  success = score > 0.0
535
  rewards_str = ','.join(f'{r:.2f}' for r in rewards)
536
 
server/router.py CHANGED
@@ -52,39 +52,42 @@ def route_step(session: SessionState, action: Dict) -> Dict:
52
  def _check_done(session: SessionState, action: Dict, reward: float, max_steps: int) -> bool:
53
  """Data-driven done condition from case definition.
54
 
55
- Three triggers (from OpenEnv tech ref Section 7.2):
56
- 1. required_sequence complete (all required action types performed)
57
- 2. reward >= completion_threshold
58
- 3. max steps reached
 
 
59
  """
60
  next_step = session.step_count + 1
61
  case = session.task_case
62
-
63
- # Mastery condition: high performance -> early exit
64
- if next_step >= 2:
65
- avg_reward = (session.reward_acc + reward) / next_step
66
- if avg_reward >= 0.90:
67
- return True
68
 
69
  # Always done if max steps reached
70
  if next_step >= max_steps:
71
  return True
72
 
73
- # Check minimum actions before allowing completion by threshold
74
- done_conditions = case.get('done_conditions', {})
75
- min_actions = done_conditions.get('min_actions', 1)
76
  if next_step < min_actions:
77
  return False
78
 
 
 
 
 
 
 
79
  # Completion threshold from case
80
  threshold = case.get('completion_threshold', 0.85)
81
  if reward >= threshold:
82
  return True
83
 
84
- # Required sequence check β€” only end if agent is actually scoring well
85
- # This prevents premature termination when all action types are done but rewards are 0.0
86
  required_seq = done_conditions.get('required_sequence', [])
87
- if required_seq and reward >= 0.3:
88
  all_actions = session.last_actions + [action.get('action_type', '')]
89
  seq_complete = all(a in all_actions for a in required_seq)
90
  if seq_complete:
 
52
  def _check_done(session: SessionState, action: Dict, reward: float, max_steps: int) -> bool:
53
  """Data-driven done condition from case definition.
54
 
55
+ Priority order:
56
+ 1. max steps reached (hard limit)
57
+ 2. min_actions guard (workflow must complete before ANY early exit)
58
+ 3. mastery early-exit (high avg reward after min_actions met)
59
+ 4. completion_threshold met
60
+ 5. required_sequence complete
61
  """
62
  next_step = session.step_count + 1
63
  case = session.task_case
64
+ done_conditions = case.get('done_conditions', {})
65
+ min_actions = done_conditions.get('min_actions', 1)
 
 
 
 
66
 
67
  # Always done if max steps reached
68
  if next_step >= max_steps:
69
  return True
70
 
71
+ # Min actions guard β€” workflow MUST complete before any early exit
72
+ # This prevents mastery from short-circuiting cli_hard at step 2
 
73
  if next_step < min_actions:
74
  return False
75
 
76
+ # Mastery condition: high performance -> early exit (only after min_actions met)
77
+ if next_step >= 2:
78
+ avg_reward = (session.reward_acc + reward) / next_step
79
+ if avg_reward >= 0.90:
80
+ return True
81
+
82
  # Completion threshold from case
83
  threshold = case.get('completion_threshold', 0.85)
84
  if reward >= threshold:
85
  return True
86
 
87
+ # Required sequence check β€” once all required actions are done, episode ends
88
+ # The accumulated rewards already reflect quality; no need for a reward guard
89
  required_seq = done_conditions.get('required_sequence', [])
90
+ if required_seq:
91
  all_actions = session.last_actions + [action.get('action_type', '')]
92
  seq_complete = all(a in all_actions for a in required_seq)
93
  if seq_complete: