Spaces:
Running
Running
Commit Β·
cd5104a
1
Parent(s): fe9aa5c
Fix state machine bugs and switch to average scoring for discriminative benchmarking
Browse files- inference.py +3 -3
- server/app.py +3 -3
- server/router.py +19 -16
inference.py
CHANGED
|
@@ -301,9 +301,9 @@ def run_task(client: OpenAI, task_id: str) -> float:
|
|
| 301 |
if done:
|
| 302 |
break
|
| 303 |
|
| 304 |
-
#
|
| 305 |
-
|
| 306 |
-
score = round(min(max(
|
| 307 |
success = score > 0.0
|
| 308 |
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
|
| 309 |
|
|
|
|
| 301 |
if done:
|
| 302 |
break
|
| 303 |
|
| 304 |
+
# Average reward across trajectory β discriminative for multi-turn tasks
|
| 305 |
+
avg_reward = sum(rewards) / max(len(rewards), 1) if rewards else 0.01
|
| 306 |
+
score = round(min(max(avg_reward, 0.01), 0.99), 4)
|
| 307 |
success = score > 0.0
|
| 308 |
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
|
| 309 |
|
server/app.py
CHANGED
|
@@ -528,9 +528,9 @@ def _run_single_task_inline(task_id, api_base, api_key, model_id, system_prompt)
|
|
| 528 |
logs.append(msg)
|
| 529 |
yield {'type': 'log', 'level': 'info', 'msg': msg}
|
| 530 |
|
| 531 |
-
#
|
| 532 |
-
|
| 533 |
-
score = round(min(max(
|
| 534 |
success = score > 0.0
|
| 535 |
rewards_str = ','.join(f'{r:.2f}' for r in rewards)
|
| 536 |
|
|
|
|
| 528 |
logs.append(msg)
|
| 529 |
yield {'type': 'log', 'level': 'info', 'msg': msg}
|
| 530 |
|
| 531 |
+
# Average reward across trajectory β same logic as inference.py
|
| 532 |
+
avg_reward = sum(rewards) / max(len(rewards), 1) if rewards else 0.01
|
| 533 |
+
score = round(min(max(avg_reward, 0.01), 0.99), 4)
|
| 534 |
success = score > 0.0
|
| 535 |
rewards_str = ','.join(f'{r:.2f}' for r in rewards)
|
| 536 |
|
server/router.py
CHANGED
|
@@ -52,39 +52,42 @@ def route_step(session: SessionState, action: Dict) -> Dict:
|
|
| 52 |
def _check_done(session: SessionState, action: Dict, reward: float, max_steps: int) -> bool:
|
| 53 |
"""Data-driven done condition from case definition.
|
| 54 |
|
| 55 |
-
|
| 56 |
-
1.
|
| 57 |
-
2.
|
| 58 |
-
3.
|
|
|
|
|
|
|
| 59 |
"""
|
| 60 |
next_step = session.step_count + 1
|
| 61 |
case = session.task_case
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
if next_step >= 2:
|
| 65 |
-
avg_reward = (session.reward_acc + reward) / next_step
|
| 66 |
-
if avg_reward >= 0.90:
|
| 67 |
-
return True
|
| 68 |
|
| 69 |
# Always done if max steps reached
|
| 70 |
if next_step >= max_steps:
|
| 71 |
return True
|
| 72 |
|
| 73 |
-
#
|
| 74 |
-
|
| 75 |
-
min_actions = done_conditions.get('min_actions', 1)
|
| 76 |
if next_step < min_actions:
|
| 77 |
return False
|
| 78 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
# Completion threshold from case
|
| 80 |
threshold = case.get('completion_threshold', 0.85)
|
| 81 |
if reward >= threshold:
|
| 82 |
return True
|
| 83 |
|
| 84 |
-
# Required sequence check β
|
| 85 |
-
#
|
| 86 |
required_seq = done_conditions.get('required_sequence', [])
|
| 87 |
-
if required_seq
|
| 88 |
all_actions = session.last_actions + [action.get('action_type', '')]
|
| 89 |
seq_complete = all(a in all_actions for a in required_seq)
|
| 90 |
if seq_complete:
|
|
|
|
| 52 |
def _check_done(session: SessionState, action: Dict, reward: float, max_steps: int) -> bool:
|
| 53 |
"""Data-driven done condition from case definition.
|
| 54 |
|
| 55 |
+
Priority order:
|
| 56 |
+
1. max steps reached (hard limit)
|
| 57 |
+
2. min_actions guard (workflow must complete before ANY early exit)
|
| 58 |
+
3. mastery early-exit (high avg reward after min_actions met)
|
| 59 |
+
4. completion_threshold met
|
| 60 |
+
5. required_sequence complete
|
| 61 |
"""
|
| 62 |
next_step = session.step_count + 1
|
| 63 |
case = session.task_case
|
| 64 |
+
done_conditions = case.get('done_conditions', {})
|
| 65 |
+
min_actions = done_conditions.get('min_actions', 1)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
|
| 67 |
# Always done if max steps reached
|
| 68 |
if next_step >= max_steps:
|
| 69 |
return True
|
| 70 |
|
| 71 |
+
# Min actions guard β workflow MUST complete before any early exit
|
| 72 |
+
# This prevents mastery from short-circuiting cli_hard at step 2
|
|
|
|
| 73 |
if next_step < min_actions:
|
| 74 |
return False
|
| 75 |
|
| 76 |
+
# Mastery condition: high performance -> early exit (only after min_actions met)
|
| 77 |
+
if next_step >= 2:
|
| 78 |
+
avg_reward = (session.reward_acc + reward) / next_step
|
| 79 |
+
if avg_reward >= 0.90:
|
| 80 |
+
return True
|
| 81 |
+
|
| 82 |
# Completion threshold from case
|
| 83 |
threshold = case.get('completion_threshold', 0.85)
|
| 84 |
if reward >= threshold:
|
| 85 |
return True
|
| 86 |
|
| 87 |
+
# Required sequence check β once all required actions are done, episode ends
|
| 88 |
+
# The accumulated rewards already reflect quality; no need for a reward guard
|
| 89 |
required_seq = done_conditions.get('required_sequence', [])
|
| 90 |
+
if required_seq:
|
| 91 |
all_actions = session.last_actions + [action.get('action_type', '')]
|
| 92 |
seq_complete = all(a in all_actions for a in required_seq)
|
| 93 |
if seq_complete:
|