immortalindeed commited on
Commit
fc84271
·
1 Parent(s): d270d2a

Fix UI score accumulation logic and save benchmark history

Browse files
Files changed (1) hide show
  1. server/app.py +4 -3
server/app.py CHANGED
@@ -519,9 +519,9 @@ def _run_single_task_inline(task_id, api_base, api_key, model_id, system_prompt)
519
  logs.append(msg)
520
  yield {'type': 'log', 'level': 'info', 'msg': msg}
521
 
522
- # Score = max(rewards) — same logic as inference.py
523
- score = round(max(rewards) if rewards else 0.0, 2)
524
- score = min(max(score, 0.0), 1.0)
525
  success = score > 0.0
526
  rewards_str = ','.join(f'{r:.2f}' for r in rewards)
527
 
@@ -574,6 +574,7 @@ def run_benchmark(body: dict):
574
 
575
  # Persist to disk via benchmark_store
576
  try:
 
577
  append_result(model_name, model_id, scores)
578
  except Exception as e:
579
  print(f"Failed to append result: {e}", flush=True)
 
519
  logs.append(msg)
520
  yield {'type': 'log', 'level': 'info', 'msg': msg}
521
 
522
+ # Sum the rewards for multi-turn accumulation — same logic as inference.py
523
+ total_reward = sum(rewards) if rewards else 0.0
524
+ score = round(min(max(total_reward, 0.0), 1.0), 2)
525
  success = score > 0.0
526
  rewards_str = ','.join(f'{r:.2f}' for r in rewards)
527
 
 
574
 
575
  # Persist to disk via benchmark_store
576
  try:
577
+ from .benchmark_store import append_result
578
  append_result(model_name, model_id, scores)
579
  except Exception as e:
580
  print(f"Failed to append result: {e}", flush=True)