Spaces:
Running
Running
Commit ·
fc84271
1
Parent(s): d270d2a
Fix UI score accumulation logic and save benchmark history
Browse files- server/app.py +4 -3
server/app.py
CHANGED
|
@@ -519,9 +519,9 @@ def _run_single_task_inline(task_id, api_base, api_key, model_id, system_prompt)
|
|
| 519 |
logs.append(msg)
|
| 520 |
yield {'type': 'log', 'level': 'info', 'msg': msg}
|
| 521 |
|
| 522 |
-
#
|
| 523 |
-
|
| 524 |
-
score = min(max(
|
| 525 |
success = score > 0.0
|
| 526 |
rewards_str = ','.join(f'{r:.2f}' for r in rewards)
|
| 527 |
|
|
@@ -574,6 +574,7 @@ def run_benchmark(body: dict):
|
|
| 574 |
|
| 575 |
# Persist to disk via benchmark_store
|
| 576 |
try:
|
|
|
|
| 577 |
append_result(model_name, model_id, scores)
|
| 578 |
except Exception as e:
|
| 579 |
print(f"Failed to append result: {e}", flush=True)
|
|
|
|
| 519 |
logs.append(msg)
|
| 520 |
yield {'type': 'log', 'level': 'info', 'msg': msg}
|
| 521 |
|
| 522 |
+
# Sum the rewards for multi-turn accumulation — same logic as inference.py
|
| 523 |
+
total_reward = sum(rewards) if rewards else 0.0
|
| 524 |
+
score = round(min(max(total_reward, 0.0), 1.0), 2)
|
| 525 |
success = score > 0.0
|
| 526 |
rewards_str = ','.join(f'{r:.2f}' for r in rewards)
|
| 527 |
|
|
|
|
| 574 |
|
| 575 |
# Persist to disk via benchmark_store
|
| 576 |
try:
|
| 577 |
+
from .benchmark_store import append_result
|
| 578 |
append_result(model_name, model_id, scores)
|
| 579 |
except Exception as e:
|
| 580 |
print(f"Failed to append result: {e}", flush=True)
|