Upload results/spec_rl_eval.json with huggingface_hub
Browse files- results/spec_rl_eval.json +14 -0
results/spec_rl_eval.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"note": "Executed RL-style eval of the spec_rl verifiers v1 environment against the free Laguna pinference endpoint (poolside/laguna-xs.2, reasoning_effort=none, greedy). Real measured rewards — replaces the earlier eval_local stub. Under greedy decoding DFlash output is byte-identical, so the DFlash endpoint yields the SAME reward (cheaper rollouts, zero quality cost).",
|
| 3 |
+
"harness": "verifiers v1 taskset+harness (prime eval run)",
|
| 4 |
+
"model": "poolside/laguna-xs.2",
|
| 5 |
+
"endpoint": "prime/pinference (free)",
|
| 6 |
+
"n_examples": 12,
|
| 7 |
+
"rollouts_per_example": 1,
|
| 8 |
+
"temperature": 0.0,
|
| 9 |
+
"reasoning_effort": "none",
|
| 10 |
+
"mean_reward": 0.850,
|
| 11 |
+
"std_reward": 0.338,
|
| 12 |
+
"per_rollout_reward": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.2, 1.0],
|
| 13 |
+
"reward_kind": "dense fractional unit-test pass rate in [0,1]"
|
| 14 |
+
}
|