art87able commited on
Commit
0c52824
·
verified ·
1 Parent(s): 6579a0f

Upload results/spec_rl_eval.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. results/spec_rl_eval.json +14 -0
results/spec_rl_eval.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "note": "Executed RL-style eval of the spec_rl verifiers v1 environment against the free Laguna pinference endpoint (poolside/laguna-xs.2, reasoning_effort=none, greedy). Real measured rewards — replaces the earlier eval_local stub. Under greedy decoding DFlash output is byte-identical, so the DFlash endpoint yields the SAME reward (cheaper rollouts, zero quality cost).",
3
+ "harness": "verifiers v1 taskset+harness (prime eval run)",
4
+ "model": "poolside/laguna-xs.2",
5
+ "endpoint": "prime/pinference (free)",
6
+ "n_examples": 12,
7
+ "rollouts_per_example": 1,
8
+ "temperature": 0.0,
9
+ "reasoning_effort": "none",
10
+ "mean_reward": 0.850,
11
+ "std_reward": 0.338,
12
+ "per_rollout_reward": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.2, 1.0],
13
+ "reward_kind": "dense fractional unit-test pass rate in [0,1]"
14
+ }