| { | |
| "note": "Executed RL-style eval of the spec_rl verifiers v1 environment against the free Laguna pinference endpoint (poolside/laguna-xs.2, reasoning_effort=none, greedy). Real measured rewards — replaces the earlier eval_local stub. Under greedy decoding DFlash output is byte-identical, so the DFlash endpoint yields the SAME reward (cheaper rollouts, zero quality cost).", | |
| "harness": "verifiers v1 taskset+harness (prime eval run)", | |
| "model": "poolside/laguna-xs.2", | |
| "endpoint": "prime/pinference (free)", | |
| "n_examples": 12, | |
| "rollouts_per_example": 1, | |
| "temperature": 0.0, | |
| "reasoning_effort": "none", | |
| "mean_reward": 0.850, | |
| "std_reward": 0.338, | |
| "per_rollout_reward": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.2, 1.0], | |
| "reward_kind": "dense fractional unit-test pass rate in [0,1]" | |
| } | |