| { | |
| "test_size": 200, | |
| "results": [ | |
| { | |
| "name": "SFT Model", | |
| "any_match": 0.85, | |
| "exact_match": 0.35, | |
| "precision": 0.6475, | |
| "recall": 0.6950000000000002, | |
| "f1": 0.6704096834264432 | |
| }, | |
| { | |
| "name": "RL Final (15 iters)", | |
| "any_match": 0.855, | |
| "exact_match": 0.4, | |
| "precision": 0.68, | |
| "recall": 0.7025, | |
| "f1": 0.6910669077757686 | |
| } | |
| ], | |
| "checkpoints": { | |
| "sft": "tinker://398393e1-7182-555d-aa1b-7ddf23892338:train:0/sampler_weights/sft_final_sampler", | |
| "rl_final": "tinker://b6c9686e-b64d-5cd9-b9e5-a882b0f69d6a:train:0/sampler_weights/rl_final" | |
| } | |
| } |