| { |
| "checkpoint": "models/sci-causal", |
| "dataset": "wls04/prm-scienceworld-gemini-mix:test", |
| "split": "test", |
| "step_score_token": "<|prm_step|>", |
| "num_steps": 50711, |
| "num_batches": 100, |
| "step_loss": 0.366407671184279, |
| "step_accuracy": 0.810179251050068, |
| "macro_f1": 76.61213730710607, |
| "balanced_accuracy": 77.840100607088, |
| "per_class_f1": { |
| "0": 91.23, |
| "1": 64.32, |
| "2": 74.29 |
| }, |
| "num_pos_labels": 11281, |
| "num_neg_labels": 26919, |
| "flip_pos_to_neg": 399, |
| "flip_neg_to_pos": 1491, |
| "flip_pos_to_neg_rate": 0.03536920485772538, |
| "flip_neg_to_pos_rate": 0.05538838738437535, |
| "flip_rate": 0.049476439790575914 |
| } |