{ "checkpoint": "models/sci-causal", "dataset": "wls04/prm-scienceworld-gemini-mix:test", "split": "test", "step_score_token": "<|prm_step|>", "num_steps": 50711, "num_batches": 100, "step_loss": 0.366407671184279, "step_accuracy": 0.810179251050068, "macro_f1": 76.61213730710607, "balanced_accuracy": 77.840100607088, "per_class_f1": { "0": 91.23, "1": 64.32, "2": 74.29 }, "num_pos_labels": 11281, "num_neg_labels": 26919, "flip_pos_to_neg": 399, "flip_neg_to_pos": 1491, "flip_pos_to_neg_rate": 0.03536920485772538, "flip_neg_to_pos_rate": 0.05538838738437535, "flip_rate": 0.049476439790575914 }