sci-causal2 / eval_test_results.json
wls04's picture
Add checkpoint
95e56e8 verified
{
"checkpoint": "models/sci-causal",
"dataset": "wls04/prm-scienceworld-gemini-mix:test",
"split": "test",
"step_score_token": "<|prm_step|>",
"num_steps": 50711,
"num_batches": 100,
"step_loss": 0.366407671184279,
"step_accuracy": 0.810179251050068,
"macro_f1": 76.61213730710607,
"balanced_accuracy": 77.840100607088,
"per_class_f1": {
"0": 91.23,
"1": 64.32,
"2": 74.29
},
"num_pos_labels": 11281,
"num_neg_labels": 26919,
"flip_pos_to_neg": 399,
"flip_neg_to_pos": 1491,
"flip_pos_to_neg_rate": 0.03536920485772538,
"flip_neg_to_pos_rate": 0.05538838738437535,
"flip_rate": 0.049476439790575914
}