sci-combine2 / eval_agent_test.json
wls04's picture
Add checkpoint
b838ce0 verified
{
"dataset": "wls04/prm-scienceworld-gemini-mix:test",
"n_examples_kept": 3200,
"n_examples_skipped": 0,
"n_steps": 50711,
"label_scheme": "agent",
"combine_mode": "concat",
"results": {
"combine": {
"acc": 86.98310031354144,
"macro_f1": 84.01496913424012,
"balanced_acc": 84.01278475222438,
"per_class_f1": {
"0": 94.0,
"1": 74.62,
"2": 83.42
}
},
"step": {
"acc": 84.9835341444657,
"macro_f1": 81.60372519964524,
"balanced_acc": 81.47710783650096,
"per_class_f1": {
"0": 92.74,
"1": 71.36,
"2": 80.71
}
},
"future": {
"acc": 87.21776340438959,
"macro_f1": 84.30195984740317,
"balanced_acc": 84.34468096714211,
"per_class_f1": {
"0": 94.15,
"1": 75.03,
"2": 83.72
}
}
}
}