alf-combine2 / eval_agent_test.json
wls04's picture
Add checkpoint
88d1266 verified
{
"dataset": "wls04/prm-alfworld-gemini-mix:test",
"n_examples_kept": 2144,
"n_examples_skipped": 0,
"n_steps": 35147,
"label_scheme": "agent",
"combine_mode": "concat",
"results": {
"combine": {
"acc": 91.98224599539078,
"macro_f1": 91.49491929565538,
"balanced_acc": 91.1784596987989,
"per_class_f1": {
"0": 94.55,
"1": 86.76,
"2": 93.17
}
},
"step": {
"acc": 87.1482630096452,
"macro_f1": 84.75697001493187,
"balanced_acc": 83.84656212095378,
"per_class_f1": {
"0": 93.57,
"1": 79.59,
"2": 81.11
}
},
"future": {
"acc": 92.25253933479387,
"macro_f1": 91.81233200670763,
"balanced_acc": 91.65293217311738,
"per_class_f1": {
"0": 94.69,
"1": 87.15,
"2": 93.6
}
}
}
}