alf-combine / eval_agent_test.json
wls04's picture
Add alfworld combine (bidirectional) PRM, Qwen3-1.7B, test macro-F1=91.4
8890dbe verified
{
"dataset": "wls04/prm-alfworld-gemini-mix:test",
"n_examples_kept": 2144,
"n_examples_skipped": 0,
"n_steps": 35147,
"label_scheme": "agent",
"combine_mode": "concat",
"results": {
"combine": {
"acc": 91.9708652232054,
"macro_f1": 91.40890341554893,
"balanced_acc": 90.74304346180321,
"per_class_f1": {
"0": 94.64,
"1": 86.75,
"2": 92.83
}
},
"step": {
"acc": 87.35311690898227,
"macro_f1": 84.90823345930534,
"balanced_acc": 83.57057256686144,
"per_class_f1": {
"0": 93.61,
"1": 80.1,
"2": 81.02
}
},
"future": {
"acc": 92.34074031923066,
"macro_f1": 91.88653762600303,
"balanced_acc": 91.41298489827267,
"per_class_f1": {
"0": 94.76,
"1": 87.26,
"2": 93.64
}
}
}
}