{ "dataset": "wls04/prm-alfworld-gemini-mix:test", "n_examples_kept": 2144, "n_examples_skipped": 0, "n_steps": 35147, "label_scheme": "agent", "combine_mode": "concat", "results": { "combine": { "acc": 91.98224599539078, "macro_f1": 91.49491929565538, "balanced_acc": 91.1784596987989, "per_class_f1": { "0": 94.55, "1": 86.76, "2": 93.17 } }, "step": { "acc": 87.1482630096452, "macro_f1": 84.75697001493187, "balanced_acc": 83.84656212095378, "per_class_f1": { "0": 93.57, "1": 79.59, "2": 81.11 } }, "future": { "acc": 92.25253933479387, "macro_f1": 91.81233200670763, "balanced_acc": 91.65293217311738, "per_class_f1": { "0": 94.69, "1": 87.15, "2": 93.6 } } } }