{ "dataset": "wls04/prm-alfworld-gemini-mix:test", "n_examples_kept": 2144, "n_examples_skipped": 0, "n_steps": 35147, "label_scheme": "agent", "combine_mode": "concat", "results": { "combine": { "acc": 91.9708652232054, "macro_f1": 91.40890341554893, "balanced_acc": 90.74304346180321, "per_class_f1": { "0": 94.64, "1": 86.75, "2": 92.83 } }, "step": { "acc": 87.35311690898227, "macro_f1": 84.90823345930534, "balanced_acc": 83.57057256686144, "per_class_f1": { "0": 93.61, "1": 80.1, "2": 81.02 } }, "future": { "acc": 92.34074031923066, "macro_f1": 91.88653762600303, "balanced_acc": 91.41298489827267, "per_class_f1": { "0": 94.76, "1": 87.26, "2": 93.64 } } } }