{ "dataset": "wls04/prm-scienceworld-gemini-mix:test", "n_examples_kept": 3200, "n_examples_skipped": 0, "n_steps": 50711, "label_scheme": "agent", "combine_mode": "concat", "results": { "combine": { "acc": 86.98310031354144, "macro_f1": 84.01496913424012, "balanced_acc": 84.01278475222438, "per_class_f1": { "0": 94.0, "1": 74.62, "2": 83.42 } }, "step": { "acc": 84.9835341444657, "macro_f1": 81.60372519964524, "balanced_acc": 81.47710783650096, "per_class_f1": { "0": 92.74, "1": 71.36, "2": 80.71 } }, "future": { "acc": 87.21776340438959, "macro_f1": 84.30195984740317, "balanced_acc": 84.34468096714211, "per_class_f1": { "0": 94.15, "1": 75.03, "2": 83.72 } } } }