| { | |
| "dataset": "wls04/prm-alfworld-gemini-mix:test", | |
| "n_examples_kept": 2144, | |
| "n_examples_skipped": 0, | |
| "n_steps": 35147, | |
| "label_scheme": "agent", | |
| "combine_mode": "concat", | |
| "results": { | |
| "combine": { | |
| "acc": 91.9708652232054, | |
| "macro_f1": 91.40890341554893, | |
| "balanced_acc": 90.74304346180321, | |
| "per_class_f1": { | |
| "0": 94.64, | |
| "1": 86.75, | |
| "2": 92.83 | |
| } | |
| }, | |
| "step": { | |
| "acc": 87.35311690898227, | |
| "macro_f1": 84.90823345930534, | |
| "balanced_acc": 83.57057256686144, | |
| "per_class_f1": { | |
| "0": 93.61, | |
| "1": 80.1, | |
| "2": 81.02 | |
| } | |
| }, | |
| "future": { | |
| "acc": 92.34074031923066, | |
| "macro_f1": 91.88653762600303, | |
| "balanced_acc": 91.41298489827267, | |
| "per_class_f1": { | |
| "0": 94.76, | |
| "1": 87.26, | |
| "2": 93.64 | |
| } | |
| } | |
| } | |
| } |