{ "schema": "autodatalab-plus.headline_benchmark.v2", "config": { "tasks": [ "hard_brief", "expert_brief", "crisis_brief" ], "policies": [ "base_naive", "base_roundrobin", "trained_mlp", "oracle_router" ], "policy_info": { "base_naive": "untrained baseline (analyst-only)", "base_roundrobin": "untrained baseline (fixed order)", "trained_mlp": "trained MLP CoS (REINFORCE, 600 ep) - cos_final.pt (ok)", "oracle_router": "oracle router (upper bound, handcoded canonical sequence)" }, "seeds": [ 11, 23, 47, 91, 137 ], "rag_settings": [ false, true ], "auto_fill_required": false, "shaping": "strict" }, "cells": [ { "task": "hard_brief", "policy": "base_naive", "use_rag": false, "terminal": { "mean": 0.2646, "std": 0.0, "n": 5 }, "cumulative": { "mean": 0.1346, "std": 0.0, "n": 5 }, "runs": [ { "seed": 11, "terminal": 0.2646, "cumulative": 0.1346, "steps": 3, "consulted": [ "analyst" ], "path": [ "analyst" ], "submitted": true }, { "seed": 23, "terminal": 0.2646, "cumulative": 0.1346, "steps": 3, "consulted": [ "analyst" ], "path": [ "analyst" ], "submitted": true }, { "seed": 47, "terminal": 0.2646, "cumulative": 0.1346, "steps": 3, "consulted": [ "analyst" ], "path": [ "analyst" ], "submitted": true }, { "seed": 91, "terminal": 0.2646, "cumulative": 0.1346, "steps": 3, "consulted": [ "analyst" ], "path": [ "analyst" ], "submitted": true }, { "seed": 137, "terminal": 0.2646, "cumulative": 0.1346, "steps": 3, "consulted": [ "analyst" ], "path": [ "analyst" ], "submitted": true } ] }, { "task": "hard_brief", "policy": "base_roundrobin", "use_rag": false, "terminal": { "mean": 0.8827, "std": 0.0, "n": 5 }, "cumulative": { "mean": 1.2627, "std": 0.0, "n": 5 }, "runs": [ { "seed": 11, "terminal": 0.8827, "cumulative": 1.2627, "steps": 6, "consulted": [ "finance", "analyst", "hr", "strategy" ], "path": [ "finance", "analyst", "hr", "strategy" ], "submitted": true }, { "seed": 23, "terminal": 0.8827, "cumulative": 1.2627, "steps": 6, "consulted": [ "finance", "analyst", "hr", "strategy" ], "path": [ "finance", "analyst", "hr", "strategy" ], "submitted": true }, { "seed": 47, "terminal": 0.8827, "cumulative": 1.2627, "steps": 6, "consulted": [ "finance", "analyst", "hr", "strategy" ], "path": [ "finance", "analyst", "hr", "strategy" ], "submitted": true }, { "seed": 91, "terminal": 0.8827, "cumulative": 1.2627, "steps": 6, "consulted": [ "finance", "analyst", "hr", "strategy" ], "path": [ "finance", "analyst", "hr", "strategy" ], "submitted": true }, { "seed": 137, "terminal": 0.8827, "cumulative": 1.2627, "steps": 6, "consulted": [ "finance", "analyst", "hr", "strategy" ], "path": [ "finance", "analyst", "hr", "strategy" ], "submitted": true } ] }, { "task": "hard_brief", "policy": "trained_mlp", "use_rag": false, "terminal": { "mean": 0.7286, "std": 0.0, "n": 5 }, "cumulative": { "mean": -0.7314, "std": 0.0, "n": 5 }, "runs": [ { "seed": 11, "terminal": 0.7286, "cumulative": -0.7314, "steps": 12, "consulted": [ "analyst", "strategy", "finance" ], "path": [ "analyst", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance" ], "submitted": true }, { "seed": 23, "terminal": 0.7286, "cumulative": -0.7314, "steps": 12, "consulted": [ "analyst", "strategy", "finance" ], "path": [ "analyst", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance" ], "submitted": true }, { "seed": 47, "terminal": 0.7286, "cumulative": -0.7314, "steps": 12, "consulted": [ "analyst", "strategy", "finance" ], "path": [ "analyst", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance" ], "submitted": true }, { "seed": 91, "terminal": 0.7286, "cumulative": -0.7314, "steps": 12, "consulted": [ "analyst", "strategy", "finance" ], "path": [ "analyst", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance" ], "submitted": true }, { "seed": 137, "terminal": 0.7286, "cumulative": -0.7314, "steps": 12, "consulted": [ "analyst", "strategy", "finance" ], "path": [ "analyst", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance" ], "submitted": true } ] }, { "task": "hard_brief", "policy": "oracle_router", "use_rag": false, "terminal": { "mean": 0.8827, "std": 0.0, "n": 5 }, "cumulative": { "mean": 1.2627, "std": 0.0, "n": 5 }, "runs": [ { "seed": 11, "terminal": 0.8827, "cumulative": 1.2627, "steps": 6, "consulted": [ "analyst", "finance", "strategy", "hr" ], "path": [ "analyst", "finance", "strategy", "hr" ], "submitted": true }, { "seed": 23, "terminal": 0.8827, "cumulative": 1.2627, "steps": 6, "consulted": [ "analyst", "finance", "strategy", "hr" ], "path": [ "analyst", "finance", "strategy", "hr" ], "submitted": true }, { "seed": 47, "terminal": 0.8827, "cumulative": 1.2627, "steps": 6, "consulted": [ "analyst", "finance", "strategy", "hr" ], "path": [ "analyst", "finance", "strategy", "hr" ], "submitted": true }, { "seed": 91, "terminal": 0.8827, "cumulative": 1.2627, "steps": 6, "consulted": [ "analyst", "finance", "strategy", "hr" ], "path": [ "analyst", "finance", "strategy", "hr" ], "submitted": true }, { "seed": 137, "terminal": 0.8827, "cumulative": 1.2627, "steps": 6, "consulted": [ "analyst", "finance", "strategy", "hr" ], "path": [ "analyst", "finance", "strategy", "hr" ], "submitted": true } ] }, { "task": "hard_brief", "policy": "base_naive", "use_rag": true, "terminal": { "mean": 0.3217, "std": 0.0, "n": 5 }, "cumulative": { "mean": 0.1917, "std": 0.0, "n": 5 }, "runs": [ { "seed": 11, "terminal": 0.3217, "cumulative": 0.1917, "steps": 3, "consulted": [ "analyst" ], "path": [ "analyst" ], "submitted": true }, { "seed": 23, "terminal": 0.3217, "cumulative": 0.1917, "steps": 3, "consulted": [ "analyst" ], "path": [ "analyst" ], "submitted": true }, { "seed": 47, "terminal": 0.3217, "cumulative": 0.1917, "steps": 3, "consulted": [ "analyst" ], "path": [ "analyst" ], "submitted": true }, { "seed": 91, "terminal": 0.3217, "cumulative": 0.1917, "steps": 3, "consulted": [ "analyst" ], "path": [ "analyst" ], "submitted": true }, { "seed": 137, "terminal": 0.3217, "cumulative": 0.1917, "steps": 3, "consulted": [ "analyst" ], "path": [ "analyst" ], "submitted": true } ] }, { "task": "hard_brief", "policy": "base_roundrobin", "use_rag": true, "terminal": { "mean": 0.897, "std": 0.0, "n": 5 }, "cumulative": { "mean": 1.277, "std": 0.0, "n": 5 }, "runs": [ { "seed": 11, "terminal": 0.897, "cumulative": 1.277, "steps": 6, "consulted": [ "finance", "analyst", "hr", "strategy" ], "path": [ "finance", "analyst", "hr", "strategy" ], "submitted": true }, { "seed": 23, "terminal": 0.897, "cumulative": 1.277, "steps": 6, "consulted": [ "finance", "analyst", "hr", "strategy" ], "path": [ "finance", "analyst", "hr", "strategy" ], "submitted": true }, { "seed": 47, "terminal": 0.897, "cumulative": 1.277, "steps": 6, "consulted": [ "finance", "analyst", "hr", "strategy" ], "path": [ "finance", "analyst", "hr", "strategy" ], "submitted": true }, { "seed": 91, "terminal": 0.897, "cumulative": 1.277, "steps": 6, "consulted": [ "finance", "analyst", "hr", "strategy" ], "path": [ "finance", "analyst", "hr", "strategy" ], "submitted": true }, { "seed": 137, "terminal": 0.897, "cumulative": 1.277, "steps": 6, "consulted": [ "finance", "analyst", "hr", "strategy" ], "path": [ "finance", "analyst", "hr", "strategy" ], "submitted": true } ] }, { "task": "hard_brief", "policy": "trained_mlp", "use_rag": true, "terminal": { "mean": 0.7429, "std": 0.0, "n": 5 }, "cumulative": { "mean": -0.7171, "std": 0.0, "n": 5 }, "runs": [ { "seed": 11, "terminal": 0.7429, "cumulative": -0.7171, "steps": 12, "consulted": [ "analyst", "strategy", "finance" ], "path": [ "analyst", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance" ], "submitted": true }, { "seed": 23, "terminal": 0.7429, "cumulative": -0.7171, "steps": 12, "consulted": [ "analyst", "strategy", "finance" ], "path": [ "analyst", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance" ], "submitted": true }, { "seed": 47, "terminal": 0.7429, "cumulative": -0.7171, "steps": 12, "consulted": [ "analyst", "strategy", "finance" ], "path": [ "analyst", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance" ], "submitted": true }, { "seed": 91, "terminal": 0.7429, "cumulative": -0.7171, "steps": 12, "consulted": [ "analyst", "strategy", "finance" ], "path": [ "analyst", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance" ], "submitted": true }, { "seed": 137, "terminal": 0.7429, "cumulative": -0.7171, "steps": 12, "consulted": [ "analyst", "strategy", "finance" ], "path": [ "analyst", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance" ], "submitted": true } ] }, { "task": "hard_brief", "policy": "oracle_router", "use_rag": true, "terminal": { "mean": 0.897, "std": 0.0, "n": 5 }, "cumulative": { "mean": 1.277, "std": 0.0, "n": 5 }, "runs": [ { "seed": 11, "terminal": 0.897, "cumulative": 1.277, "steps": 6, "consulted": [ "analyst", "finance", "strategy", "hr" ], "path": [ "analyst", "finance", "strategy", "hr" ], "submitted": true }, { "seed": 23, "terminal": 0.897, "cumulative": 1.277, "steps": 6, "consulted": [ "analyst", "finance", "strategy", "hr" ], "path": [ "analyst", "finance", "strategy", "hr" ], "submitted": true }, { "seed": 47, "terminal": 0.897, "cumulative": 1.277, "steps": 6, "consulted": [ "analyst", "finance", "strategy", "hr" ], "path": [ "analyst", "finance", "strategy", "hr" ], "submitted": true }, { "seed": 91, "terminal": 0.897, "cumulative": 1.277, "steps": 6, "consulted": [ "analyst", "finance", "strategy", "hr" ], "path": [ "analyst", "finance", "strategy", "hr" ], "submitted": true }, { "seed": 137, "terminal": 0.897, "cumulative": 1.277, "steps": 6, "consulted": [ "analyst", "finance", "strategy", "hr" ], "path": [ "analyst", "finance", "strategy", "hr" ], "submitted": true } ] }, { "task": "expert_brief", "policy": "base_naive", "use_rag": false, "terminal": { "mean": 0.2646, "std": 0.0, "n": 5 }, "cumulative": { "mean": 0.1346, "std": 0.0, "n": 5 }, "runs": [ { "seed": 11, "terminal": 0.2646, "cumulative": 0.1346, "steps": 3, "consulted": [ "analyst" ], "path": [ "analyst" ], "submitted": true }, { "seed": 23, "terminal": 0.2646, "cumulative": 0.1346, "steps": 3, "consulted": [ "analyst" ], "path": [ "analyst" ], "submitted": true }, { "seed": 47, "terminal": 0.2646, "cumulative": 0.1346, "steps": 3, "consulted": [ "analyst" ], "path": [ "analyst" ], "submitted": true }, { "seed": 91, "terminal": 0.2646, "cumulative": 0.1346, "steps": 3, "consulted": [ "analyst" ], "path": [ "analyst" ], "submitted": true }, { "seed": 137, "terminal": 0.2646, "cumulative": 0.1346, "steps": 3, "consulted": [ "analyst" ], "path": [ "analyst" ], "submitted": true } ] }, { "task": "expert_brief", "policy": "base_roundrobin", "use_rag": false, "terminal": { "mean": 0.8827, "std": 0.0, "n": 5 }, "cumulative": { "mean": 1.2827, "std": 0.0, "n": 5 }, "runs": [ { "seed": 11, "terminal": 0.8827, "cumulative": 1.2827, "steps": 6, "consulted": [ "finance", "analyst", "hr", "strategy" ], "path": [ "finance", "analyst", "hr", "strategy" ], "submitted": true }, { "seed": 23, "terminal": 0.8827, "cumulative": 1.2827, "steps": 6, "consulted": [ "finance", "analyst", "hr", "strategy" ], "path": [ "finance", "analyst", "hr", "strategy" ], "submitted": true }, { "seed": 47, "terminal": 0.8827, "cumulative": 1.2827, "steps": 6, "consulted": [ "finance", "analyst", "hr", "strategy" ], "path": [ "finance", "analyst", "hr", "strategy" ], "submitted": true }, { "seed": 91, "terminal": 0.8827, "cumulative": 1.2827, "steps": 6, "consulted": [ "finance", "analyst", "hr", "strategy" ], "path": [ "finance", "analyst", "hr", "strategy" ], "submitted": true }, { "seed": 137, "terminal": 0.8827, "cumulative": 1.2827, "steps": 6, "consulted": [ "finance", "analyst", "hr", "strategy" ], "path": [ "finance", "analyst", "hr", "strategy" ], "submitted": true } ] }, { "task": "expert_brief", "policy": "trained_mlp", "use_rag": false, "terminal": { "mean": 0.7286, "std": 0.0, "n": 5 }, "cumulative": { "mean": -1.1314, "std": 0.0, "n": 5 }, "runs": [ { "seed": 11, "terminal": 0.7286, "cumulative": -1.1314, "steps": 14, "consulted": [ "analyst", "strategy", "finance" ], "path": [ "analyst", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance" ], "submitted": true }, { "seed": 23, "terminal": 0.7286, "cumulative": -1.1314, "steps": 14, "consulted": [ "analyst", "strategy", "finance" ], "path": [ "analyst", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance" ], "submitted": true }, { "seed": 47, "terminal": 0.7286, "cumulative": -1.1314, "steps": 14, "consulted": [ "analyst", "strategy", "finance" ], "path": [ "analyst", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance" ], "submitted": true }, { "seed": 91, "terminal": 0.7286, "cumulative": -1.1314, "steps": 14, "consulted": [ "analyst", "strategy", "finance" ], "path": [ "analyst", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance" ], "submitted": true }, { "seed": 137, "terminal": 0.7286, "cumulative": -1.1314, "steps": 14, "consulted": [ "analyst", "strategy", "finance" ], "path": [ "analyst", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance" ], "submitted": true } ] }, { "task": "expert_brief", "policy": "oracle_router", "use_rag": false, "terminal": { "mean": 0.8827, "std": 0.0, "n": 5 }, "cumulative": { "mean": 1.2827, "std": 0.0, "n": 5 }, "runs": [ { "seed": 11, "terminal": 0.8827, "cumulative": 1.2827, "steps": 6, "consulted": [ "analyst", "finance", "strategy", "hr" ], "path": [ "analyst", "finance", "strategy", "hr" ], "submitted": true }, { "seed": 23, "terminal": 0.8827, "cumulative": 1.2827, "steps": 6, "consulted": [ "analyst", "finance", "strategy", "hr" ], "path": [ "analyst", "finance", "strategy", "hr" ], "submitted": true }, { "seed": 47, "terminal": 0.8827, "cumulative": 1.2827, "steps": 6, "consulted": [ "analyst", "finance", "strategy", "hr" ], "path": [ "analyst", "finance", "strategy", "hr" ], "submitted": true }, { "seed": 91, "terminal": 0.8827, "cumulative": 1.2827, "steps": 6, "consulted": [ "analyst", "finance", "strategy", "hr" ], "path": [ "analyst", "finance", "strategy", "hr" ], "submitted": true }, { "seed": 137, "terminal": 0.8827, "cumulative": 1.2827, "steps": 6, "consulted": [ "analyst", "finance", "strategy", "hr" ], "path": [ "analyst", "finance", "strategy", "hr" ], "submitted": true } ] }, { "task": "expert_brief", "policy": "base_naive", "use_rag": true, "terminal": { "mean": 0.3217, "std": 0.0, "n": 5 }, "cumulative": { "mean": 0.1917, "std": 0.0, "n": 5 }, "runs": [ { "seed": 11, "terminal": 0.3217, "cumulative": 0.1917, "steps": 3, "consulted": [ "analyst" ], "path": [ "analyst" ], "submitted": true }, { "seed": 23, "terminal": 0.3217, "cumulative": 0.1917, "steps": 3, "consulted": [ "analyst" ], "path": [ "analyst" ], "submitted": true }, { "seed": 47, "terminal": 0.3217, "cumulative": 0.1917, "steps": 3, "consulted": [ "analyst" ], "path": [ "analyst" ], "submitted": true }, { "seed": 91, "terminal": 0.3217, "cumulative": 0.1917, "steps": 3, "consulted": [ "analyst" ], "path": [ "analyst" ], "submitted": true }, { "seed": 137, "terminal": 0.3217, "cumulative": 0.1917, "steps": 3, "consulted": [ "analyst" ], "path": [ "analyst" ], "submitted": true } ] }, { "task": "expert_brief", "policy": "base_roundrobin", "use_rag": true, "terminal": { "mean": 0.8924999999999998, "std": 0.0, "n": 5 }, "cumulative": { "mean": 1.2925, "std": 0.0, "n": 5 }, "runs": [ { "seed": 11, "terminal": 0.8925, "cumulative": 1.2925, "steps": 6, "consulted": [ "finance", "analyst", "hr", "strategy" ], "path": [ "finance", "analyst", "hr", "strategy" ], "submitted": true }, { "seed": 23, "terminal": 0.8925, "cumulative": 1.2925, "steps": 6, "consulted": [ "finance", "analyst", "hr", "strategy" ], "path": [ "finance", "analyst", "hr", "strategy" ], "submitted": true }, { "seed": 47, "terminal": 0.8925, "cumulative": 1.2925, "steps": 6, "consulted": [ "finance", "analyst", "hr", "strategy" ], "path": [ "finance", "analyst", "hr", "strategy" ], "submitted": true }, { "seed": 91, "terminal": 0.8925, "cumulative": 1.2925, "steps": 6, "consulted": [ "finance", "analyst", "hr", "strategy" ], "path": [ "finance", "analyst", "hr", "strategy" ], "submitted": true }, { "seed": 137, "terminal": 0.8925, "cumulative": 1.2925, "steps": 6, "consulted": [ "finance", "analyst", "hr", "strategy" ], "path": [ "finance", "analyst", "hr", "strategy" ], "submitted": true } ] }, { "task": "expert_brief", "policy": "trained_mlp", "use_rag": true, "terminal": { "mean": 0.7429, "std": 0.0, "n": 5 }, "cumulative": { "mean": -1.1171, "std": 0.0, "n": 5 }, "runs": [ { "seed": 11, "terminal": 0.7429, "cumulative": -1.1171, "steps": 14, "consulted": [ "analyst", "strategy", "finance" ], "path": [ "analyst", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance" ], "submitted": true }, { "seed": 23, "terminal": 0.7429, "cumulative": -1.1171, "steps": 14, "consulted": [ "analyst", "strategy", "finance" ], "path": [ "analyst", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance" ], "submitted": true }, { "seed": 47, "terminal": 0.7429, "cumulative": -1.1171, "steps": 14, "consulted": [ "analyst", "strategy", "finance" ], "path": [ "analyst", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance" ], "submitted": true }, { "seed": 91, "terminal": 0.7429, "cumulative": -1.1171, "steps": 14, "consulted": [ "analyst", "strategy", "finance" ], "path": [ "analyst", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance" ], "submitted": true }, { "seed": 137, "terminal": 0.7429, "cumulative": -1.1171, "steps": 14, "consulted": [ "analyst", "strategy", "finance" ], "path": [ "analyst", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance" ], "submitted": true } ] }, { "task": "expert_brief", "policy": "oracle_router", "use_rag": true, "terminal": { "mean": 0.8924999999999998, "std": 0.0, "n": 5 }, "cumulative": { "mean": 1.2925, "std": 0.0, "n": 5 }, "runs": [ { "seed": 11, "terminal": 0.8925, "cumulative": 1.2925, "steps": 6, "consulted": [ "analyst", "finance", "strategy", "hr" ], "path": [ "analyst", "finance", "strategy", "hr" ], "submitted": true }, { "seed": 23, "terminal": 0.8925, "cumulative": 1.2925, "steps": 6, "consulted": [ "analyst", "finance", "strategy", "hr" ], "path": [ "analyst", "finance", "strategy", "hr" ], "submitted": true }, { "seed": 47, "terminal": 0.8925, "cumulative": 1.2925, "steps": 6, "consulted": [ "analyst", "finance", "strategy", "hr" ], "path": [ "analyst", "finance", "strategy", "hr" ], "submitted": true }, { "seed": 91, "terminal": 0.8925, "cumulative": 1.2925, "steps": 6, "consulted": [ "analyst", "finance", "strategy", "hr" ], "path": [ "analyst", "finance", "strategy", "hr" ], "submitted": true }, { "seed": 137, "terminal": 0.8925, "cumulative": 1.2925, "steps": 6, "consulted": [ "analyst", "finance", "strategy", "hr" ], "path": [ "analyst", "finance", "strategy", "hr" ], "submitted": true } ] }, { "task": "crisis_brief", "policy": "base_naive", "use_rag": false, "terminal": { "mean": 0.2646, "std": 0.0, "n": 5 }, "cumulative": { "mean": 0.1346, "std": 0.0, "n": 5 }, "runs": [ { "seed": 11, "terminal": 0.2646, "cumulative": 0.1346, "steps": 3, "consulted": [ "analyst" ], "path": [ "analyst" ], "submitted": true }, { "seed": 23, "terminal": 0.2646, "cumulative": 0.1346, "steps": 3, "consulted": [ "analyst" ], "path": [ "analyst" ], "submitted": true }, { "seed": 47, "terminal": 0.2646, "cumulative": 0.1346, "steps": 3, "consulted": [ "analyst" ], "path": [ "analyst" ], "submitted": true }, { "seed": 91, "terminal": 0.2646, "cumulative": 0.1346, "steps": 3, "consulted": [ "analyst" ], "path": [ "analyst" ], "submitted": true }, { "seed": 137, "terminal": 0.2646, "cumulative": 0.1346, "steps": 3, "consulted": [ "analyst" ], "path": [ "analyst" ], "submitted": true } ] }, { "task": "crisis_brief", "policy": "base_roundrobin", "use_rag": false, "terminal": { "mean": 0.8805, "std": 0.0, "n": 5 }, "cumulative": { "mean": 1.2805, "std": 0.0, "n": 5 }, "runs": [ { "seed": 11, "terminal": 0.8805, "cumulative": 1.2805, "steps": 6, "consulted": [ "finance", "analyst", "hr", "strategy" ], "path": [ "finance", "analyst", "hr", "strategy" ], "submitted": true }, { "seed": 23, "terminal": 0.8805, "cumulative": 1.2805, "steps": 6, "consulted": [ "finance", "analyst", "hr", "strategy" ], "path": [ "finance", "analyst", "hr", "strategy" ], "submitted": true }, { "seed": 47, "terminal": 0.8805, "cumulative": 1.2805, "steps": 6, "consulted": [ "finance", "analyst", "hr", "strategy" ], "path": [ "finance", "analyst", "hr", "strategy" ], "submitted": true }, { "seed": 91, "terminal": 0.8805, "cumulative": 1.2805, "steps": 6, "consulted": [ "finance", "analyst", "hr", "strategy" ], "path": [ "finance", "analyst", "hr", "strategy" ], "submitted": true }, { "seed": 137, "terminal": 0.8805, "cumulative": 1.2805, "steps": 6, "consulted": [ "finance", "analyst", "hr", "strategy" ], "path": [ "finance", "analyst", "hr", "strategy" ], "submitted": true } ] }, { "task": "crisis_brief", "policy": "trained_mlp", "use_rag": false, "terminal": { "mean": 0.7286, "std": 0.0, "n": 5 }, "cumulative": { "mean": -1.1314, "std": 0.0, "n": 5 }, "runs": [ { "seed": 11, "terminal": 0.7286, "cumulative": -1.1314, "steps": 14, "consulted": [ "analyst", "strategy", "finance" ], "path": [ "analyst", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance" ], "submitted": true }, { "seed": 23, "terminal": 0.7286, "cumulative": -1.1314, "steps": 14, "consulted": [ "analyst", "strategy", "finance" ], "path": [ "analyst", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance" ], "submitted": true }, { "seed": 47, "terminal": 0.7286, "cumulative": -1.1314, "steps": 14, "consulted": [ "analyst", "strategy", "finance" ], "path": [ "analyst", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance" ], "submitted": true }, { "seed": 91, "terminal": 0.7286, "cumulative": -1.1314, "steps": 14, "consulted": [ "analyst", "strategy", "finance" ], "path": [ "analyst", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance" ], "submitted": true }, { "seed": 137, "terminal": 0.7286, "cumulative": -1.1314, "steps": 14, "consulted": [ "analyst", "strategy", "finance" ], "path": [ "analyst", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance" ], "submitted": true } ] }, { "task": "crisis_brief", "policy": "oracle_router", "use_rag": false, "terminal": { "mean": 0.8805, "std": 0.0, "n": 5 }, "cumulative": { "mean": 1.2805, "std": 0.0, "n": 5 }, "runs": [ { "seed": 11, "terminal": 0.8805, "cumulative": 1.2805, "steps": 6, "consulted": [ "analyst", "finance", "strategy", "hr" ], "path": [ "analyst", "finance", "strategy", "hr" ], "submitted": true }, { "seed": 23, "terminal": 0.8805, "cumulative": 1.2805, "steps": 6, "consulted": [ "analyst", "finance", "strategy", "hr" ], "path": [ "analyst", "finance", "strategy", "hr" ], "submitted": true }, { "seed": 47, "terminal": 0.8805, "cumulative": 1.2805, "steps": 6, "consulted": [ "analyst", "finance", "strategy", "hr" ], "path": [ "analyst", "finance", "strategy", "hr" ], "submitted": true }, { "seed": 91, "terminal": 0.8805, "cumulative": 1.2805, "steps": 6, "consulted": [ "analyst", "finance", "strategy", "hr" ], "path": [ "analyst", "finance", "strategy", "hr" ], "submitted": true }, { "seed": 137, "terminal": 0.8805, "cumulative": 1.2805, "steps": 6, "consulted": [ "analyst", "finance", "strategy", "hr" ], "path": [ "analyst", "finance", "strategy", "hr" ], "submitted": true } ] }, { "task": "crisis_brief", "policy": "base_naive", "use_rag": true, "terminal": { "mean": 0.3217, "std": 0.0, "n": 5 }, "cumulative": { "mean": 0.1917, "std": 0.0, "n": 5 }, "runs": [ { "seed": 11, "terminal": 0.3217, "cumulative": 0.1917, "steps": 3, "consulted": [ "analyst" ], "path": [ "analyst" ], "submitted": true }, { "seed": 23, "terminal": 0.3217, "cumulative": 0.1917, "steps": 3, "consulted": [ "analyst" ], "path": [ "analyst" ], "submitted": true }, { "seed": 47, "terminal": 0.3217, "cumulative": 0.1917, "steps": 3, "consulted": [ "analyst" ], "path": [ "analyst" ], "submitted": true }, { "seed": 91, "terminal": 0.3217, "cumulative": 0.1917, "steps": 3, "consulted": [ "analyst" ], "path": [ "analyst" ], "submitted": true }, { "seed": 137, "terminal": 0.3217, "cumulative": 0.1917, "steps": 3, "consulted": [ "analyst" ], "path": [ "analyst" ], "submitted": true } ] }, { "task": "crisis_brief", "policy": "base_roundrobin", "use_rag": true, "terminal": { "mean": 0.8914, "std": 0.0, "n": 5 }, "cumulative": { "mean": 1.2914, "std": 0.0, "n": 5 }, "runs": [ { "seed": 11, "terminal": 0.8914, "cumulative": 1.2914, "steps": 6, "consulted": [ "finance", "analyst", "hr", "strategy" ], "path": [ "finance", "analyst", "hr", "strategy" ], "submitted": true }, { "seed": 23, "terminal": 0.8914, "cumulative": 1.2914, "steps": 6, "consulted": [ "finance", "analyst", "hr", "strategy" ], "path": [ "finance", "analyst", "hr", "strategy" ], "submitted": true }, { "seed": 47, "terminal": 0.8914, "cumulative": 1.2914, "steps": 6, "consulted": [ "finance", "analyst", "hr", "strategy" ], "path": [ "finance", "analyst", "hr", "strategy" ], "submitted": true }, { "seed": 91, "terminal": 0.8914, "cumulative": 1.2914, "steps": 6, "consulted": [ "finance", "analyst", "hr", "strategy" ], "path": [ "finance", "analyst", "hr", "strategy" ], "submitted": true }, { "seed": 137, "terminal": 0.8914, "cumulative": 1.2914, "steps": 6, "consulted": [ "finance", "analyst", "hr", "strategy" ], "path": [ "finance", "analyst", "hr", "strategy" ], "submitted": true } ] }, { "task": "crisis_brief", "policy": "trained_mlp", "use_rag": true, "terminal": { "mean": 0.7429, "std": 0.0, "n": 5 }, "cumulative": { "mean": -1.1171, "std": 0.0, "n": 5 }, "runs": [ { "seed": 11, "terminal": 0.7429, "cumulative": -1.1171, "steps": 14, "consulted": [ "analyst", "strategy", "finance" ], "path": [ "analyst", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance" ], "submitted": true }, { "seed": 23, "terminal": 0.7429, "cumulative": -1.1171, "steps": 14, "consulted": [ "analyst", "strategy", "finance" ], "path": [ "analyst", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance" ], "submitted": true }, { "seed": 47, "terminal": 0.7429, "cumulative": -1.1171, "steps": 14, "consulted": [ "analyst", "strategy", "finance" ], "path": [ "analyst", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance" ], "submitted": true }, { "seed": 91, "terminal": 0.7429, "cumulative": -1.1171, "steps": 14, "consulted": [ "analyst", "strategy", "finance" ], "path": [ "analyst", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance" ], "submitted": true }, { "seed": 137, "terminal": 0.7429, "cumulative": -1.1171, "steps": 14, "consulted": [ "analyst", "strategy", "finance" ], "path": [ "analyst", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance", "finance" ], "submitted": true } ] }, { "task": "crisis_brief", "policy": "oracle_router", "use_rag": true, "terminal": { "mean": 0.8914, "std": 0.0, "n": 5 }, "cumulative": { "mean": 1.2914, "std": 0.0, "n": 5 }, "runs": [ { "seed": 11, "terminal": 0.8914, "cumulative": 1.2914, "steps": 6, "consulted": [ "analyst", "finance", "strategy", "hr" ], "path": [ "analyst", "finance", "strategy", "hr" ], "submitted": true }, { "seed": 23, "terminal": 0.8914, "cumulative": 1.2914, "steps": 6, "consulted": [ "analyst", "finance", "strategy", "hr" ], "path": [ "analyst", "finance", "strategy", "hr" ], "submitted": true }, { "seed": 47, "terminal": 0.8914, "cumulative": 1.2914, "steps": 6, "consulted": [ "analyst", "finance", "strategy", "hr" ], "path": [ "analyst", "finance", "strategy", "hr" ], "submitted": true }, { "seed": 91, "terminal": 0.8914, "cumulative": 1.2914, "steps": 6, "consulted": [ "analyst", "finance", "strategy", "hr" ], "path": [ "analyst", "finance", "strategy", "hr" ], "submitted": true }, { "seed": 137, "terminal": 0.8914, "cumulative": 1.2914, "steps": 6, "consulted": [ "analyst", "finance", "strategy", "hr" ], "path": [ "analyst", "finance", "strategy", "hr" ], "submitted": true } ] } ] }