AutoDataLab2.0 / training /evidence /headline_benchmark.json
uchihamadara1816's picture
Upload 172 files
d02bacd verified
{
"schema": "autodatalab-plus.headline_benchmark.v2",
"config": {
"tasks": [
"hard_brief",
"expert_brief",
"crisis_brief"
],
"policies": [
"base_naive",
"base_roundrobin",
"trained_mlp",
"oracle_router"
],
"policy_info": {
"base_naive": "untrained baseline (analyst-only)",
"base_roundrobin": "untrained baseline (fixed order)",
"trained_mlp": "trained MLP CoS (REINFORCE, 600 ep) - cos_final.pt (ok)",
"oracle_router": "oracle router (upper bound, handcoded canonical sequence)"
},
"seeds": [
11,
23,
47,
91,
137
],
"rag_settings": [
false,
true
],
"auto_fill_required": false,
"shaping": "strict"
},
"cells": [
{
"task": "hard_brief",
"policy": "base_naive",
"use_rag": false,
"terminal": {
"mean": 0.2646,
"std": 0.0,
"n": 5
},
"cumulative": {
"mean": 0.1346,
"std": 0.0,
"n": 5
},
"runs": [
{
"seed": 11,
"terminal": 0.2646,
"cumulative": 0.1346,
"steps": 3,
"consulted": [
"analyst"
],
"path": [
"analyst"
],
"submitted": true
},
{
"seed": 23,
"terminal": 0.2646,
"cumulative": 0.1346,
"steps": 3,
"consulted": [
"analyst"
],
"path": [
"analyst"
],
"submitted": true
},
{
"seed": 47,
"terminal": 0.2646,
"cumulative": 0.1346,
"steps": 3,
"consulted": [
"analyst"
],
"path": [
"analyst"
],
"submitted": true
},
{
"seed": 91,
"terminal": 0.2646,
"cumulative": 0.1346,
"steps": 3,
"consulted": [
"analyst"
],
"path": [
"analyst"
],
"submitted": true
},
{
"seed": 137,
"terminal": 0.2646,
"cumulative": 0.1346,
"steps": 3,
"consulted": [
"analyst"
],
"path": [
"analyst"
],
"submitted": true
}
]
},
{
"task": "hard_brief",
"policy": "base_roundrobin",
"use_rag": false,
"terminal": {
"mean": 0.8827,
"std": 0.0,
"n": 5
},
"cumulative": {
"mean": 1.2627,
"std": 0.0,
"n": 5
},
"runs": [
{
"seed": 11,
"terminal": 0.8827,
"cumulative": 1.2627,
"steps": 6,
"consulted": [
"finance",
"analyst",
"hr",
"strategy"
],
"path": [
"finance",
"analyst",
"hr",
"strategy"
],
"submitted": true
},
{
"seed": 23,
"terminal": 0.8827,
"cumulative": 1.2627,
"steps": 6,
"consulted": [
"finance",
"analyst",
"hr",
"strategy"
],
"path": [
"finance",
"analyst",
"hr",
"strategy"
],
"submitted": true
},
{
"seed": 47,
"terminal": 0.8827,
"cumulative": 1.2627,
"steps": 6,
"consulted": [
"finance",
"analyst",
"hr",
"strategy"
],
"path": [
"finance",
"analyst",
"hr",
"strategy"
],
"submitted": true
},
{
"seed": 91,
"terminal": 0.8827,
"cumulative": 1.2627,
"steps": 6,
"consulted": [
"finance",
"analyst",
"hr",
"strategy"
],
"path": [
"finance",
"analyst",
"hr",
"strategy"
],
"submitted": true
},
{
"seed": 137,
"terminal": 0.8827,
"cumulative": 1.2627,
"steps": 6,
"consulted": [
"finance",
"analyst",
"hr",
"strategy"
],
"path": [
"finance",
"analyst",
"hr",
"strategy"
],
"submitted": true
}
]
},
{
"task": "hard_brief",
"policy": "trained_mlp",
"use_rag": false,
"terminal": {
"mean": 0.7286,
"std": 0.0,
"n": 5
},
"cumulative": {
"mean": -0.7314,
"std": 0.0,
"n": 5
},
"runs": [
{
"seed": 11,
"terminal": 0.7286,
"cumulative": -0.7314,
"steps": 12,
"consulted": [
"analyst",
"strategy",
"finance"
],
"path": [
"analyst",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance"
],
"submitted": true
},
{
"seed": 23,
"terminal": 0.7286,
"cumulative": -0.7314,
"steps": 12,
"consulted": [
"analyst",
"strategy",
"finance"
],
"path": [
"analyst",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance"
],
"submitted": true
},
{
"seed": 47,
"terminal": 0.7286,
"cumulative": -0.7314,
"steps": 12,
"consulted": [
"analyst",
"strategy",
"finance"
],
"path": [
"analyst",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance"
],
"submitted": true
},
{
"seed": 91,
"terminal": 0.7286,
"cumulative": -0.7314,
"steps": 12,
"consulted": [
"analyst",
"strategy",
"finance"
],
"path": [
"analyst",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance"
],
"submitted": true
},
{
"seed": 137,
"terminal": 0.7286,
"cumulative": -0.7314,
"steps": 12,
"consulted": [
"analyst",
"strategy",
"finance"
],
"path": [
"analyst",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance"
],
"submitted": true
}
]
},
{
"task": "hard_brief",
"policy": "oracle_router",
"use_rag": false,
"terminal": {
"mean": 0.8827,
"std": 0.0,
"n": 5
},
"cumulative": {
"mean": 1.2627,
"std": 0.0,
"n": 5
},
"runs": [
{
"seed": 11,
"terminal": 0.8827,
"cumulative": 1.2627,
"steps": 6,
"consulted": [
"analyst",
"finance",
"strategy",
"hr"
],
"path": [
"analyst",
"finance",
"strategy",
"hr"
],
"submitted": true
},
{
"seed": 23,
"terminal": 0.8827,
"cumulative": 1.2627,
"steps": 6,
"consulted": [
"analyst",
"finance",
"strategy",
"hr"
],
"path": [
"analyst",
"finance",
"strategy",
"hr"
],
"submitted": true
},
{
"seed": 47,
"terminal": 0.8827,
"cumulative": 1.2627,
"steps": 6,
"consulted": [
"analyst",
"finance",
"strategy",
"hr"
],
"path": [
"analyst",
"finance",
"strategy",
"hr"
],
"submitted": true
},
{
"seed": 91,
"terminal": 0.8827,
"cumulative": 1.2627,
"steps": 6,
"consulted": [
"analyst",
"finance",
"strategy",
"hr"
],
"path": [
"analyst",
"finance",
"strategy",
"hr"
],
"submitted": true
},
{
"seed": 137,
"terminal": 0.8827,
"cumulative": 1.2627,
"steps": 6,
"consulted": [
"analyst",
"finance",
"strategy",
"hr"
],
"path": [
"analyst",
"finance",
"strategy",
"hr"
],
"submitted": true
}
]
},
{
"task": "hard_brief",
"policy": "base_naive",
"use_rag": true,
"terminal": {
"mean": 0.3217,
"std": 0.0,
"n": 5
},
"cumulative": {
"mean": 0.1917,
"std": 0.0,
"n": 5
},
"runs": [
{
"seed": 11,
"terminal": 0.3217,
"cumulative": 0.1917,
"steps": 3,
"consulted": [
"analyst"
],
"path": [
"analyst"
],
"submitted": true
},
{
"seed": 23,
"terminal": 0.3217,
"cumulative": 0.1917,
"steps": 3,
"consulted": [
"analyst"
],
"path": [
"analyst"
],
"submitted": true
},
{
"seed": 47,
"terminal": 0.3217,
"cumulative": 0.1917,
"steps": 3,
"consulted": [
"analyst"
],
"path": [
"analyst"
],
"submitted": true
},
{
"seed": 91,
"terminal": 0.3217,
"cumulative": 0.1917,
"steps": 3,
"consulted": [
"analyst"
],
"path": [
"analyst"
],
"submitted": true
},
{
"seed": 137,
"terminal": 0.3217,
"cumulative": 0.1917,
"steps": 3,
"consulted": [
"analyst"
],
"path": [
"analyst"
],
"submitted": true
}
]
},
{
"task": "hard_brief",
"policy": "base_roundrobin",
"use_rag": true,
"terminal": {
"mean": 0.897,
"std": 0.0,
"n": 5
},
"cumulative": {
"mean": 1.277,
"std": 0.0,
"n": 5
},
"runs": [
{
"seed": 11,
"terminal": 0.897,
"cumulative": 1.277,
"steps": 6,
"consulted": [
"finance",
"analyst",
"hr",
"strategy"
],
"path": [
"finance",
"analyst",
"hr",
"strategy"
],
"submitted": true
},
{
"seed": 23,
"terminal": 0.897,
"cumulative": 1.277,
"steps": 6,
"consulted": [
"finance",
"analyst",
"hr",
"strategy"
],
"path": [
"finance",
"analyst",
"hr",
"strategy"
],
"submitted": true
},
{
"seed": 47,
"terminal": 0.897,
"cumulative": 1.277,
"steps": 6,
"consulted": [
"finance",
"analyst",
"hr",
"strategy"
],
"path": [
"finance",
"analyst",
"hr",
"strategy"
],
"submitted": true
},
{
"seed": 91,
"terminal": 0.897,
"cumulative": 1.277,
"steps": 6,
"consulted": [
"finance",
"analyst",
"hr",
"strategy"
],
"path": [
"finance",
"analyst",
"hr",
"strategy"
],
"submitted": true
},
{
"seed": 137,
"terminal": 0.897,
"cumulative": 1.277,
"steps": 6,
"consulted": [
"finance",
"analyst",
"hr",
"strategy"
],
"path": [
"finance",
"analyst",
"hr",
"strategy"
],
"submitted": true
}
]
},
{
"task": "hard_brief",
"policy": "trained_mlp",
"use_rag": true,
"terminal": {
"mean": 0.7429,
"std": 0.0,
"n": 5
},
"cumulative": {
"mean": -0.7171,
"std": 0.0,
"n": 5
},
"runs": [
{
"seed": 11,
"terminal": 0.7429,
"cumulative": -0.7171,
"steps": 12,
"consulted": [
"analyst",
"strategy",
"finance"
],
"path": [
"analyst",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance"
],
"submitted": true
},
{
"seed": 23,
"terminal": 0.7429,
"cumulative": -0.7171,
"steps": 12,
"consulted": [
"analyst",
"strategy",
"finance"
],
"path": [
"analyst",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance"
],
"submitted": true
},
{
"seed": 47,
"terminal": 0.7429,
"cumulative": -0.7171,
"steps": 12,
"consulted": [
"analyst",
"strategy",
"finance"
],
"path": [
"analyst",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance"
],
"submitted": true
},
{
"seed": 91,
"terminal": 0.7429,
"cumulative": -0.7171,
"steps": 12,
"consulted": [
"analyst",
"strategy",
"finance"
],
"path": [
"analyst",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance"
],
"submitted": true
},
{
"seed": 137,
"terminal": 0.7429,
"cumulative": -0.7171,
"steps": 12,
"consulted": [
"analyst",
"strategy",
"finance"
],
"path": [
"analyst",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance"
],
"submitted": true
}
]
},
{
"task": "hard_brief",
"policy": "oracle_router",
"use_rag": true,
"terminal": {
"mean": 0.897,
"std": 0.0,
"n": 5
},
"cumulative": {
"mean": 1.277,
"std": 0.0,
"n": 5
},
"runs": [
{
"seed": 11,
"terminal": 0.897,
"cumulative": 1.277,
"steps": 6,
"consulted": [
"analyst",
"finance",
"strategy",
"hr"
],
"path": [
"analyst",
"finance",
"strategy",
"hr"
],
"submitted": true
},
{
"seed": 23,
"terminal": 0.897,
"cumulative": 1.277,
"steps": 6,
"consulted": [
"analyst",
"finance",
"strategy",
"hr"
],
"path": [
"analyst",
"finance",
"strategy",
"hr"
],
"submitted": true
},
{
"seed": 47,
"terminal": 0.897,
"cumulative": 1.277,
"steps": 6,
"consulted": [
"analyst",
"finance",
"strategy",
"hr"
],
"path": [
"analyst",
"finance",
"strategy",
"hr"
],
"submitted": true
},
{
"seed": 91,
"terminal": 0.897,
"cumulative": 1.277,
"steps": 6,
"consulted": [
"analyst",
"finance",
"strategy",
"hr"
],
"path": [
"analyst",
"finance",
"strategy",
"hr"
],
"submitted": true
},
{
"seed": 137,
"terminal": 0.897,
"cumulative": 1.277,
"steps": 6,
"consulted": [
"analyst",
"finance",
"strategy",
"hr"
],
"path": [
"analyst",
"finance",
"strategy",
"hr"
],
"submitted": true
}
]
},
{
"task": "expert_brief",
"policy": "base_naive",
"use_rag": false,
"terminal": {
"mean": 0.2646,
"std": 0.0,
"n": 5
},
"cumulative": {
"mean": 0.1346,
"std": 0.0,
"n": 5
},
"runs": [
{
"seed": 11,
"terminal": 0.2646,
"cumulative": 0.1346,
"steps": 3,
"consulted": [
"analyst"
],
"path": [
"analyst"
],
"submitted": true
},
{
"seed": 23,
"terminal": 0.2646,
"cumulative": 0.1346,
"steps": 3,
"consulted": [
"analyst"
],
"path": [
"analyst"
],
"submitted": true
},
{
"seed": 47,
"terminal": 0.2646,
"cumulative": 0.1346,
"steps": 3,
"consulted": [
"analyst"
],
"path": [
"analyst"
],
"submitted": true
},
{
"seed": 91,
"terminal": 0.2646,
"cumulative": 0.1346,
"steps": 3,
"consulted": [
"analyst"
],
"path": [
"analyst"
],
"submitted": true
},
{
"seed": 137,
"terminal": 0.2646,
"cumulative": 0.1346,
"steps": 3,
"consulted": [
"analyst"
],
"path": [
"analyst"
],
"submitted": true
}
]
},
{
"task": "expert_brief",
"policy": "base_roundrobin",
"use_rag": false,
"terminal": {
"mean": 0.8827,
"std": 0.0,
"n": 5
},
"cumulative": {
"mean": 1.2827,
"std": 0.0,
"n": 5
},
"runs": [
{
"seed": 11,
"terminal": 0.8827,
"cumulative": 1.2827,
"steps": 6,
"consulted": [
"finance",
"analyst",
"hr",
"strategy"
],
"path": [
"finance",
"analyst",
"hr",
"strategy"
],
"submitted": true
},
{
"seed": 23,
"terminal": 0.8827,
"cumulative": 1.2827,
"steps": 6,
"consulted": [
"finance",
"analyst",
"hr",
"strategy"
],
"path": [
"finance",
"analyst",
"hr",
"strategy"
],
"submitted": true
},
{
"seed": 47,
"terminal": 0.8827,
"cumulative": 1.2827,
"steps": 6,
"consulted": [
"finance",
"analyst",
"hr",
"strategy"
],
"path": [
"finance",
"analyst",
"hr",
"strategy"
],
"submitted": true
},
{
"seed": 91,
"terminal": 0.8827,
"cumulative": 1.2827,
"steps": 6,
"consulted": [
"finance",
"analyst",
"hr",
"strategy"
],
"path": [
"finance",
"analyst",
"hr",
"strategy"
],
"submitted": true
},
{
"seed": 137,
"terminal": 0.8827,
"cumulative": 1.2827,
"steps": 6,
"consulted": [
"finance",
"analyst",
"hr",
"strategy"
],
"path": [
"finance",
"analyst",
"hr",
"strategy"
],
"submitted": true
}
]
},
{
"task": "expert_brief",
"policy": "trained_mlp",
"use_rag": false,
"terminal": {
"mean": 0.7286,
"std": 0.0,
"n": 5
},
"cumulative": {
"mean": -1.1314,
"std": 0.0,
"n": 5
},
"runs": [
{
"seed": 11,
"terminal": 0.7286,
"cumulative": -1.1314,
"steps": 14,
"consulted": [
"analyst",
"strategy",
"finance"
],
"path": [
"analyst",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance"
],
"submitted": true
},
{
"seed": 23,
"terminal": 0.7286,
"cumulative": -1.1314,
"steps": 14,
"consulted": [
"analyst",
"strategy",
"finance"
],
"path": [
"analyst",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance"
],
"submitted": true
},
{
"seed": 47,
"terminal": 0.7286,
"cumulative": -1.1314,
"steps": 14,
"consulted": [
"analyst",
"strategy",
"finance"
],
"path": [
"analyst",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance"
],
"submitted": true
},
{
"seed": 91,
"terminal": 0.7286,
"cumulative": -1.1314,
"steps": 14,
"consulted": [
"analyst",
"strategy",
"finance"
],
"path": [
"analyst",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance"
],
"submitted": true
},
{
"seed": 137,
"terminal": 0.7286,
"cumulative": -1.1314,
"steps": 14,
"consulted": [
"analyst",
"strategy",
"finance"
],
"path": [
"analyst",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance"
],
"submitted": true
}
]
},
{
"task": "expert_brief",
"policy": "oracle_router",
"use_rag": false,
"terminal": {
"mean": 0.8827,
"std": 0.0,
"n": 5
},
"cumulative": {
"mean": 1.2827,
"std": 0.0,
"n": 5
},
"runs": [
{
"seed": 11,
"terminal": 0.8827,
"cumulative": 1.2827,
"steps": 6,
"consulted": [
"analyst",
"finance",
"strategy",
"hr"
],
"path": [
"analyst",
"finance",
"strategy",
"hr"
],
"submitted": true
},
{
"seed": 23,
"terminal": 0.8827,
"cumulative": 1.2827,
"steps": 6,
"consulted": [
"analyst",
"finance",
"strategy",
"hr"
],
"path": [
"analyst",
"finance",
"strategy",
"hr"
],
"submitted": true
},
{
"seed": 47,
"terminal": 0.8827,
"cumulative": 1.2827,
"steps": 6,
"consulted": [
"analyst",
"finance",
"strategy",
"hr"
],
"path": [
"analyst",
"finance",
"strategy",
"hr"
],
"submitted": true
},
{
"seed": 91,
"terminal": 0.8827,
"cumulative": 1.2827,
"steps": 6,
"consulted": [
"analyst",
"finance",
"strategy",
"hr"
],
"path": [
"analyst",
"finance",
"strategy",
"hr"
],
"submitted": true
},
{
"seed": 137,
"terminal": 0.8827,
"cumulative": 1.2827,
"steps": 6,
"consulted": [
"analyst",
"finance",
"strategy",
"hr"
],
"path": [
"analyst",
"finance",
"strategy",
"hr"
],
"submitted": true
}
]
},
{
"task": "expert_brief",
"policy": "base_naive",
"use_rag": true,
"terminal": {
"mean": 0.3217,
"std": 0.0,
"n": 5
},
"cumulative": {
"mean": 0.1917,
"std": 0.0,
"n": 5
},
"runs": [
{
"seed": 11,
"terminal": 0.3217,
"cumulative": 0.1917,
"steps": 3,
"consulted": [
"analyst"
],
"path": [
"analyst"
],
"submitted": true
},
{
"seed": 23,
"terminal": 0.3217,
"cumulative": 0.1917,
"steps": 3,
"consulted": [
"analyst"
],
"path": [
"analyst"
],
"submitted": true
},
{
"seed": 47,
"terminal": 0.3217,
"cumulative": 0.1917,
"steps": 3,
"consulted": [
"analyst"
],
"path": [
"analyst"
],
"submitted": true
},
{
"seed": 91,
"terminal": 0.3217,
"cumulative": 0.1917,
"steps": 3,
"consulted": [
"analyst"
],
"path": [
"analyst"
],
"submitted": true
},
{
"seed": 137,
"terminal": 0.3217,
"cumulative": 0.1917,
"steps": 3,
"consulted": [
"analyst"
],
"path": [
"analyst"
],
"submitted": true
}
]
},
{
"task": "expert_brief",
"policy": "base_roundrobin",
"use_rag": true,
"terminal": {
"mean": 0.8924999999999998,
"std": 0.0,
"n": 5
},
"cumulative": {
"mean": 1.2925,
"std": 0.0,
"n": 5
},
"runs": [
{
"seed": 11,
"terminal": 0.8925,
"cumulative": 1.2925,
"steps": 6,
"consulted": [
"finance",
"analyst",
"hr",
"strategy"
],
"path": [
"finance",
"analyst",
"hr",
"strategy"
],
"submitted": true
},
{
"seed": 23,
"terminal": 0.8925,
"cumulative": 1.2925,
"steps": 6,
"consulted": [
"finance",
"analyst",
"hr",
"strategy"
],
"path": [
"finance",
"analyst",
"hr",
"strategy"
],
"submitted": true
},
{
"seed": 47,
"terminal": 0.8925,
"cumulative": 1.2925,
"steps": 6,
"consulted": [
"finance",
"analyst",
"hr",
"strategy"
],
"path": [
"finance",
"analyst",
"hr",
"strategy"
],
"submitted": true
},
{
"seed": 91,
"terminal": 0.8925,
"cumulative": 1.2925,
"steps": 6,
"consulted": [
"finance",
"analyst",
"hr",
"strategy"
],
"path": [
"finance",
"analyst",
"hr",
"strategy"
],
"submitted": true
},
{
"seed": 137,
"terminal": 0.8925,
"cumulative": 1.2925,
"steps": 6,
"consulted": [
"finance",
"analyst",
"hr",
"strategy"
],
"path": [
"finance",
"analyst",
"hr",
"strategy"
],
"submitted": true
}
]
},
{
"task": "expert_brief",
"policy": "trained_mlp",
"use_rag": true,
"terminal": {
"mean": 0.7429,
"std": 0.0,
"n": 5
},
"cumulative": {
"mean": -1.1171,
"std": 0.0,
"n": 5
},
"runs": [
{
"seed": 11,
"terminal": 0.7429,
"cumulative": -1.1171,
"steps": 14,
"consulted": [
"analyst",
"strategy",
"finance"
],
"path": [
"analyst",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance"
],
"submitted": true
},
{
"seed": 23,
"terminal": 0.7429,
"cumulative": -1.1171,
"steps": 14,
"consulted": [
"analyst",
"strategy",
"finance"
],
"path": [
"analyst",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance"
],
"submitted": true
},
{
"seed": 47,
"terminal": 0.7429,
"cumulative": -1.1171,
"steps": 14,
"consulted": [
"analyst",
"strategy",
"finance"
],
"path": [
"analyst",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance"
],
"submitted": true
},
{
"seed": 91,
"terminal": 0.7429,
"cumulative": -1.1171,
"steps": 14,
"consulted": [
"analyst",
"strategy",
"finance"
],
"path": [
"analyst",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance"
],
"submitted": true
},
{
"seed": 137,
"terminal": 0.7429,
"cumulative": -1.1171,
"steps": 14,
"consulted": [
"analyst",
"strategy",
"finance"
],
"path": [
"analyst",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance"
],
"submitted": true
}
]
},
{
"task": "expert_brief",
"policy": "oracle_router",
"use_rag": true,
"terminal": {
"mean": 0.8924999999999998,
"std": 0.0,
"n": 5
},
"cumulative": {
"mean": 1.2925,
"std": 0.0,
"n": 5
},
"runs": [
{
"seed": 11,
"terminal": 0.8925,
"cumulative": 1.2925,
"steps": 6,
"consulted": [
"analyst",
"finance",
"strategy",
"hr"
],
"path": [
"analyst",
"finance",
"strategy",
"hr"
],
"submitted": true
},
{
"seed": 23,
"terminal": 0.8925,
"cumulative": 1.2925,
"steps": 6,
"consulted": [
"analyst",
"finance",
"strategy",
"hr"
],
"path": [
"analyst",
"finance",
"strategy",
"hr"
],
"submitted": true
},
{
"seed": 47,
"terminal": 0.8925,
"cumulative": 1.2925,
"steps": 6,
"consulted": [
"analyst",
"finance",
"strategy",
"hr"
],
"path": [
"analyst",
"finance",
"strategy",
"hr"
],
"submitted": true
},
{
"seed": 91,
"terminal": 0.8925,
"cumulative": 1.2925,
"steps": 6,
"consulted": [
"analyst",
"finance",
"strategy",
"hr"
],
"path": [
"analyst",
"finance",
"strategy",
"hr"
],
"submitted": true
},
{
"seed": 137,
"terminal": 0.8925,
"cumulative": 1.2925,
"steps": 6,
"consulted": [
"analyst",
"finance",
"strategy",
"hr"
],
"path": [
"analyst",
"finance",
"strategy",
"hr"
],
"submitted": true
}
]
},
{
"task": "crisis_brief",
"policy": "base_naive",
"use_rag": false,
"terminal": {
"mean": 0.2646,
"std": 0.0,
"n": 5
},
"cumulative": {
"mean": 0.1346,
"std": 0.0,
"n": 5
},
"runs": [
{
"seed": 11,
"terminal": 0.2646,
"cumulative": 0.1346,
"steps": 3,
"consulted": [
"analyst"
],
"path": [
"analyst"
],
"submitted": true
},
{
"seed": 23,
"terminal": 0.2646,
"cumulative": 0.1346,
"steps": 3,
"consulted": [
"analyst"
],
"path": [
"analyst"
],
"submitted": true
},
{
"seed": 47,
"terminal": 0.2646,
"cumulative": 0.1346,
"steps": 3,
"consulted": [
"analyst"
],
"path": [
"analyst"
],
"submitted": true
},
{
"seed": 91,
"terminal": 0.2646,
"cumulative": 0.1346,
"steps": 3,
"consulted": [
"analyst"
],
"path": [
"analyst"
],
"submitted": true
},
{
"seed": 137,
"terminal": 0.2646,
"cumulative": 0.1346,
"steps": 3,
"consulted": [
"analyst"
],
"path": [
"analyst"
],
"submitted": true
}
]
},
{
"task": "crisis_brief",
"policy": "base_roundrobin",
"use_rag": false,
"terminal": {
"mean": 0.8805,
"std": 0.0,
"n": 5
},
"cumulative": {
"mean": 1.2805,
"std": 0.0,
"n": 5
},
"runs": [
{
"seed": 11,
"terminal": 0.8805,
"cumulative": 1.2805,
"steps": 6,
"consulted": [
"finance",
"analyst",
"hr",
"strategy"
],
"path": [
"finance",
"analyst",
"hr",
"strategy"
],
"submitted": true
},
{
"seed": 23,
"terminal": 0.8805,
"cumulative": 1.2805,
"steps": 6,
"consulted": [
"finance",
"analyst",
"hr",
"strategy"
],
"path": [
"finance",
"analyst",
"hr",
"strategy"
],
"submitted": true
},
{
"seed": 47,
"terminal": 0.8805,
"cumulative": 1.2805,
"steps": 6,
"consulted": [
"finance",
"analyst",
"hr",
"strategy"
],
"path": [
"finance",
"analyst",
"hr",
"strategy"
],
"submitted": true
},
{
"seed": 91,
"terminal": 0.8805,
"cumulative": 1.2805,
"steps": 6,
"consulted": [
"finance",
"analyst",
"hr",
"strategy"
],
"path": [
"finance",
"analyst",
"hr",
"strategy"
],
"submitted": true
},
{
"seed": 137,
"terminal": 0.8805,
"cumulative": 1.2805,
"steps": 6,
"consulted": [
"finance",
"analyst",
"hr",
"strategy"
],
"path": [
"finance",
"analyst",
"hr",
"strategy"
],
"submitted": true
}
]
},
{
"task": "crisis_brief",
"policy": "trained_mlp",
"use_rag": false,
"terminal": {
"mean": 0.7286,
"std": 0.0,
"n": 5
},
"cumulative": {
"mean": -1.1314,
"std": 0.0,
"n": 5
},
"runs": [
{
"seed": 11,
"terminal": 0.7286,
"cumulative": -1.1314,
"steps": 14,
"consulted": [
"analyst",
"strategy",
"finance"
],
"path": [
"analyst",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance"
],
"submitted": true
},
{
"seed": 23,
"terminal": 0.7286,
"cumulative": -1.1314,
"steps": 14,
"consulted": [
"analyst",
"strategy",
"finance"
],
"path": [
"analyst",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance"
],
"submitted": true
},
{
"seed": 47,
"terminal": 0.7286,
"cumulative": -1.1314,
"steps": 14,
"consulted": [
"analyst",
"strategy",
"finance"
],
"path": [
"analyst",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance"
],
"submitted": true
},
{
"seed": 91,
"terminal": 0.7286,
"cumulative": -1.1314,
"steps": 14,
"consulted": [
"analyst",
"strategy",
"finance"
],
"path": [
"analyst",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance"
],
"submitted": true
},
{
"seed": 137,
"terminal": 0.7286,
"cumulative": -1.1314,
"steps": 14,
"consulted": [
"analyst",
"strategy",
"finance"
],
"path": [
"analyst",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance"
],
"submitted": true
}
]
},
{
"task": "crisis_brief",
"policy": "oracle_router",
"use_rag": false,
"terminal": {
"mean": 0.8805,
"std": 0.0,
"n": 5
},
"cumulative": {
"mean": 1.2805,
"std": 0.0,
"n": 5
},
"runs": [
{
"seed": 11,
"terminal": 0.8805,
"cumulative": 1.2805,
"steps": 6,
"consulted": [
"analyst",
"finance",
"strategy",
"hr"
],
"path": [
"analyst",
"finance",
"strategy",
"hr"
],
"submitted": true
},
{
"seed": 23,
"terminal": 0.8805,
"cumulative": 1.2805,
"steps": 6,
"consulted": [
"analyst",
"finance",
"strategy",
"hr"
],
"path": [
"analyst",
"finance",
"strategy",
"hr"
],
"submitted": true
},
{
"seed": 47,
"terminal": 0.8805,
"cumulative": 1.2805,
"steps": 6,
"consulted": [
"analyst",
"finance",
"strategy",
"hr"
],
"path": [
"analyst",
"finance",
"strategy",
"hr"
],
"submitted": true
},
{
"seed": 91,
"terminal": 0.8805,
"cumulative": 1.2805,
"steps": 6,
"consulted": [
"analyst",
"finance",
"strategy",
"hr"
],
"path": [
"analyst",
"finance",
"strategy",
"hr"
],
"submitted": true
},
{
"seed": 137,
"terminal": 0.8805,
"cumulative": 1.2805,
"steps": 6,
"consulted": [
"analyst",
"finance",
"strategy",
"hr"
],
"path": [
"analyst",
"finance",
"strategy",
"hr"
],
"submitted": true
}
]
},
{
"task": "crisis_brief",
"policy": "base_naive",
"use_rag": true,
"terminal": {
"mean": 0.3217,
"std": 0.0,
"n": 5
},
"cumulative": {
"mean": 0.1917,
"std": 0.0,
"n": 5
},
"runs": [
{
"seed": 11,
"terminal": 0.3217,
"cumulative": 0.1917,
"steps": 3,
"consulted": [
"analyst"
],
"path": [
"analyst"
],
"submitted": true
},
{
"seed": 23,
"terminal": 0.3217,
"cumulative": 0.1917,
"steps": 3,
"consulted": [
"analyst"
],
"path": [
"analyst"
],
"submitted": true
},
{
"seed": 47,
"terminal": 0.3217,
"cumulative": 0.1917,
"steps": 3,
"consulted": [
"analyst"
],
"path": [
"analyst"
],
"submitted": true
},
{
"seed": 91,
"terminal": 0.3217,
"cumulative": 0.1917,
"steps": 3,
"consulted": [
"analyst"
],
"path": [
"analyst"
],
"submitted": true
},
{
"seed": 137,
"terminal": 0.3217,
"cumulative": 0.1917,
"steps": 3,
"consulted": [
"analyst"
],
"path": [
"analyst"
],
"submitted": true
}
]
},
{
"task": "crisis_brief",
"policy": "base_roundrobin",
"use_rag": true,
"terminal": {
"mean": 0.8914,
"std": 0.0,
"n": 5
},
"cumulative": {
"mean": 1.2914,
"std": 0.0,
"n": 5
},
"runs": [
{
"seed": 11,
"terminal": 0.8914,
"cumulative": 1.2914,
"steps": 6,
"consulted": [
"finance",
"analyst",
"hr",
"strategy"
],
"path": [
"finance",
"analyst",
"hr",
"strategy"
],
"submitted": true
},
{
"seed": 23,
"terminal": 0.8914,
"cumulative": 1.2914,
"steps": 6,
"consulted": [
"finance",
"analyst",
"hr",
"strategy"
],
"path": [
"finance",
"analyst",
"hr",
"strategy"
],
"submitted": true
},
{
"seed": 47,
"terminal": 0.8914,
"cumulative": 1.2914,
"steps": 6,
"consulted": [
"finance",
"analyst",
"hr",
"strategy"
],
"path": [
"finance",
"analyst",
"hr",
"strategy"
],
"submitted": true
},
{
"seed": 91,
"terminal": 0.8914,
"cumulative": 1.2914,
"steps": 6,
"consulted": [
"finance",
"analyst",
"hr",
"strategy"
],
"path": [
"finance",
"analyst",
"hr",
"strategy"
],
"submitted": true
},
{
"seed": 137,
"terminal": 0.8914,
"cumulative": 1.2914,
"steps": 6,
"consulted": [
"finance",
"analyst",
"hr",
"strategy"
],
"path": [
"finance",
"analyst",
"hr",
"strategy"
],
"submitted": true
}
]
},
{
"task": "crisis_brief",
"policy": "trained_mlp",
"use_rag": true,
"terminal": {
"mean": 0.7429,
"std": 0.0,
"n": 5
},
"cumulative": {
"mean": -1.1171,
"std": 0.0,
"n": 5
},
"runs": [
{
"seed": 11,
"terminal": 0.7429,
"cumulative": -1.1171,
"steps": 14,
"consulted": [
"analyst",
"strategy",
"finance"
],
"path": [
"analyst",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance"
],
"submitted": true
},
{
"seed": 23,
"terminal": 0.7429,
"cumulative": -1.1171,
"steps": 14,
"consulted": [
"analyst",
"strategy",
"finance"
],
"path": [
"analyst",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance"
],
"submitted": true
},
{
"seed": 47,
"terminal": 0.7429,
"cumulative": -1.1171,
"steps": 14,
"consulted": [
"analyst",
"strategy",
"finance"
],
"path": [
"analyst",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance"
],
"submitted": true
},
{
"seed": 91,
"terminal": 0.7429,
"cumulative": -1.1171,
"steps": 14,
"consulted": [
"analyst",
"strategy",
"finance"
],
"path": [
"analyst",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance"
],
"submitted": true
},
{
"seed": 137,
"terminal": 0.7429,
"cumulative": -1.1171,
"steps": 14,
"consulted": [
"analyst",
"strategy",
"finance"
],
"path": [
"analyst",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance",
"finance"
],
"submitted": true
}
]
},
{
"task": "crisis_brief",
"policy": "oracle_router",
"use_rag": true,
"terminal": {
"mean": 0.8914,
"std": 0.0,
"n": 5
},
"cumulative": {
"mean": 1.2914,
"std": 0.0,
"n": 5
},
"runs": [
{
"seed": 11,
"terminal": 0.8914,
"cumulative": 1.2914,
"steps": 6,
"consulted": [
"analyst",
"finance",
"strategy",
"hr"
],
"path": [
"analyst",
"finance",
"strategy",
"hr"
],
"submitted": true
},
{
"seed": 23,
"terminal": 0.8914,
"cumulative": 1.2914,
"steps": 6,
"consulted": [
"analyst",
"finance",
"strategy",
"hr"
],
"path": [
"analyst",
"finance",
"strategy",
"hr"
],
"submitted": true
},
{
"seed": 47,
"terminal": 0.8914,
"cumulative": 1.2914,
"steps": 6,
"consulted": [
"analyst",
"finance",
"strategy",
"hr"
],
"path": [
"analyst",
"finance",
"strategy",
"hr"
],
"submitted": true
},
{
"seed": 91,
"terminal": 0.8914,
"cumulative": 1.2914,
"steps": 6,
"consulted": [
"analyst",
"finance",
"strategy",
"hr"
],
"path": [
"analyst",
"finance",
"strategy",
"hr"
],
"submitted": true
},
{
"seed": 137,
"terminal": 0.8914,
"cumulative": 1.2914,
"steps": 6,
"consulted": [
"analyst",
"finance",
"strategy",
"hr"
],
"path": [
"analyst",
"finance",
"strategy",
"hr"
],
"submitted": true
}
]
}
]
}