Ray0202
update leaderboard
1dd52d9
[
{
"agent_name": "Single LLM",
"agent_type": "single-LLM",
"base_model": "gpt-4o",
"T1_acc": null,
"T2_acc": null,
"T3_acc": null,
"T4_acc": null,
"FreshRetailNet_T1_acc": 0.6364,
"FreshRetailNet_T2_acc": 0.5227,
"FreshRetailNet_T3_acc": 0.0289,
"FreshRetailNet_T4_acc": 0.1364,
"PSML_T1_acc": 0.675,
"PSML_T2_acc": 0.2067,
"PSML_T3_acc": 0.348,
"PSML_T4_acc": 0.36,
"CausalChambers_T1_acc": 0.1333,
"CausalChambers_T2_acc": 0.2733,
"CausalChambers_T3_acc": 0.352,
"CausalChambers_T4_acc": 0.26,
"MIMIC_T1_acc": 0.4681,
"MIMIC_T2_acc": 0.2128,
"MIMIC_T3_acc": 0.3661,
"MIMIC_T4_acc": 0.2979,
"T2_sMAPE": null,
"T2_MAE": null,
"T2_OW_sMAPE_MIMIC": null,
"T2_OW_RMSSE_MIMIC": null,
"T4_sMAPE": null,
"T4_MAE": null,
"T4_OW_sMAPE_MIMIC": null,
"T4_OW_RMSSE_MIMIC": null,
"FreshRetailNet_T2_MAE": 0.12,
"FreshRetailNet_T2_sMAPE": 1.27,
"FreshRetailNet_T4_MAE": 0.34,
"FreshRetailNet_T4_sMAPE": 1.29,
"PSML_T2_MAE": 0.61,
"PSML_T2_sMAPE": 0.6,
"PSML_T4_MAE": 0.44,
"PSML_T4_sMAPE": 0.37,
"CausalChambers_T2_MAE": 2.48,
"CausalChambers_T2_OW_RMSSE": 0.0000257,
"CausalChambers_T4_MAE": 2.58,
"CausalChambers_T4_OW_RMSSE": 0.0000269,
"MIMIC_T2_OW_sMAPE": 15.2,
"MIMIC_T2_OW_RMSSE": 0.55,
"MIMIC_T4_OW_sMAPE": 16.86,
"MIMIC_T4_OW_RMSSE": 0.63
},
{
"agent_name": "TimeSeries Scientist",
"agent_type": "time-series-specific agent",
"base_model": "gpt-4o",
"T1_acc": null,
"T2_acc": null,
"T3_acc": null,
"T4_acc": null,
"FreshRetailNet_T1_acc": 0.3352,
"FreshRetailNet_T2_acc": 0.5682,
"FreshRetailNet_T3_acc": 0.0341,
"FreshRetailNet_T4_acc": 0.5682,
"PSML_T1_acc": 0.28,
"PSML_T2_acc": 0.2667,
"PSML_T3_acc": 0.216,
"PSML_T4_acc": 0.2733,
"CausalChambers_T1_acc": 0.2867,
"CausalChambers_T2_acc": 0.0267,
"CausalChambers_T3_acc": 0.216,
"CausalChambers_T4_acc": 0.0267,
"MIMIC_T1_acc": 0.1011,
"MIMIC_T2_acc": 0.234,
"MIMIC_T3_acc": 0.2887,
"MIMIC_T4_acc": 0.234,
"T2_sMAPE": null,
"T2_MAE": null,
"T2_OW_sMAPE_MIMIC": null,
"T2_OW_RMSSE_MIMIC": null,
"T4_sMAPE": null,
"T4_MAE": null,
"T4_OW_sMAPE_MIMIC": null,
"T4_OW_RMSSE_MIMIC": null,
"FreshRetailNet_T2_MAE": 0.35,
"FreshRetailNet_T2_sMAPE": 1.27,
"FreshRetailNet_T4_MAE": 0.51,
"FreshRetailNet_T4_sMAPE": 1.4,
"PSML_T2_MAE": 1.53,
"PSML_T2_sMAPE": 0.65,
"PSML_T4_MAE": 0.84,
"PSML_T4_sMAPE": 0.48,
"CausalChambers_T2_MAE": 2.44,
"CausalChambers_T2_OW_RMSSE": 0.0000253,
"CausalChambers_T4_MAE": 2.94,
"CausalChambers_T4_OW_RMSSE": 0.0000306,
"MIMIC_T2_OW_sMAPE": 15.81,
"MIMIC_T2_OW_RMSSE": 0.52,
"MIMIC_T4_OW_sMAPE": 17.18,
"MIMIC_T4_OW_RMSSE": 0.64
},
{
"agent_name": "AgentScope",
"agent_type": "general agent",
"base_model": "gpt-4o",
"T1_acc": null,
"T2_acc": null,
"T3_acc": null,
"T4_acc": null,
"FreshRetailNet_T1_acc": 0.625,
"FreshRetailNet_T2_acc": 0.1212,
"FreshRetailNet_T3_acc": 0.1364,
"FreshRetailNet_T4_acc": 0.1894,
"PSML_T1_acc": 0.66,
"PSML_T2_acc": 0.2467,
"PSML_T3_acc": 0.272,
"PSML_T4_acc": 0.3533,
"CausalChambers_T1_acc": 0.12,
"CausalChambers_T2_acc": 0.46,
"CausalChambers_T3_acc": 0.44,
"CausalChambers_T4_acc": 0.32,
"MIMIC_T1_acc": 0.4468,
"MIMIC_T2_acc": 0.2128,
"MIMIC_T3_acc": 0.2395,
"MIMIC_T4_acc": 0.227,
"T2_sMAPE": null,
"T2_MAE": null,
"T2_OW_sMAPE_MIMIC": null,
"T2_OW_RMSSE_MIMIC": null,
"T4_sMAPE": null,
"T4_MAE": null,
"T4_OW_sMAPE_MIMIC": null,
"T4_OW_RMSSE_MIMIC": null,
"FreshRetailNet_T2_MAE": 0.12,
"FreshRetailNet_T2_sMAPE": 126.27,
"FreshRetailNet_T4_MAE": 0.2,
"FreshRetailNet_T4_sMAPE": 130.86,
"PSML_T2_MAE": 0.28,
"PSML_T2_sMAPE": 37.38,
"PSML_T4_MAE": 0.35,
"PSML_T4_sMAPE": 30.51,
"CausalChambers_T2_MAE": 2.76,
"CausalChambers_T2_OW_RMSSE": 0.00262,
"CausalChambers_T4_MAE": 2.66,
"CausalChambers_T4_OW_RMSSE": 0.00246,
"MIMIC_T2_OW_sMAPE": 11.05,
"MIMIC_T2_OW_RMSSE": 0.43,
"MIMIC_T4_OW_sMAPE": 12.02,
"MIMIC_T4_OW_RMSSE": 0.49
},
{
"agent_name": "MetaGPT",
"agent_type": "general agent",
"base_model": "gpt-4o",
"T1_acc": null,
"T2_acc": null,
"T3_acc": null,
"T4_acc": null,
"FreshRetailNet_T1_acc": 0.625,
"FreshRetailNet_T2_acc": 0.0909,
"FreshRetailNet_T3_acc": 0.0511,
"FreshRetailNet_T4_acc": 0.1439,
"PSML_T1_acc": 0.675,
"PSML_T2_acc": 0.2109,
"PSML_T3_acc": 0.22,
"PSML_T4_acc": 0.3133,
"CausalChambers_T1_acc": 0.1067,
"CausalChambers_T2_acc": 0.5933,
"CausalChambers_T3_acc": 0.452,
"CausalChambers_T4_acc": 0.16,
"MIMIC_T1_acc": 0.4574,
"MIMIC_T2_acc": 0.1702,
"MIMIC_T3_acc": 0.2897,
"MIMIC_T4_acc": 0.2553,
"T2_sMAPE": null,
"T2_MAE": null,
"T2_OW_sMAPE_MIMIC": null,
"T2_OW_RMSSE_MIMIC": null,
"T4_sMAPE": null,
"T4_MAE": null,
"T4_OW_sMAPE_MIMIC": null,
"T4_OW_RMSSE_MIMIC": null,
"FreshRetailNet_T2_MAE": 0.13,
"FreshRetailNet_T2_sMAPE": 126.59,
"FreshRetailNet_T4_MAE": 0.24,
"FreshRetailNet_T4_sMAPE": 127.22,
"PSML_T2_MAE": 0.34,
"PSML_T2_sMAPE": 24.74,
"PSML_T4_MAE": 0.4,
"PSML_T4_sMAPE": 43.47,
"CausalChambers_T2_MAE": 2.62,
"CausalChambers_T2_OW_RMSSE": 0.00272,
"CausalChambers_T4_MAE": 2.76,
"CausalChambers_T4_OW_RMSSE": 0.00287,
"MIMIC_T2_OW_sMAPE": 14.11,
"MIMIC_T2_OW_RMSSE": 0.53,
"MIMIC_T4_OW_sMAPE": 15.4,
"MIMIC_T4_OW_RMSSE": 0.63
},
{
"agent_name": "CAMEL",
"agent_type": "general agent",
"base_model": "gpt-4o",
"T1_acc": null,
"T2_acc": null,
"T3_acc": null,
"T4_acc": null,
"FreshRetailNet_T1_acc": 0.642,
"FreshRetailNet_T2_acc": 0.0076,
"FreshRetailNet_T3_acc": 0.0625,
"FreshRetailNet_T4_acc": 0.3106,
"PSML_T1_acc": 0.685,
"PSML_T2_acc": 0.14,
"PSML_T3_acc": 0.184,
"PSML_T4_acc": 0.3067,
"CausalChambers_T1_acc": 0.1,
"CausalChambers_T2_acc": 0.66,
"CausalChambers_T3_acc": 0.42,
"CausalChambers_T4_acc": 0.2667,
"MIMIC_T1_acc": 0.4681,
"MIMIC_T2_acc": 0.2057,
"MIMIC_T3_acc": 0.3014,
"MIMIC_T4_acc": 0.234,
"T2_sMAPE": null,
"T2_MAE": null,
"T2_OW_sMAPE_MIMIC": null,
"T2_OW_RMSSE_MIMIC": null,
"T4_sMAPE": null,
"T4_MAE": null,
"T4_OW_sMAPE_MIMIC": null,
"T4_OW_RMSSE_MIMIC": null,
"FreshRetailNet_T2_MAE": 0.13,
"FreshRetailNet_T2_sMAPE": 126.75,
"FreshRetailNet_T4_MAE": 0.28,
"FreshRetailNet_T4_sMAPE": 128.18,
"PSML_T2_MAE": 0.43,
"PSML_T2_sMAPE": 34.89,
"PSML_T4_MAE": 0.45,
"PSML_T4_sMAPE": 35.78,
"CausalChambers_T2_MAE": 2.99,
"CausalChambers_T2_OW_RMSSE": 0.00311,
"CausalChambers_T4_MAE": 2.5,
"CausalChambers_T4_OW_RMSSE": 0.0026,
"MIMIC_T2_OW_sMAPE": 12.02,
"MIMIC_T2_OW_RMSSE": 0.55,
"MIMIC_T4_OW_sMAPE": 15.74,
"MIMIC_T4_OW_RMSSE": 0.59
}
]