AEOLLM / data /leaderboard.json
陈俊杰
aeollm2
e737694
{
"metadata": {
"version": "2025-03",
"last_updated": "2025-03-17",
"description": "NTCIR-19 AEOLLM 2 Leaderboard Data"
},
"tasks": {
"Weighted Overall": {
"dataset": "DeepResearchBench"
},
"Comprehensiveness": {
"dataset": "DeepResearchBench"
},
"Insight": {
"dataset": "DeepResearchBench"
},
"Instruction Following": {
"dataset": "DeepResearchBench"
},
"Readability": {
"dataset": "DeepResearchBench"
}
},
"metrics": [
"Spearman",
"Kendall",
"Accuracy"
],
"entries": [
{
"team": "baseline",
"method": "deepseek-v3.2",
"results": {
"Weighted Overall": {
"Spearman": 0.697838,
"Kendall": 0.545161,
"Accuracy": 0.76
},
"Comprehensiveness": {
"Spearman": 0.617544,
"Kendall": 0.474721,
"Accuracy": 0.6875
},
"Insight": {
"Spearman": 0.653226,
"Kendall": 0.518985,
"Accuracy": 0.695
},
"Instruction Following": {
"Spearman": 0.524247,
"Kendall": 0.408872,
"Accuracy": 0.605833
},
"Readability": {
"Spearman": 0.381338,
"Kendall": 0.306877,
"Accuracy": 0.4325
}
}
},
{
"team": "baseline",
"method": "glm-4.7",
"results": {
"Weighted Overall": {
"Spearman": 0.602563,
"Kendall": 0.45113,
"Accuracy": 0.725
},
"Comprehensiveness": {
"Spearman": 0.627869,
"Kendall": 0.49387,
"Accuracy": 0.725833
},
"Insight": {
"Spearman": 0.595509,
"Kendall": 0.450274,
"Accuracy": 0.699167
},
"Instruction Following": {
"Spearman": 0.568339,
"Kendall": 0.46629,
"Accuracy": 0.5925
},
"Readability": {
"Spearman": 0.3441,
"Kendall": 0.272089,
"Accuracy": 0.581667
}
}
},
{
"team": "baseline",
"method": "gpt-4o-2024-11-20",
"results": {
"Weighted Overall": {
"Spearman": 0.64408,
"Kendall": 0.490928,
"Accuracy": 0.731667
},
"Comprehensiveness": {
"Spearman": 0.524506,
"Kendall": 0.414525,
"Accuracy": 0.595833
},
"Insight": {
"Spearman": 0.559674,
"Kendall": 0.435226,
"Accuracy": 0.618333
},
"Instruction Following": {
"Spearman": 0.402335,
"Kendall": 0.31764,
"Accuracy": 0.565833
},
"Readability": {
"Spearman": 0.304546,
"Kendall": 0.23432,
"Accuracy": 0.506667
}
}
},
{
"team": "baseline",
"method": "gpt-4o-mini",
"results": {
"Weighted Overall": {
"Spearman": 0.589317,
"Kendall": 0.44087,
"Accuracy": 0.71
},
"Comprehensiveness": {
"Spearman": 0.509257,
"Kendall": 0.419137,
"Accuracy": 0.616667
},
"Insight": {
"Spearman": 0.562408,
"Kendall": 0.454384,
"Accuracy": 0.568333
},
"Instruction Following": {
"Spearman": 0.351689,
"Kendall": 0.276535,
"Accuracy": 0.4975
},
"Readability": {
"Spearman": 0.297509,
"Kendall": 0.239408,
"Accuracy": 0.350833
}
}
},
{
"team": "baseline",
"method": "qwen-max",
"results": {
"Weighted Overall": {
"Spearman": 0.515201,
"Kendall": 0.374244,
"Accuracy": 0.669167
},
"Comprehensiveness": {
"Spearman": 0.456098,
"Kendall": 0.360907,
"Accuracy": 0.5175
},
"Insight": {
"Spearman": 0.536739,
"Kendall": 0.414278,
"Accuracy": 0.540833
},
"Instruction Following": {
"Spearman": 0.316772,
"Kendall": 0.254033,
"Accuracy": 0.515833
},
"Readability": {
"Spearman": 0.209253,
"Kendall": 0.166982,
"Accuracy": 0.4
}
}
},
{
"team": "baseline",
"method": "qwen3-max",
"results": {
"Weighted Overall": {
"Spearman": 0.598278,
"Kendall": 0.456276,
"Accuracy": 0.711392
},
"Comprehensiveness": {
"Spearman": 0.53795,
"Kendall": 0.419239,
"Accuracy": 0.592143
},
"Insight": {
"Spearman": 0.600424,
"Kendall": 0.481517,
"Accuracy": 0.629405
},
"Instruction Following": {
"Spearman": 0.376286,
"Kendall": 0.295089,
"Accuracy": 0.507619
},
"Readability": {
"Spearman": 0.335661,
"Kendall": 0.27048,
"Accuracy": 0.458095
}
}
}
]
}