{ "metadata": { "version": "2025-03", "last_updated": "2025-03-17", "description": "NTCIR-19 AEOLLM 2 Leaderboard Data" }, "tasks": { "Weighted Overall": { "dataset": "DeepResearchBench" }, "Comprehensiveness": { "dataset": "DeepResearchBench" }, "Insight": { "dataset": "DeepResearchBench" }, "Instruction Following": { "dataset": "DeepResearchBench" }, "Readability": { "dataset": "DeepResearchBench" } }, "metrics": [ "Spearman", "Kendall", "Accuracy" ], "entries": [ { "team": "baseline", "method": "deepseek-v3.2", "results": { "Weighted Overall": { "Spearman": 0.697838, "Kendall": 0.545161, "Accuracy": 0.76 }, "Comprehensiveness": { "Spearman": 0.617544, "Kendall": 0.474721, "Accuracy": 0.6875 }, "Insight": { "Spearman": 0.653226, "Kendall": 0.518985, "Accuracy": 0.695 }, "Instruction Following": { "Spearman": 0.524247, "Kendall": 0.408872, "Accuracy": 0.605833 }, "Readability": { "Spearman": 0.381338, "Kendall": 0.306877, "Accuracy": 0.4325 } } }, { "team": "baseline", "method": "glm-4.7", "results": { "Weighted Overall": { "Spearman": 0.602563, "Kendall": 0.45113, "Accuracy": 0.725 }, "Comprehensiveness": { "Spearman": 0.627869, "Kendall": 0.49387, "Accuracy": 0.725833 }, "Insight": { "Spearman": 0.595509, "Kendall": 0.450274, "Accuracy": 0.699167 }, "Instruction Following": { "Spearman": 0.568339, "Kendall": 0.46629, "Accuracy": 0.5925 }, "Readability": { "Spearman": 0.3441, "Kendall": 0.272089, "Accuracy": 0.581667 } } }, { "team": "baseline", "method": "gpt-4o-2024-11-20", "results": { "Weighted Overall": { "Spearman": 0.64408, "Kendall": 0.490928, "Accuracy": 0.731667 }, "Comprehensiveness": { "Spearman": 0.524506, "Kendall": 0.414525, "Accuracy": 0.595833 }, "Insight": { "Spearman": 0.559674, "Kendall": 0.435226, "Accuracy": 0.618333 }, "Instruction Following": { "Spearman": 0.402335, "Kendall": 0.31764, "Accuracy": 0.565833 }, "Readability": { "Spearman": 0.304546, "Kendall": 0.23432, "Accuracy": 0.506667 } } }, { "team": "baseline", "method": "gpt-4o-mini", "results": { "Weighted Overall": { "Spearman": 0.589317, "Kendall": 0.44087, "Accuracy": 0.71 }, "Comprehensiveness": { "Spearman": 0.509257, "Kendall": 0.419137, "Accuracy": 0.616667 }, "Insight": { "Spearman": 0.562408, "Kendall": 0.454384, "Accuracy": 0.568333 }, "Instruction Following": { "Spearman": 0.351689, "Kendall": 0.276535, "Accuracy": 0.4975 }, "Readability": { "Spearman": 0.297509, "Kendall": 0.239408, "Accuracy": 0.350833 } } }, { "team": "baseline", "method": "qwen-max", "results": { "Weighted Overall": { "Spearman": 0.515201, "Kendall": 0.374244, "Accuracy": 0.669167 }, "Comprehensiveness": { "Spearman": 0.456098, "Kendall": 0.360907, "Accuracy": 0.5175 }, "Insight": { "Spearman": 0.536739, "Kendall": 0.414278, "Accuracy": 0.540833 }, "Instruction Following": { "Spearman": 0.316772, "Kendall": 0.254033, "Accuracy": 0.515833 }, "Readability": { "Spearman": 0.209253, "Kendall": 0.166982, "Accuracy": 0.4 } } }, { "team": "baseline", "method": "qwen3-max", "results": { "Weighted Overall": { "Spearman": 0.598278, "Kendall": 0.456276, "Accuracy": 0.711392 }, "Comprehensiveness": { "Spearman": 0.53795, "Kendall": 0.419239, "Accuracy": 0.592143 }, "Insight": { "Spearman": 0.600424, "Kendall": 0.481517, "Accuracy": 0.629405 }, "Instruction Following": { "Spearman": 0.376286, "Kendall": 0.295089, "Accuracy": 0.507619 }, "Readability": { "Spearman": 0.335661, "Kendall": 0.27048, "Accuracy": 0.458095 } } } ] }