| { |
| "metadata": { |
| "version": "2025-03", |
| "last_updated": "2025-03-17", |
| "description": "NTCIR-19 AEOLLM 2 Leaderboard Data" |
| }, |
| "tasks": { |
| "Weighted Overall": { |
| "dataset": "DeepResearchBench" |
| }, |
| "Comprehensiveness": { |
| "dataset": "DeepResearchBench" |
| }, |
| "Insight": { |
| "dataset": "DeepResearchBench" |
| }, |
| "Instruction Following": { |
| "dataset": "DeepResearchBench" |
| }, |
| "Readability": { |
| "dataset": "DeepResearchBench" |
| } |
| }, |
| "metrics": [ |
| "Spearman", |
| "Kendall", |
| "Accuracy" |
| ], |
| "entries": [ |
| { |
| "team": "baseline", |
| "method": "deepseek-v3.2", |
| "results": { |
| "Weighted Overall": { |
| "Spearman": 0.697838, |
| "Kendall": 0.545161, |
| "Accuracy": 0.76 |
| }, |
| "Comprehensiveness": { |
| "Spearman": 0.617544, |
| "Kendall": 0.474721, |
| "Accuracy": 0.6875 |
| }, |
| "Insight": { |
| "Spearman": 0.653226, |
| "Kendall": 0.518985, |
| "Accuracy": 0.695 |
| }, |
| "Instruction Following": { |
| "Spearman": 0.524247, |
| "Kendall": 0.408872, |
| "Accuracy": 0.605833 |
| }, |
| "Readability": { |
| "Spearman": 0.381338, |
| "Kendall": 0.306877, |
| "Accuracy": 0.4325 |
| } |
| } |
| }, |
| { |
| "team": "baseline", |
| "method": "glm-4.7", |
| "results": { |
| "Weighted Overall": { |
| "Spearman": 0.602563, |
| "Kendall": 0.45113, |
| "Accuracy": 0.725 |
| }, |
| "Comprehensiveness": { |
| "Spearman": 0.627869, |
| "Kendall": 0.49387, |
| "Accuracy": 0.725833 |
| }, |
| "Insight": { |
| "Spearman": 0.595509, |
| "Kendall": 0.450274, |
| "Accuracy": 0.699167 |
| }, |
| "Instruction Following": { |
| "Spearman": 0.568339, |
| "Kendall": 0.46629, |
| "Accuracy": 0.5925 |
| }, |
| "Readability": { |
| "Spearman": 0.3441, |
| "Kendall": 0.272089, |
| "Accuracy": 0.581667 |
| } |
| } |
| }, |
| { |
| "team": "baseline", |
| "method": "gpt-4o-2024-11-20", |
| "results": { |
| "Weighted Overall": { |
| "Spearman": 0.64408, |
| "Kendall": 0.490928, |
| "Accuracy": 0.731667 |
| }, |
| "Comprehensiveness": { |
| "Spearman": 0.524506, |
| "Kendall": 0.414525, |
| "Accuracy": 0.595833 |
| }, |
| "Insight": { |
| "Spearman": 0.559674, |
| "Kendall": 0.435226, |
| "Accuracy": 0.618333 |
| }, |
| "Instruction Following": { |
| "Spearman": 0.402335, |
| "Kendall": 0.31764, |
| "Accuracy": 0.565833 |
| }, |
| "Readability": { |
| "Spearman": 0.304546, |
| "Kendall": 0.23432, |
| "Accuracy": 0.506667 |
| } |
| } |
| }, |
| { |
| "team": "baseline", |
| "method": "gpt-4o-mini", |
| "results": { |
| "Weighted Overall": { |
| "Spearman": 0.589317, |
| "Kendall": 0.44087, |
| "Accuracy": 0.71 |
| }, |
| "Comprehensiveness": { |
| "Spearman": 0.509257, |
| "Kendall": 0.419137, |
| "Accuracy": 0.616667 |
| }, |
| "Insight": { |
| "Spearman": 0.562408, |
| "Kendall": 0.454384, |
| "Accuracy": 0.568333 |
| }, |
| "Instruction Following": { |
| "Spearman": 0.351689, |
| "Kendall": 0.276535, |
| "Accuracy": 0.4975 |
| }, |
| "Readability": { |
| "Spearman": 0.297509, |
| "Kendall": 0.239408, |
| "Accuracy": 0.350833 |
| } |
| } |
| }, |
| { |
| "team": "baseline", |
| "method": "qwen-max", |
| "results": { |
| "Weighted Overall": { |
| "Spearman": 0.515201, |
| "Kendall": 0.374244, |
| "Accuracy": 0.669167 |
| }, |
| "Comprehensiveness": { |
| "Spearman": 0.456098, |
| "Kendall": 0.360907, |
| "Accuracy": 0.5175 |
| }, |
| "Insight": { |
| "Spearman": 0.536739, |
| "Kendall": 0.414278, |
| "Accuracy": 0.540833 |
| }, |
| "Instruction Following": { |
| "Spearman": 0.316772, |
| "Kendall": 0.254033, |
| "Accuracy": 0.515833 |
| }, |
| "Readability": { |
| "Spearman": 0.209253, |
| "Kendall": 0.166982, |
| "Accuracy": 0.4 |
| } |
| } |
| }, |
| { |
| "team": "baseline", |
| "method": "qwen3-max", |
| "results": { |
| "Weighted Overall": { |
| "Spearman": 0.598278, |
| "Kendall": 0.456276, |
| "Accuracy": 0.711392 |
| }, |
| "Comprehensiveness": { |
| "Spearman": 0.53795, |
| "Kendall": 0.419239, |
| "Accuracy": 0.592143 |
| }, |
| "Insight": { |
| "Spearman": 0.600424, |
| "Kendall": 0.481517, |
| "Accuracy": 0.629405 |
| }, |
| "Instruction Following": { |
| "Spearman": 0.376286, |
| "Kendall": 0.295089, |
| "Accuracy": 0.507619 |
| }, |
| "Readability": { |
| "Spearman": 0.335661, |
| "Kendall": 0.27048, |
| "Accuracy": 0.458095 |
| } |
| } |
| } |
| ] |
| } |