RPC-Bench / data /leaderboard.json
ZHANGYUXUAN-zR's picture
Upload RPC-Bench leaderboard Space
e9313f3 verified
Raw
History Blame Contribute Delete
12.9 kB
{
"benchmark": "rpc-bench",
"schema_version": "1.0",
"last_updated": "2026-06-17",
"source": "data/leaderboard_seed.csv",
"seasons": {
"default": {
"period": "rolling",
"dataset_version": "rpc-bench-test-v1",
"eval_version": "eval.py@rpc-leaderboard-v1",
"judges": {
"open_qa": [
"gpt-5-2025-08-07",
"gemini-2.5-pro"
],
"claim_verification": "exact-match"
},
"models": [
{
"rank": 1,
"name": "GPT-5",
"url": "https://openai.com/index/introducing-gpt-5/",
"org": "OpenAI",
"modality": "text",
"date": "2025-8-7",
"status": "published",
"conciseness": 54.93,
"correctness": 69.1,
"completeness": 67.33,
"f1_like": 68.2,
"info": 37.46,
"overall": 37.46
},
{
"rank": 2,
"name": "GPT-5.2",
"url": "https://openai.com/index/introducing-gpt-5-2/",
"org": "OpenAI",
"modality": "text",
"date": "2025-12-11",
"status": "published",
"conciseness": 53.81,
"correctness": 66.84,
"completeness": 64.03,
"f1_like": 65.4,
"info": 35.19,
"overall": 35.19
},
{
"rank": 3,
"name": "GPT-5",
"url": "https://openai.com/index/introducing-gpt-5/",
"org": "OpenAI",
"modality": "visual",
"date": "2025-8-7",
"status": "published",
"conciseness": 61.47,
"correctness": 58.9,
"completeness": 55.34,
"f1_like": 57.07,
"info": 35.08,
"overall": 35.08
},
{
"rank": 4,
"name": "Gemini-2.5-Pro",
"url": "https://blog.google/technology/google-deepmind/gemini-model-thinking-updates-march-2025/",
"org": "Google",
"modality": "text",
"date": "2025-3-25",
"status": "published",
"conciseness": 54.87,
"correctness": 62.65,
"completeness": 59.03,
"f1_like": 60.79,
"info": 33.35,
"overall": 33.35
},
{
"rank": 5,
"name": "Gemini-3-Pro",
"url": "https://blog.google/products-and-platforms/products/gemini/gemini-3/",
"org": "Google",
"modality": "text",
"date": "2025-11-18",
"status": "published",
"conciseness": 52.81,
"correctness": 62.69,
"completeness": 60.28,
"f1_like": 61.46,
"info": 32.46,
"overall": 32.46
},
{
"rank": 6,
"name": "DeepSeek-V3.2",
"url": "https://api-docs.deepseek.com/news/news251201",
"org": "DeepSeek-AI",
"modality": "text",
"date": "2025-12-1",
"status": "published",
"conciseness": 56.31,
"correctness": 58.73,
"completeness": 55.19,
"f1_like": 56.91,
"info": 32.04,
"overall": 32.04
},
{
"rank": 7,
"name": "GPT-5.2",
"url": "https://openai.com/index/introducing-gpt-5-2/",
"org": "OpenAI",
"modality": "visual",
"date": "2025-12-11",
"status": "published",
"conciseness": 56.43,
"correctness": 56.75,
"completeness": 52.82,
"f1_like": 54.72,
"info": 30.88,
"overall": 30.88
},
{
"rank": 8,
"name": "DeepSeek-V3.1",
"url": "https://api-docs.deepseek.com/news/news250821",
"org": "DeepSeek-AI",
"modality": "text",
"date": "2025-8-21",
"status": "published",
"conciseness": 54.76,
"correctness": 57.85,
"completeness": 54.85,
"f1_like": 56.31,
"info": 30.84,
"overall": 30.84
},
{
"rank": 9,
"name": "GLM-4.6V",
"url": "https://github.com/zai-org/GLM-V",
"org": "Z.ai",
"modality": "visual",
"date": "2025-12-8",
"status": "published",
"conciseness": 64.55,
"correctness": 47.32,
"completeness": 43.43,
"f1_like": 45.29,
"info": 29.23,
"overall": 29.23
},
{
"rank": 10,
"name": "GLM-4.7",
"url": "https://z.ai/blog/glm-4.7",
"org": "Z.ai",
"modality": "text",
"date": "2025-12-22",
"status": "published",
"conciseness": 54.34,
"correctness": 54.36,
"completeness": 51.75,
"f1_like": 53.02,
"info": 28.81,
"overall": 28.81
},
{
"rank": 11,
"name": "GLM-4.5V",
"url": "https://github.com/zai-org/GLM-V",
"org": "Z.ai",
"modality": "visual",
"date": "2025-8-11",
"status": "published",
"conciseness": 59.44,
"correctness": 48.79,
"completeness": 43.62,
"f1_like": 46.06,
"info": 27.38,
"overall": 27.38
},
{
"rank": 12,
"name": "gemini-3-pro",
"url": "https://blog.google/products-and-platforms/products/gemini/gemini-3/",
"org": "Google",
"modality": "visual",
"date": "2025-11-18",
"status": "published",
"conciseness": 50.22,
"correctness": 56.06,
"completeness": 52.69,
"f1_like": 54.32,
"info": 27.28,
"overall": 27.28
},
{
"rank": 13,
"name": "GLM-4.5",
"url": "https://z.ai/blog/glm-4.5",
"org": "Z.ai",
"modality": "text",
"date": "2025-7-28",
"status": "published",
"conciseness": 43.41,
"correctness": 58.95,
"completeness": 59.54,
"f1_like": 59.24,
"info": 25.72,
"overall": 25.72
},
{
"rank": 14,
"name": "gemini-2.5-pro",
"url": "https://blog.google/technology/google-deepmind/gemini-model-thinking-updates-march-2025/",
"org": "Google",
"modality": "visual",
"date": "2025-3-25",
"status": "published",
"conciseness": 51.71,
"correctness": 48.39,
"completeness": 45.59,
"f1_like": 46.95,
"info": 24.28,
"overall": 24.28
},
{
"rank": 15,
"name": "Claude-Sonnet-4",
"url": "https://www.anthropic.com/news/claude-4",
"org": "Anthropic",
"modality": "text",
"date": "2025-5-23",
"status": "published",
"conciseness": 41.37,
"correctness": 58.53,
"completeness": 58.44,
"f1_like": 58.48,
"info": 24.19,
"overall": 24.19
},
{
"rank": 16,
"name": "Qwen3",
"url": "https://github.com/QwenLM/Qwen3",
"org": "Alibaba",
"modality": "text",
"date": "2025-7-21",
"status": "published",
"conciseness": 41.44,
"correctness": 55.88,
"completeness": 56.64,
"f1_like": 56.26,
"info": 23.31,
"overall": 23.31
},
{
"rank": 17,
"name": "Claude-Sonnet-4.5",
"url": "https://www.anthropic.com/news/claude-sonnet-4-5",
"org": "Anthropic",
"modality": "text",
"date": "2025-9-30",
"status": "published",
"conciseness": 31.02,
"correctness": 64.31,
"completeness": 64.97,
"f1_like": 64.64,
"info": 20.05,
"overall": 20.05
},
{
"rank": 18,
"name": "Claude-Sonnet-4.5",
"url": "https://www.anthropic.com/news/claude-sonnet-4-5",
"org": "Anthropic",
"modality": "visual",
"date": "2025-9-30",
"status": "published",
"conciseness": 31.95,
"correctness": 55.35,
"completeness": 54.45,
"f1_like": 54.89,
"info": 17.54,
"overall": 17.54
},
{
"rank": 19,
"name": "Claude-Sonnet-4",
"url": "https://www.anthropic.com/news/claude-4",
"org": "Anthropic",
"modality": "visual",
"date": "2025-5-23",
"status": "published",
"conciseness": 31.63,
"correctness": 54.16,
"completeness": 53.32,
"f1_like": 53.74,
"info": 16.99,
"overall": 16.99
},
{
"rank": 20,
"name": "HippoRAG2",
"url": "https://github.com/ianliuwd/HippoRAG2",
"org": "The Ohio State University",
"modality": "text",
"date": "2025-6-19",
"status": "published",
"conciseness": 45.77,
"correctness": 33.13,
"completeness": 27.88,
"f1_like": 30.28,
"info": 13.86,
"overall": 13.86
},
{
"rank": 21,
"name": "MemoRAG",
"url": "https://github.com/qhjqhj00/MemoRAG",
"org": "Peking University & Hong Kong Polytechnic University",
"modality": "text",
"date": "2025-4-9",
"status": "published",
"conciseness": 51.31,
"correctness": 24.19,
"completeness": 19.1,
"f1_like": 21.35,
"info": 10.96,
"overall": 10.96
},
{
"rank": 22,
"name": "VdocRAG",
"url": "https://vdocrag.github.io/",
"org": "NTT Corporation & Tohoku University",
"modality": "visual",
"date": "2025-4-14",
"status": "published",
"conciseness": 61.54,
"correctness": 21.17,
"completeness": 13.88,
"f1_like": 16.77,
"info": 10.32,
"overall": 10.32
},
{
"rank": 23,
"name": "VisRAG",
"url": "https://github.com/OpenBMB/VisRAG",
"org": "Tsinghua University & ModelBest Inc.",
"modality": "visual",
"date": "2025-3-2",
"status": "published",
"conciseness": 39.9,
"correctness": 26.24,
"completeness": 23.63,
"f1_like": 24.87,
"info": 9.92,
"overall": 9.92
},
{
"rank": 24,
"name": "Raptor",
"url": "https://github.com/parthsarthi03/raptor",
"org": "Stanford University",
"modality": "text",
"date": "2024-1-31",
"status": "published",
"conciseness": 36.47,
"correctness": 25.28,
"completeness": 20.82,
"f1_like": 22.84,
"info": 8.33,
"overall": 8.33
},
{
"rank": 25,
"name": "Monkey",
"url": "https://github.com/Yuliang-Liu/Monkey",
"org": "Huazhong University of Science and Technology",
"modality": "visual",
"date": "2024-8-26",
"status": "published",
"conciseness": 54.61,
"correctness": 17.08,
"completeness": 11.27,
"f1_like": 13.58,
"info": 7.41,
"overall": 7.41
},
{
"rank": 26,
"name": "Docopilot",
"url": "https://github.com/OpenGVLab/Docopilot",
"org": "Shanghai AI Laboratory",
"modality": "visual",
"date": "2025-7-19",
"status": "published",
"conciseness": 39.31,
"correctness": 18.31,
"completeness": 17.12,
"f1_like": 17.69,
"info": 6.96,
"overall": 6.96
},
{
"rank": 27,
"name": "Qwen3",
"url": "https://github.com/QwenLM/Qwen3",
"org": "Alibaba",
"modality": "visual",
"date": "2025-7-21",
"status": "published",
"conciseness": 22.64,
"correctness": 20.17,
"completeness": 20.14,
"f1_like": 20.16,
"info": 4.56,
"overall": 4.56
},
{
"rank": 28,
"name": "DocOwl2",
"url": "https://github.com/X-PLUG/mPLUG-DocOwl",
"org": "Alibaba",
"modality": "visual",
"date": "2024-9-9",
"status": "published",
"conciseness": 50.19,
"correctness": 11.75,
"completeness": 6.66,
"f1_like": 8.5,
"info": 4.27,
"overall": 4.27
}
]
}
}
}