| { |
| "benchmark": "rpc-bench", |
| "schema_version": "1.0", |
| "last_updated": "2026-06-17", |
| "source": "data/leaderboard_seed.csv", |
| "seasons": { |
| "default": { |
| "period": "rolling", |
| "dataset_version": "rpc-bench-test-v1", |
| "eval_version": "eval.py@rpc-leaderboard-v1", |
| "judges": { |
| "open_qa": [ |
| "gpt-5-2025-08-07", |
| "gemini-2.5-pro" |
| ], |
| "claim_verification": "exact-match" |
| }, |
| "models": [ |
| { |
| "rank": 1, |
| "name": "GPT-5", |
| "url": "https://openai.com/index/introducing-gpt-5/", |
| "org": "OpenAI", |
| "modality": "text", |
| "date": "2025-8-7", |
| "status": "published", |
| "conciseness": 54.93, |
| "correctness": 69.1, |
| "completeness": 67.33, |
| "f1_like": 68.2, |
| "info": 37.46, |
| "overall": 37.46 |
| }, |
| { |
| "rank": 2, |
| "name": "GPT-5.2", |
| "url": "https://openai.com/index/introducing-gpt-5-2/", |
| "org": "OpenAI", |
| "modality": "text", |
| "date": "2025-12-11", |
| "status": "published", |
| "conciseness": 53.81, |
| "correctness": 66.84, |
| "completeness": 64.03, |
| "f1_like": 65.4, |
| "info": 35.19, |
| "overall": 35.19 |
| }, |
| { |
| "rank": 3, |
| "name": "GPT-5", |
| "url": "https://openai.com/index/introducing-gpt-5/", |
| "org": "OpenAI", |
| "modality": "visual", |
| "date": "2025-8-7", |
| "status": "published", |
| "conciseness": 61.47, |
| "correctness": 58.9, |
| "completeness": 55.34, |
| "f1_like": 57.07, |
| "info": 35.08, |
| "overall": 35.08 |
| }, |
| { |
| "rank": 4, |
| "name": "Gemini-2.5-Pro", |
| "url": "https://blog.google/technology/google-deepmind/gemini-model-thinking-updates-march-2025/", |
| "org": "Google", |
| "modality": "text", |
| "date": "2025-3-25", |
| "status": "published", |
| "conciseness": 54.87, |
| "correctness": 62.65, |
| "completeness": 59.03, |
| "f1_like": 60.79, |
| "info": 33.35, |
| "overall": 33.35 |
| }, |
| { |
| "rank": 5, |
| "name": "Gemini-3-Pro", |
| "url": "https://blog.google/products-and-platforms/products/gemini/gemini-3/", |
| "org": "Google", |
| "modality": "text", |
| "date": "2025-11-18", |
| "status": "published", |
| "conciseness": 52.81, |
| "correctness": 62.69, |
| "completeness": 60.28, |
| "f1_like": 61.46, |
| "info": 32.46, |
| "overall": 32.46 |
| }, |
| { |
| "rank": 6, |
| "name": "DeepSeek-V3.2", |
| "url": "https://api-docs.deepseek.com/news/news251201", |
| "org": "DeepSeek-AI", |
| "modality": "text", |
| "date": "2025-12-1", |
| "status": "published", |
| "conciseness": 56.31, |
| "correctness": 58.73, |
| "completeness": 55.19, |
| "f1_like": 56.91, |
| "info": 32.04, |
| "overall": 32.04 |
| }, |
| { |
| "rank": 7, |
| "name": "GPT-5.2", |
| "url": "https://openai.com/index/introducing-gpt-5-2/", |
| "org": "OpenAI", |
| "modality": "visual", |
| "date": "2025-12-11", |
| "status": "published", |
| "conciseness": 56.43, |
| "correctness": 56.75, |
| "completeness": 52.82, |
| "f1_like": 54.72, |
| "info": 30.88, |
| "overall": 30.88 |
| }, |
| { |
| "rank": 8, |
| "name": "DeepSeek-V3.1", |
| "url": "https://api-docs.deepseek.com/news/news250821", |
| "org": "DeepSeek-AI", |
| "modality": "text", |
| "date": "2025-8-21", |
| "status": "published", |
| "conciseness": 54.76, |
| "correctness": 57.85, |
| "completeness": 54.85, |
| "f1_like": 56.31, |
| "info": 30.84, |
| "overall": 30.84 |
| }, |
| { |
| "rank": 9, |
| "name": "GLM-4.6V", |
| "url": "https://github.com/zai-org/GLM-V", |
| "org": "Z.ai", |
| "modality": "visual", |
| "date": "2025-12-8", |
| "status": "published", |
| "conciseness": 64.55, |
| "correctness": 47.32, |
| "completeness": 43.43, |
| "f1_like": 45.29, |
| "info": 29.23, |
| "overall": 29.23 |
| }, |
| { |
| "rank": 10, |
| "name": "GLM-4.7", |
| "url": "https://z.ai/blog/glm-4.7", |
| "org": "Z.ai", |
| "modality": "text", |
| "date": "2025-12-22", |
| "status": "published", |
| "conciseness": 54.34, |
| "correctness": 54.36, |
| "completeness": 51.75, |
| "f1_like": 53.02, |
| "info": 28.81, |
| "overall": 28.81 |
| }, |
| { |
| "rank": 11, |
| "name": "GLM-4.5V", |
| "url": "https://github.com/zai-org/GLM-V", |
| "org": "Z.ai", |
| "modality": "visual", |
| "date": "2025-8-11", |
| "status": "published", |
| "conciseness": 59.44, |
| "correctness": 48.79, |
| "completeness": 43.62, |
| "f1_like": 46.06, |
| "info": 27.38, |
| "overall": 27.38 |
| }, |
| { |
| "rank": 12, |
| "name": "gemini-3-pro", |
| "url": "https://blog.google/products-and-platforms/products/gemini/gemini-3/", |
| "org": "Google", |
| "modality": "visual", |
| "date": "2025-11-18", |
| "status": "published", |
| "conciseness": 50.22, |
| "correctness": 56.06, |
| "completeness": 52.69, |
| "f1_like": 54.32, |
| "info": 27.28, |
| "overall": 27.28 |
| }, |
| { |
| "rank": 13, |
| "name": "GLM-4.5", |
| "url": "https://z.ai/blog/glm-4.5", |
| "org": "Z.ai", |
| "modality": "text", |
| "date": "2025-7-28", |
| "status": "published", |
| "conciseness": 43.41, |
| "correctness": 58.95, |
| "completeness": 59.54, |
| "f1_like": 59.24, |
| "info": 25.72, |
| "overall": 25.72 |
| }, |
| { |
| "rank": 14, |
| "name": "gemini-2.5-pro", |
| "url": "https://blog.google/technology/google-deepmind/gemini-model-thinking-updates-march-2025/", |
| "org": "Google", |
| "modality": "visual", |
| "date": "2025-3-25", |
| "status": "published", |
| "conciseness": 51.71, |
| "correctness": 48.39, |
| "completeness": 45.59, |
| "f1_like": 46.95, |
| "info": 24.28, |
| "overall": 24.28 |
| }, |
| { |
| "rank": 15, |
| "name": "Claude-Sonnet-4", |
| "url": "https://www.anthropic.com/news/claude-4", |
| "org": "Anthropic", |
| "modality": "text", |
| "date": "2025-5-23", |
| "status": "published", |
| "conciseness": 41.37, |
| "correctness": 58.53, |
| "completeness": 58.44, |
| "f1_like": 58.48, |
| "info": 24.19, |
| "overall": 24.19 |
| }, |
| { |
| "rank": 16, |
| "name": "Qwen3", |
| "url": "https://github.com/QwenLM/Qwen3", |
| "org": "Alibaba", |
| "modality": "text", |
| "date": "2025-7-21", |
| "status": "published", |
| "conciseness": 41.44, |
| "correctness": 55.88, |
| "completeness": 56.64, |
| "f1_like": 56.26, |
| "info": 23.31, |
| "overall": 23.31 |
| }, |
| { |
| "rank": 17, |
| "name": "Claude-Sonnet-4.5", |
| "url": "https://www.anthropic.com/news/claude-sonnet-4-5", |
| "org": "Anthropic", |
| "modality": "text", |
| "date": "2025-9-30", |
| "status": "published", |
| "conciseness": 31.02, |
| "correctness": 64.31, |
| "completeness": 64.97, |
| "f1_like": 64.64, |
| "info": 20.05, |
| "overall": 20.05 |
| }, |
| { |
| "rank": 18, |
| "name": "Claude-Sonnet-4.5", |
| "url": "https://www.anthropic.com/news/claude-sonnet-4-5", |
| "org": "Anthropic", |
| "modality": "visual", |
| "date": "2025-9-30", |
| "status": "published", |
| "conciseness": 31.95, |
| "correctness": 55.35, |
| "completeness": 54.45, |
| "f1_like": 54.89, |
| "info": 17.54, |
| "overall": 17.54 |
| }, |
| { |
| "rank": 19, |
| "name": "Claude-Sonnet-4", |
| "url": "https://www.anthropic.com/news/claude-4", |
| "org": "Anthropic", |
| "modality": "visual", |
| "date": "2025-5-23", |
| "status": "published", |
| "conciseness": 31.63, |
| "correctness": 54.16, |
| "completeness": 53.32, |
| "f1_like": 53.74, |
| "info": 16.99, |
| "overall": 16.99 |
| }, |
| { |
| "rank": 20, |
| "name": "HippoRAG2", |
| "url": "https://github.com/ianliuwd/HippoRAG2", |
| "org": "The Ohio State University", |
| "modality": "text", |
| "date": "2025-6-19", |
| "status": "published", |
| "conciseness": 45.77, |
| "correctness": 33.13, |
| "completeness": 27.88, |
| "f1_like": 30.28, |
| "info": 13.86, |
| "overall": 13.86 |
| }, |
| { |
| "rank": 21, |
| "name": "MemoRAG", |
| "url": "https://github.com/qhjqhj00/MemoRAG", |
| "org": "Peking University & Hong Kong Polytechnic University", |
| "modality": "text", |
| "date": "2025-4-9", |
| "status": "published", |
| "conciseness": 51.31, |
| "correctness": 24.19, |
| "completeness": 19.1, |
| "f1_like": 21.35, |
| "info": 10.96, |
| "overall": 10.96 |
| }, |
| { |
| "rank": 22, |
| "name": "VdocRAG", |
| "url": "https://vdocrag.github.io/", |
| "org": "NTT Corporation & Tohoku University", |
| "modality": "visual", |
| "date": "2025-4-14", |
| "status": "published", |
| "conciseness": 61.54, |
| "correctness": 21.17, |
| "completeness": 13.88, |
| "f1_like": 16.77, |
| "info": 10.32, |
| "overall": 10.32 |
| }, |
| { |
| "rank": 23, |
| "name": "VisRAG", |
| "url": "https://github.com/OpenBMB/VisRAG", |
| "org": "Tsinghua University & ModelBest Inc.", |
| "modality": "visual", |
| "date": "2025-3-2", |
| "status": "published", |
| "conciseness": 39.9, |
| "correctness": 26.24, |
| "completeness": 23.63, |
| "f1_like": 24.87, |
| "info": 9.92, |
| "overall": 9.92 |
| }, |
| { |
| "rank": 24, |
| "name": "Raptor", |
| "url": "https://github.com/parthsarthi03/raptor", |
| "org": "Stanford University", |
| "modality": "text", |
| "date": "2024-1-31", |
| "status": "published", |
| "conciseness": 36.47, |
| "correctness": 25.28, |
| "completeness": 20.82, |
| "f1_like": 22.84, |
| "info": 8.33, |
| "overall": 8.33 |
| }, |
| { |
| "rank": 25, |
| "name": "Monkey", |
| "url": "https://github.com/Yuliang-Liu/Monkey", |
| "org": "Huazhong University of Science and Technology", |
| "modality": "visual", |
| "date": "2024-8-26", |
| "status": "published", |
| "conciseness": 54.61, |
| "correctness": 17.08, |
| "completeness": 11.27, |
| "f1_like": 13.58, |
| "info": 7.41, |
| "overall": 7.41 |
| }, |
| { |
| "rank": 26, |
| "name": "Docopilot", |
| "url": "https://github.com/OpenGVLab/Docopilot", |
| "org": "Shanghai AI Laboratory", |
| "modality": "visual", |
| "date": "2025-7-19", |
| "status": "published", |
| "conciseness": 39.31, |
| "correctness": 18.31, |
| "completeness": 17.12, |
| "f1_like": 17.69, |
| "info": 6.96, |
| "overall": 6.96 |
| }, |
| { |
| "rank": 27, |
| "name": "Qwen3", |
| "url": "https://github.com/QwenLM/Qwen3", |
| "org": "Alibaba", |
| "modality": "visual", |
| "date": "2025-7-21", |
| "status": "published", |
| "conciseness": 22.64, |
| "correctness": 20.17, |
| "completeness": 20.14, |
| "f1_like": 20.16, |
| "info": 4.56, |
| "overall": 4.56 |
| }, |
| { |
| "rank": 28, |
| "name": "DocOwl2", |
| "url": "https://github.com/X-PLUG/mPLUG-DocOwl", |
| "org": "Alibaba", |
| "modality": "visual", |
| "date": "2024-9-9", |
| "status": "published", |
| "conciseness": 50.19, |
| "correctness": 11.75, |
| "completeness": 6.66, |
| "f1_like": 8.5, |
| "info": 4.27, |
| "overall": 4.27 |
| } |
| ] |
| } |
| } |
| } |
|
|