Spaces:
Running
Running
| { | |
| "AA-Index": [ | |
| "knowledge", | |
| "general" | |
| ], | |
| "AA-LCR": [ | |
| "general", | |
| "applied_reasoning" | |
| ], | |
| "ACE": [ | |
| "multimodal", | |
| "agentic" | |
| ], | |
| "ACEBench": [ | |
| "multimodal", | |
| "agentic" | |
| ], | |
| "AGIEval": [ | |
| "general", | |
| "applied_reasoning" | |
| ], | |
| "AI2 Reasoning Challenge (ARC)": [ | |
| "logical_reasoning", | |
| "knowledge", | |
| "commonsense_reasoning" | |
| ], | |
| "AI2D": [ | |
| "multimodal", | |
| "knowledge" | |
| ], | |
| "AIME": [ | |
| "mathematics" | |
| ], | |
| "AIME 2024": [ | |
| "mathematics" | |
| ], | |
| "AIME 2025": [ | |
| "mathematics" | |
| ], | |
| "AITZ_EM": [ | |
| "agentic" | |
| ], | |
| "AMC_2022_23": [ | |
| "mathematics" | |
| ], | |
| "APEX Agents": [ | |
| "agentic", | |
| "applied_reasoning" | |
| ], | |
| "APEX v1": [ | |
| "agentic", | |
| "applied_reasoning" | |
| ], | |
| "API-Bank": [ | |
| "agentic" | |
| ], | |
| "Air Bench 2024": [ | |
| "safety" | |
| ], | |
| "ARC-AGI": [ | |
| "logical_reasoning", | |
| "commonsense_reasoning", | |
| "general" | |
| ], | |
| "ARC-C": [ | |
| "logical_reasoning", | |
| "knowledge", | |
| "commonsense_reasoning" | |
| ], | |
| "ARC-E": [ | |
| "logical_reasoning", | |
| "knowledge", | |
| "commonsense_reasoning" | |
| ], | |
| "ARKitScenes": [ | |
| "multimodal" | |
| ], | |
| "ActivityNet": [ | |
| "multimodal" | |
| ], | |
| "Agentharm": [ | |
| "safety", | |
| "agentic" | |
| ], | |
| "Aider": [ | |
| "software_engineering", | |
| "agentic" | |
| ], | |
| "Aider-Polyglot": [ | |
| "software_engineering", | |
| "linguistic_core" | |
| ], | |
| "Aider-Polyglot Edit": [ | |
| "software_engineering", | |
| "linguistic_core" | |
| ], | |
| "AlignBench": [ | |
| "applied_reasoning", | |
| "general" | |
| ], | |
| "AlpacaEval 2.0": [ | |
| "applied_reasoning", | |
| "general" | |
| ], | |
| "Android Control High_EM": [ | |
| "agentic", | |
| "multimodal" | |
| ], | |
| "Android Control Low_EM": [ | |
| "agentic", | |
| "multimodal" | |
| ], | |
| "AndroidWorld": [ | |
| "agentic", | |
| "multimodal" | |
| ], | |
| "AndroidWorld_SR": [ | |
| "agentic", | |
| "multimodal" | |
| ], | |
| "Anthropic Red Team": [ | |
| "safety" | |
| ], | |
| "AppWorld": [ | |
| "agentic" | |
| ], | |
| "AppWorld Benchmark": [ | |
| "agentic", | |
| "applied_reasoning" | |
| ], | |
| "Arc": [ | |
| "logical_reasoning", | |
| "commonsense_reasoning" | |
| ], | |
| "Arena Hard": [ | |
| "general", | |
| "applied_reasoning" | |
| ], | |
| "Arena-Hard v2": [ | |
| "general", | |
| "applied_reasoning" | |
| ], | |
| "Artificial Analysis": [ | |
| "general" | |
| ], | |
| "Artificial Analysis LLM API": [ | |
| "general" | |
| ], | |
| "Artificial-Analysis": [ | |
| "general" | |
| ], | |
| "AssistantBench": [ | |
| "agentic", | |
| "applied_reasoning" | |
| ], | |
| "AttaQ": [ | |
| "robustness" | |
| ], | |
| "AutoLogi": [ | |
| "logical_reasoning", | |
| "mathematics" | |
| ], | |
| "BBH": [ | |
| "logical_reasoning", | |
| "applied_reasoning" | |
| ], | |
| "BBQ": [ | |
| "safety", | |
| "commonsense_reasoning" | |
| ], | |
| "BFCL": [ | |
| "agentic", | |
| "software_engineering" | |
| ], | |
| "BFCL v2": [ | |
| "agentic", | |
| "software_engineering" | |
| ], | |
| "BFCL-V4": [ | |
| "agentic", | |
| "software_engineering" | |
| ], | |
| "BFCL-v3": [ | |
| "agentic", | |
| "software_engineering" | |
| ], | |
| "BFCL_v3_MultiTurn": [ | |
| "agentic", | |
| "software_engineering" | |
| ], | |
| "BIG-Bench": [ | |
| "general", | |
| "applied_reasoning" | |
| ], | |
| "BIG-Bench Extra Hard": [ | |
| "general", | |
| "applied_reasoning" | |
| ], | |
| "BIG-Bench Hard": [ | |
| "general", | |
| "applied_reasoning" | |
| ], | |
| "BLINK": [ | |
| "knowledge", | |
| "linguistic_core" | |
| ], | |
| "BabyVision": [ | |
| "multimodal" | |
| ], | |
| "Beyond AIME": [ | |
| "mathematics" | |
| ], | |
| "BigCodeBench": [ | |
| "software_engineering" | |
| ], | |
| "BigCodeBench-Full": [ | |
| "software_engineering" | |
| ], | |
| "BigCodeBench-Hard": [ | |
| "software_engineering" | |
| ], | |
| "BioLP-Bench": [ | |
| "natural_sciences" | |
| ], | |
| "Bird-SQL (dev)": [ | |
| "software_engineering" | |
| ], | |
| "BixBench": [ | |
| "applied_reasoning" | |
| ], | |
| "BoolQ": [ | |
| "commonsense_reasoning" | |
| ], | |
| "BrowseComp": [ | |
| "agentic" | |
| ], | |
| "BrowseComp Long Context 128k": [ | |
| "agentic" | |
| ], | |
| "BrowseComp Long Context 256k": [ | |
| "agentic" | |
| ], | |
| "BrowseComp-Plus": [ | |
| "agentic" | |
| ], | |
| "BrowseComp-VL": [ | |
| "agentic", | |
| "multimodal" | |
| ], | |
| "BrowseComp-zh": [ | |
| "agentic" | |
| ], | |
| "BrowseCompPlus": [ | |
| "agentic" | |
| ], | |
| "C-Eval": [ | |
| "knowledge", | |
| "mathematics", | |
| "natural_sciences", | |
| "humanities_and_social_sciences" | |
| ], | |
| "CBNSL": [ | |
| "linguistic_core" | |
| ], | |
| "CC-Bench-V2 Backend": [ | |
| "software_engineering" | |
| ], | |
| "CC-Bench-V2 Frontend": [ | |
| "software_engineering" | |
| ], | |
| "CC-Bench-V2 Repo Exploration": [ | |
| "software_engineering" | |
| ], | |
| "CC-OCR": [ | |
| "multimodal" | |
| ], | |
| "CFEval": [ | |
| "software_engineering" | |
| ], | |
| "CLUEWSC": [ | |
| "linguistic_core" | |
| ], | |
| "CMMLU": [ | |
| "knowledge", | |
| "mathematics", | |
| "natural_sciences", | |
| "humanities_and_social_sciences" | |
| ], | |
| "CNMO 2024": [ | |
| "mathematics" | |
| ], | |
| "CNN/DailyMail": [ | |
| "linguistic_core" | |
| ], | |
| "COLLIE": [ | |
| "applied_reasoning" | |
| ], | |
| "CORE-Bench Hard": [ | |
| "applied_reasoning" | |
| ], | |
| "CRAG": [ | |
| "applied_reasoning" | |
| ], | |
| "CRPErelation": [ | |
| "linguistic_core" | |
| ], | |
| "CRUX-O": [ | |
| "software_engineering" | |
| ], | |
| "CRUXEval-Input-CoT": [ | |
| "software_engineering" | |
| ], | |
| "CRUXEval-Output-CoT": [ | |
| "software_engineering" | |
| ], | |
| "CSimpleQA": [ | |
| "knowledge" | |
| ], | |
| "Caparena": [ | |
| "agentic" | |
| ], | |
| "Caparena Auto": [ | |
| "agentic" | |
| ], | |
| "CaseLaw (v2) - Overall": [ | |
| "law" | |
| ], | |
| "CharXiv-D": [ | |
| "multimodal" | |
| ], | |
| "CharXiv-R": [ | |
| "multimodal" | |
| ], | |
| "CharadesSTA": [ | |
| "multimodal" | |
| ], | |
| "ChartQA": [ | |
| "multimodal" | |
| ], | |
| "CheXpert CXR": [ | |
| "multimodal", | |
| "natural_sciences" | |
| ], | |
| "CivilComments": [ | |
| "safety" | |
| ], | |
| "Claw-Eval": [ | |
| "law" | |
| ], | |
| "CloningScenarios": [ | |
| "robustness" | |
| ], | |
| "CoVoST2": [ | |
| "linguistic_core" | |
| ], | |
| "CoVoST2 en-zh": [ | |
| "linguistic_core" | |
| ], | |
| "CocoaBench": [ | |
| "agentic" | |
| ], | |
| "CocoaBench v1.0": [ | |
| "agentic" | |
| ], | |
| "CodeForces": [ | |
| "mathematics", | |
| "software_engineering" | |
| ], | |
| "Codegolf v2.2": [ | |
| "software_engineering" | |
| ], | |
| "Common Voice 15": [ | |
| "linguistic_core" | |
| ], | |
| "CommonSenseQA": [ | |
| "commonsense_reasoning" | |
| ], | |
| "Commonsense Qa": [ | |
| "commonsense_reasoning" | |
| ], | |
| "ComplexFuncBench": [ | |
| "software_engineering" | |
| ], | |
| "Corp Fin v2": [ | |
| "finance" | |
| ], | |
| "CorpusQA 1M": [ | |
| "knowledge" | |
| ], | |
| "CountBench": [ | |
| "mathematics" | |
| ], | |
| "Creative Writing v3": [ | |
| "linguistic_core" | |
| ], | |
| "CruxEval-O": [ | |
| "software_engineering" | |
| ], | |
| "Cvebench": [ | |
| "software_engineering", | |
| "safety" | |
| ], | |
| "CyBench": [ | |
| "software_engineering", | |
| "safety" | |
| ], | |
| "Cybench": [ | |
| "software_engineering", | |
| "safety" | |
| ], | |
| "CyberGym": [ | |
| "agentic", | |
| "safety" | |
| ], | |
| "Cybersecurity CTFs": [ | |
| "agentic", | |
| "safety" | |
| ], | |
| "Cyse2": [ | |
| "software_engineering", | |
| "safety" | |
| ], | |
| "DROP": [ | |
| "linguistic_core", | |
| "applied_reasoning" | |
| ], | |
| "DS-Arena-Code": [ | |
| "software_engineering" | |
| ], | |
| "DS-FIM-Eval": [ | |
| "software_engineering" | |
| ], | |
| "DeepPlanning": [ | |
| "agentic" | |
| ], | |
| "DeepSearchQA": [ | |
| "applied_reasoning" | |
| ], | |
| "DermMCQA": [ | |
| "natural_sciences" | |
| ], | |
| "Design2Code": [ | |
| "software_engineering" | |
| ], | |
| "DocVQA": [ | |
| "multimodal" | |
| ], | |
| "DocVQAtest": [ | |
| "multimodal" | |
| ], | |
| "DynaMath": [ | |
| "mathematics" | |
| ], | |
| "ECLeKTic": [ | |
| "knowledge" | |
| ], | |
| "EQ-Bench": [ | |
| "commonsense_reasoning" | |
| ], | |
| "ERQA": [ | |
| "knowledge" | |
| ], | |
| "EgoSchema": [ | |
| "multimodal" | |
| ], | |
| "EmbSpatialBench": [ | |
| "commonsense_reasoning" | |
| ], | |
| "EvalPlus": [ | |
| "software_engineering" | |
| ], | |
| "FActScore": [ | |
| "hallucination" | |
| ], | |
| "FLEURS": [ | |
| "linguistic_core" | |
| ], | |
| "FRAMES": [ | |
| "linguistic_core" | |
| ], | |
| "Facts Grounding": [ | |
| "hallucination" | |
| ], | |
| "Fibble Arena": [ | |
| "logical_reasoning" | |
| ], | |
| "Fibble arena": [ | |
| "logical_reasoning" | |
| ], | |
| "FigQA": [ | |
| "multimodal" | |
| ], | |
| "FinQA": [ | |
| "finance" | |
| ], | |
| "FinSearchComp T2&T3": [ | |
| "finance" | |
| ], | |
| "FinSearchComp-T3": [ | |
| "finance" | |
| ], | |
| "Finance Agent": [ | |
| "agentic" | |
| ], | |
| "Flame-VLM-Code": [ | |
| "software_engineering" | |
| ], | |
| "FlenQA": [ | |
| "knowledge" | |
| ], | |
| "French MMLU": [ | |
| "general" | |
| ], | |
| "FrontierMath": [ | |
| "mathematics" | |
| ], | |
| "FrontierScience Research": [ | |
| "natural_sciences" | |
| ], | |
| "FullStackBench en": [ | |
| "software_engineering" | |
| ], | |
| "FullStackBench zh": [ | |
| "software_engineering" | |
| ], | |
| "FunctionalMATH": [ | |
| "mathematics" | |
| ], | |
| "GAIA": [ | |
| "applied_reasoning", | |
| "agentic" | |
| ], | |
| "GDPval-AA": [ | |
| "general" | |
| ], | |
| "GDPval-MM": [ | |
| "multimodal" | |
| ], | |
| "GPQA": [ | |
| "natural_sciences" | |
| ], | |
| "GPQA - Overall": [ | |
| "natural_sciences" | |
| ], | |
| "GPQA Biology": [ | |
| "natural_sciences" | |
| ], | |
| "GPQA Chemistry": [ | |
| "natural_sciences" | |
| ], | |
| "GPQA Diamond": [ | |
| "natural_sciences" | |
| ], | |
| "GPQA Physics": [ | |
| "natural_sciences" | |
| ], | |
| "GSM-8K (CoT)": [ | |
| "mathematics" | |
| ], | |
| "GSM-MC": [ | |
| "mathematics" | |
| ], | |
| "GSM8K": [ | |
| "mathematics" | |
| ], | |
| "GSM8K Chat": [ | |
| "mathematics" | |
| ], | |
| "Gdm Intercode CTF": [ | |
| "software_engineering", | |
| "agentic", | |
| "safety" | |
| ], | |
| "GeneBench": [ | |
| "natural_sciences" | |
| ], | |
| "GiantSteps Tempo": [ | |
| "multimodal" | |
| ], | |
| "Global MMLU Lite": [ | |
| "general" | |
| ], | |
| "Global PIQA": [ | |
| "commonsense_reasoning" | |
| ], | |
| "Global-MMLU": [ | |
| "general" | |
| ], | |
| "Gorilla Benchmark API Bench": [ | |
| "software_engineering", | |
| "agentic" | |
| ], | |
| "GovReport": [ | |
| "linguistic_core" | |
| ], | |
| "Graphwalks BFS >128k": [ | |
| "logical_reasoning" | |
| ], | |
| "Graphwalks parents >128k": [ | |
| "logical_reasoning" | |
| ], | |
| "GroundUI-1K": [ | |
| "multimodal", | |
| "agentic" | |
| ], | |
| "HAL": [ | |
| "hallucination" | |
| ], | |
| "HELM": [ | |
| "general" | |
| ], | |
| "HELM Instruct": [ | |
| "general" | |
| ], | |
| "HF Open LLM Leaderboard v2": [ | |
| "general" | |
| ], | |
| "HMMT 2025": [ | |
| "mathematics" | |
| ], | |
| "HMMT Feb 26": [ | |
| "mathematics" | |
| ], | |
| "HMMT25": [ | |
| "mathematics" | |
| ], | |
| "Hallusion Bench": [ | |
| "hallucination" | |
| ], | |
| "HarmBench": [ | |
| "safety" | |
| ], | |
| "HealthBench": [ | |
| "natural_sciences", | |
| "knowledge" | |
| ], | |
| "HealthBench Hard": [ | |
| "natural_sciences", | |
| "knowledge" | |
| ], | |
| "HellaSwag": [ | |
| "commonsense_reasoning" | |
| ], | |
| "Helm air bench": [ | |
| "general" | |
| ], | |
| "Helm classic": [ | |
| "general" | |
| ], | |
| "Helm lite": [ | |
| "general" | |
| ], | |
| "HiddenMath": [ | |
| "mathematics" | |
| ], | |
| "Holistic Evaluation of Language Models (HELM)": [ | |
| "general" | |
| ], | |
| "HumanEval": [ | |
| "software_engineering" | |
| ], | |
| "HumanEval-Average": [ | |
| "software_engineering" | |
| ], | |
| "HumanEval-ER": [ | |
| "software_engineering" | |
| ], | |
| "HumanEval-Mul": [ | |
| "software_engineering" | |
| ], | |
| "HumanEvalFIM-Average": [ | |
| "software_engineering" | |
| ], | |
| "Humanity's Last Exam": [ | |
| "general" | |
| ], | |
| "Hypersim": [ | |
| "multimodal" | |
| ], | |
| "IF": [ | |
| "linguistic_core" | |
| ], | |
| "IFBench": [ | |
| "linguistic_core" | |
| ], | |
| "IFEval": [ | |
| "linguistic_core" | |
| ], | |
| "IMDb": [ | |
| "linguistic_core" | |
| ], | |
| "IMO-AnswerBench": [ | |
| "mathematics" | |
| ], | |
| "IOI": [ | |
| "software_engineering" | |
| ], | |
| "IPhO 2025": [ | |
| "natural_sciences", | |
| "mathematics" | |
| ], | |
| "ImageMining": [ | |
| "multimodal" | |
| ], | |
| "Include": [ | |
| "knowledge", | |
| "linguistic_core" | |
| ], | |
| "InfiniteBench/En.MC": [ | |
| "linguistic_core" | |
| ], | |
| "InfiniteBench/En.QA": [ | |
| "linguistic_core", | |
| "knowledge" | |
| ], | |
| "InfoVQA": [ | |
| "multimodal", | |
| "knowledge" | |
| ], | |
| "InfoVQAtest": [ | |
| "multimodal", | |
| "knowledge" | |
| ], | |
| "InfographicsQA": [ | |
| "multimodal", | |
| "knowledge" | |
| ], | |
| "Instruct HumanEval": [ | |
| "software_engineering" | |
| ], | |
| "InterGPS": [ | |
| "multimodal" | |
| ], | |
| "Internal API instruction following (hard)": [ | |
| "linguistic_core" | |
| ], | |
| "JudgeBench": [ | |
| "applied_reasoning", | |
| "general" | |
| ], | |
| "Judgebench": [ | |
| "applied_reasoning", | |
| "general" | |
| ], | |
| "LA Leaderboard": [ | |
| "general" | |
| ], | |
| "LBPP (v2)": [ | |
| "software_engineering" | |
| ], | |
| "LLM-Stats": [ | |
| "general" | |
| ], | |
| "LMArena Text Leaderboard": [ | |
| "general" | |
| ], | |
| "LSAT": [ | |
| "law", | |
| "logical_reasoning" | |
| ], | |
| "LVBench": [ | |
| "multimodal" | |
| ], | |
| "La Leaderboard composite dataset": [ | |
| "general" | |
| ], | |
| "LegalBench": [ | |
| "law" | |
| ], | |
| "LingoQA": [ | |
| "linguistic_core", | |
| "knowledge" | |
| ], | |
| "Live-Bench": [ | |
| "general" | |
| ], | |
| "LiveBench": [ | |
| "general" | |
| ], | |
| "LiveBench 20241125": [ | |
| "general" | |
| ], | |
| "LiveCodeBench": [ | |
| "software_engineering" | |
| ], | |
| "LiveCodeBench - Easy": [ | |
| "software_engineering" | |
| ], | |
| "LiveCodeBench - Hard": [ | |
| "software_engineering" | |
| ], | |
| "LiveCodeBench - Medium": [ | |
| "software_engineering" | |
| ], | |
| "LiveCodeBench - Overall": [ | |
| "software_engineering" | |
| ], | |
| "LiveCodeBench Pro": [ | |
| "software_engineering" | |
| ], | |
| "LiveCodeBench v5": [ | |
| "software_engineering" | |
| ], | |
| "LiveCodeBench v5 24.12-25.2": [ | |
| "software_engineering" | |
| ], | |
| "LiveCodeBench v6": [ | |
| "software_engineering" | |
| ], | |
| "LiveCodeBench(01-09)": [ | |
| "software_engineering" | |
| ], | |
| "Llm Stats": [ | |
| "general" | |
| ], | |
| "LongBench v2": [ | |
| "linguistic_core" | |
| ], | |
| "LongFact Concepts": [ | |
| "hallucination" | |
| ], | |
| "LongFact Objects": [ | |
| "hallucination" | |
| ], | |
| "LongVideoBench": [ | |
| "multimodal" | |
| ], | |
| "MASK": [ | |
| "linguistic_core" | |
| ], | |
| "MATH": [ | |
| "mathematics" | |
| ], | |
| "MATH (CoT)": [ | |
| "mathematics" | |
| ], | |
| "MATH-500": [ | |
| "mathematics" | |
| ], | |
| "MATH-Mc": [ | |
| "mathematics" | |
| ], | |
| "MAXIFE": [ | |
| "multimodal" | |
| ], | |
| "MBPP ++ base version": [ | |
| "software_engineering" | |
| ], | |
| "MBPP EvalPlus (base)": [ | |
| "software_engineering" | |
| ], | |
| "MBPP pass@1": [ | |
| "software_engineering" | |
| ], | |
| "MBPP+": [ | |
| "software_engineering" | |
| ], | |
| "MCP Atlas": [ | |
| "agentic" | |
| ], | |
| "MCP-Mark": [ | |
| "agentic" | |
| ], | |
| "MCP-Universe": [ | |
| "agentic" | |
| ], | |
| "MEGA MLQA": [ | |
| "knowledge" | |
| ], | |
| "MEGA TyDi QA": [ | |
| "knowledge" | |
| ], | |
| "MEGA UDPOS": [ | |
| "linguistic_core" | |
| ], | |
| "MEGA XCOPA": [ | |
| "commonsense_reasoning" | |
| ], | |
| "MEGA XStoryCloze": [ | |
| "commonsense_reasoning" | |
| ], | |
| "MEWC": [ | |
| "knowledge" | |
| ], | |
| "MGSM": [ | |
| "mathematics" | |
| ], | |
| "MIABench": [ | |
| "applied_reasoning" | |
| ], | |
| "MIMIC CXR": [ | |
| "multimodal", | |
| "natural_sciences" | |
| ], | |
| "MLE-Bench Lite": [ | |
| "software_engineering" | |
| ], | |
| "MLVU": [ | |
| "multimodal" | |
| ], | |
| "MLVU-M": [ | |
| "multimodal" | |
| ], | |
| "MM IF-Eval": [ | |
| "multimodal" | |
| ], | |
| "MM-BrowserComp": [ | |
| "agentic", | |
| "multimodal" | |
| ], | |
| "MM-ClawBench": [ | |
| "agentic", | |
| "multimodal" | |
| ], | |
| "MM-MT-Bench": [ | |
| "agentic", | |
| "multimodal" | |
| ], | |
| "MM-Mind2Web": [ | |
| "agentic", | |
| "multimodal" | |
| ], | |
| "MMAU": [ | |
| "multimodal" | |
| ], | |
| "MMAU Music": [ | |
| "multimodal" | |
| ], | |
| "MMAU Sound": [ | |
| "multimodal" | |
| ], | |
| "MMAU Speech": [ | |
| "multimodal" | |
| ], | |
| "MMBench": [ | |
| "multimodal" | |
| ], | |
| "MMBench-V1.1": [ | |
| "multimodal" | |
| ], | |
| "MMBench-Video": [ | |
| "multimodal" | |
| ], | |
| "MMBench_test": [ | |
| "multimodal" | |
| ], | |
| "MME": [ | |
| "multimodal" | |
| ], | |
| "MME-RealWorld": [ | |
| "multimodal" | |
| ], | |
| "MMLU": [ | |
| "general", | |
| "knowledge" | |
| ], | |
| "MMLU (CoT)": [ | |
| "general", | |
| "knowledge" | |
| ], | |
| "MMLU Chat": [ | |
| "general", | |
| "knowledge" | |
| ], | |
| "MMLU French": [ | |
| "general", | |
| "knowledge" | |
| ], | |
| "MMLU-Base": [ | |
| "general", | |
| "knowledge" | |
| ], | |
| "MMLU-Pro": [ | |
| "general", | |
| "knowledge" | |
| ], | |
| "MMLU-Pro leaderboard submissions (TIGER-Lab)": [ | |
| "general", | |
| "knowledge" | |
| ], | |
| "MMLU-ProX": [ | |
| "general", | |
| "knowledge" | |
| ], | |
| "MMLU-Redux": [ | |
| "general", | |
| "knowledge" | |
| ], | |
| "MMLU-STEM": [ | |
| "general", | |
| "knowledge", | |
| "natural_sciences", | |
| "mathematics" | |
| ], | |
| "MMLU-redux-2.0": [ | |
| "general", | |
| "knowledge" | |
| ], | |
| "MMLongBench-Doc": [ | |
| "multimodal", | |
| "linguistic_core" | |
| ], | |
| "MMMLU": [ | |
| "multimodal", | |
| "knowledge" | |
| ], | |
| "MMMU": [ | |
| "multimodal", | |
| "knowledge" | |
| ], | |
| "MMMU (val)": [ | |
| "multimodal", | |
| "knowledge" | |
| ], | |
| "MMMU (validation)": [ | |
| "multimodal", | |
| "knowledge" | |
| ], | |
| "MMMUval": [ | |
| "multimodal", | |
| "knowledge" | |
| ], | |
| "MMSearch": [ | |
| "multimodal", | |
| "applied_reasoning" | |
| ], | |
| "MMSearch-Plus": [ | |
| "multimodal", | |
| "applied_reasoning" | |
| ], | |
| "MMStar": [ | |
| "multimodal", | |
| "applied_reasoning" | |
| ], | |
| "MMT-Bench": [ | |
| "multimodal", | |
| "applied_reasoning" | |
| ], | |
| "MMVU": [ | |
| "multimodal", | |
| "knowledge" | |
| ], | |
| "MMVet": [ | |
| "multimodal", | |
| "applied_reasoning" | |
| ], | |
| "MMVetGPT4Turbo": [ | |
| "multimodal", | |
| "applied_reasoning" | |
| ], | |
| "MRCR": [ | |
| "linguistic_core", | |
| "applied_reasoning" | |
| ], | |
| "MRCR 128K (2-needle)": [ | |
| "linguistic_core", | |
| "applied_reasoning" | |
| ], | |
| "MRCR 128K (4-needle)": [ | |
| "linguistic_core", | |
| "applied_reasoning" | |
| ], | |
| "MRCR 128K (8-needle)": [ | |
| "linguistic_core", | |
| "applied_reasoning" | |
| ], | |
| "MRCR 1M": [ | |
| "linguistic_core", | |
| "applied_reasoning" | |
| ], | |
| "MRCR 1M (pointwise)": [ | |
| "linguistic_core", | |
| "applied_reasoning" | |
| ], | |
| "MRCR 64K (2-needle)": [ | |
| "linguistic_core", | |
| "applied_reasoning" | |
| ], | |
| "MRCR 64K (4-needle)": [ | |
| "linguistic_core", | |
| "applied_reasoning" | |
| ], | |
| "MRCR 64K (8-needle)": [ | |
| "linguistic_core", | |
| "applied_reasoning" | |
| ], | |
| "MRCR v2": [ | |
| "linguistic_core", | |
| "applied_reasoning" | |
| ], | |
| "MRCR v2 (8-needle)": [ | |
| "linguistic_core", | |
| "applied_reasoning" | |
| ], | |
| "MS MARCO (TREC)": [ | |
| "linguistic_core", | |
| "applied_reasoning" | |
| ], | |
| "MT-Bench": [ | |
| "linguistic_core", | |
| "applied_reasoning" | |
| ], | |
| "MTVQA": [ | |
| "multimodal", | |
| "applied_reasoning" | |
| ], | |
| "MVBench": [ | |
| "multimodal", | |
| "applied_reasoning" | |
| ], | |
| "MathArena Apex": [ | |
| "mathematics" | |
| ], | |
| "MathVerse-Mini": [ | |
| "mathematics", | |
| "multimodal" | |
| ], | |
| "MathVision": [ | |
| "mathematics", | |
| "multimodal" | |
| ], | |
| "MathVista": [ | |
| "mathematics", | |
| "multimodal" | |
| ], | |
| "MathVista-Mini": [ | |
| "mathematics", | |
| "multimodal" | |
| ], | |
| "MedCode - Overall": [ | |
| "software_engineering", | |
| "natural_sciences" | |
| ], | |
| "MedQA": [ | |
| "knowledge", | |
| "natural_sciences" | |
| ], | |
| "MedScribe - Overall": [ | |
| "linguistic_core", | |
| "natural_sciences" | |
| ], | |
| "MedXpertQA": [ | |
| "knowledge", | |
| "natural_sciences" | |
| ], | |
| "Meld": [ | |
| "linguistic_core", | |
| "commonsense_reasoning" | |
| ], | |
| "MobileMiniWob++_SR": [ | |
| "agentic" | |
| ], | |
| "Mortgage Tax": [ | |
| "finance", | |
| "applied_reasoning" | |
| ], | |
| "MotionBench": [ | |
| "multimodal", | |
| "applied_reasoning" | |
| ], | |
| "Mt Bench": [ | |
| "linguistic_core", | |
| "applied_reasoning" | |
| ], | |
| "MuSR": [ | |
| "commonsense_reasoning", | |
| "applied_reasoning" | |
| ], | |
| "MuirBench": [ | |
| "multimodal", | |
| "applied_reasoning" | |
| ], | |
| "Multi-Challenge": [ | |
| "general" | |
| ], | |
| "Multi-IF": [ | |
| "applied_reasoning" | |
| ], | |
| "Multi-SWE-Bench Leaderboard": [ | |
| "software_engineering" | |
| ], | |
| "MultiLF": [ | |
| "logical_reasoning", | |
| "applied_reasoning" | |
| ], | |
| "Multilingual MGSM (CoT)": [ | |
| "mathematics", | |
| "linguistic_core" | |
| ], | |
| "Multilingual MMLU": [ | |
| "knowledge", | |
| "linguistic_core" | |
| ], | |
| "Multipl E": [ | |
| "software_engineering" | |
| ], | |
| "MusicCaps": [ | |
| "multimodal" | |
| ], | |
| "NIH/Multi-needle": [ | |
| "hallucination" | |
| ], | |
| "NL2Repo": [ | |
| "software_engineering" | |
| ], | |
| "NMOS": [ | |
| "multimodal" | |
| ], | |
| "NOVA-63": [ | |
| "knowledge" | |
| ], | |
| "NQ": [ | |
| "knowledge" | |
| ], | |
| "NarrativeQA": [ | |
| "linguistic_core" | |
| ], | |
| "Natural Questions": [ | |
| "knowledge" | |
| ], | |
| "Natural2Code": [ | |
| "software_engineering" | |
| ], | |
| "NaturalQuestions": [ | |
| "knowledge" | |
| ], | |
| "Nexus": [ | |
| "agentic" | |
| ], | |
| "NoLiMa 128K": [ | |
| "robustness" | |
| ], | |
| "NoLiMa 32K": [ | |
| "robustness" | |
| ], | |
| "NoLiMa 64K": [ | |
| "robustness" | |
| ], | |
| "Nuscene": [ | |
| "multimodal" | |
| ], | |
| "OCRBench v2": [ | |
| "multimodal" | |
| ], | |
| "OCRBench-V2 (en)": [ | |
| "multimodal" | |
| ], | |
| "OCRBench-V2 (zh)": [ | |
| "multimodal" | |
| ], | |
| "ODinW": [ | |
| "multimodal" | |
| ], | |
| "OJBench": [ | |
| "software_engineering" | |
| ], | |
| "OJBench (C++)": [ | |
| "software_engineering" | |
| ], | |
| "OSWorld": [ | |
| "agentic" | |
| ], | |
| "OSWorld Extended": [ | |
| "agentic" | |
| ], | |
| "OSWorld Screenshot-only": [ | |
| "agentic" | |
| ], | |
| "OSWorld-G": [ | |
| "agentic" | |
| ], | |
| "OSWorld-Verified": [ | |
| "agentic" | |
| ], | |
| "Objectron": [ | |
| "multimodal" | |
| ], | |
| "OctoCodingBench": [ | |
| "software_engineering" | |
| ], | |
| "OfficeQA Pro": [ | |
| "agentic" | |
| ], | |
| "OlympiadBench": [ | |
| "mathematics" | |
| ], | |
| "Omni-MATH": [ | |
| "mathematics" | |
| ], | |
| "OmniBench": [ | |
| "general" | |
| ], | |
| "OmniBench Music": [ | |
| "multimodal" | |
| ], | |
| "OmniDocBench 1.5": [ | |
| "multimodal" | |
| ], | |
| "OmniGAIA": [ | |
| "applied_reasoning" | |
| ], | |
| "OmniMath": [ | |
| "mathematics" | |
| ], | |
| "Online Mind2Web": [ | |
| "agentic" | |
| ], | |
| "Open-rewrite": [ | |
| "software_engineering" | |
| ], | |
| "OpenAI MMLU": [ | |
| "knowledge" | |
| ], | |
| "OpenAI-MRCR: 2 needle 128k": [ | |
| "hallucination" | |
| ], | |
| "OpenAI-MRCR: 2 needle 1M": [ | |
| "hallucination" | |
| ], | |
| "OpenAI-MRCR: 2 needle 256k": [ | |
| "hallucination" | |
| ], | |
| "OpenEval": [ | |
| "general" | |
| ], | |
| "OpenRCA": [ | |
| "applied_reasoning" | |
| ], | |
| "OpenbookQA": [ | |
| "commonsense_reasoning" | |
| ], | |
| "Openeval": [ | |
| "general" | |
| ], | |
| "PIQA": [ | |
| "commonsense_reasoning" | |
| ], | |
| "PMC-VQA": [ | |
| "multimodal" | |
| ], | |
| "POPE": [ | |
| "hallucination" | |
| ], | |
| "PaperBench": [ | |
| "knowledge", | |
| "applied_reasoning", | |
| "agentic" | |
| ], | |
| "PathMCQA": [ | |
| "natural_sciences", | |
| "knowledge" | |
| ], | |
| "PerceptionTest": [ | |
| "multimodal", | |
| "applied_reasoning" | |
| ], | |
| "PhiBench": [ | |
| "knowledge", | |
| "applied_reasoning" | |
| ], | |
| "PhysicsFinals": [ | |
| "natural_sciences", | |
| "mathematics" | |
| ], | |
| "PinchBench": [ | |
| "multimodal", | |
| "applied_reasoning" | |
| ], | |
| "Piqa": [ | |
| "commonsense_reasoning" | |
| ], | |
| "PointGrounding": [ | |
| "multimodal" | |
| ], | |
| "PolyMATH": [ | |
| "mathematics", | |
| "applied_reasoning" | |
| ], | |
| "PolyMath-en": [ | |
| "mathematics", | |
| "applied_reasoning" | |
| ], | |
| "PopQA": [ | |
| "knowledge" | |
| ], | |
| "ProofBench - Overall": [ | |
| "logical_reasoning", | |
| "mathematics" | |
| ], | |
| "ProtocolQA": [ | |
| "knowledge", | |
| "applied_reasoning" | |
| ], | |
| "QMSum": [ | |
| "linguistic_core" | |
| ], | |
| "Qasper": [ | |
| "linguistic_core" | |
| ], | |
| "QuAC": [ | |
| "linguistic_core" | |
| ], | |
| "QwenWebBench": [ | |
| "knowledge", | |
| "applied_reasoning" | |
| ], | |
| "RAFT": [ | |
| "linguistic_core", | |
| "applied_reasoning" | |
| ], | |
| "RULER": [ | |
| "linguistic_core", | |
| "robustness" | |
| ], | |
| "RULER 1000K": [ | |
| "linguistic_core", | |
| "robustness" | |
| ], | |
| "RULER 128k": [ | |
| "linguistic_core", | |
| "robustness" | |
| ], | |
| "RULER 2048K": [ | |
| "linguistic_core", | |
| "robustness" | |
| ], | |
| "RULER 512K": [ | |
| "linguistic_core", | |
| "robustness" | |
| ], | |
| "RULER 64k": [ | |
| "linguistic_core", | |
| "robustness" | |
| ], | |
| "RealWorldQA": [ | |
| "knowledge", | |
| "applied_reasoning" | |
| ], | |
| "RefCOCO-avg": [ | |
| "multimodal" | |
| ], | |
| "RefSpatialBench": [ | |
| "multimodal", | |
| "commonsense_reasoning" | |
| ], | |
| "RepoBench": [ | |
| "software_engineering" | |
| ], | |
| "RepoQA": [ | |
| "software_engineering" | |
| ], | |
| "Reward-Bench": [ | |
| "applied_reasoning", | |
| "general" | |
| ], | |
| "RewardBench": [ | |
| "applied_reasoning", | |
| "general" | |
| ], | |
| "RoboSpatialHome": [ | |
| "agentic", | |
| "multimodal" | |
| ], | |
| "SAGE": [ | |
| "knowledge", | |
| "applied_reasoning" | |
| ], | |
| "SAT Math": [ | |
| "mathematics" | |
| ], | |
| "SIFO": [ | |
| "knowledge", | |
| "applied_reasoning" | |
| ], | |
| "SIFO-Multiturn": [ | |
| "knowledge", | |
| "applied_reasoning" | |
| ], | |
| "SQuALITY": [ | |
| "linguistic_core" | |
| ], | |
| "STEM": [ | |
| "natural_sciences", | |
| "mathematics" | |
| ], | |
| "SUNRGBD": [ | |
| "multimodal" | |
| ], | |
| "SWE-Bench Multimodal": [ | |
| "software_engineering", | |
| "multimodal" | |
| ], | |
| "SWE-Bench Pro": [ | |
| "software_engineering", | |
| "agentic" | |
| ], | |
| "SWE-bench Verified Mini": [ | |
| "software_engineering" | |
| ], | |
| "SWE-Lancer": [ | |
| "software_engineering" | |
| ], | |
| "SWE-Lancer (IC-Diamond subset)": [ | |
| "software_engineering" | |
| ], | |
| "SWE-Perf": [ | |
| "software_engineering" | |
| ], | |
| "SWE-PolyBench": [ | |
| "software_engineering", | |
| "applied_reasoning" | |
| ], | |
| "SWE-Review": [ | |
| "software_engineering" | |
| ], | |
| "SWE-bench": [ | |
| "software_engineering" | |
| ], | |
| "SWE-bench Multilingual": [ | |
| "software_engineering" | |
| ], | |
| "SWE-bench Verified": [ | |
| "software_engineering" | |
| ], | |
| "SWE-bench Verified (Agentic Coding)": [ | |
| "software_engineering", | |
| "agentic" | |
| ], | |
| "SWE-bench Verified (Agentless)": [ | |
| "software_engineering" | |
| ], | |
| "SWE-bench Verified (Multiple Attempts)": [ | |
| "software_engineering" | |
| ], | |
| "SWT-Bench": [ | |
| "software_engineering" | |
| ], | |
| "SciArena": [ | |
| "natural_sciences", | |
| "applied_reasoning" | |
| ], | |
| "SciArena leaderboard API": [ | |
| "natural_sciences", | |
| "applied_reasoning" | |
| ], | |
| "SciCode": [ | |
| "natural_sciences", | |
| "software_engineering" | |
| ], | |
| "ScienceAgentBench": [ | |
| "natural_sciences", | |
| "agentic" | |
| ], | |
| "ScienceQA": [ | |
| "natural_sciences", | |
| "knowledge" | |
| ], | |
| "ScienceQA Visual": [ | |
| "natural_sciences", | |
| "multimodal" | |
| ], | |
| "ScreenSpot": [ | |
| "agentic", | |
| "multimodal" | |
| ], | |
| "ScreenSpot Pro": [ | |
| "agentic", | |
| "multimodal" | |
| ], | |
| "Seal-0": [ | |
| "safety" | |
| ], | |
| "SecCodeBench": [ | |
| "software_engineering", | |
| "safety" | |
| ], | |
| "SimpleQA": [ | |
| "knowledge", | |
| "hallucination" | |
| ], | |
| "SimpleSafetyTests": [ | |
| "safety" | |
| ], | |
| "SimpleVQA": [ | |
| "multimodal" | |
| ], | |
| "SkillsBench": [ | |
| "applied_reasoning" | |
| ], | |
| "SlakeVQA": [ | |
| "multimodal" | |
| ], | |
| "Social IQa": [ | |
| "commonsense_reasoning" | |
| ], | |
| "Spider": [ | |
| "software_engineering", | |
| "logical_reasoning" | |
| ], | |
| "SummScreenFD": [ | |
| "linguistic_core" | |
| ], | |
| "SuperGLUE": [ | |
| "linguistic_core", | |
| "general" | |
| ], | |
| "SuperGPQA": [ | |
| "knowledge", | |
| "applied_reasoning" | |
| ], | |
| "TAU3-Bench": [ | |
| "agentic" | |
| ], | |
| "TIR-Bench": [ | |
| "agentic" | |
| ], | |
| "TLDR9+ (test)": [ | |
| "linguistic_core" | |
| ], | |
| "Tau Bench": [ | |
| "agentic" | |
| ], | |
| "Tau Bench Airline": [ | |
| "agentic" | |
| ], | |
| "Tau2-Bench": [ | |
| "agentic" | |
| ], | |
| "Tau2-Bench Airline": [ | |
| "agentic" | |
| ], | |
| "Tau2-Bench Retail": [ | |
| "agentic" | |
| ], | |
| "Tau2-Bench Telecom": [ | |
| "agentic" | |
| ], | |
| "Tax Eval v2": [ | |
| "finance", | |
| "applied_reasoning" | |
| ], | |
| "TempCompass": [ | |
| "knowledge", | |
| "robustness" | |
| ], | |
| "Terminal Bench": [ | |
| "agentic" | |
| ], | |
| "Terminal-Bench 2.0": [ | |
| "agentic" | |
| ], | |
| "TerminalBench Hard": [ | |
| "agentic" | |
| ], | |
| "Terminus": [ | |
| "agentic" | |
| ], | |
| "TextVQA": [ | |
| "multimodal" | |
| ], | |
| "TheoremQA": [ | |
| "mathematics", | |
| "logical_reasoning" | |
| ], | |
| "Theory of Mind": [ | |
| "commonsense_reasoning" | |
| ], | |
| "Toolathlon": [ | |
| "agentic" | |
| ], | |
| "Translation Set1\u2192en COMET22": [ | |
| "linguistic_core" | |
| ], | |
| "Translation Set1\u2192en spBleu": [ | |
| "linguistic_core" | |
| ], | |
| "Translation en\u2192Set1 COMET22": [ | |
| "linguistic_core" | |
| ], | |
| "Translation en\u2192Set1 spBleu": [ | |
| "linguistic_core" | |
| ], | |
| "TriviaQA": [ | |
| "knowledge" | |
| ], | |
| "TruthfulQA": [ | |
| "hallucination" | |
| ], | |
| "TydiQA": [ | |
| "linguistic_core" | |
| ], | |
| "USACO": [ | |
| "mathematics", | |
| "logical_reasoning" | |
| ], | |
| "USAMO25": [ | |
| "mathematics" | |
| ], | |
| "Uniform Bar Exam": [ | |
| "law" | |
| ], | |
| "V*": [ | |
| "general" | |
| ], | |
| "VATEX": [ | |
| "multimodal" | |
| ], | |
| "VCR_en_easy": [ | |
| "multimodal" | |
| ], | |
| "VIBE": [ | |
| "agentic" | |
| ], | |
| "VIBE Android": [ | |
| "agentic" | |
| ], | |
| "VIBE Backend": [ | |
| "agentic" | |
| ], | |
| "VIBE Simulation": [ | |
| "agentic" | |
| ], | |
| "VIBE Web": [ | |
| "agentic" | |
| ], | |
| "VIBE iOS": [ | |
| "agentic" | |
| ], | |
| "VIBE-Pro": [ | |
| "agentic" | |
| ], | |
| "VITA-Bench": [ | |
| "multimodal" | |
| ], | |
| "VLMsAreBlind": [ | |
| "multimodal" | |
| ], | |
| "VQA-Rad": [ | |
| "multimodal" | |
| ], | |
| "VQAv2": [ | |
| "multimodal" | |
| ], | |
| "VQAv2 (test)": [ | |
| "multimodal" | |
| ], | |
| "VQAv2 (val)": [ | |
| "multimodal" | |
| ], | |
| "Vals AI": [ | |
| "general" | |
| ], | |
| "Vals Index": [ | |
| "general" | |
| ], | |
| "Vals Multimodal Index": [ | |
| "multimodal" | |
| ], | |
| "Vending-Bench 2": [ | |
| "agentic" | |
| ], | |
| "Vibe Code Bench - Overall": [ | |
| "software_engineering" | |
| ], | |
| "Vibe-Eval": [ | |
| "applied_reasoning", | |
| "multimodal" | |
| ], | |
| "Video-MME": [ | |
| "multimodal" | |
| ], | |
| "Video-MME (long, no subtitles)": [ | |
| "multimodal" | |
| ], | |
| "VideoMME w sub.": [ | |
| "multimodal" | |
| ], | |
| "VideoMME w/o sub.": [ | |
| "multimodal" | |
| ], | |
| "VideoMMMU": [ | |
| "multimodal" | |
| ], | |
| "Virology Capabilities Test": [ | |
| "natural_sciences" | |
| ], | |
| "Vision2Web": [ | |
| "agentic" | |
| ], | |
| "VisuLogic": [ | |
| "multimodal" | |
| ], | |
| "VisualWebBench": [ | |
| "agentic" | |
| ], | |
| "VocalSound": [ | |
| "multimodal" | |
| ], | |
| "VoiceBench Avg": [ | |
| "linguistic_core" | |
| ], | |
| "WMDP": [ | |
| "safety" | |
| ], | |
| "WMT 2014": [ | |
| "linguistic_core" | |
| ], | |
| "WMT23": [ | |
| "linguistic_core" | |
| ], | |
| "WMT24++": [ | |
| "linguistic_core" | |
| ], | |
| "We-Math": [ | |
| "mathematics" | |
| ], | |
| "WebVoyager": [ | |
| "agentic" | |
| ], | |
| "WideSearch": [ | |
| "agentic" | |
| ], | |
| "Wild Bench": [ | |
| "general" | |
| ], | |
| "WildBench": [ | |
| "general" | |
| ], | |
| "Winogrande": [ | |
| "commonsense_reasoning" | |
| ], | |
| "Wordle Arena": [ | |
| "logical_reasoning" | |
| ], | |
| "WorldVQA": [ | |
| "multimodal" | |
| ], | |
| "WritingBench": [ | |
| "linguistic_core" | |
| ], | |
| "XLSum English": [ | |
| "linguistic_core" | |
| ], | |
| "XSum": [ | |
| "linguistic_core" | |
| ], | |
| "Xstest": [ | |
| "safety" | |
| ], | |
| "ZClawBench": [ | |
| "law" | |
| ], | |
| "ZEROBench": [ | |
| "general" | |
| ], | |
| "ZEROBench-Sub": [ | |
| "general" | |
| ], | |
| "ZebraLogic": [ | |
| "logical_reasoning" | |
| ] | |
| } |