Spaces:

evaleval
/

general-eval-card

Running

App Files Files Community

general-eval-card / data /benchmarks /categories.json

evijit HF Staff

Restructure model details + extend cleanHierarchy for split families and aggregator dedup

06313c1 25 days ago

raw

history blame contribute delete

28.5 kB

	{
	"AA-Index": [
	"knowledge",
	"general"
	],
	"AA-LCR": [
	"general",
	"applied_reasoning"
	],
	"ACE": [
	"multimodal",
	"agentic"
	],
	"ACEBench": [
	"multimodal",
	"agentic"
	],
	"AGIEval": [
	"general",
	"applied_reasoning"
	],
	"AI2 Reasoning Challenge (ARC)": [
	"logical_reasoning",
	"knowledge",
	"commonsense_reasoning"
	],
	"AI2D": [
	"multimodal",
	"knowledge"
	],
	"AIME": [
	"mathematics"
	],
	"AIME 2024": [
	"mathematics"
	],
	"AIME 2025": [
	"mathematics"
	],
	"AITZ_EM": [
	"agentic"
	],
	"AMC_2022_23": [
	"mathematics"
	],
	"APEX Agents": [
	"agentic",
	"applied_reasoning"
	],
	"APEX v1": [
	"agentic",
	"applied_reasoning"
	],
	"API-Bank": [
	"agentic"
	],
	"Air Bench 2024": [
	"safety"
	],
	"ARC-AGI": [
	"logical_reasoning",
	"commonsense_reasoning",
	"general"
	],
	"ARC-C": [
	"logical_reasoning",
	"knowledge",
	"commonsense_reasoning"
	],
	"ARC-E": [
	"logical_reasoning",
	"knowledge",
	"commonsense_reasoning"
	],
	"ARKitScenes": [
	"multimodal"
	],
	"ActivityNet": [
	"multimodal"
	],
	"Agentharm": [
	"safety",
	"agentic"
	],
	"Aider": [
	"software_engineering",
	"agentic"
	],
	"Aider-Polyglot": [
	"software_engineering",
	"linguistic_core"
	],
	"Aider-Polyglot Edit": [
	"software_engineering",
	"linguistic_core"
	],
	"AlignBench": [
	"applied_reasoning",
	"general"
	],
	"AlpacaEval 2.0": [
	"applied_reasoning",
	"general"
	],
	"Android Control High_EM": [
	"agentic",
	"multimodal"
	],
	"Android Control Low_EM": [
	"agentic",
	"multimodal"
	],
	"AndroidWorld": [
	"agentic",
	"multimodal"
	],
	"AndroidWorld_SR": [
	"agentic",
	"multimodal"
	],
	"Anthropic Red Team": [
	"safety"
	],
	"AppWorld": [
	"agentic"
	],
	"AppWorld Benchmark": [
	"agentic",
	"applied_reasoning"
	],
	"Arc": [
	"logical_reasoning",
	"commonsense_reasoning"
	],
	"Arena Hard": [
	"general",
	"applied_reasoning"
	],
	"Arena-Hard v2": [
	"general",
	"applied_reasoning"
	],
	"Artificial Analysis": [
	"general"
	],
	"Artificial Analysis LLM API": [
	"general"
	],
	"Artificial-Analysis": [
	"general"
	],
	"AssistantBench": [
	"agentic",
	"applied_reasoning"
	],
	"AttaQ": [
	"robustness"
	],
	"AutoLogi": [
	"logical_reasoning",
	"mathematics"
	],
	"BBH": [
	"logical_reasoning",
	"applied_reasoning"
	],
	"BBQ": [
	"safety",
	"commonsense_reasoning"
	],
	"BFCL": [
	"agentic",
	"software_engineering"
	],
	"BFCL v2": [
	"agentic",
	"software_engineering"
	],
	"BFCL-V4": [
	"agentic",
	"software_engineering"
	],
	"BFCL-v3": [
	"agentic",
	"software_engineering"
	],
	"BFCL_v3_MultiTurn": [
	"agentic",
	"software_engineering"
	],
	"BIG-Bench": [
	"general",
	"applied_reasoning"
	],
	"BIG-Bench Extra Hard": [
	"general",
	"applied_reasoning"
	],
	"BIG-Bench Hard": [
	"general",
	"applied_reasoning"
	],
	"BLINK": [
	"knowledge",
	"linguistic_core"
	],
	"BabyVision": [
	"multimodal"
	],
	"Beyond AIME": [
	"mathematics"
	],
	"BigCodeBench": [
	"software_engineering"
	],
	"BigCodeBench-Full": [
	"software_engineering"
	],
	"BigCodeBench-Hard": [
	"software_engineering"
	],
	"BioLP-Bench": [
	"natural_sciences"
	],
	"Bird-SQL (dev)": [
	"software_engineering"
	],
	"BixBench": [
	"applied_reasoning"
	],
	"BoolQ": [
	"commonsense_reasoning"
	],
	"BrowseComp": [
	"agentic"
	],
	"BrowseComp Long Context 128k": [
	"agentic"
	],
	"BrowseComp Long Context 256k": [
	"agentic"
	],
	"BrowseComp-Plus": [
	"agentic"
	],
	"BrowseComp-VL": [
	"agentic",
	"multimodal"
	],
	"BrowseComp-zh": [
	"agentic"
	],
	"BrowseCompPlus": [
	"agentic"
	],
	"C-Eval": [
	"knowledge",
	"mathematics",
	"natural_sciences",
	"humanities_and_social_sciences"
	],
	"CBNSL": [
	"linguistic_core"
	],
	"CC-Bench-V2 Backend": [
	"software_engineering"
	],
	"CC-Bench-V2 Frontend": [
	"software_engineering"
	],
	"CC-Bench-V2 Repo Exploration": [
	"software_engineering"
	],
	"CC-OCR": [
	"multimodal"
	],
	"CFEval": [
	"software_engineering"
	],
	"CLUEWSC": [
	"linguistic_core"
	],
	"CMMLU": [
	"knowledge",
	"mathematics",
	"natural_sciences",
	"humanities_and_social_sciences"
	],
	"CNMO 2024": [
	"mathematics"
	],
	"CNN/DailyMail": [
	"linguistic_core"
	],
	"COLLIE": [
	"applied_reasoning"
	],
	"CORE-Bench Hard": [
	"applied_reasoning"
	],
	"CRAG": [
	"applied_reasoning"
	],
	"CRPErelation": [
	"linguistic_core"
	],
	"CRUX-O": [
	"software_engineering"
	],
	"CRUXEval-Input-CoT": [
	"software_engineering"
	],
	"CRUXEval-Output-CoT": [
	"software_engineering"
	],
	"CSimpleQA": [
	"knowledge"
	],
	"Caparena": [
	"agentic"
	],
	"Caparena Auto": [
	"agentic"
	],
	"CaseLaw (v2) - Overall": [
	"law"
	],
	"CharXiv-D": [
	"multimodal"
	],
	"CharXiv-R": [
	"multimodal"
	],
	"CharadesSTA": [
	"multimodal"
	],
	"ChartQA": [
	"multimodal"
	],
	"CheXpert CXR": [
	"multimodal",
	"natural_sciences"
	],
	"CivilComments": [
	"safety"
	],
	"Claw-Eval": [
	"law"
	],
	"CloningScenarios": [
	"robustness"
	],
	"CoVoST2": [
	"linguistic_core"
	],
	"CoVoST2 en-zh": [
	"linguistic_core"
	],
	"CocoaBench": [
	"agentic"
	],
	"CocoaBench v1.0": [
	"agentic"
	],
	"CodeForces": [
	"mathematics",
	"software_engineering"
	],
	"Codegolf v2.2": [
	"software_engineering"
	],
	"Common Voice 15": [
	"linguistic_core"
	],
	"CommonSenseQA": [
	"commonsense_reasoning"
	],
	"Commonsense Qa": [
	"commonsense_reasoning"
	],
	"ComplexFuncBench": [
	"software_engineering"
	],
	"Corp Fin v2": [
	"finance"
	],
	"CorpusQA 1M": [
	"knowledge"
	],
	"CountBench": [
	"mathematics"
	],
	"Creative Writing v3": [
	"linguistic_core"
	],
	"CruxEval-O": [
	"software_engineering"
	],
	"Cvebench": [
	"software_engineering",
	"safety"
	],
	"CyBench": [
	"software_engineering",
	"safety"
	],
	"Cybench": [
	"software_engineering",
	"safety"
	],
	"CyberGym": [
	"agentic",
	"safety"
	],
	"Cybersecurity CTFs": [
	"agentic",
	"safety"
	],
	"Cyse2": [
	"software_engineering",
	"safety"
	],
	"DROP": [
	"linguistic_core",
	"applied_reasoning"
	],
	"DS-Arena-Code": [
	"software_engineering"
	],
	"DS-FIM-Eval": [
	"software_engineering"
	],
	"DeepPlanning": [
	"agentic"
	],
	"DeepSearchQA": [
	"applied_reasoning"
	],
	"DermMCQA": [
	"natural_sciences"
	],
	"Design2Code": [
	"software_engineering"
	],
	"DocVQA": [
	"multimodal"
	],
	"DocVQAtest": [
	"multimodal"
	],
	"DynaMath": [
	"mathematics"
	],
	"ECLeKTic": [
	"knowledge"
	],
	"EQ-Bench": [
	"commonsense_reasoning"
	],
	"ERQA": [
	"knowledge"
	],
	"EgoSchema": [
	"multimodal"
	],
	"EmbSpatialBench": [
	"commonsense_reasoning"
	],
	"EvalPlus": [
	"software_engineering"
	],
	"FActScore": [
	"hallucination"
	],
	"FLEURS": [
	"linguistic_core"
	],
	"FRAMES": [
	"linguistic_core"
	],
	"Facts Grounding": [
	"hallucination"
	],
	"Fibble Arena": [
	"logical_reasoning"
	],
	"Fibble arena": [
	"logical_reasoning"
	],
	"FigQA": [
	"multimodal"
	],
	"FinQA": [
	"finance"
	],
	"FinSearchComp T2&T3": [
	"finance"
	],
	"FinSearchComp-T3": [
	"finance"
	],
	"Finance Agent": [
	"agentic"
	],
	"Flame-VLM-Code": [
	"software_engineering"
	],
	"FlenQA": [
	"knowledge"
	],
	"French MMLU": [
	"general"
	],
	"FrontierMath": [
	"mathematics"
	],
	"FrontierScience Research": [
	"natural_sciences"
	],
	"FullStackBench en": [
	"software_engineering"
	],
	"FullStackBench zh": [
	"software_engineering"
	],
	"FunctionalMATH": [
	"mathematics"
	],
	"GAIA": [
	"applied_reasoning",
	"agentic"
	],
	"GDPval-AA": [
	"general"
	],
	"GDPval-MM": [
	"multimodal"
	],
	"GPQA": [
	"natural_sciences"
	],
	"GPQA - Overall": [
	"natural_sciences"
	],
	"GPQA Biology": [
	"natural_sciences"
	],
	"GPQA Chemistry": [
	"natural_sciences"
	],
	"GPQA Diamond": [
	"natural_sciences"
	],
	"GPQA Physics": [
	"natural_sciences"
	],
	"GSM-8K (CoT)": [
	"mathematics"
	],
	"GSM-MC": [
	"mathematics"
	],
	"GSM8K": [
	"mathematics"
	],
	"GSM8K Chat": [
	"mathematics"
	],
	"Gdm Intercode CTF": [
	"software_engineering",
	"agentic",
	"safety"
	],
	"GeneBench": [
	"natural_sciences"
	],
	"GiantSteps Tempo": [
	"multimodal"
	],
	"Global MMLU Lite": [
	"general"
	],
	"Global PIQA": [
	"commonsense_reasoning"
	],
	"Global-MMLU": [
	"general"
	],
	"Gorilla Benchmark API Bench": [
	"software_engineering",
	"agentic"
	],
	"GovReport": [
	"linguistic_core"
	],
	"Graphwalks BFS >128k": [
	"logical_reasoning"
	],
	"Graphwalks parents >128k": [
	"logical_reasoning"
	],
	"GroundUI-1K": [
	"multimodal",
	"agentic"
	],
	"HAL": [
	"hallucination"
	],
	"HELM": [
	"general"
	],
	"HELM Instruct": [
	"general"
	],
	"HF Open LLM Leaderboard v2": [
	"general"
	],
	"HMMT 2025": [
	"mathematics"
	],
	"HMMT Feb 26": [
	"mathematics"
	],
	"HMMT25": [
	"mathematics"
	],
	"Hallusion Bench": [
	"hallucination"
	],
	"HarmBench": [
	"safety"
	],
	"HealthBench": [
	"natural_sciences",
	"knowledge"
	],
	"HealthBench Hard": [
	"natural_sciences",
	"knowledge"
	],
	"HellaSwag": [
	"commonsense_reasoning"
	],
	"Helm air bench": [
	"general"
	],
	"Helm classic": [
	"general"
	],
	"Helm lite": [
	"general"
	],
	"HiddenMath": [
	"mathematics"
	],
	"Holistic Evaluation of Language Models (HELM)": [
	"general"
	],
	"HumanEval": [
	"software_engineering"
	],
	"HumanEval-Average": [
	"software_engineering"
	],
	"HumanEval-ER": [
	"software_engineering"
	],
	"HumanEval-Mul": [
	"software_engineering"
	],
	"HumanEvalFIM-Average": [
	"software_engineering"
	],
	"Humanity's Last Exam": [
	"general"
	],
	"Hypersim": [
	"multimodal"
	],
	"IF": [
	"linguistic_core"
	],
	"IFBench": [
	"linguistic_core"
	],
	"IFEval": [
	"linguistic_core"
	],
	"IMDb": [
	"linguistic_core"
	],
	"IMO-AnswerBench": [
	"mathematics"
	],
	"IOI": [
	"software_engineering"
	],
	"IPhO 2025": [
	"natural_sciences",
	"mathematics"
	],
	"ImageMining": [
	"multimodal"
	],
	"Include": [
	"knowledge",
	"linguistic_core"
	],
	"InfiniteBench/En.MC": [
	"linguistic_core"
	],
	"InfiniteBench/En.QA": [
	"linguistic_core",
	"knowledge"
	],
	"InfoVQA": [
	"multimodal",
	"knowledge"
	],
	"InfoVQAtest": [
	"multimodal",
	"knowledge"
	],
	"InfographicsQA": [
	"multimodal",
	"knowledge"
	],
	"Instruct HumanEval": [
	"software_engineering"
	],
	"InterGPS": [
	"multimodal"
	],
	"Internal API instruction following (hard)": [
	"linguistic_core"
	],
	"JudgeBench": [
	"applied_reasoning",
	"general"
	],
	"Judgebench": [
	"applied_reasoning",
	"general"
	],
	"LA Leaderboard": [
	"general"
	],
	"LBPP (v2)": [
	"software_engineering"
	],
	"LLM-Stats": [
	"general"
	],
	"LMArena Text Leaderboard": [
	"general"
	],
	"LSAT": [
	"law",
	"logical_reasoning"
	],
	"LVBench": [
	"multimodal"
	],
	"La Leaderboard composite dataset": [
	"general"
	],
	"LegalBench": [
	"law"
	],
	"LingoQA": [
	"linguistic_core",
	"knowledge"
	],
	"Live-Bench": [
	"general"
	],
	"LiveBench": [
	"general"
	],
	"LiveBench 20241125": [
	"general"
	],
	"LiveCodeBench": [
	"software_engineering"
	],
	"LiveCodeBench - Easy": [
	"software_engineering"
	],
	"LiveCodeBench - Hard": [
	"software_engineering"
	],
	"LiveCodeBench - Medium": [
	"software_engineering"
	],
	"LiveCodeBench - Overall": [
	"software_engineering"
	],
	"LiveCodeBench Pro": [
	"software_engineering"
	],
	"LiveCodeBench v5": [
	"software_engineering"
	],
	"LiveCodeBench v5 24.12-25.2": [
	"software_engineering"
	],
	"LiveCodeBench v6": [
	"software_engineering"
	],
	"LiveCodeBench(01-09)": [
	"software_engineering"
	],
	"Llm Stats": [
	"general"
	],
	"LongBench v2": [
	"linguistic_core"
	],
	"LongFact Concepts": [
	"hallucination"
	],
	"LongFact Objects": [
	"hallucination"
	],
	"LongVideoBench": [
	"multimodal"
	],
	"MASK": [
	"linguistic_core"
	],
	"MATH": [
	"mathematics"
	],
	"MATH (CoT)": [
	"mathematics"
	],
	"MATH-500": [
	"mathematics"
	],
	"MATH-Mc": [
	"mathematics"
	],
	"MAXIFE": [
	"multimodal"
	],
	"MBPP ++ base version": [
	"software_engineering"
	],
	"MBPP EvalPlus (base)": [
	"software_engineering"
	],
	"MBPP pass@1": [
	"software_engineering"
	],
	"MBPP+": [
	"software_engineering"
	],
	"MCP Atlas": [
	"agentic"
	],
	"MCP-Mark": [
	"agentic"
	],
	"MCP-Universe": [
	"agentic"
	],
	"MEGA MLQA": [
	"knowledge"
	],
	"MEGA TyDi QA": [
	"knowledge"
	],
	"MEGA UDPOS": [
	"linguistic_core"
	],
	"MEGA XCOPA": [
	"commonsense_reasoning"
	],
	"MEGA XStoryCloze": [
	"commonsense_reasoning"
	],
	"MEWC": [
	"knowledge"
	],
	"MGSM": [
	"mathematics"
	],
	"MIABench": [
	"applied_reasoning"
	],
	"MIMIC CXR": [
	"multimodal",
	"natural_sciences"
	],
	"MLE-Bench Lite": [
	"software_engineering"
	],
	"MLVU": [
	"multimodal"
	],
	"MLVU-M": [
	"multimodal"
	],
	"MM IF-Eval": [
	"multimodal"
	],
	"MM-BrowserComp": [
	"agentic",
	"multimodal"
	],
	"MM-ClawBench": [
	"agentic",
	"multimodal"
	],
	"MM-MT-Bench": [
	"agentic",
	"multimodal"
	],
	"MM-Mind2Web": [
	"agentic",
	"multimodal"
	],
	"MMAU": [
	"multimodal"
	],
	"MMAU Music": [
	"multimodal"
	],
	"MMAU Sound": [
	"multimodal"
	],
	"MMAU Speech": [
	"multimodal"
	],
	"MMBench": [
	"multimodal"
	],
	"MMBench-V1.1": [
	"multimodal"
	],
	"MMBench-Video": [
	"multimodal"
	],
	"MMBench_test": [
	"multimodal"
	],
	"MME": [
	"multimodal"
	],
	"MME-RealWorld": [
	"multimodal"
	],
	"MMLU": [
	"general",
	"knowledge"
	],
	"MMLU (CoT)": [
	"general",
	"knowledge"
	],
	"MMLU Chat": [
	"general",
	"knowledge"
	],
	"MMLU French": [
	"general",
	"knowledge"
	],
	"MMLU-Base": [
	"general",
	"knowledge"
	],
	"MMLU-Pro": [
	"general",
	"knowledge"
	],
	"MMLU-Pro leaderboard submissions (TIGER-Lab)": [
	"general",
	"knowledge"
	],
	"MMLU-ProX": [
	"general",
	"knowledge"
	],
	"MMLU-Redux": [
	"general",
	"knowledge"
	],
	"MMLU-STEM": [
	"general",
	"knowledge",
	"natural_sciences",
	"mathematics"
	],
	"MMLU-redux-2.0": [
	"general",
	"knowledge"
	],
	"MMLongBench-Doc": [
	"multimodal",
	"linguistic_core"
	],
	"MMMLU": [
	"multimodal",
	"knowledge"
	],
	"MMMU": [
	"multimodal",
	"knowledge"
	],
	"MMMU (val)": [
	"multimodal",
	"knowledge"
	],
	"MMMU (validation)": [
	"multimodal",
	"knowledge"
	],
	"MMMUval": [
	"multimodal",
	"knowledge"
	],
	"MMSearch": [
	"multimodal",
	"applied_reasoning"
	],
	"MMSearch-Plus": [
	"multimodal",
	"applied_reasoning"
	],
	"MMStar": [
	"multimodal",
	"applied_reasoning"
	],
	"MMT-Bench": [
	"multimodal",
	"applied_reasoning"
	],
	"MMVU": [
	"multimodal",
	"knowledge"
	],
	"MMVet": [
	"multimodal",
	"applied_reasoning"
	],
	"MMVetGPT4Turbo": [
	"multimodal",
	"applied_reasoning"
	],
	"MRCR": [
	"linguistic_core",
	"applied_reasoning"
	],
	"MRCR 128K (2-needle)": [
	"linguistic_core",
	"applied_reasoning"
	],
	"MRCR 128K (4-needle)": [
	"linguistic_core",
	"applied_reasoning"
	],
	"MRCR 128K (8-needle)": [
	"linguistic_core",
	"applied_reasoning"
	],
	"MRCR 1M": [
	"linguistic_core",
	"applied_reasoning"
	],
	"MRCR 1M (pointwise)": [
	"linguistic_core",
	"applied_reasoning"
	],
	"MRCR 64K (2-needle)": [
	"linguistic_core",
	"applied_reasoning"
	],
	"MRCR 64K (4-needle)": [
	"linguistic_core",
	"applied_reasoning"
	],
	"MRCR 64K (8-needle)": [
	"linguistic_core",
	"applied_reasoning"
	],
	"MRCR v2": [
	"linguistic_core",
	"applied_reasoning"
	],
	"MRCR v2 (8-needle)": [
	"linguistic_core",
	"applied_reasoning"
	],
	"MS MARCO (TREC)": [
	"linguistic_core",
	"applied_reasoning"
	],
	"MT-Bench": [
	"linguistic_core",
	"applied_reasoning"
	],
	"MTVQA": [
	"multimodal",
	"applied_reasoning"
	],
	"MVBench": [
	"multimodal",
	"applied_reasoning"
	],
	"MathArena Apex": [
	"mathematics"
	],
	"MathVerse-Mini": [
	"mathematics",
	"multimodal"
	],
	"MathVision": [
	"mathematics",
	"multimodal"
	],
	"MathVista": [
	"mathematics",
	"multimodal"
	],
	"MathVista-Mini": [
	"mathematics",
	"multimodal"
	],
	"MedCode - Overall": [
	"software_engineering",
	"natural_sciences"
	],
	"MedQA": [
	"knowledge",
	"natural_sciences"
	],
	"MedScribe - Overall": [
	"linguistic_core",
	"natural_sciences"
	],
	"MedXpertQA": [
	"knowledge",
	"natural_sciences"
	],
	"Meld": [
	"linguistic_core",
	"commonsense_reasoning"
	],
	"MobileMiniWob++_SR": [
	"agentic"
	],
	"Mortgage Tax": [
	"finance",
	"applied_reasoning"
	],
	"MotionBench": [
	"multimodal",
	"applied_reasoning"
	],
	"Mt Bench": [
	"linguistic_core",
	"applied_reasoning"
	],
	"MuSR": [
	"commonsense_reasoning",
	"applied_reasoning"
	],
	"MuirBench": [
	"multimodal",
	"applied_reasoning"
	],
	"Multi-Challenge": [
	"general"
	],
	"Multi-IF": [
	"applied_reasoning"
	],
	"Multi-SWE-Bench Leaderboard": [
	"software_engineering"
	],
	"MultiLF": [
	"logical_reasoning",
	"applied_reasoning"
	],
	"Multilingual MGSM (CoT)": [
	"mathematics",
	"linguistic_core"
	],
	"Multilingual MMLU": [
	"knowledge",
	"linguistic_core"
	],
	"Multipl E": [
	"software_engineering"
	],
	"MusicCaps": [
	"multimodal"
	],
	"NIH/Multi-needle": [
	"hallucination"
	],
	"NL2Repo": [
	"software_engineering"
	],
	"NMOS": [
	"multimodal"
	],
	"NOVA-63": [
	"knowledge"
	],
	"NQ": [
	"knowledge"
	],
	"NarrativeQA": [
	"linguistic_core"
	],
	"Natural Questions": [
	"knowledge"
	],
	"Natural2Code": [
	"software_engineering"
	],
	"NaturalQuestions": [
	"knowledge"
	],
	"Nexus": [
	"agentic"
	],
	"NoLiMa 128K": [
	"robustness"
	],
	"NoLiMa 32K": [
	"robustness"
	],
	"NoLiMa 64K": [
	"robustness"
	],
	"Nuscene": [
	"multimodal"
	],
	"OCRBench v2": [
	"multimodal"
	],
	"OCRBench-V2 (en)": [
	"multimodal"
	],
	"OCRBench-V2 (zh)": [
	"multimodal"
	],
	"ODinW": [
	"multimodal"
	],
	"OJBench": [
	"software_engineering"
	],
	"OJBench (C++)": [
	"software_engineering"
	],
	"OSWorld": [
	"agentic"
	],
	"OSWorld Extended": [
	"agentic"
	],
	"OSWorld Screenshot-only": [
	"agentic"
	],
	"OSWorld-G": [
	"agentic"
	],
	"OSWorld-Verified": [
	"agentic"
	],
	"Objectron": [
	"multimodal"
	],
	"OctoCodingBench": [
	"software_engineering"
	],
	"OfficeQA Pro": [
	"agentic"
	],
	"OlympiadBench": [
	"mathematics"
	],
	"Omni-MATH": [
	"mathematics"
	],
	"OmniBench": [
	"general"
	],
	"OmniBench Music": [
	"multimodal"
	],
	"OmniDocBench 1.5": [
	"multimodal"
	],
	"OmniGAIA": [
	"applied_reasoning"
	],
	"OmniMath": [
	"mathematics"
	],
	"Online Mind2Web": [
	"agentic"
	],
	"Open-rewrite": [
	"software_engineering"
	],
	"OpenAI MMLU": [
	"knowledge"
	],
	"OpenAI-MRCR: 2 needle 128k": [
	"hallucination"
	],
	"OpenAI-MRCR: 2 needle 1M": [
	"hallucination"
	],
	"OpenAI-MRCR: 2 needle 256k": [
	"hallucination"
	],
	"OpenEval": [
	"general"
	],
	"OpenRCA": [
	"applied_reasoning"
	],
	"OpenbookQA": [
	"commonsense_reasoning"
	],
	"Openeval": [
	"general"
	],
	"PIQA": [
	"commonsense_reasoning"
	],
	"PMC-VQA": [
	"multimodal"
	],
	"POPE": [
	"hallucination"
	],
	"PaperBench": [
	"knowledge",
	"applied_reasoning",
	"agentic"
	],
	"PathMCQA": [
	"natural_sciences",
	"knowledge"
	],
	"PerceptionTest": [
	"multimodal",
	"applied_reasoning"
	],
	"PhiBench": [
	"knowledge",
	"applied_reasoning"
	],
	"PhysicsFinals": [
	"natural_sciences",
	"mathematics"
	],
	"PinchBench": [
	"multimodal",
	"applied_reasoning"
	],
	"Piqa": [
	"commonsense_reasoning"
	],
	"PointGrounding": [
	"multimodal"
	],
	"PolyMATH": [
	"mathematics",
	"applied_reasoning"
	],
	"PolyMath-en": [
	"mathematics",
	"applied_reasoning"
	],
	"PopQA": [
	"knowledge"
	],
	"ProofBench - Overall": [
	"logical_reasoning",
	"mathematics"
	],
	"ProtocolQA": [
	"knowledge",
	"applied_reasoning"
	],
	"QMSum": [
	"linguistic_core"
	],
	"Qasper": [
	"linguistic_core"
	],
	"QuAC": [
	"linguistic_core"
	],
	"QwenWebBench": [
	"knowledge",
	"applied_reasoning"
	],
	"RAFT": [
	"linguistic_core",
	"applied_reasoning"
	],
	"RULER": [
	"linguistic_core",
	"robustness"
	],
	"RULER 1000K": [
	"linguistic_core",
	"robustness"
	],
	"RULER 128k": [
	"linguistic_core",
	"robustness"
	],
	"RULER 2048K": [
	"linguistic_core",
	"robustness"
	],
	"RULER 512K": [
	"linguistic_core",
	"robustness"
	],
	"RULER 64k": [
	"linguistic_core",
	"robustness"
	],
	"RealWorldQA": [
	"knowledge",
	"applied_reasoning"
	],
	"RefCOCO-avg": [
	"multimodal"
	],
	"RefSpatialBench": [
	"multimodal",
	"commonsense_reasoning"
	],
	"RepoBench": [
	"software_engineering"
	],
	"RepoQA": [
	"software_engineering"
	],
	"Reward-Bench": [
	"applied_reasoning",
	"general"
	],
	"RewardBench": [
	"applied_reasoning",
	"general"
	],
	"RoboSpatialHome": [
	"agentic",
	"multimodal"
	],
	"SAGE": [
	"knowledge",
	"applied_reasoning"
	],
	"SAT Math": [
	"mathematics"
	],
	"SIFO": [
	"knowledge",
	"applied_reasoning"
	],
	"SIFO-Multiturn": [
	"knowledge",
	"applied_reasoning"
	],
	"SQuALITY": [
	"linguistic_core"
	],
	"STEM": [
	"natural_sciences",
	"mathematics"
	],
	"SUNRGBD": [
	"multimodal"
	],
	"SWE-Bench Multimodal": [
	"software_engineering",
	"multimodal"
	],
	"SWE-Bench Pro": [
	"software_engineering",
	"agentic"
	],
	"SWE-bench Verified Mini": [
	"software_engineering"
	],
	"SWE-Lancer": [
	"software_engineering"
	],
	"SWE-Lancer (IC-Diamond subset)": [
	"software_engineering"
	],
	"SWE-Perf": [
	"software_engineering"
	],
	"SWE-PolyBench": [
	"software_engineering",
	"applied_reasoning"
	],
	"SWE-Review": [
	"software_engineering"
	],
	"SWE-bench": [
	"software_engineering"
	],
	"SWE-bench Multilingual": [
	"software_engineering"
	],
	"SWE-bench Verified": [
	"software_engineering"
	],
	"SWE-bench Verified (Agentic Coding)": [
	"software_engineering",
	"agentic"
	],
	"SWE-bench Verified (Agentless)": [
	"software_engineering"
	],
	"SWE-bench Verified (Multiple Attempts)": [
	"software_engineering"
	],
	"SWT-Bench": [
	"software_engineering"
	],
	"SciArena": [
	"natural_sciences",
	"applied_reasoning"
	],
	"SciArena leaderboard API": [
	"natural_sciences",
	"applied_reasoning"
	],
	"SciCode": [
	"natural_sciences",
	"software_engineering"
	],
	"ScienceAgentBench": [
	"natural_sciences",
	"agentic"
	],
	"ScienceQA": [
	"natural_sciences",
	"knowledge"
	],
	"ScienceQA Visual": [
	"natural_sciences",
	"multimodal"
	],
	"ScreenSpot": [
	"agentic",
	"multimodal"
	],
	"ScreenSpot Pro": [
	"agentic",
	"multimodal"
	],
	"Seal-0": [
	"safety"
	],
	"SecCodeBench": [
	"software_engineering",
	"safety"
	],
	"SimpleQA": [
	"knowledge",
	"hallucination"
	],
	"SimpleSafetyTests": [
	"safety"
	],
	"SimpleVQA": [
	"multimodal"
	],
	"SkillsBench": [
	"applied_reasoning"
	],
	"SlakeVQA": [
	"multimodal"
	],
	"Social IQa": [
	"commonsense_reasoning"
	],
	"Spider": [
	"software_engineering",
	"logical_reasoning"
	],
	"SummScreenFD": [
	"linguistic_core"
	],
	"SuperGLUE": [
	"linguistic_core",
	"general"
	],
	"SuperGPQA": [
	"knowledge",
	"applied_reasoning"
	],
	"TAU3-Bench": [
	"agentic"
	],
	"TIR-Bench": [
	"agentic"
	],
	"TLDR9+ (test)": [
	"linguistic_core"
	],
	"Tau Bench": [
	"agentic"
	],
	"Tau Bench Airline": [
	"agentic"
	],
	"Tau2-Bench": [
	"agentic"
	],
	"Tau2-Bench Airline": [
	"agentic"
	],
	"Tau2-Bench Retail": [
	"agentic"
	],
	"Tau2-Bench Telecom": [
	"agentic"
	],
	"Tax Eval v2": [
	"finance",
	"applied_reasoning"
	],
	"TempCompass": [
	"knowledge",
	"robustness"
	],
	"Terminal Bench": [
	"agentic"
	],
	"Terminal-Bench 2.0": [
	"agentic"
	],
	"TerminalBench Hard": [
	"agentic"
	],
	"Terminus": [
	"agentic"
	],
	"TextVQA": [
	"multimodal"
	],
	"TheoremQA": [
	"mathematics",
	"logical_reasoning"
	],
	"Theory of Mind": [
	"commonsense_reasoning"
	],
	"Toolathlon": [
	"agentic"
	],
	"Translation Set1\u2192en COMET22": [
	"linguistic_core"
	],
	"Translation Set1\u2192en spBleu": [
	"linguistic_core"
	],
	"Translation en\u2192Set1 COMET22": [
	"linguistic_core"
	],
	"Translation en\u2192Set1 spBleu": [
	"linguistic_core"
	],
	"TriviaQA": [
	"knowledge"
	],
	"TruthfulQA": [
	"hallucination"
	],
	"TydiQA": [
	"linguistic_core"
	],
	"USACO": [
	"mathematics",
	"logical_reasoning"
	],
	"USAMO25": [
	"mathematics"
	],
	"Uniform Bar Exam": [
	"law"
	],
	"V*": [
	"general"
	],
	"VATEX": [
	"multimodal"
	],
	"VCR_en_easy": [
	"multimodal"
	],
	"VIBE": [
	"agentic"
	],
	"VIBE Android": [
	"agentic"
	],
	"VIBE Backend": [
	"agentic"
	],
	"VIBE Simulation": [
	"agentic"
	],
	"VIBE Web": [
	"agentic"
	],
	"VIBE iOS": [
	"agentic"
	],
	"VIBE-Pro": [
	"agentic"
	],
	"VITA-Bench": [
	"multimodal"
	],
	"VLMsAreBlind": [
	"multimodal"
	],
	"VQA-Rad": [
	"multimodal"
	],
	"VQAv2": [
	"multimodal"
	],
	"VQAv2 (test)": [
	"multimodal"
	],
	"VQAv2 (val)": [
	"multimodal"
	],
	"Vals AI": [
	"general"
	],
	"Vals Index": [
	"general"
	],
	"Vals Multimodal Index": [
	"multimodal"
	],
	"Vending-Bench 2": [
	"agentic"
	],
	"Vibe Code Bench - Overall": [
	"software_engineering"
	],
	"Vibe-Eval": [
	"applied_reasoning",
	"multimodal"
	],
	"Video-MME": [
	"multimodal"
	],
	"Video-MME (long, no subtitles)": [
	"multimodal"
	],
	"VideoMME w sub.": [
	"multimodal"
	],
	"VideoMME w/o sub.": [
	"multimodal"
	],
	"VideoMMMU": [
	"multimodal"
	],
	"Virology Capabilities Test": [
	"natural_sciences"
	],
	"Vision2Web": [
	"agentic"
	],
	"VisuLogic": [
	"multimodal"
	],
	"VisualWebBench": [
	"agentic"
	],
	"VocalSound": [
	"multimodal"
	],
	"VoiceBench Avg": [
	"linguistic_core"
	],
	"WMDP": [
	"safety"
	],
	"WMT 2014": [
	"linguistic_core"
	],
	"WMT23": [
	"linguistic_core"
	],
	"WMT24++": [
	"linguistic_core"
	],
	"We-Math": [
	"mathematics"
	],
	"WebVoyager": [
	"agentic"
	],
	"WideSearch": [
	"agentic"
	],
	"Wild Bench": [
	"general"
	],
	"WildBench": [
	"general"
	],
	"Winogrande": [
	"commonsense_reasoning"
	],
	"Wordle Arena": [
	"logical_reasoning"
	],
	"WorldVQA": [
	"multimodal"
	],
	"WritingBench": [
	"linguistic_core"
	],
	"XLSum English": [
	"linguistic_core"
	],
	"XSum": [
	"linguistic_core"
	],
	"Xstest": [
	"safety"
	],
	"ZClawBench": [
	"law"
	],
	"ZEROBench": [
	"general"
	],
	"ZEROBench-Sub": [
	"general"
	],
	"ZebraLogic": [
	"logical_reasoning"
	]
	}