[ { "benchmark": "ace", "model_count": 12 }, { "benchmark": "apex-agents", "model_count": 20 }, { "benchmark": "apex-v1", "model_count": 10 }, { "benchmark": "appworld_test_normal", "model_count": 3 }, { "benchmark": "bfcl", "model_count": 109 }, { "benchmark": "browsecompplus", "model_count": 3 }, { "benchmark": "global-mmlu-lite", "model_count": 27 }, { "benchmark": "helm_capabilities", "model_count": 61 }, { "benchmark": "helm_classic", "model_count": 67 }, { "benchmark": "helm_instruct", "model_count": 4 }, { "benchmark": "helm_lite", "model_count": 91 }, { "benchmark": "helm_mmlu", "model_count": 79 }, { "benchmark": "hfopenllm_v2", "model_count": 4493 }, { "benchmark": "la_leaderboard", "model_count": 5 }, { "benchmark": "livecodebenchpro", "model_count": 27 }, { "benchmark": "reward-bench", "model_count": 328 }, { "benchmark": "swe-bench", "model_count": 3 }, { "benchmark": "tau-bench-2_airline", "model_count": 3 }, { "benchmark": "tau-bench-2_retail", "model_count": 3 }, { "benchmark": "tau-bench-2_telecom", "model_count": 3 }, { "benchmark": "terminal-bench-2.0", "model_count": 37 }, { "benchmark": "theory_of_mind", "model_count": 1 } ]