Spaces:

evaleval
/

general-eval-card

Running

GitHub Actions

chore: sync EEE pipeline output [2026-04-07 05:13 UTC]

ddebd57 about 2 months ago

1.43 kB

	[
	{
	"benchmark": "ace",
	"model_count": 12
	},
	{
	"benchmark": "apex-agents",
	"model_count": 20
	},
	{
	"benchmark": "apex-v1",
	"model_count": 10
	},
	{
	"benchmark": "appworld_test_normal",
	"model_count": 3
	},
	{
	"benchmark": "bfcl",
	"model_count": 109
	},
	{
	"benchmark": "browsecompplus",
	"model_count": 3
	},
	{
	"benchmark": "global-mmlu-lite",
	"model_count": 27
	},
	{
	"benchmark": "helm_capabilities",
	"model_count": 61
	},
	{
	"benchmark": "helm_classic",
	"model_count": 67
	},
	{
	"benchmark": "helm_instruct",
	"model_count": 4
	},
	{
	"benchmark": "helm_lite",
	"model_count": 91
	},
	{
	"benchmark": "helm_mmlu",
	"model_count": 79
	},
	{
	"benchmark": "hfopenllm_v2",
	"model_count": 4493
	},
	{
	"benchmark": "la_leaderboard",
	"model_count": 5
	},
	{
	"benchmark": "livecodebenchpro",
	"model_count": 27
	},
	{
	"benchmark": "reward-bench",
	"model_count": 328
	},
	{
	"benchmark": "swe-bench",
	"model_count": 3
	},
	{
	"benchmark": "tau-bench-2_airline",
	"model_count": 3
	},
	{
	"benchmark": "tau-bench-2_retail",
	"model_count": 3
	},
	{
	"benchmark": "tau-bench-2_telecom",
	"model_count": 3
	},
	{
	"benchmark": "terminal-bench-2.0",
	"model_count": 37
	},
	{
	"benchmark": "theory_of_mind",
	"model_count": 1
	}
	]