Spaces:

evaleval
/

general-eval-card

Running

App Files Files Community

GitHub Actions commited on Mar 28

Commit

2edd871

1 Parent(s): f4514c9

chore: sync EEE pipeline output [2026-03-28 04:56 UTC]

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

data/benchmarks/ace.json +120 -0
data/benchmarks/apex-agents.json +218 -0
data/benchmarks/apex-v1.json +93 -0
data/benchmarks/appworld_test_normal.json +28 -0
data/benchmarks/browsecompplus.json +28 -0
data/benchmarks/global-mmlu-lite.json +706 -0
data/benchmarks/helm_capabilities.json +797 -0
data/benchmarks/helm_classic.json +1478 -0
data/benchmarks/helm_instruct.json +60 -0
data/benchmarks/helm_lite.json +1551 -0
data/benchmarks/helm_mmlu.json +0 -0
data/benchmarks/hfopenllm_v2.json +0 -0
data/benchmarks/livecodebenchpro.json +274 -0
data/benchmarks/reward-bench.json +0 -0
data/benchmarks/swe-bench.json +28 -0
data/benchmarks/tau-bench-2_airline.json +28 -0
data/benchmarks/tau-bench-2_retail.json +28 -0
data/benchmarks/tau-bench-2_telecom.json +28 -0
data/benchmarks/terminal-bench-2.0.json +300 -0
data/developers/0-hero.json +47 -0
data/developers/01-ai.json +433 -0
data/developers/1-800-LLMs.json +33 -0
data/developers/1024m.json +33 -0
data/developers/152334H.json +19 -0
data/developers/1TuanPham.json +33 -0
data/developers/3rd-Degree-Burn.json +61 -0
data/developers/4season.json +19 -0
data/developers/AALF.json +61 -0
data/developers/AELLM.json +33 -0
data/developers/AGI-0.json +47 -0
data/developers/AI-MO.json +33 -0
data/developers/AI-Sweden-Models.json +33 -0
data/developers/AI4free.json +33 -0
data/developers/AIDC-AI.json +19 -0
data/developers/Aashraf995.json +61 -0
data/developers/AbacusResearch.json +19 -0
data/developers/Ahdoot.json +33 -0
data/developers/Ahjeong.json +33 -0
data/developers/AicoresSecurity.json +61 -0
data/developers/Alepach.json +47 -0
data/developers/AlephAlpha.json +59 -0
data/developers/Alibaba-NLP.json +19 -0
data/developers/Alibaba.json +58 -0
data/developers/Alsebay.json +19 -0
data/developers/Amaorynho.json +61 -0
data/developers/Amu.json +33 -0
data/developers/Anthropic.json +129 -0
data/developers/ArliAI.json +33 -0
data/developers/Arthur-LAGACHERIE.json +19 -0
data/developers/Artples.json +33 -0

data/benchmarks/ace.json ADDED Viewed

	@@ -0,0 +1,120 @@

+{
+  "models": [
+    {
+      "model_id": "anthropic/Opus 4.1",
+      "name": "Opus 4.1",
+      "developer": "anthropic",
+      "scores": {
+        "Overall Score": 0.4,
+        "Gaming Score": 0.318
+      }
+    },
+    {
+      "model_id": "anthropic/Opus 4.5",
+      "name": "Opus 4.5",
+      "developer": "anthropic",
+      "scores": {
+        "Overall Score": 0.478,
+        "Gaming Score": 0.391
+      }
+    },
+    {
+      "model_id": "anthropic/Sonnet 4.5",
+      "name": "Sonnet 4.5",
+      "developer": "anthropic",
+      "scores": {
+        "Overall Score": 0.44,
+        "Gaming Score": 0.373
+      }
+    },
+    {
+      "model_id": "google/Gemini 2.5 Flash",
+      "name": "Gemini 2.5 Flash",
+      "developer": "google",
+      "scores": {
+        "Overall Score": 0.38,
+        "Gaming Score": 0.284
+      }
+    },
+    {
+      "model_id": "google/Gemini 2.5 Pro",
+      "name": "Gemini 2.5 Pro",
+      "developer": "google",
+      "scores": {
+        "Overall Score": 0.4,
+        "Gaming Score": 0.285
+      }
+    },
+    {
+      "model_id": "google/Gemini 3 Flash",
+      "name": "Gemini 3 Flash",
+      "developer": "google",
+      "scores": {
+        "Gaming Score": 0.415
+      }
+    },
+    {
+      "model_id": "google/Gemini 3 Pro",
+      "name": "Gemini 3 Pro",
+      "developer": "google",
+      "scores": {
+        "Overall Score": 0.47,
+        "Gaming Score": 0.509
+      }
+    },
+    {
+      "model_id": "openai/GPT 5",
+      "name": "GPT 5",
+      "developer": "openai",
+      "scores": {
+        "Overall Score": 0.561,
+        "DIY Score": 0.55,
+        "Food Score": 0.7,
+        "Gaming Score": 0.575
+      }
+    },
+    {
+      "model_id": "openai/GPT 5.1",
+      "name": "GPT 5.1",
+      "developer": "openai",
+      "scores": {
+        "Overall Score": 0.551,
+        "DIY Score": 0.56,
+        "Gaming Score": 0.61,
+        "Shopping Score": 0.45
+      }
+    },
+    {
+      "model_id": "openai/GPT 5.2",
+      "name": "GPT 5.2",
+      "developer": "openai",
+      "scores": {
+        "Overall Score": 0.515,
+        "Food Score": 0.65,
+        "Gaming Score": 0.578
+      }
+    },
+    {
+      "model_id": "openai/o3",
+      "name": "o3",
+      "developer": "openai",
+      "scores": {
+        "Overall Score": 0.529,
+        "Gaming Score": 0.585,
+        "Shopping Score": 0.45
+      }
+    },
+    {
+      "model_id": "openai/o3 Pro",
+      "name": "o3 Pro",
+      "developer": "openai",
+      "scores": {
+        "Overall Score": 0.552,
+        "DIY Score": 0.54,
+        "Food Score": 0.6,
+        "Gaming Score": 0.613,
+        "Shopping Score": 0.45
+      }
+    }
+  ]
+}

data/benchmarks/apex-agents.json ADDED Viewed

	@@ -0,0 +1,218 @@

+{
+  "models": [
+    {
+      "model_id": "anthropic/Opus 4.5",
+      "name": "Opus 4.5",
+      "developer": "anthropic",
+      "scores": {
+        "Overall Pass@1": 0.184,
+        "Overall Pass@8": 0.34,
+        "Overall Mean Score": 0.348,
+        "Investment Banking Pass@1": 0.216,
+        "Management Consulting Pass@1": 0.132,
+        "Corporate Law Pass@1": 0.202,
+        "Corporate Lawyer Mean Score": 0.471
+      }
+    },
+    {
+      "model_id": "anthropic/Opus 4.6",
+      "name": "Opus 4.6",
+      "developer": "anthropic",
+      "scores": {
+        "Overall Pass@1": 0.298,
+        "Corporate Lawyer Mean Score": 0.502
+      }
+    },
+    {
+      "model_id": "applied-compute/Applied Compute: Small",
+      "name": "Applied Compute: Small",
+      "developer": "applied-compute",
+      "scores": {
+        "Overall Pass@1": 0.23,
+        "Overall Mean Score": 0.401,
+        "Corporate Law Pass@1": 0.266,
+        "Corporate Lawyer Mean Score": 0.548
+      }
+    },
+    {
+      "model_id": "google/Gemini 3 Flash",
+      "name": "Gemini 3 Flash",
+      "developer": "google",
+      "scores": {
+        "Overall Pass@1": 0.24,
+        "Overall Pass@8": 0.367,
+        "Overall Mean Score": 0.395,
+        "Investment Banking Pass@1": 0.267,
+        "Management Consulting Pass@1": 0.193,
+        "Corporate Law Pass@1": 0.259,
+        "Corporate Lawyer Mean Score": 0.524
+      }
+    },
+    {
+      "model_id": "google/Gemini 3 Pro",
+      "name": "Gemini 3 Pro",
+      "developer": "google",
+      "scores": {
+        "Overall Pass@1": 0.184,
+        "Overall Pass@8": 0.373,
+        "Overall Mean Score": 0.341,
+        "Investment Banking Pass@1": 0.188,
+        "Management Consulting Pass@1": 0.124,
+        "Corporate Law Pass@1": 0.239,
+        "Corporate Lawyer Mean Score": 0.487
+      }
+    },
+    {
+      "model_id": "google/Gemini 3.1 Pro",
+      "name": "Gemini 3.1 Pro",
+      "developer": "google",
+      "scores": {
+        "Overall Pass@1": 0.335,
+        "Corporate Lawyer Mean Score": 0.494
+      }
+    },
+    {
+      "model_id": "minimax/Minimax-2.5",
+      "name": "Minimax-2.5",
+      "developer": "minimax",
+      "scores": {
+        "Corporate Lawyer Mean Score": 0.339
+      }
+    },
+    {
+      "model_id": "moonshot/Kimi K2 Thinking",
+      "name": "Kimi K2 Thinking",
+      "developer": "moonshot",
+      "scores": {
+        "Overall Pass@1": 0.04,
+        "Overall Pass@8": 0.144,
+        "Overall Mean Score": 0.115,
+        "Investment Banking Pass@1": 0.012,
+        "Management Consulting Pass@1": 0.029,
+        "Corporate Law Pass@1": 0.08,
+        "Corporate Lawyer Mean Score": 0.223
+      }
+    },
+    {
+      "model_id": "moonshot/Kimi K2.5",
+      "name": "Kimi K2.5",
+      "developer": "moonshot",
+      "scores": {
+        "Corporate Lawyer Mean Score": 0.402
+      }
+    },
+    {
+      "model_id": "openai/GPT 5",
+      "name": "GPT 5",
+      "developer": "openai",
+      "scores": {
+        "Overall Pass@1": 0.183,
+        "Overall Pass@8": 0.31,
+        "Overall Mean Score": 0.329,
+        "Investment Banking Pass@1": 0.273,
+        "Management Consulting Pass@1": 0.123,
+        "Corporate Law Pass@1": 0.153,
+        "Corporate Lawyer Mean Score": 0.382
+      }
+    },
+    {
+      "model_id": "openai/GPT 5 Codex",
+      "name": "GPT 5 Codex",
+      "developer": "openai",
+      "scores": {
+        "Corporate Lawyer Mean Score": 0.362
+      }
+    },
+    {
+      "model_id": "openai/GPT 5.1",
+      "name": "GPT 5.1",
+      "developer": "openai",
+      "scores": {
+        "Corporate Lawyer Mean Score": 0.376
+      }
+    },
+    {
+      "model_id": "openai/GPT 5.1 Codex",
+      "name": "GPT 5.1 Codex",
+      "developer": "openai",
+      "scores": {
+        "Corporate Lawyer Mean Score": 0.366
+      }
+    },
+    {
+      "model_id": "openai/GPT 5.2",
+      "name": "GPT 5.2",
+      "developer": "openai",
+      "scores": {
+        "Overall Pass@1": 0.23,
+        "Overall Pass@8": 0.4,
+        "Overall Mean Score": 0.387,
+        "Investment Banking Pass@1": 0.273,
+        "Management Consulting Pass@1": 0.227,
+        "Corporate Law Pass@1": 0.189,
+        "Corporate Lawyer Mean Score": 0.443
+      }
+    },
+    {
+      "model_id": "openai/GPT 5.2 Codex",
+      "name": "GPT 5.2 Codex",
+      "developer": "openai",
+      "scores": {
+        "Overall Pass@1": 0.276,
+        "Corporate Lawyer Mean Score": 0.394
+      }
+    },
+    {
+      "model_id": "openai/GPT 5.3 Codex",
+      "name": "GPT 5.3 Codex",
+      "developer": "openai",
+      "scores": {
+        "Overall Pass@1": 0.317
+      }
+    },
+    {
+      "model_id": "openai/GPT OSS 120B",
+      "name": "GPT OSS 120B",
+      "developer": "openai",
+      "scores": {
+        "Overall Pass@1": 0.047,
+        "Overall Pass@8": 0.115,
+        "Overall Mean Score": 0.145,
+        "Investment Banking Pass@1": 0.027,
+        "Management Consulting Pass@1": 0.035,
+        "Corporate Law Pass@1": 0.078,
+        "Corporate Lawyer Mean Score": 0.269
+      }
+    },
+    {
+      "model_id": "xai/Grok 4",
+      "name": "Grok 4",
+      "developer": "xai",
+      "scores": {
+        "Overall Pass@1": 0.152,
+        "Overall Pass@8": 0.329,
+        "Overall Mean Score": 0.303,
+        "Investment Banking Pass@1": 0.17,
+        "Management Consulting Pass@1": 0.12,
+        "Corporate Law Pass@1": 0.165,
+        "Corporate Lawyer Mean Score": 0.41
+      }
+    },
+    {
+      "model_id": "zhipu/GLM 4.6",
+      "name": "GLM 4.6",
+      "developer": "zhipu",
+      "scores": {
+        "Corporate Lawyer Mean Score": 0.196
+      }
+    },
+    {
+      "model_id": "zhipu/GLM 4.7",
+      "name": "GLM 4.7",
+      "developer": "zhipu",
+      "scores": {
+        "Corporate Lawyer Mean Score": 0.147
+      }
+    }
+  ]
+}

data/benchmarks/apex-v1.json ADDED Viewed

	@@ -0,0 +1,93 @@

+{
+  "models": [
+    {
+      "model_id": "anthropic/Opus 4.5",
+      "name": "Opus 4.5",
+      "developer": "anthropic",
+      "scores": {
+        "Medicine (MD) Score": 0.65
+      }
+    },
+    {
+      "model_id": "google/Gemini 2.5 Flash",
+      "name": "Gemini 2.5 Flash",
+      "developer": "google",
+      "scores": {
+        "Overall Score": 0.604
+      }
+    },
+    {
+      "model_id": "google/Gemini 3 Flash",
+      "name": "Gemini 3 Flash",
+      "developer": "google",
+      "scores": {
+        "Overall Score": 0.64,
+        "Consulting Score": 0.64
+      }
+    },
+    {
+      "model_id": "google/Gemini 3 Pro",
+      "name": "Gemini 3 Pro",
+      "developer": "google",
+      "scores": {
+        "Overall Score": 0.643,
+        "Consulting Score": 0.64,
+        "Investment Banking Score": 0.63
+      }
+    },
+    {
+      "model_id": "openai/GPT 4o",
+      "name": "GPT 4o",
+      "developer": "openai",
+      "scores": {
+        "Overall Score": 0.359
+      }
+    },
+    {
+      "model_id": "openai/GPT 5",
+      "name": "GPT 5",
+      "developer": "openai",
+      "scores": {
+        "Overall Score": 0.67,
+        "Big Law Score": 0.78,
+        "Medicine (MD) Score": 0.66,
+        "Investment Banking Score": 0.61
+      }
+    },
+    {
+      "model_id": "openai/GPT 5.1",
+      "name": "GPT 5.1",
+      "developer": "openai",
+      "scores": {
+        "Big Law Score": 0.77
+      }
+    },
+    {
+      "model_id": "openai/GPT 5.2 Pro",
+      "name": "GPT 5.2 Pro",
+      "developer": "openai",
+      "scores": {
+        "Overall Score": 0.668,
+        "Consulting Score": 0.64,
+        "Medicine (MD) Score": 0.65,
+        "Investment Banking Score": 0.64
+      }
+    },
+    {
+      "model_id": "openai/o3",
+      "name": "o3",
+      "developer": "openai",
+      "scores": {
+        "Big Law Score": 0.76
+      }
+    },
+    {
+      "model_id": "xai/Grok 4",
+      "name": "Grok 4",
+      "developer": "xai",
+      "scores": {
+        "Overall Score": 0.635
+      }
+    }
+  ]
+}

data/benchmarks/appworld_test_normal.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "models": [
+    {
+      "model_id": "anthropic/claude-opus-4-5",
+      "name": "claude-opus-4-5",
+      "developer": "Anthropic",
+      "scores": {
+        "appworld/test_normal": 0.7
+      }
+    },
+    {
+      "model_id": "google/gemini-3-pro-preview",
+      "name": "gemini-3-pro-preview",
+      "developer": "Google",
+      "scores": {
+        "appworld/test_normal": 0.36
+      }
+    },
+    {
+      "model_id": "openai/gpt-5.2-2025-12-11",
+      "name": "gpt-5.2-2025-12-11",
+      "developer": "OpenAI",
+      "scores": {
+        "appworld/test_normal": 0.0
+      }
+    }
+  ]
+}

data/benchmarks/browsecompplus.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "models": [
+    {
+      "model_id": "anthropic/claude-opus-4-5",
+      "name": "claude-opus-4-5",
+      "developer": "Anthropic",
+      "scores": {
+        "browsecompplus": 0.61
+      }
+    },
+    {
+      "model_id": "google/gemini-3-pro-preview",
+      "name": "gemini-3-pro-preview",
+      "developer": "Google",
+      "scores": {
+        "browsecompplus": 0.57
+      }
+    },
+    {
+      "model_id": "openai/gpt-5.2-2025-12-11",
+      "name": "gpt-5.2-2025-12-11",
+      "developer": "OpenAI",
+      "scores": {
+        "browsecompplus": 0.46
+      }
+    }
+  ]
+}

data/benchmarks/global-mmlu-lite.json ADDED Viewed

	@@ -0,0 +1,706 @@

+{
+  "models": [
+    {
+      "model_id": "alibaba/qwen3-235b-a22b-instruct-2507",
+      "name": "qwen3-235b-a22b-instruct-2507",
+      "developer": "alibaba",
+      "scores": {
+        "Global MMLU Lite": 0.8798,
+        "Culturally Sensitive": 0.8522,
+        "Culturally Agnostic": 0.9075,
+        "Arabic": 0.88,
+        "English": 0.89,
+        "Bengali": 0.8875,
+        "German": 0.885,
+        "French": 0.88,
+        "Hindi": 0.8775,
+        "Indonesian": 0.88,
+        "Italian": 0.88,
+        "Japanese": 0.88,
+        "Korean": 0.875,
+        "Portuguese": 0.8875,
+        "Spanish": 0.875,
+        "Swahili": 0.87,
+        "Yoruba": 0.8725,
+        "Chinese": 0.8775,
+        "Burmese": 0.88
+      }
+    },
+    {
+      "model_id": "anthropic/claude-3-5-haiku-20241022",
+      "name": "claude-3-5-haiku-20241022",
+      "developer": "anthropic",
+      "scores": {
+        "Global MMLU Lite": 0.6114,
+        "Culturally Sensitive": 0.5834,
+        "Culturally Agnostic": 0.6394,
+        "Arabic": 0.695,
+        "English": 0.485,
+        "Bengali": 0.675,
+        "German": 0.565,
+        "French": 0.61,
+        "Hindi": 0.6575,
+        "Indonesian": 0.5475,
+        "Italian": 0.48,
+        "Japanese": 0.655,
+        "Korean": 0.6575,
+        "Portuguese": 0.5225,
+        "Spanish": 0.485,
+        "Swahili": 0.69,
+        "Yoruba": 0.6675,
+        "Chinese": 0.69,
+        "Burmese": 0.7
+      }
+    },
+    {
+      "model_id": "anthropic/claude-3-7-sonnet-20250219",
+      "name": "claude-3-7-sonnet-20250219",
+      "developer": "anthropic",
+      "scores": {
+        "Global MMLU Lite": 0.8078,
+        "Culturally Sensitive": 0.7794,
+        "Culturally Agnostic": 0.8362,
+        "Arabic": 0.7925,
+        "English": 0.7625,
+        "Bengali": 0.825,
+        "German": 0.8125,
+        "French": 0.7675,
+        "Hindi": 0.805,
+        "Indonesian": 0.8175,
+        "Italian": 0.8225,
+        "Japanese": 0.8425,
+        "Korean": 0.83,
+        "Portuguese": 0.77,
+        "Spanish": 0.8075,
+        "Swahili": 0.8125,
+        "Yoruba": 0.81,
+        "Chinese": 0.835,
+        "Burmese": 0.8125
+      }
+    },
+    {
+      "model_id": "anthropic/claude-opus-4-1-20250805",
+      "name": "claude-opus-4-1-20250805",
+      "developer": "anthropic",
+      "scores": {
+        "Global MMLU Lite": 0.943,
+        "Culturally Sensitive": 0.9331,
+        "Culturally Agnostic": 0.9528,
+        "Arabic": 0.945,
+        "English": 0.9475,
+        "Bengali": 0.9425,
+        "German": 0.94,
+        "French": 0.945,
+        "Hindi": 0.9475,
+        "Indonesian": 0.9425,
+        "Italian": 0.94,
+        "Japanese": 0.94,
+        "Korean": 0.95,
+        "Portuguese": 0.945,
+        "Spanish": 0.945,
+        "Swahili": 0.93,
+        "Yoruba": 0.9375,
+        "Chinese": 0.945,
+        "Burmese": 0.945
+      }
+    },
+    {
+      "model_id": "anthropic/claude-sonnet-4-20250514",
+      "name": "claude-sonnet-4-20250514",
+      "developer": "anthropic",
+      "scores": {
+        "Global MMLU Lite": 0.9058,
+        "Culturally Sensitive": 0.8913,
+        "Culturally Agnostic": 0.9203,
+        "Arabic": 0.9125,
+        "English": 0.905,
+        "Bengali": 0.9075,
+        "German": 0.9125,
+        "French": 0.91,
+        "Hindi": 0.9,
+        "Indonesian": 0.9025,
+        "Italian": 0.9075,
+        "Japanese": 0.9,
+        "Korean": 0.9125,
+        "Portuguese": 0.91,
+        "Spanish": 0.9075,
+        "Swahili": 0.8975,
+        "Yoruba": 0.8975,
+        "Chinese": 0.9175,
+        "Burmese": 0.8925
+      }
+    },
+    {
+      "model_id": "cohere/aya-expanse-32b",
+      "name": "aya-expanse-32b",
+      "developer": "cohere",
+      "scores": {
+        "Global MMLU Lite": 0.7353,
+        "Culturally Sensitive": 0.6891,
+        "Culturally Agnostic": 0.7815,
+        "Arabic": 0.7425,
+        "English": 0.7544,
+        "Bengali": 0.7343,
+        "German": 0.7425,
+        "French": 0.7325,
+        "Hindi": 0.7375,
+        "Indonesian": 0.7594,
+        "Italian": 0.7305,
+        "Japanese": 0.7419,
+        "Korean": 0.7525,
+        "Portuguese": 0.7544,
+        "Spanish": 0.7362,
+        "Swahili": 0.7071,
+        "Yoruba": 0.6942,
+        "Chinese": 0.743,
+        "Burmese": 0.7025
+      }
+    },
+    {
+      "model_id": "cohere/command-a-03-2025",
+      "name": "command-a-03-2025",
+      "developer": "cohere",
+      "scores": {
+        "Global MMLU Lite": 0.8385,
+        "Culturally Sensitive": 0.7993,
+        "Culturally Agnostic": 0.8778,
+        "Arabic": 0.8425,
+        "English": 0.855,
+        "Bengali": 0.8225,
+        "German": 0.8425,
+        "French": 0.8375,
+        "Hindi": 0.8421,
+        "Indonesian": 0.8546,
+        "Italian": 0.8375,
+        "Japanese": 0.845,
+        "Korean": 0.85,
+        "Portuguese": 0.84,
+        "Spanish": 0.8525,
+        "Swahili": 0.8275,
+        "Yoruba": 0.815,
+        "Chinese": 0.835,
+        "Burmese": 0.8175
+      }
+    },
+    {
+      "model_id": "deepseek/deepseek-r1-0528",
+      "name": "deepseek-r1-0528",
+      "developer": "deepseek",
+      "scores": {
+        "Global MMLU Lite": 0.6744,
+        "Culturally Sensitive": 0.6672,
+        "Culturally Agnostic": 0.6816,
+        "Arabic": 0.6825,
+        "English": 0.715,
+        "Bengali": 0.655,
+        "German": 0.6375,
+        "French": 0.6925,
+        "Hindi": 0.6475,
+        "Indonesian": 0.655,
+        "Italian": 0.6775,
+        "Japanese": 0.7725,
+        "Korean": 0.6575,
+        "Portuguese": 0.635,
+        "Spanish": 0.7175,
+        "Swahili": 0.6775,
+        "Yoruba": 0.77,
+        "Chinese": 0.5075,
+        "Burmese": 0.69
+      }
+    },
+    {
+      "model_id": "deepseek/deepseek-v3.1",
+      "name": "deepseek-v3.1",
+      "developer": "deepseek",
+      "scores": {
+        "Global MMLU Lite": 0.8044,
+        "Culturally Sensitive": 0.7793,
+        "Culturally Agnostic": 0.8295,
+        "Arabic": 0.805,
+        "English": 0.825,
+        "Bengali": 0.8157,
+        "German": 0.7925,
+        "French": 0.8175,
+        "Hindi": 0.7569,
+        "Indonesian": 0.7764,
+        "Italian": 0.8075,
+        "Japanese": 0.8312,
+        "Korean": 0.8125,
+        "Portuguese": 0.8246,
+        "Spanish": 0.8125,
+        "Swahili": 0.801,
+        "Yoruba": 0.7831,
+        "Chinese": 0.8161,
+        "Burmese": 0.7925
+      }
+    },
+    {
+      "model_id": "google/gemini-2.5-flash",
+      "name": "gemini-2.5-flash",
+      "developer": "google",
+      "scores": {
+        "Global MMLU Lite": 0.9145,
+        "Culturally Sensitive": 0.9,
+        "Culturally Agnostic": 0.9291,
+        "Arabic": 0.9125,
+        "English": 0.9325,
+        "Bengali": 0.91,
+        "German": 0.9025,
+        "French": 0.91,
+        "Hindi": 0.925,
+        "Indonesian": 0.9075,
+        "Italian": 0.9225,
+        "Japanese": 0.9125,
+        "Korean": 0.915,
+        "Portuguese": 0.9125,
+        "Spanish": 0.9175,
+        "Swahili": 0.915,
+        "Yoruba": 0.9075,
+        "Chinese": 0.915,
+        "Burmese": 0.915
+      }
+    },
+    {
+      "model_id": "google/gemini-2.5-flash-preview-05-20",
+      "name": "gemini-2.5-flash-preview-05-20",
+      "developer": "google",
+      "scores": {
+        "Global MMLU Lite": 0.9092,
+        "Culturally Sensitive": 0.8925,
+        "Culturally Agnostic": 0.9259,
+        "Arabic": 0.905,
+        "English": 0.9225,
+        "Bengali": 0.91,
+        "German": 0.905,
+        "French": 0.925,
+        "Hindi": 0.9125,
+        "Indonesian": 0.9075,
+        "Italian": 0.89,
+        "Japanese": 0.9125,
+        "Korean": 0.9075,
+        "Portuguese": 0.915,
+        "Spanish": 0.915,
+        "Swahili": 0.905,
+        "Yoruba": 0.8825,
+        "Chinese": 0.93,
+        "Burmese": 0.9025
+      }
+    },
+    {
+      "model_id": "google/gemini-2.5-pro",
+      "name": "gemini-2.5-pro",
+      "developer": "google",
+      "scores": {
+        "Global MMLU Lite": 0.9323,
+        "Culturally Sensitive": 0.9241,
+        "Culturally Agnostic": 0.9406,
+        "Arabic": 0.9475,
+        "English": 0.9275,
+        "Bengali": 0.9275,
+        "German": 0.93,
+        "French": 0.9425,
+        "Hindi": 0.9275,
+        "Indonesian": 0.925,
+        "Italian": 0.935,
+        "Japanese": 0.9375,
+        "Korean": 0.9275,
+        "Portuguese": 0.93,
+        "Spanish": 0.94,
+        "Swahili": 0.9375,
+        "Yoruba": 0.925,
+        "Chinese": 0.9275,
+        "Burmese": 0.93
+      }
+    },
+    {
+      "model_id": "google/gemini-3-pro-preview",
+      "name": "gemini-3-pro-preview",
+      "developer": "Google",
+      "scores": {
+        "Global MMLU Lite": 0.9453,
+        "Culturally Sensitive": 0.9397,
+        "Culturally Agnostic": 0.9509,
+        "Arabic": 0.9475,
+        "English": 0.9425,
+        "Bengali": 0.9425,
+        "German": 0.94,
+        "French": 0.9575,
+        "Hindi": 0.9425,
+        "Indonesian": 0.955,
+        "Italian": 0.955,
+        "Japanese": 0.94,
+        "Korean": 0.94,
+        "Portuguese": 0.9425,
+        "Spanish": 0.9475,
+        "Swahili": 0.94,
+        "Yoruba": 0.9425,
+        "Chinese": 0.9475,
+        "Burmese": 0.9425
+      }
+    },
+    {
+      "model_id": "google/gemma-3-27b-it",
+      "name": "gemma-3-27b-it",
+      "developer": "google",
+      "scores": {
+        "Global MMLU Lite": 0.763,
+        "Culturally Sensitive": 0.7528,
+        "Culturally Agnostic": 0.7733,
+        "Arabic": 0.78,
+        "English": 0.7337,
+        "Bengali": 0.75,
+        "German": 0.775,
+        "French": 0.7481,
+        "Hindi": 0.7335,
+        "Indonesian": 0.7563,
+        "Italian": 0.75,
+        "Japanese": 0.7925,
+        "Korean": 0.798,
+        "Portuguese": 0.7481,
+        "Spanish": 0.7494,
+        "Swahili": 0.785,
+        "Yoruba": 0.7444,
+        "Chinese": 0.7925,
+        "Burmese": 0.7719
+      }
+    },
+    {
+      "model_id": "google/gemma-3-4b-it",
+      "name": "gemma-3-4b-it",
+      "developer": "google",
+      "scores": {
+        "Global MMLU Lite": 0.6511,
+        "Culturally Sensitive": 0.6116,
+        "Culturally Agnostic": 0.6906,
+        "Arabic": 0.6525,
+        "English": 0.67,
+        "Bengali": 0.68,
+        "German": 0.6525,
+        "French": 0.6575,
+        "Hindi": 0.6475,
+        "Indonesian": 0.6775,
+        "Italian": 0.6675,
+        "Japanese": 0.6325,
+        "Korean": 0.66,
+        "Portuguese": 0.68,
+        "Spanish": 0.6725,
+        "Swahili": 0.6075,
+        "Yoruba": 0.5825,
+        "Chinese": 0.6475,
+        "Burmese": 0.63
+      }
+    },
+    {
+      "model_id": "ibm/granite-4.0-h-small",
+      "name": "granite-4.0-h-small",
+      "developer": "ibm",
+      "scores": {
+        "Global MMLU Lite": 0.7503,
+        "Culturally Sensitive": 0.7182,
+        "Culturally Agnostic": 0.7826,
+        "Arabic": 0.7613,
+        "English": 0.77,
+        "Bengali": 0.7613,
+        "German": 0.755,
+        "French": 0.7594,
+        "Hindi": 0.7575,
+        "Indonesian": 0.7614,
+        "Italian": 0.7525,
+        "Japanese": 0.7406,
+        "Korean": 0.7525,
+        "Portuguese": 0.757,
+        "Spanish": 0.7638,
+        "Swahili": 0.7318,
+        "Yoruba": 0.6921,
+        "Chinese": 0.7475,
+        "Burmese": 0.7419
+      }
+    },
+    {
+      "model_id": "mistralai/mistral-medium-3",
+      "name": "mistral-medium-3",
+      "developer": "mistralai",
+      "scores": {
+        "Global MMLU Lite": 0.5511,
+        "Culturally Sensitive": 0.5391,
+        "Culturally Agnostic": 0.5631,
+        "Arabic": 0.455,
+        "English": 0.38,
+        "Bengali": 0.5175,
+        "German": 0.4775,
+        "French": 0.41,
+        "Hindi": 0.555,
+        "Indonesian": 0.515,
+        "Italian": 0.535,
+        "Japanese": 0.58,
+        "Korean": 0.595,
+        "Portuguese": 0.5175,
+        "Spanish": 0.5375,
+        "Swahili": 0.7075,
+        "Yoruba": 0.7675,
+        "Chinese": 0.535,
+        "Burmese": 0.7325
+      }
+    },
+    {
+      "model_id": "mistralai/mistral-small-2503",
+      "name": "mistral-small-2503",
+      "developer": "mistralai",
+      "scores": {
+        "Global MMLU Lite": 0.7852,
+        "Culturally Sensitive": 0.7537,
+        "Culturally Agnostic": 0.8166,
+        "Arabic": 0.7875,
+        "English": 0.8,
+        "Bengali": 0.7725,
+        "German": 0.7975,
+        "French": 0.8,
+        "Hindi": 0.795,
+        "Indonesian": 0.785,
+        "Italian": 0.805,
+        "Japanese": 0.77,
+        "Korean": 0.79,
+        "Portuguese": 0.7925,
+        "Spanish": 0.7825,
+        "Swahili": 0.775,
+        "Yoruba": 0.735,
+        "Chinese": 0.7925,
+        "Burmese": 0.7825
+      }
+    },
+    {
+      "model_id": "openai/gpt-4.1-2025-04-14",
+      "name": "gpt-4.1-2025-04-14",
+      "developer": "openai",
+      "scores": {
+        "Global MMLU Lite": 0.8755,
+        "Culturally Sensitive": 0.8541,
+        "Culturally Agnostic": 0.8969,
+        "Arabic": 0.88,
+        "English": 0.8825,
+        "Bengali": 0.8625,
+        "German": 0.875,
+        "French": 0.8875,
+        "Hindi": 0.8775,
+        "Indonesian": 0.885,
+        "Italian": 0.88,
+        "Japanese": 0.8725,
+        "Korean": 0.87,
+        "Portuguese": 0.875,
+        "Spanish": 0.885,
+        "Swahili": 0.8725,
+        "Yoruba": 0.875,
+        "Chinese": 0.87,
+        "Burmese": 0.8575
+      }
+    },
+    {
+      "model_id": "openai/gpt-5-2025-08-07",
+      "name": "gpt-5-2025-08-07",
+      "developer": "openai",
+      "scores": {
+        "Global MMLU Lite": 0.8895,
+        "Culturally Sensitive": 0.8913,
+        "Culturally Agnostic": 0.8878,
+        "Arabic": 0.8925,
+        "English": 0.8725,
+        "Bengali": 0.9,
+        "German": 0.91,
+        "French": 0.9075,
+        "Hindi": 0.865,
+        "Indonesian": 0.795,
+        "Italian": 0.9075,
+        "Japanese": 0.8875,
+        "Korean": 0.915,
+        "Portuguese": 0.8875,
+        "Spanish": 0.905,
+        "Swahili": 0.865,
+        "Yoruba": 0.9125,
+        "Chinese": 0.895,
+        "Burmese": 0.915
+      }
+    },
+    {
+      "model_id": "openai/o3-mini-2025-01-31",
+      "name": "o3-mini-2025-01-31",
+      "developer": "openai",
+      "scores": {
+        "Global MMLU Lite": 0.78,
+        "Culturally Sensitive": 0.765,
+        "Culturally Agnostic": 0.795,
+        "Arabic": 0.7725,
+        "English": 0.8025,
+        "Bengali": 0.77,
+        "German": 0.7525,
+        "French": 0.74,
+        "Hindi": 0.7525,
+        "Indonesian": 0.7425,
+        "Italian": 0.8,
+        "Japanese": 0.81,
+        "Korean": 0.8075,
+        "Portuguese": 0.7975,
+        "Spanish": 0.775,
+        "Swahili": 0.765,
+        "Yoruba": 0.7725,
+        "Chinese": 0.8125,
+        "Burmese": 0.8075
+      }
+    },
+    {
+      "model_id": "openai/o4-mini-2025-04-16",
+      "name": "o4-mini-2025-04-16",
+      "developer": "openai",
+      "scores": {
+        "Global MMLU Lite": 0.8705,
+        "Culturally Sensitive": 0.8503,
+        "Culturally Agnostic": 0.8906,
+        "Arabic": 0.865,
+        "English": 0.8675,
+        "Bengali": 0.8875,
+        "German": 0.8775,
+        "French": 0.87,
+        "Hindi": 0.87,
+        "Indonesian": 0.8675,
+        "Italian": 0.855,
+        "Japanese": 0.885,
+        "Korean": 0.88,
+        "Portuguese": 0.88,
+        "Spanish": 0.855,
+        "Swahili": 0.8525,
+        "Yoruba": 0.8525,
+        "Chinese": 0.89,
+        "Burmese": 0.8725
+      }
+    },
+    {
+      "model_id": "unknown/aya-expanse-32b",
+      "name": "aya-expanse-32b",
+      "developer": "unknown",
+      "scores": {
+        "Global MMLU Lite": 0.7353,
+        "Culturally Sensitive": 0.6891,
+        "Culturally Agnostic": 0.7815,
+        "Arabic": 0.7425,
+        "English": 0.7544,
+        "Bengali": 0.7343,
+        "German": 0.7425,
+        "French": 0.7325,
+        "Hindi": 0.7375,
+        "Indonesian": 0.7594,
+        "Italian": 0.7305,
+        "Japanese": 0.7419,
+        "Korean": 0.7525,
+        "Portuguese": 0.7544,
+        "Spanish": 0.7362,
+        "Swahili": 0.7071,
+        "Yoruba": 0.6942,
+        "Chinese": 0.743,
+        "Burmese": 0.7025
+      }
+    },
+    {
+      "model_id": "unknown/granite-4.0-h-small",
+      "name": "granite-4.0-h-small",
+      "developer": "unknown",
+      "scores": {
+        "Global MMLU Lite": 0.7503,
+        "Culturally Sensitive": 0.7182,
+        "Culturally Agnostic": 0.7826,
+        "Arabic": 0.7613,
+        "English": 0.77,
+        "Bengali": 0.7613,
+        "German": 0.755,
+        "French": 0.7594,
+        "Hindi": 0.7575,
+        "Indonesian": 0.7614,
+        "Italian": 0.7525,
+        "Japanese": 0.7406,
+        "Korean": 0.7525,
+        "Portuguese": 0.757,
+        "Spanish": 0.7638,
+        "Swahili": 0.7318,
+        "Yoruba": 0.6921,
+        "Chinese": 0.7475,
+        "Burmese": 0.7419
+      }
+    },
+    {
+      "model_id": "unknown/o4-mini-2025-04-16",
+      "name": "o4-mini-2025-04-16",
+      "developer": "unknown",
+      "scores": {
+        "Global MMLU Lite": 0.8705,
+        "Culturally Sensitive": 0.8503,
+        "Culturally Agnostic": 0.8906,
+        "Arabic": 0.865,
+        "English": 0.8675,
+        "Bengali": 0.8875,
+        "German": 0.8775,
+        "French": 0.87,
+        "Hindi": 0.87,
+        "Indonesian": 0.8675,
+        "Italian": 0.855,
+        "Japanese": 0.885,
+        "Korean": 0.88,
+        "Portuguese": 0.88,
+        "Spanish": 0.855,
+        "Swahili": 0.8525,
+        "Yoruba": 0.8525,
+        "Chinese": 0.89,
+        "Burmese": 0.8725
+      }
+    },
+    {
+      "model_id": "xai/grok-3-mini",
+      "name": "grok-3-mini",
+      "developer": "xai",
+      "scores": {
+        "Global MMLU Lite": 0.673,
+        "Culturally Sensitive": 0.6717,
+        "Culturally Agnostic": 0.6743,
+        "Arabic": 0.755,
+        "English": 0.5075,
+        "Bengali": 0.7355,
+        "German": 0.6591,
+        "French": 0.485,
+        "Hindi": 0.56,
+        "Indonesian": 0.725,
+        "Italian": 0.696,
+        "Japanese": 0.6575,
+        "Korean": 0.7325,
+        "Portuguese": 0.6275,
+        "Spanish": 0.61,
+        "Swahili": 0.7625,
+        "Yoruba": 0.8296,
+        "Chinese": 0.5564,
+        "Burmese": 0.8693
+      }
+    },
+    {
+      "model_id": "xai/grok-4-0709",
+      "name": "grok-4-0709",
+      "developer": "xai",
+      "scores": {
+        "Global MMLU Lite": 0.8881,
+        "Culturally Sensitive": 0.8862,
+        "Culturally Agnostic": 0.89,
+        "Arabic": 0.885,
+        "English": 0.905,
+        "Bengali": 0.8925,
+        "German": 0.8725,
+        "French": 0.875,
+        "Hindi": 0.8675,
+        "Indonesian": 0.89,
+        "Italian": 0.9025,
+        "Japanese": 0.87,
+        "Korean": 0.895,
+        "Portuguese": 0.8725,
+        "Spanish": 0.9075,
+        "Swahili": 0.91,
+        "Yoruba": 0.905,
+        "Chinese": 0.8525,
+        "Burmese": 0.9075
+      }
+    }
+  ]
+}

data/benchmarks/helm_capabilities.json ADDED Viewed

	@@ -0,0 +1,797 @@

+{
+  "models": [
+    {
+      "model_id": "allenai/olmo-2-0325-32b-instruct",
+      "name": "OLMo 2 32B Instruct March 2025",
+      "developer": "allenai",
+      "scores": {
+        "Mean score": 0.475,
+        "MMLU-Pro": 0.414,
+        "GPQA": 0.287,
+        "IFEval": 0.78,
+        "WildBench": 0.734,
+        "Omni-MATH": 0.161
+      }
+    },
+    {
+      "model_id": "allenai/olmo-2-1124-13b-instruct",
+      "name": "OLMo 2 13B Instruct November 2024",
+      "developer": "allenai",
+      "scores": {
+        "Mean score": 0.44,
+        "MMLU-Pro": 0.31,
+        "GPQA": 0.316,
+        "IFEval": 0.73,
+        "WildBench": 0.689,
+        "Omni-MATH": 0.156
+      }
+    },
+    {
+      "model_id": "allenai/olmo-2-1124-7b-instruct",
+      "name": "OLMo 2 7B Instruct November 2024",
+      "developer": "allenai",
+      "scores": {
+        "Mean score": 0.405,
+        "MMLU-Pro": 0.292,
+        "GPQA": 0.296,
+        "IFEval": 0.693,
+        "WildBench": 0.628,
+        "Omni-MATH": 0.116
+      }
+    },
+    {
+      "model_id": "allenai/olmoe-1b-7b-0125-instruct",
+      "name": "OLMoE 1B-7B Instruct January 2025",
+      "developer": "allenai",
+      "scores": {
+        "Mean score": 0.332,
+        "MMLU-Pro": 0.169,
+        "GPQA": 0.22,
+        "IFEval": 0.628,
+        "WildBench": 0.551,
+        "Omni-MATH": 0.093
+      }
+    },
+    {
+      "model_id": "amazon/nova-lite-v1:0",
+      "name": "Amazon Nova Lite",
+      "developer": "amazon",
+      "scores": {
+        "Mean score": 0.551,
+        "MMLU-Pro": 0.6,
+        "GPQA": 0.397,
+        "IFEval": 0.776,
+        "WildBench": 0.75,
+        "Omni-MATH": 0.233
+      }
+    },
+    {
+      "model_id": "amazon/nova-micro-v1:0",
+      "name": "Amazon Nova Micro",
+      "developer": "amazon",
+      "scores": {
+        "Mean score": 0.522,
+        "MMLU-Pro": 0.511,
+        "GPQA": 0.383,
+        "IFEval": 0.76,
+        "WildBench": 0.743,
+        "Omni-MATH": 0.214
+      }
+    },
+    {
+      "model_id": "amazon/nova-premier-v1:0",
+      "name": "Amazon Nova Premier",
+      "developer": "amazon",
+      "scores": {
+        "Mean score": 0.637,
+        "MMLU-Pro": 0.726,
+        "GPQA": 0.518,
+        "IFEval": 0.803,
+        "WildBench": 0.788,
+        "Omni-MATH": 0.35
+      }
+    },
+    {
+      "model_id": "amazon/nova-pro-v1:0",
+      "name": "Amazon Nova Pro",
+      "developer": "amazon",
+      "scores": {
+        "Mean score": 0.591,
+        "MMLU-Pro": 0.673,
+        "GPQA": 0.446,
+        "IFEval": 0.815,
+        "WildBench": 0.777,
+        "Omni-MATH": 0.242
+      }
+    },
+    {
+      "model_id": "anthropic/claude-3-5-haiku-20241022",
+      "name": "claude-3-5-haiku-20241022",
+      "developer": "anthropic",
+      "scores": {
+        "Mean score": 0.549,
+        "MMLU-Pro": 0.605,
+        "GPQA": 0.363,
+        "IFEval": 0.792,
+        "WildBench": 0.76,
+        "Omni-MATH": 0.224
+      }
+    },
+    {
+      "model_id": "anthropic/claude-3-5-sonnet-20241022",
+      "name": "Claude 3.5 Sonnet 20241022",
+      "developer": "anthropic",
+      "scores": {
+        "Mean score": 0.653,
+        "MMLU-Pro": 0.777,
+        "GPQA": 0.565,
+        "IFEval": 0.856,
+        "WildBench": 0.792,
+        "Omni-MATH": 0.276
+      }
+    },
+    {
+      "model_id": "anthropic/claude-3-7-sonnet-20250219",
+      "name": "claude-3-7-sonnet-20250219",
+      "developer": "anthropic",
+      "scores": {
+        "Mean score": 0.674,
+        "MMLU-Pro": 0.784,
+        "GPQA": 0.608,
+        "IFEval": 0.834,
+        "WildBench": 0.814,
+        "Omni-MATH": 0.33
+      }
+    },
+    {
+      "model_id": "anthropic/claude-opus-4-20250514",
+      "name": "Claude 4 Opus 20250514",
+      "developer": "anthropic",
+      "scores": {
+        "Mean score": 0.757,
+        "MMLU-Pro": 0.859,
+        "GPQA": 0.666,
+        "IFEval": 0.918,
+        "WildBench": 0.833,
+        "Omni-MATH": 0.511
+      }
+    },
+    {
+      "model_id": "anthropic/claude-opus-4-20250514-thinking-10k",
+      "name": "Claude 4 Opus 20250514, extended thinking",
+      "developer": "anthropic",
+      "scores": {
+        "Mean score": 0.78,
+        "MMLU-Pro": 0.875,
+        "GPQA": 0.709,
+        "IFEval": 0.849,
+        "WildBench": 0.852,
+        "Omni-MATH": 0.616
+      }
+    },
+    {
+      "model_id": "anthropic/claude-sonnet-4-20250514",
+      "name": "claude-sonnet-4-20250514",
+      "developer": "anthropic",
+      "scores": {
+        "Mean score": 0.733,
+        "MMLU-Pro": 0.843,
+        "GPQA": 0.643,
+        "IFEval": 0.839,
+        "WildBench": 0.825,
+        "Omni-MATH": 0.512
+      }
+    },
+    {
+      "model_id": "anthropic/claude-sonnet-4-20250514-thinking-10k",
+      "name": "Claude 4 Sonnet 20250514, extended thinking",
+      "developer": "anthropic",
+      "scores": {
+        "Mean score": 0.766,
+        "MMLU-Pro": 0.843,
+        "GPQA": 0.706,
+        "IFEval": 0.84,
+        "WildBench": 0.838,
+        "Omni-MATH": 0.602
+      }
+    },
+    {
+      "model_id": "deepseek-ai/deepseek-r1-0528",
+      "name": "DeepSeek-R1-0528",
+      "developer": "deepseek-ai",
+      "scores": {
+        "Mean score": 0.699,
+        "MMLU-Pro": 0.793,
+        "GPQA": 0.666,
+        "IFEval": 0.784,
+        "WildBench": 0.828,
+        "Omni-MATH": 0.424
+      }
+    },
+    {
+      "model_id": "deepseek-ai/deepseek-v3",
+      "name": "DeepSeek v3",
+      "developer": "deepseek-ai",
+      "scores": {
+        "Mean score": 0.665,
+        "MMLU-Pro": 0.723,
+        "GPQA": 0.538,
+        "IFEval": 0.832,
+        "WildBench": 0.831,
+        "Omni-MATH": 0.403
+      }
+    },
+    {
+      "model_id": "google/gemini-1.5-flash-002",
+      "name": "Gemini 1.5 Flash 002",
+      "developer": "google",
+      "scores": {
+        "Mean score": 0.609,
+        "MMLU-Pro": 0.678,
+        "GPQA": 0.437,
+        "IFEval": 0.831,
+        "WildBench": 0.792,
+        "Omni-MATH": 0.305
+      }
+    },
+    {
+      "model_id": "google/gemini-1.5-pro-002",
+      "name": "Gemini 1.5 Pro 002",
+      "developer": "google",
+      "scores": {
+        "Mean score": 0.657,
+        "MMLU-Pro": 0.737,
+        "GPQA": 0.534,
+        "IFEval": 0.837,
+        "WildBench": 0.813,
+        "Omni-MATH": 0.364
+      }
+    },
+    {
+      "model_id": "google/gemini-2.0-flash-001",
+      "name": "Gemini 2.0 Flash",
+      "developer": "google",
+      "scores": {
+        "Mean score": 0.679,
+        "MMLU-Pro": 0.737,
+        "GPQA": 0.556,
+        "IFEval": 0.841,
+        "WildBench": 0.8,
+        "Omni-MATH": 0.459
+      }
+    },
+    {
+      "model_id": "google/gemini-2.0-flash-lite-preview-02-05",
+      "name": "Gemini 2.0 Flash Lite 02-05 preview",
+      "developer": "google",
+      "scores": {
+        "Mean score": 0.642,
+        "MMLU-Pro": 0.72,
+        "GPQA": 0.5,
+        "IFEval": 0.824,
+        "WildBench": 0.79,
+        "Omni-MATH": 0.374
+      }
+    },
+    {
+      "model_id": "google/gemini-2.5-flash-lite",
+      "name": "Gemini 2.5 Flash-Lite",
+      "developer": "google",
+      "scores": {
+        "Mean score": 0.591,
+        "MMLU-Pro": 0.537,
+        "GPQA": 0.309,
+        "IFEval": 0.81,
+        "WildBench": 0.818,
+        "Omni-MATH": 0.48
+      }
+    },
+    {
+      "model_id": "google/gemini-2.5-flash-preview-04-17",
+      "name": "Gemini 2.5 Flash 04-17 preview",
+      "developer": "google",
+      "scores": {
+        "Mean score": 0.626,
+        "MMLU-Pro": 0.639,
+        "GPQA": 0.39,
+        "IFEval": 0.898,
+        "WildBench": 0.817,
+        "Omni-MATH": 0.384
+      }
+    },
+    {
+      "model_id": "google/gemini-2.5-pro-preview-03-25",
+      "name": "Gemini 2.5 Pro 03-25 preview",
+      "developer": "google",
+      "scores": {
+        "Mean score": 0.745,
+        "MMLU-Pro": 0.863,
+        "GPQA": 0.749,
+        "IFEval": 0.84,
+        "WildBench": 0.857,
+        "Omni-MATH": 0.416
+      }
+    },
+    {
+      "model_id": "ibm/granite-3.3-8b-instruct",
+      "name": "IBM Granite 3.3 8B Instruct",
+      "developer": "ibm",
+      "scores": {
+        "Mean score": 0.463,
+        "MMLU-Pro": 0.343,
+        "GPQA": 0.325,
+        "IFEval": 0.729,
+        "WildBench": 0.741,
+        "Omni-MATH": 0.176
+      }
+    },
+    {
+      "model_id": "marin-community/marin-8b-instruct",
+      "name": "Marin 8B Instruct",
+      "developer": "marin-community",
+      "scores": {
+        "Mean score": 0.325,
+        "MMLU-Pro": 0.188,
+        "GPQA": 0.168,
+        "IFEval": 0.632,
+        "WildBench": 0.477,
+        "Omni-MATH": 0.16
+      }
+    },
+    {
+      "model_id": "meta/llama-3.1-405b-instruct-turbo",
+      "name": "Llama 3.1 Instruct Turbo 405B",
+      "developer": "meta",
+      "scores": {
+        "Mean score": 0.618,
+        "MMLU-Pro": 0.723,
+        "GPQA": 0.522,
+        "IFEval": 0.811,
+        "WildBench": 0.783,
+        "Omni-MATH": 0.249
+      }
+    },
+    {
+      "model_id": "meta/llama-3.1-70b-instruct-turbo",
+      "name": "Llama 3.1 Instruct Turbo 70B",
+      "developer": "meta",
+      "scores": {
+        "Mean score": 0.574,
+        "MMLU-Pro": 0.653,
+        "GPQA": 0.426,
+        "IFEval": 0.821,
+        "WildBench": 0.758,
+        "Omni-MATH": 0.21
+      }
+    },
+    {
+      "model_id": "meta/llama-3.1-8b-instruct-turbo",
+      "name": "Llama 3.1 Instruct Turbo 8B",
+      "developer": "meta",
+      "scores": {
+        "Mean score": 0.444,
+        "MMLU-Pro": 0.406,
+        "GPQA": 0.247,
+        "IFEval": 0.743,
+        "WildBench": 0.686,
+        "Omni-MATH": 0.137
+      }
+    },
+    {
+      "model_id": "meta/llama-4-maverick-17b-128e-instruct-fp8",
+      "name": "Llama 4 Maverick 17Bx128E Instruct FP8",
+      "developer": "meta",
+      "scores": {
+        "Mean score": 0.718,
+        "MMLU-Pro": 0.81,
+        "GPQA": 0.65,
+        "IFEval": 0.908,
+        "WildBench": 0.8,
+        "Omni-MATH": 0.422
+      }
+    },
+    {
+      "model_id": "meta/llama-4-scout-17b-16e-instruct",
+      "name": "Llama 4 Scout 17Bx16E Instruct",
+      "developer": "meta",
+      "scores": {
+        "Mean score": 0.644,
+        "MMLU-Pro": 0.742,
+        "GPQA": 0.507,
+        "IFEval": 0.818,
+        "WildBench": 0.779,
+        "Omni-MATH": 0.373
+      }
+    },
+    {
+      "model_id": "mistralai/mistral-7b-instruct-v0.3",
+      "name": "Mistral Instruct v0.3 7B",
+      "developer": "mistralai",
+      "scores": {
+        "Mean score": 0.376,
+        "MMLU-Pro": 0.277,
+        "GPQA": 0.303,
+        "IFEval": 0.567,
+        "WildBench": 0.66,
+        "Omni-MATH": 0.072
+      }
+    },
+    {
+      "model_id": "mistralai/mistral-large-2411",
+      "name": "Mistral Large 2411",
+      "developer": "mistralai",
+      "scores": {
+        "Mean score": 0.598,
+        "MMLU-Pro": 0.599,
+        "GPQA": 0.435,
+        "IFEval": 0.876,
+        "WildBench": 0.801,
+        "Omni-MATH": 0.281
+      }
+    },
+    {
+      "model_id": "mistralai/mistral-small-2503",
+      "name": "mistral-small-2503",
+      "developer": "mistralai",
+      "scores": {
+        "Mean score": 0.558,
+        "MMLU-Pro": 0.61,
+        "GPQA": 0.392,
+        "IFEval": 0.75,
+        "WildBench": 0.788,
+        "Omni-MATH": 0.248
+      }
+    },
+    {
+      "model_id": "mistralai/mixtral-8x22b-instruct-v0.1",
+      "name": "Mixtral Instruct 8x22B",
+      "developer": "mistralai",
+      "scores": {
+        "Mean score": 0.478,
+        "MMLU-Pro": 0.46,
+        "GPQA": 0.334,
+        "IFEval": 0.724,
+        "WildBench": 0.711,
+        "Omni-MATH": 0.163
+      }
+    },
+    {
+      "model_id": "mistralai/mixtral-8x7b-instruct-v0.1",
+      "name": "Mixtral Instruct 8x7B",
+      "developer": "mistralai",
+      "scores": {
+        "Mean score": 0.397,
+        "MMLU-Pro": 0.335,
+        "GPQA": 0.296,
+        "IFEval": 0.575,
+        "WildBench": 0.673,
+        "Omni-MATH": 0.105
+      }
+    },
+    {
+      "model_id": "moonshotai/kimi-k2-instruct",
+      "name": "Kimi K2 Instruct",
+      "developer": "moonshotai",
+      "scores": {
+        "Mean score": 0.768,
+        "MMLU-Pro": 0.819,
+        "GPQA": 0.652,
+        "IFEval": 0.85,
+        "WildBench": 0.862,
+        "Omni-MATH": 0.654
+      }
+    },
+    {
+      "model_id": "openai/gpt-4.1-2025-04-14",
+      "name": "gpt-4.1-2025-04-14",
+      "developer": "openai",
+      "scores": {
+        "Mean score": 0.727,
+        "MMLU-Pro": 0.811,
+        "GPQA": 0.659,
+        "IFEval": 0.838,
+        "WildBench": 0.854,
+        "Omni-MATH": 0.471
+      }
+    },
+    {
+      "model_id": "openai/gpt-4.1-mini-2025-04-14",
+      "name": "GPT-4.1 mini 2025-04-14",
+      "developer": "openai",
+      "scores": {
+        "Mean score": 0.726,
+        "MMLU-Pro": 0.783,
+        "GPQA": 0.614,
+        "IFEval": 0.904,
+        "WildBench": 0.838,
+        "Omni-MATH": 0.491
+      }
+    },
+    {
+      "model_id": "openai/gpt-4.1-nano-2025-04-14",
+      "name": "GPT-4.1 nano 2025-04-14",
+      "developer": "openai",
+      "scores": {
+        "Mean score": 0.616,
+        "MMLU-Pro": 0.55,
+        "GPQA": 0.507,
+        "IFEval": 0.843,
+        "WildBench": 0.811,
+        "Omni-MATH": 0.367
+      }
+    },
+    {
+      "model_id": "openai/gpt-4o-2024-11-20",
+      "name": "GPT-4o 2024-11-20",
+      "developer": "openai",
+      "scores": {
+        "Mean score": 0.634,
+        "MMLU-Pro": 0.713,
+        "GPQA": 0.52,
+        "IFEval": 0.817,
+        "WildBench": 0.828,
+        "Omni-MATH": 0.293
+      }
+    },
+    {
+      "model_id": "openai/gpt-4o-mini-2024-07-18",
+      "name": "GPT-4o mini 2024-07-18",
+      "developer": "openai",
+      "scores": {
+        "Mean score": 0.565,
+        "MMLU-Pro": 0.603,
+        "GPQA": 0.368,
+        "IFEval": 0.782,
+        "WildBench": 0.791,
+        "Omni-MATH": 0.28
+      }
+    },
+    {
+      "model_id": "openai/gpt-5-2025-08-07",
+      "name": "gpt-5-2025-08-07",
+      "developer": "openai",
+      "scores": {
+        "Mean score": 0.807,
+        "MMLU-Pro": 0.863,
+        "GPQA": 0.791,
+        "IFEval": 0.875,
+        "WildBench": 0.857,
+        "Omni-MATH": 0.647
+      }
+    },
+    {
+      "model_id": "openai/gpt-5-mini-2025-08-07",
+      "name": "GPT-5 mini 2025-08-07",
+      "developer": "openai",
+      "scores": {
+        "Mean score": 0.819,
+        "MMLU-Pro": 0.835,
+        "GPQA": 0.756,
+        "IFEval": 0.927,
+        "WildBench": 0.855,
+        "Omni-MATH": 0.722
+      }
+    },
+    {
+      "model_id": "openai/gpt-5-nano-2025-08-07",
+      "name": "GPT-5 nano 2025-08-07",
+      "developer": "openai",
+      "scores": {
+        "Mean score": 0.748,
+        "MMLU-Pro": 0.778,
+        "GPQA": 0.679,
+        "IFEval": 0.932,
+        "WildBench": 0.806,
+        "Omni-MATH": 0.547
+      }
+    },
+    {
+      "model_id": "openai/gpt-oss-120b",
+      "name": "gpt-oss-120b",
+      "developer": "openai",
+      "scores": {
+        "Mean score": 0.77,
+        "MMLU-Pro": 0.795,
+        "GPQA": 0.684,
+        "IFEval": 0.836,
+        "WildBench": 0.845,
+        "Omni-MATH": 0.688
+      }
+    },
+    {
+      "model_id": "openai/gpt-oss-20b",
+      "name": "gpt-oss-20b",
+      "developer": "openai",
+      "scores": {
+        "Mean score": 0.674,
+        "MMLU-Pro": 0.74,
+        "GPQA": 0.594,
+        "IFEval": 0.732,
+        "WildBench": 0.737,
+        "Omni-MATH": 0.565
+      }
+    },
+    {
+      "model_id": "openai/o3-2025-04-16",
+      "name": "o3 2025-04-16",
+      "developer": "openai",
+      "scores": {
+        "Mean score": 0.811,
+        "MMLU-Pro": 0.859,
+        "GPQA": 0.753,
+        "IFEval": 0.869,
+        "WildBench": 0.861,
+        "Omni-MATH": 0.714
+      }
+    },
+    {
+      "model_id": "openai/o4-mini-2025-04-16",
+      "name": "o4-mini-2025-04-16",
+      "developer": "openai",
+      "scores": {
+        "Mean score": 0.812,
+        "MMLU-Pro": 0.82,
+        "GPQA": 0.735,
+        "IFEval": 0.929,
+        "WildBench": 0.854,
+        "Omni-MATH": 0.72
+      }
+    },
+    {
+      "model_id": "qwen/qwen2.5-72b-instruct-turbo",
+      "name": "Qwen2.5 Instruct Turbo 72B",
+      "developer": "qwen",
+      "scores": {
+        "Mean score": 0.599,
+        "MMLU-Pro": 0.631,
+        "GPQA": 0.426,
+        "IFEval": 0.806,
+        "WildBench": 0.802,
+        "Omni-MATH": 0.33
+      }
+    },
+    {
+      "model_id": "qwen/qwen2.5-7b-instruct-turbo",
+      "name": "Qwen2.5 Instruct Turbo 7B",
+      "developer": "qwen",
+      "scores": {
+        "Mean score": 0.529,
+        "MMLU-Pro": 0.539,
+        "GPQA": 0.341,
+        "IFEval": 0.741,
+        "WildBench": 0.731,
+        "Omni-MATH": 0.294
+      }
+    },
+    {
+      "model_id": "qwen/qwen3-235b-a22b-fp8-tput",
+      "name": "Qwen3 235B A22B FP8 Throughput",
+      "developer": "qwen",
+      "scores": {
+        "Mean score": 0.726,
+        "MMLU-Pro": 0.817,
+        "GPQA": 0.623,
+        "IFEval": 0.816,
+        "WildBench": 0.828,
+        "Omni-MATH": 0.548
+      }
+    },
+    {
+      "model_id": "qwen/qwen3-235b-a22b-instruct-2507-fp8",
+      "name": "Qwen3 235B A22B Instruct 2507 FP8",
+      "developer": "qwen",
+      "scores": {
+        "Mean score": 0.798,
+        "MMLU-Pro": 0.844,
+        "GPQA": 0.726,
+        "IFEval": 0.835,
+        "WildBench": 0.866,
+        "Omni-MATH": 0.718
+      }
+    },
+    {
+      "model_id": "writer/palmyra-fin",
+      "name": "Palmyra Fin",
+      "developer": "writer",
+      "scores": {
+        "Mean score": 0.577,
+        "MMLU-Pro": 0.591,
+        "GPQA": 0.422,
+        "IFEval": 0.793,
+        "WildBench": 0.783,
+        "Omni-MATH": 0.295
+      }
+    },
+    {
+      "model_id": "writer/palmyra-med",
+      "name": "Palmyra Med",
+      "developer": "writer",
+      "scores": {
+        "Mean score": 0.476,
+        "MMLU-Pro": 0.411,
+        "GPQA": 0.368,
+        "IFEval": 0.767,
+        "WildBench": 0.676,
+        "Omni-MATH": 0.156
+      }
+    },
+    {
+      "model_id": "writer/palmyra-x-004",
+      "name": "Palmyra-X-004",
+      "developer": "writer",
+      "scores": {
+        "Mean score": 0.609,
+        "MMLU-Pro": 0.657,
+        "GPQA": 0.395,
+        "IFEval": 0.872,
+        "WildBench": 0.802,
+        "Omni-MATH": 0.32
+      }
+    },
+    {
+      "model_id": "writer/palmyra-x5",
+      "name": "Palmyra X5",
+      "developer": "writer",
+      "scores": {
+        "Mean score": 0.696,
+        "MMLU-Pro": 0.804,
+        "GPQA": 0.661,
+        "IFEval": 0.823,
+        "WildBench": 0.78,
+        "Omni-MATH": 0.414
+      }
+    },
+    {
+      "model_id": "xai/grok-3-beta",
+      "name": "Grok 3 Beta",
+      "developer": "xai",
+      "scores": {
+        "Mean score": 0.727,
+        "MMLU-Pro": 0.788,
+        "GPQA": 0.65,
+        "IFEval": 0.884,
+        "WildBench": 0.849,
+        "Omni-MATH": 0.464
+      }
+    },
+    {
+      "model_id": "xai/grok-3-mini-beta",
+      "name": "Grok 3 mini Beta",
+      "developer": "xai",
+      "scores": {
+        "Mean score": 0.679,
+        "MMLU-Pro": 0.799,
+        "GPQA": 0.675,
+        "IFEval": 0.951,
+        "WildBench": 0.651,
+        "Omni-MATH": 0.318
+      }
+    },
+    {
+      "model_id": "xai/grok-4-0709",
+      "name": "grok-4-0709",
+      "developer": "xai",
+      "scores": {
+        "Mean score": 0.785,
+        "MMLU-Pro": 0.851,
+        "GPQA": 0.726,
+        "IFEval": 0.949,
+        "WildBench": 0.797,
+        "Omni-MATH": 0.603
+      }
+    },
+    {
+      "model_id": "zai-org/glm-4.5-air-fp8",
+      "name": "GLM-4.5-Air-FP8",
+      "developer": "zai-org",
+      "scores": {
+        "Mean score": 0.67,
+        "MMLU-Pro": 0.762,
+        "GPQA": 0.594,
+        "IFEval": 0.812,
+        "WildBench": 0.789,
+        "Omni-MATH": 0.391
+      }
+    }
+  ]
+}

data/benchmarks/helm_classic.json ADDED Viewed

	@@ -0,0 +1,1478 @@

+{
+  "models": [
+    {
+      "model_id": "Anthropic-LM-v4-s3-52B",
+      "name": "Anthropic-LM v4-s3 52B",
+      "developer": "unknown",
+      "scores": {
+        "Mean win rate": 0.78,
+        "MMLU": 0.481,
+        "BoolQ": 0.815,
+        "NarrativeQA": 0.728,
+        "NaturalQuestions (open-book)": 0.686,
+        "QuAC": 0.431,
+        "HellaSwag": 0.807,
+        "OpenbookQA": 0.558,
+        "TruthfulQA": 0.368,
+        "MS MARCO (TREC)": -1.0,
+        "CNN/DailyMail": 0.154,
+        "XSUM": 0.134,
+        "IMDB": 0.934,
+        "CivilComments": 0.61,
+        "RAFT": 0.699
+      }
+    },
+    {
+      "model_id": "ai21/J1-Grande-v1-17B",
+      "name": "J1-Grande v1 17B",
+      "developer": "ai21",
+      "scores": {
+        "Mean win rate": 0.433,
+        "MMLU": 0.27,
+        "BoolQ": 0.722,
+        "NarrativeQA": 0.672,
+        "NaturalQuestions (open-book)": 0.578,
+        "QuAC": 0.362,
+        "HellaSwag": 0.739,
+        "OpenbookQA": 0.52,
+        "TruthfulQA": 0.193,
+        "MS MARCO (TREC)": 0.341,
+        "CNN/DailyMail": 0.143,
+        "XSUM": 0.122,
+        "IMDB": 0.953,
+        "CivilComments": 0.529,
+        "RAFT": 0.658
+      }
+    },
+    {
+      "model_id": "ai21/J1-Grande-v2-beta-17B",
+      "name": "J1-Grande v2 beta 17B",
+      "developer": "ai21",
+      "scores": {
+        "Mean win rate": 0.706,
+        "MMLU": 0.445,
+        "BoolQ": 0.812,
+        "NarrativeQA": 0.725,
+        "NaturalQuestions (open-book)": 0.625,
+        "QuAC": 0.392,
+        "HellaSwag": 0.764,
+        "OpenbookQA": 0.56,
+        "TruthfulQA": 0.306,
+        "MS MARCO (TREC)": 0.46,
+        "CNN/DailyMail": 0.146,
+        "XSUM": 0.152,
+        "IMDB": 0.957,
+        "CivilComments": 0.546,
+        "RAFT": 0.679
+      }
+    },
+    {
+      "model_id": "ai21/J1-Jumbo-v1-178B",
+      "name": "J1-Jumbo v1 178B",
+      "developer": "ai21",
+      "scores": {
+        "Mean win rate": 0.517,
+        "MMLU": 0.259,
+        "BoolQ": 0.776,
+        "NarrativeQA": 0.695,
+        "NaturalQuestions (open-book)": 0.595,
+        "QuAC": 0.358,
+        "HellaSwag": 0.765,
+        "OpenbookQA": 0.534,
+        "TruthfulQA": 0.175,
+        "MS MARCO (TREC)": 0.363,
+        "CNN/DailyMail": 0.144,
+        "XSUM": 0.129,
+        "IMDB": 0.943,
+        "CivilComments": 0.553,
+        "RAFT": 0.681
+      }
+    },
+    {
+      "model_id": "ai21/J1-Large-v1-7.5B",
+      "name": "J1-Large v1 7.5B",
+      "developer": "ai21",
+      "scores": {
+        "Mean win rate": 0.285,
+        "MMLU": 0.241,
+        "BoolQ": 0.683,
+        "NarrativeQA": 0.623,
+        "NaturalQuestions (open-book)": 0.532,
+        "QuAC": 0.328,
+        "HellaSwag": 0.7,
+        "OpenbookQA": 0.514,
+        "TruthfulQA": 0.197,
+        "MS MARCO (TREC)": 0.292,
+        "CNN/DailyMail": 0.134,
+        "XSUM": 0.102,
+        "IMDB": 0.956,
+        "CivilComments": 0.532,
+        "RAFT": 0.545
+      }
+    },
+    {
+      "model_id": "ai21/Jurassic-2-Grande-17B",
+      "name": "Jurassic-2 Grande 17B",
+      "developer": "ai21",
+      "scores": {
+        "Mean win rate": 0.743,
+        "MMLU": 0.475,
+        "BoolQ": 0.826,
+        "NarrativeQA": 0.737,
+        "NaturalQuestions (open-book)": 0.639,
+        "QuAC": 0.418,
+        "HellaSwag": 0.781,
+        "OpenbookQA": 0.542,
+        "TruthfulQA": 0.348,
+        "MS MARCO (TREC)": 0.514,
+        "CNN/DailyMail": 0.144,
+        "XSUM": 0.167,
+        "IMDB": 0.938,
+        "CivilComments": 0.547,
+        "RAFT": 0.712
+      }
+    },
+    {
+      "model_id": "ai21/Jurassic-2-Jumbo-178B",
+      "name": "Jurassic-2 Jumbo 178B",
+      "developer": "ai21",
+      "scores": {
+        "Mean win rate": 0.824,
+        "MMLU": 0.48,
+        "BoolQ": 0.829,
+        "NarrativeQA": 0.733,
+        "NaturalQuestions (open-book)": 0.669,
+        "QuAC": 0.435,
+        "HellaSwag": 0.788,
+        "OpenbookQA": 0.558,
+        "TruthfulQA": 0.437,
+        "MS MARCO (TREC)": 0.661,
+        "CNN/DailyMail": 0.149,
+        "XSUM": 0.182,
+        "IMDB": 0.938,
+        "CivilComments": 0.57,
+        "RAFT": 0.746
+      }
+    },
+    {
+      "model_id": "ai21/Jurassic-2-Large-7.5B",
+      "name": "Jurassic-2 Large 7.5B",
+      "developer": "ai21",
+      "scores": {
+        "Mean win rate": 0.553,
+        "MMLU": 0.339,
+        "BoolQ": 0.742,
+        "NarrativeQA": -1.0,
+        "NaturalQuestions (open-book)": 0.589,
+        "QuAC": -1.0,
+        "HellaSwag": 0.729,
+        "OpenbookQA": 0.53,
+        "TruthfulQA": 0.245,
+        "MS MARCO (TREC)": 0.464,
+        "CNN/DailyMail": 0.136,
+        "XSUM": 0.142,
+        "IMDB": 0.956,
+        "CivilComments": 0.57,
+        "RAFT": 0.622
+      }
+    },
+    {
+      "model_id": "aleph-alpha/Luminous-Base-13B",
+      "name": "Luminous Base 13B",
+      "developer": "aleph-alpha",
+      "scores": {
+        "Mean win rate": 0.315,
+        "MMLU": 0.27,
+        "BoolQ": 0.719,
+        "NarrativeQA": 0.605,
+        "NaturalQuestions (open-book)": 0.568,
+        "QuAC": 0.334,
+        "HellaSwag": -1.0,
+        "OpenbookQA": -1.0,
+        "TruthfulQA": 0.182,
+        "MS MARCO (TREC)": -1.0,
+        "CNN/DailyMail": 0.11,
+        "XSUM": 0.105,
+        "IMDB": 0.939,
+        "CivilComments": 0.544,
+        "RAFT": 0.473
+      }
+    },
+    {
+      "model_id": "aleph-alpha/Luminous-Extended-30B",
+      "name": "Luminous Extended 30B",
+      "developer": "aleph-alpha",
+      "scores": {
+        "Mean win rate": 0.485,
+        "MMLU": 0.321,
+        "BoolQ": 0.767,
+        "NarrativeQA": 0.665,
+        "NaturalQuestions (open-book)": 0.609,
+        "QuAC": 0.349,
+        "HellaSwag": -1.0,
+        "OpenbookQA": -1.0,
+        "TruthfulQA": 0.221,
+        "MS MARCO (TREC)": -1.0,
+        "CNN/DailyMail": 0.139,
+        "XSUM": 0.124,
+        "IMDB": 0.947,
+        "CivilComments": 0.524,
+        "RAFT": 0.523
+      }
+    },
+    {
+      "model_id": "aleph-alpha/Luminous-Supreme-70B",
+      "name": "Luminous Supreme 70B",
+      "developer": "aleph-alpha",
+      "scores": {
+        "Mean win rate": 0.662,
+        "MMLU": 0.38,
+        "BoolQ": 0.775,
+        "NarrativeQA": 0.711,
+        "NaturalQuestions (open-book)": 0.649,
+        "QuAC": 0.37,
+        "HellaSwag": -1.0,
+        "OpenbookQA": -1.0,
+        "TruthfulQA": 0.222,
+        "MS MARCO (TREC)": -1.0,
+        "CNN/DailyMail": 0.15,
+        "XSUM": 0.136,
+        "IMDB": 0.959,
+        "CivilComments": 0.562,
+        "RAFT": 0.653
+      }
+    },
+    {
+      "model_id": "bigscience/BLOOM-176B",
+      "name": "BLOOM 176B",
+      "developer": "bigscience",
+      "scores": {
+        "Mean win rate": 0.446,
+        "MMLU": 0.299,
+        "BoolQ": 0.704,
+        "NarrativeQA": 0.662,
+        "NaturalQuestions (open-book)": 0.621,
+        "QuAC": 0.361,
+        "HellaSwag": 0.744,
+        "OpenbookQA": 0.534,
+        "TruthfulQA": 0.205,
+        "MS MARCO (TREC)": 0.386,
+        "CNN/DailyMail": 0.08,
+        "XSUM": 0.03,
+        "IMDB": 0.945,
+        "CivilComments": 0.62,
+        "RAFT": 0.592
+      }
+    },
+    {
+      "model_id": "bigscience/T0pp-11B",
+      "name": "T0pp 11B",
+      "developer": "bigscience",
+      "scores": {
+        "Mean win rate": 0.197,
+        "MMLU": 0.407,
+        "BoolQ": 0.0,
+        "NarrativeQA": 0.151,
+        "NaturalQuestions (open-book)": 0.19,
+        "QuAC": 0.121,
+        "HellaSwag": -1.0,
+        "OpenbookQA": -1.0,
+        "TruthfulQA": 0.377,
+        "MS MARCO (TREC)": -1.0,
+        "CNN/DailyMail": 0.122,
+        "XSUM": 0.09,
+        "IMDB": 0.207,
+        "CivilComments": 0.234,
+        "RAFT": 0.118
+      }
+    },
+    {
+      "model_id": "cohere/Cohere-Command-beta-52.4B",
+      "name": "Cohere Command beta 52.4B",
+      "developer": "cohere",
+      "scores": {
+        "Mean win rate": 0.874,
+        "MMLU": 0.452,
+        "BoolQ": 0.856,
+        "NarrativeQA": 0.752,
+        "NaturalQuestions (open-book)": 0.76,
+        "QuAC": 0.432,
+        "HellaSwag": 0.811,
+        "OpenbookQA": 0.582,
+        "TruthfulQA": 0.269,
+        "MS MARCO (TREC)": 0.762,
+        "CNN/DailyMail": 0.161,
+        "XSUM": 0.152,
+        "IMDB": 0.96,
+        "CivilComments": 0.601,
+        "RAFT": 0.667
+      }
+    },
+    {
+      "model_id": "cohere/Cohere-Command-beta-6.1B",
+      "name": "Cohere Command beta 6.1B",
+      "developer": "cohere",
+      "scores": {
+        "Mean win rate": 0.675,
+        "MMLU": 0.406,
+        "BoolQ": 0.798,
+        "NarrativeQA": 0.709,
+        "NaturalQuestions (open-book)": 0.717,
+        "QuAC": 0.375,
+        "HellaSwag": 0.752,
+        "OpenbookQA": 0.55,
+        "TruthfulQA": 0.203,
+        "MS MARCO (TREC)": 0.709,
+        "CNN/DailyMail": 0.153,
+        "XSUM": 0.122,
+        "IMDB": 0.961,
+        "CivilComments": 0.54,
+        "RAFT": 0.634
+      }
+    },
+    {
+      "model_id": "cohere/Cohere-large-v20220720-13.1B",
+      "name": "Cohere large v20220720 13.1B",
+      "developer": "cohere",
+      "scores": {
+        "Mean win rate": 0.372,
+        "MMLU": 0.324,
+        "BoolQ": 0.725,
+        "NarrativeQA": 0.625,
+        "NaturalQuestions (open-book)": 0.573,
+        "QuAC": 0.338,
+        "HellaSwag": 0.736,
+        "OpenbookQA": 0.542,
+        "TruthfulQA": 0.181,
+        "MS MARCO (TREC)": 0.33,
+        "CNN/DailyMail": 0.126,
+        "XSUM": 0.108,
+        "IMDB": 0.933,
+        "CivilComments": 0.507,
+        "RAFT": 0.596
+      }
+    },
+    {
+      "model_id": "cohere/Cohere-medium-v20220720-6.1B",
+      "name": "Cohere medium v20220720 6.1B",
+      "developer": "cohere",
+      "scores": {
+        "Mean win rate": 0.23,
+        "MMLU": 0.279,
+        "BoolQ": 0.659,
+        "NarrativeQA": 0.559,
+        "NaturalQuestions (open-book)": 0.504,
+        "QuAC": 0.279,
+        "HellaSwag": 0.706,
+        "OpenbookQA": 0.496,
+        "TruthfulQA": 0.19,
+        "MS MARCO (TREC)": 0.374,
+        "CNN/DailyMail": 0.077,
+        "XSUM": 0.087,
+        "IMDB": 0.935,
+        "CivilComments": 0.504,
+        "RAFT": 0.52
+      }
+    },
+    {
+      "model_id": "cohere/Cohere-medium-v20221108-6.1B",
+      "name": "Cohere medium v20221108 6.1B",
+      "developer": "cohere",
+      "scores": {
+        "Mean win rate": 0.312,
+        "MMLU": 0.254,
+        "BoolQ": 0.7,
+        "NarrativeQA": 0.61,
+        "NaturalQuestions (open-book)": 0.517,
+        "QuAC": 0.314,
+        "HellaSwag": 0.726,
+        "OpenbookQA": 0.538,
+        "TruthfulQA": 0.215,
+        "MS MARCO (TREC)": 0.373,
+        "CNN/DailyMail": 0.121,
+        "XSUM": 0.099,
+        "IMDB": 0.935,
+        "CivilComments": 0.5,
+        "RAFT": 0.591
+      }
+    },
+    {
+      "model_id": "cohere/Cohere-small-v20220720-410M",
+      "name": "Cohere small v20220720 410M",
+      "developer": "cohere",
+      "scores": {
+        "Mean win rate": 0.109,
+        "MMLU": 0.264,
+        "BoolQ": 0.457,
+        "NarrativeQA": 0.294,
+        "NaturalQuestions (open-book)": 0.309,
+        "QuAC": 0.219,
+        "HellaSwag": 0.483,
+        "OpenbookQA": 0.348,
+        "TruthfulQA": 0.217,
+        "MS MARCO (TREC)": 0.304,
+        "CNN/DailyMail": 0.063,
+        "XSUM": 0.033,
+        "IMDB": 0.578,
+        "CivilComments": 0.501,
+        "RAFT": 0.492
+      }
+    },
+    {
+      "model_id": "cohere/Cohere-xlarge-v20220609-52.4B",
+      "name": "Cohere xlarge v20220609 52.4B",
+      "developer": "cohere",
+      "scores": {
+        "Mean win rate": 0.56,
+        "MMLU": 0.353,
+        "BoolQ": 0.718,
+        "NarrativeQA": 0.65,
+        "NaturalQuestions (open-book)": 0.595,
+        "QuAC": 0.361,
+        "HellaSwag": 0.811,
+        "OpenbookQA": 0.55,
+        "TruthfulQA": 0.198,
+        "MS MARCO (TREC)": 0.459,
+        "CNN/DailyMail": 0.144,
+        "XSUM": 0.129,
+        "IMDB": 0.956,
+        "CivilComments": 0.532,
+        "RAFT": 0.633
+      }
+    },
+    {
+      "model_id": "cohere/Cohere-xlarge-v20221108-52.4B",
+      "name": "Cohere xlarge v20221108 52.4B",
+      "developer": "cohere",
+      "scores": {
+        "Mean win rate": 0.664,
+        "MMLU": 0.382,
+        "BoolQ": 0.762,
+        "NarrativeQA": 0.672,
+        "NaturalQuestions (open-book)": 0.628,
+        "QuAC": 0.374,
+        "HellaSwag": 0.81,
+        "OpenbookQA": 0.588,
+        "TruthfulQA": 0.169,
+        "MS MARCO (TREC)": 0.55,
+        "CNN/DailyMail": 0.153,
+        "XSUM": 0.153,
+        "IMDB": 0.956,
+        "CivilComments": 0.524,
+        "RAFT": 0.624
+      }
+    },
+    {
+      "model_id": "eleutherai/Pythia-12B",
+      "name": "Pythia 12B",
+      "developer": "eleutherai",
+      "scores": {
+        "Mean win rate": 0.257,
+        "MMLU": 0.274,
+        "BoolQ": 0.662,
+        "NarrativeQA": 0.596,
+        "NaturalQuestions (open-book)": 0.581,
+        "QuAC": 0.313,
+        "HellaSwag": -1.0,
+        "OpenbookQA": -1.0,
+        "TruthfulQA": 0.177,
+        "MS MARCO (TREC)": -1.0,
+        "CNN/DailyMail": -1.0,
+        "XSUM": -1.0,
+        "IMDB": 0.931,
+        "CivilComments": 0.531,
+        "RAFT": 0.514
+      }
+    },
+    {
+      "model_id": "eleutherai/Pythia-6.9B",
+      "name": "Pythia 6.9B",
+      "developer": "eleutherai",
+      "scores": {
+        "Mean win rate": 0.196,
+        "MMLU": 0.236,
+        "BoolQ": 0.631,
+        "NarrativeQA": 0.528,
+        "NaturalQuestions (open-book)": 0.539,
+        "QuAC": 0.296,
+        "HellaSwag": -1.0,
+        "OpenbookQA": -1.0,
+        "TruthfulQA": 0.213,
+        "MS MARCO (TREC)": -1.0,
+        "CNN/DailyMail": -1.0,
+        "XSUM": -1.0,
+        "IMDB": 0.928,
+        "CivilComments": 0.511,
+        "RAFT": 0.502
+      }
+    },
+    {
+      "model_id": "google/Palmyra-X-43B",
+      "name": "Palmyra X 43B",
+      "developer": "google",
+      "scores": {
+        "Mean win rate": 0.732,
+        "MMLU": 0.609,
+        "BoolQ": 0.896,
+        "NarrativeQA": 0.742,
+        "NaturalQuestions (open-book)": -1.0,
+        "QuAC": 0.473,
+        "HellaSwag": -1.0,
+        "OpenbookQA": -1.0,
+        "TruthfulQA": 0.616,
+        "MS MARCO (TREC)": -1.0,
+        "CNN/DailyMail": 0.049,
+        "XSUM": 0.149,
+        "IMDB": 0.935,
+        "CivilComments": 0.008,
+        "RAFT": 0.701
+      }
+    },
+    {
+      "model_id": "google/T5-11B",
+      "name": "T5 11B",
+      "developer": "google",
+      "scores": {
+        "Mean win rate": 0.131,
+        "MMLU": 0.29,
+        "BoolQ": 0.761,
+        "NarrativeQA": 0.086,
+        "NaturalQuestions (open-book)": 0.477,
+        "QuAC": 0.116,
+        "HellaSwag": -1.0,
+        "OpenbookQA": -1.0,
+        "TruthfulQA": 0.133,
+        "MS MARCO (TREC)": -1.0,
+        "CNN/DailyMail": 0.043,
+        "XSUM": 0.015,
+        "IMDB": 0.379,
+        "CivilComments": 0.509,
+        "RAFT": 0.37
+      }
+    },
+    {
+      "model_id": "google/UL2-20B",
+      "name": "UL2 20B",
+      "developer": "google",
+      "scores": {
+        "Mean win rate": 0.167,
+        "MMLU": 0.291,
+        "BoolQ": 0.746,
+        "NarrativeQA": 0.083,
+        "NaturalQuestions (open-book)": 0.349,
+        "QuAC": 0.144,
+        "HellaSwag": -1.0,
+        "OpenbookQA": -1.0,
+        "TruthfulQA": 0.193,
+        "MS MARCO (TREC)": -1.0,
+        "CNN/DailyMail": 0.03,
+        "XSUM": 0.058,
+        "IMDB": 0.337,
+        "CivilComments": 0.521,
+        "RAFT": 0.404
+      }
+    },
+    {
+      "model_id": "lmsys/Vicuna-v1.3-13B",
+      "name": "Vicuna v1.3 13B",
+      "developer": "lmsys",
+      "scores": {
+        "Mean win rate": 0.706,
+        "MMLU": 0.462,
+        "BoolQ": 0.808,
+        "NarrativeQA": 0.691,
+        "NaturalQuestions (open-book)": 0.686,
+        "QuAC": 0.403,
+        "HellaSwag": -1.0,
+        "OpenbookQA": -1.0,
+        "TruthfulQA": 0.385,
+        "MS MARCO (TREC)": -1.0,
+        "CNN/DailyMail": -1.0,
+        "XSUM": -1.0,
+        "IMDB": 0.762,
+        "CivilComments": 0.645,
+        "RAFT": 0.657
+      }
+    },
+    {
+      "model_id": "lmsys/Vicuna-v1.3-7B",
+      "name": "Vicuna v1.3 7B",
+      "developer": "lmsys",
+      "scores": {
+        "Mean win rate": 0.625,
+        "MMLU": 0.434,
+        "BoolQ": 0.76,
+        "NarrativeQA": 0.643,
+        "NaturalQuestions (open-book)": 0.634,
+        "QuAC": 0.392,
+        "HellaSwag": -1.0,
+        "OpenbookQA": -1.0,
+        "TruthfulQA": 0.292,
+        "MS MARCO (TREC)": -1.0,
+        "CNN/DailyMail": -1.0,
+        "XSUM": -1.0,
+        "IMDB": 0.916,
+        "CivilComments": 0.62,
+        "RAFT": 0.693
+      }
+    },
+    {
+      "model_id": "meta/LLaMA-13B",
+      "name": "LLaMA 13B",
+      "developer": "meta",
+      "scores": {
+        "Mean win rate": 0.595,
+        "MMLU": 0.422,
+        "BoolQ": 0.714,
+        "NarrativeQA": 0.711,
+        "NaturalQuestions (open-book)": 0.614,
+        "QuAC": 0.347,
+        "HellaSwag": -1.0,
+        "OpenbookQA": -1.0,
+        "TruthfulQA": 0.324,
+        "MS MARCO (TREC)": -1.0,
+        "CNN/DailyMail": -1.0,
+        "XSUM": -1.0,
+        "IMDB": 0.928,
+        "CivilComments": 0.6,
+        "RAFT": 0.643
+      }
+    },
+    {
+      "model_id": "meta/LLaMA-30B",
+      "name": "LLaMA 30B",
+      "developer": "meta",
+      "scores": {
+        "Mean win rate": 0.781,
+        "MMLU": 0.531,
+        "BoolQ": 0.861,
+        "NarrativeQA": 0.752,
+        "NaturalQuestions (open-book)": 0.666,
+        "QuAC": 0.39,
+        "HellaSwag": -1.0,
+        "OpenbookQA": -1.0,
+        "TruthfulQA": 0.344,
+        "MS MARCO (TREC)": -1.0,
+        "CNN/DailyMail": -1.0,
+        "XSUM": -1.0,
+        "IMDB": 0.927,
+        "CivilComments": 0.549,
+        "RAFT": 0.752
+      }
+    },
+    {
+      "model_id": "meta/LLaMA-65B",
+      "name": "LLaMA 65B",
+      "developer": "meta",
+      "scores": {
+        "Mean win rate": 0.908,
+        "MMLU": 0.584,
+        "BoolQ": 0.871,
+        "NarrativeQA": 0.755,
+        "NaturalQuestions (open-book)": 0.672,
+        "QuAC": 0.401,
+        "HellaSwag": -1.0,
+        "OpenbookQA": -1.0,
+        "TruthfulQA": 0.508,
+        "MS MARCO (TREC)": -1.0,
+        "CNN/DailyMail": -1.0,
+        "XSUM": -1.0,
+        "IMDB": 0.962,
+        "CivilComments": 0.655,
+        "RAFT": 0.702
+      }
+    },
+    {
+      "model_id": "meta/LLaMA-7B",
+      "name": "LLaMA 7B",
+      "developer": "meta",
+      "scores": {
+        "Mean win rate": 0.533,
+        "MMLU": 0.321,
+        "BoolQ": 0.756,
+        "NarrativeQA": 0.669,
+        "NaturalQuestions (open-book)": 0.589,
+        "QuAC": 0.338,
+        "HellaSwag": -1.0,
+        "OpenbookQA": -1.0,
+        "TruthfulQA": 0.28,
+        "MS MARCO (TREC)": -1.0,
+        "CNN/DailyMail": -1.0,
+        "XSUM": -1.0,
+        "IMDB": 0.947,
+        "CivilComments": 0.563,
+        "RAFT": 0.573
+      }
+    },
+    {
+      "model_id": "meta/Llama-2-13B",
+      "name": "Llama 2 13B",
+      "developer": "meta",
+      "scores": {
+        "Mean win rate": 0.823,
+        "MMLU": 0.507,
+        "BoolQ": 0.811,
+        "NarrativeQA": 0.744,
+        "NaturalQuestions (open-book)": 0.637,
+        "QuAC": 0.424,
+        "HellaSwag": -1.0,
+        "OpenbookQA": -1.0,
+        "TruthfulQA": 0.33,
+        "MS MARCO (TREC)": -1.0,
+        "CNN/DailyMail": -1.0,
+        "XSUM": -1.0,
+        "IMDB": 0.962,
+        "CivilComments": 0.588,
+        "RAFT": 0.707
+      }
+    },
+    {
+      "model_id": "meta/Llama-2-70B",
+      "name": "Llama 2 70B",
+      "developer": "meta",
+      "scores": {
+        "Mean win rate": 0.944,
+        "MMLU": 0.582,
+        "BoolQ": 0.886,
+        "NarrativeQA": 0.77,
+        "NaturalQuestions (open-book)": 0.674,
+        "QuAC": 0.484,
+        "HellaSwag": -1.0,
+        "OpenbookQA": -1.0,
+        "TruthfulQA": 0.554,
+        "MS MARCO (TREC)": -1.0,
+        "CNN/DailyMail": -1.0,
+        "XSUM": -1.0,
+        "IMDB": 0.961,
+        "CivilComments": 0.652,
+        "RAFT": 0.727
+      }
+    },
+    {
+      "model_id": "meta/Llama-2-7B",
+      "name": "Llama 2 7B",
+      "developer": "meta",
+      "scores": {
+        "Mean win rate": 0.607,
+        "MMLU": 0.431,
+        "BoolQ": 0.762,
+        "NarrativeQA": 0.691,
+        "NaturalQuestions (open-book)": 0.611,
+        "QuAC": 0.406,
+        "HellaSwag": -1.0,
+        "OpenbookQA": -1.0,
+        "TruthfulQA": 0.272,
+        "MS MARCO (TREC)": -1.0,
+        "CNN/DailyMail": -1.0,
+        "XSUM": -1.0,
+        "IMDB": 0.907,
+        "CivilComments": 0.562,
+        "RAFT": 0.643
+      }
+    },
+    {
+      "model_id": "meta/OPT-175B",
+      "name": "OPT 175B",
+      "developer": "meta",
+      "scores": {
+        "Mean win rate": 0.609,
+        "MMLU": 0.318,
+        "BoolQ": 0.793,
+        "NarrativeQA": 0.671,
+        "NaturalQuestions (open-book)": 0.615,
+        "QuAC": 0.36,
+        "HellaSwag": 0.791,
+        "OpenbookQA": 0.586,
+        "TruthfulQA": 0.25,
+        "MS MARCO (TREC)": 0.448,
+        "CNN/DailyMail": 0.146,
+        "XSUM": 0.155,
+        "IMDB": 0.947,
+        "CivilComments": 0.505,
+        "RAFT": 0.606
+      }
+    },
+    {
+      "model_id": "meta/OPT-66B",
+      "name": "OPT 66B",
+      "developer": "meta",
+      "scores": {
+        "Mean win rate": 0.448,
+        "MMLU": 0.276,
+        "BoolQ": 0.76,
+        "NarrativeQA": 0.638,
+        "NaturalQuestions (open-book)": 0.596,
+        "QuAC": 0.357,
+        "HellaSwag": 0.745,
+        "OpenbookQA": 0.534,
+        "TruthfulQA": 0.201,
+        "MS MARCO (TREC)": 0.482,
+        "CNN/DailyMail": 0.136,
+        "XSUM": 0.126,
+        "IMDB": 0.917,
+        "CivilComments": 0.506,
+        "RAFT": 0.557
+      }
+    },
+    {
+      "model_id": "microsoft/TNLG-v2-530B",
+      "name": "TNLG v2 530B",
+      "developer": "microsoft",
+      "scores": {
+        "Mean win rate": 0.787,
+        "MMLU": 0.469,
+        "BoolQ": 0.809,
+        "NarrativeQA": 0.722,
+        "NaturalQuestions (open-book)": 0.642,
+        "QuAC": 0.39,
+        "HellaSwag": 0.799,
+        "OpenbookQA": 0.562,
+        "TruthfulQA": 0.251,
+        "MS MARCO (TREC)": 0.643,
+        "CNN/DailyMail": 0.161,
+        "XSUM": 0.169,
+        "IMDB": 0.941,
+        "CivilComments": 0.601,
+        "RAFT": 0.679
+      }
+    },
+    {
+      "model_id": "microsoft/TNLG-v2-6.7B",
+      "name": "TNLG v2 6.7B",
+      "developer": "microsoft",
+      "scores": {
+        "Mean win rate": 0.309,
+        "MMLU": 0.242,
+        "BoolQ": 0.698,
+        "NarrativeQA": 0.631,
+        "NaturalQuestions (open-book)": 0.561,
+        "QuAC": 0.345,
+        "HellaSwag": 0.704,
+        "OpenbookQA": 0.478,
+        "TruthfulQA": 0.167,
+        "MS MARCO (TREC)": 0.332,
+        "CNN/DailyMail": 0.146,
+        "XSUM": 0.11,
+        "IMDB": 0.927,
+        "CivilComments": 0.532,
+        "RAFT": 0.525
+      }
+    },
+    {
+      "model_id": "mistralai/Mistral-v0.1-7B",
+      "name": "Mistral v0.1 7B",
+      "developer": "mistralai",
+      "scores": {
+        "Mean win rate": 0.884,
+        "MMLU": 0.572,
+        "BoolQ": 0.874,
+        "NarrativeQA": 0.716,
+        "NaturalQuestions (open-book)": 0.687,
+        "QuAC": 0.423,
+        "HellaSwag": -1.0,
+        "OpenbookQA": -1.0,
+        "TruthfulQA": 0.422,
+        "MS MARCO (TREC)": -1.0,
+        "CNN/DailyMail": -1.0,
+        "XSUM": -1.0,
+        "IMDB": 0.962,
+        "CivilComments": 0.624,
+        "RAFT": 0.707
+      }
+    },
+    {
+      "model_id": "mosaicml/MPT-30B",
+      "name": "MPT 30B",
+      "developer": "mosaicml",
+      "scores": {
+        "Mean win rate": 0.714,
+        "MMLU": 0.437,
+        "BoolQ": 0.704,
+        "NarrativeQA": 0.732,
+        "NaturalQuestions (open-book)": 0.673,
+        "QuAC": 0.393,
+        "HellaSwag": -1.0,
+        "OpenbookQA": -1.0,
+        "TruthfulQA": 0.231,
+        "MS MARCO (TREC)": -1.0,
+        "CNN/DailyMail": -1.0,
+        "XSUM": -1.0,
+        "IMDB": 0.959,
+        "CivilComments": 0.599,
+        "RAFT": 0.723
+      }
+    },
+    {
+      "model_id": "mosaicml/MPT-Instruct-30B",
+      "name": "MPT-Instruct 30B",
+      "developer": "mosaicml",
+      "scores": {
+        "Mean win rate": 0.716,
+        "MMLU": 0.444,
+        "BoolQ": 0.85,
+        "NarrativeQA": 0.733,
+        "NaturalQuestions (open-book)": 0.697,
+        "QuAC": 0.327,
+        "HellaSwag": -1.0,
+        "OpenbookQA": -1.0,
+        "TruthfulQA": 0.234,
+        "MS MARCO (TREC)": -1.0,
+        "CNN/DailyMail": -1.0,
+        "XSUM": -1.0,
+        "IMDB": 0.956,
+        "CivilComments": 0.573,
+        "RAFT": 0.68
+      }
+    },
+    {
+      "model_id": "openai/GPT-J-6B",
+      "name": "GPT-J 6B",
+      "developer": "openai",
+      "scores": {
+        "Mean win rate": 0.273,
+        "MMLU": 0.249,
+        "BoolQ": 0.649,
+        "NarrativeQA": 0.545,
+        "NaturalQuestions (open-book)": 0.559,
+        "QuAC": 0.33,
+        "HellaSwag": 0.663,
+        "OpenbookQA": 0.514,
+        "TruthfulQA": 0.199,
+        "MS MARCO (TREC)": 0.345,
+        "CNN/DailyMail": 0.131,
+        "XSUM": 0.096,
+        "IMDB": 0.939,
+        "CivilComments": 0.52,
+        "RAFT": 0.619
+      }
+    },
+    {
+      "model_id": "openai/GPT-NeoX-20B",
+      "name": "GPT-NeoX 20B",
+      "developer": "openai",
+      "scores": {
+        "Mean win rate": 0.351,
+        "MMLU": 0.276,
+        "BoolQ": 0.683,
+        "NarrativeQA": 0.599,
+        "NaturalQuestions (open-book)": 0.596,
+        "QuAC": 0.326,
+        "HellaSwag": 0.718,
+        "OpenbookQA": 0.524,
+        "TruthfulQA": 0.216,
+        "MS MARCO (TREC)": 0.398,
+        "CNN/DailyMail": 0.123,
+        "XSUM": 0.102,
+        "IMDB": 0.948,
+        "CivilComments": 0.516,
+        "RAFT": 0.505
+      }
+    },
+    {
+      "model_id": "openai/ada-350M",
+      "name": "ada 350M",
+      "developer": "openai",
+      "scores": {
+        "Mean win rate": 0.108,
+        "MMLU": 0.243,
+        "BoolQ": 0.581,
+        "NarrativeQA": 0.326,
+        "NaturalQuestions (open-book)": 0.365,
+        "QuAC": 0.242,
+        "HellaSwag": 0.435,
+        "OpenbookQA": 0.38,
+        "TruthfulQA": 0.215,
+        "MS MARCO (TREC)": 0.29,
+        "CNN/DailyMail": 0.09,
+        "XSUM": 0.022,
+        "IMDB": 0.849,
+        "CivilComments": 0.517,
+        "RAFT": 0.423
+      }
+    },
+    {
+      "model_id": "openai/babbage-1.3B",
+      "name": "babbage 1.3B",
+      "developer": "openai",
+      "scores": {
+        "Mean win rate": 0.114,
+        "MMLU": 0.235,
+        "BoolQ": 0.574,
+        "NarrativeQA": 0.491,
+        "NaturalQuestions (open-book)": 0.451,
+        "QuAC": 0.273,
+        "HellaSwag": 0.555,
+        "OpenbookQA": 0.438,
+        "TruthfulQA": 0.188,
+        "MS MARCO (TREC)": 0.317,
+        "CNN/DailyMail": 0.079,
+        "XSUM": 0.045,
+        "IMDB": 0.597,
+        "CivilComments": 0.519,
+        "RAFT": 0.455
+      }
+    },
+    {
+      "model_id": "openai/curie-6.7B",
+      "name": "curie 6.7B",
+      "developer": "openai",
+      "scores": {
+        "Mean win rate": 0.247,
+        "MMLU": 0.243,
+        "BoolQ": 0.656,
+        "NarrativeQA": 0.604,
+        "NaturalQuestions (open-book)": 0.552,
+        "QuAC": 0.321,
+        "HellaSwag": 0.682,
+        "OpenbookQA": 0.502,
+        "TruthfulQA": 0.232,
+        "MS MARCO (TREC)": 0.3,
+        "CNN/DailyMail": 0.113,
+        "XSUM": 0.091,
+        "IMDB": 0.889,
+        "CivilComments": 0.539,
+        "RAFT": 0.49
+      }
+    },
+    {
+      "model_id": "openai/davinci-175B",
+      "name": "davinci 175B",
+      "developer": "openai",
+      "scores": {
+        "Mean win rate": 0.538,
+        "MMLU": 0.422,
+        "BoolQ": 0.722,
+        "NarrativeQA": 0.687,
+        "NaturalQuestions (open-book)": 0.625,
+        "QuAC": 0.36,
+        "HellaSwag": 0.775,
+        "OpenbookQA": 0.586,
+        "TruthfulQA": 0.194,
+        "MS MARCO (TREC)": 0.378,
+        "CNN/DailyMail": 0.127,
+        "XSUM": 0.126,
+        "IMDB": 0.933,
+        "CivilComments": 0.532,
+        "RAFT": 0.642
+      }
+    },
+    {
+      "model_id": "openai/gpt-3.5-turbo-0301",
+      "name": "gpt-3.5-turbo-0301",
+      "developer": "openai",
+      "scores": {
+        "Mean win rate": 0.76,
+        "MMLU": 0.59,
+        "BoolQ": 0.74,
+        "NarrativeQA": 0.663,
+        "NaturalQuestions (open-book)": 0.624,
+        "QuAC": 0.512,
+        "HellaSwag": -1.0,
+        "OpenbookQA": -1.0,
+        "TruthfulQA": 0.609,
+        "MS MARCO (TREC)": -1.0,
+        "CNN/DailyMail": -1.0,
+        "XSUM": -1.0,
+        "IMDB": 0.899,
+        "CivilComments": 0.674,
+        "RAFT": 0.768
+      }
+    },
+    {
+      "model_id": "openai/gpt-3.5-turbo-0613",
+      "name": "gpt-3.5-turbo-0613",
+      "developer": "openai",
+      "scores": {
+        "Mean win rate": 0.783,
+        "MMLU": 0.391,
+        "BoolQ": 0.87,
+        "NarrativeQA": 0.625,
+        "NaturalQuestions (open-book)": 0.675,
+        "QuAC": 0.485,
+        "HellaSwag": -1.0,
+        "OpenbookQA": -1.0,
+        "TruthfulQA": 0.339,
+        "MS MARCO (TREC)": -1.0,
+        "CNN/DailyMail": -1.0,
+        "XSUM": -1.0,
+        "IMDB": 0.943,
+        "CivilComments": 0.696,
+        "RAFT": 0.748
+      }
+    },
+    {
+      "model_id": "openai/text-ada-001",
+      "name": "text-ada-001",
+      "developer": "openai",
+      "scores": {
+        "Mean win rate": 0.107,
+        "MMLU": 0.238,
+        "BoolQ": 0.464,
+        "NarrativeQA": 0.238,
+        "NaturalQuestions (open-book)": 0.149,
+        "QuAC": 0.176,
+        "HellaSwag": 0.429,
+        "OpenbookQA": 0.346,
+        "TruthfulQA": 0.232,
+        "MS MARCO (TREC)": 0.302,
+        "CNN/DailyMail": 0.136,
+        "XSUM": 0.034,
+        "IMDB": 0.822,
+        "CivilComments": 0.503,
+        "RAFT": 0.406
+      }
+    },
+    {
+      "model_id": "openai/text-babbage-001",
+      "name": "text-babbage-001",
+      "developer": "openai",
+      "scores": {
+        "Mean win rate": 0.229,
+        "MMLU": 0.229,
+        "BoolQ": 0.451,
+        "NarrativeQA": 0.429,
+        "NaturalQuestions (open-book)": 0.33,
+        "QuAC": 0.284,
+        "HellaSwag": 0.561,
+        "OpenbookQA": 0.452,
+        "TruthfulQA": 0.233,
+        "MS MARCO (TREC)": 0.449,
+        "CNN/DailyMail": 0.151,
+        "XSUM": 0.046,
+        "IMDB": 0.913,
+        "CivilComments": 0.499,
+        "RAFT": 0.509
+      }
+    },
+    {
+      "model_id": "openai/text-curie-001",
+      "name": "text-curie-001",
+      "developer": "openai",
+      "scores": {
+        "Mean win rate": 0.36,
+        "MMLU": 0.237,
+        "BoolQ": 0.62,
+        "NarrativeQA": 0.582,
+        "NaturalQuestions (open-book)": 0.571,
+        "QuAC": 0.358,
+        "HellaSwag": 0.676,
+        "OpenbookQA": 0.514,
+        "TruthfulQA": 0.257,
+        "MS MARCO (TREC)": 0.507,
+        "CNN/DailyMail": 0.152,
+        "XSUM": 0.076,
+        "IMDB": 0.923,
+        "CivilComments": 0.537,
+        "RAFT": 0.489
+      }
+    },
+    {
+      "model_id": "openai/text-davinci-002",
+      "name": "text-davinci-002",
+      "developer": "openai",
+      "scores": {
+        "Mean win rate": 0.905,
+        "MMLU": 0.568,
+        "BoolQ": 0.877,
+        "NarrativeQA": 0.727,
+        "NaturalQuestions (open-book)": 0.713,
+        "QuAC": 0.445,
+        "HellaSwag": 0.815,
+        "OpenbookQA": 0.594,
+        "TruthfulQA": 0.61,
+        "MS MARCO (TREC)": 0.664,
+        "CNN/DailyMail": 0.153,
+        "XSUM": 0.144,
+        "IMDB": 0.948,
+        "CivilComments": 0.668,
+        "RAFT": 0.733
+      }
+    },
+    {
+      "model_id": "openai/text-davinci-003",
+      "name": "text-davinci-003",
+      "developer": "openai",
+      "scores": {
+        "Mean win rate": 0.872,
+        "MMLU": 0.569,
+        "BoolQ": 0.881,
+        "NarrativeQA": 0.727,
+        "NaturalQuestions (open-book)": 0.77,
+        "QuAC": 0.525,
+        "HellaSwag": 0.822,
+        "OpenbookQA": 0.646,
+        "TruthfulQA": 0.593,
+        "MS MARCO (TREC)": 0.644,
+        "CNN/DailyMail": 0.156,
+        "XSUM": 0.124,
+        "IMDB": 0.848,
+        "CivilComments": 0.684,
+        "RAFT": 0.759
+      }
+    },
+    {
+      "model_id": "stanford/Alpaca-7B",
+      "name": "Alpaca 7B",
+      "developer": "stanford",
+      "scores": {
+        "Mean win rate": 0.381,
+        "MMLU": 0.385,
+        "BoolQ": 0.778,
+        "NarrativeQA": 0.396,
+        "NaturalQuestions (open-book)": 0.592,
+        "QuAC": 0.27,
+        "HellaSwag": -1.0,
+        "OpenbookQA": -1.0,
+        "TruthfulQA": 0.243,
+        "MS MARCO (TREC)": -1.0,
+        "CNN/DailyMail": -1.0,
+        "XSUM": -1.0,
+        "IMDB": 0.738,
+        "CivilComments": 0.566,
+        "RAFT": 0.486
+      }
+    },
+    {
+      "model_id": "tiiuae/Falcon-40B",
+      "name": "Falcon 40B",
+      "developer": "tiiuae",
+      "scores": {
+        "Mean win rate": 0.729,
+        "MMLU": 0.509,
+        "BoolQ": 0.819,
+        "NarrativeQA": 0.673,
+        "NaturalQuestions (open-book)": 0.675,
+        "QuAC": 0.307,
+        "HellaSwag": -1.0,
+        "OpenbookQA": -1.0,
+        "TruthfulQA": 0.353,
+        "MS MARCO (TREC)": -1.0,
+        "CNN/DailyMail": -1.0,
+        "XSUM": -1.0,
+        "IMDB": 0.959,
+        "CivilComments": 0.552,
+        "RAFT": 0.661
+      }
+    },
+    {
+      "model_id": "tiiuae/Falcon-7B",
+      "name": "Falcon 7B",
+      "developer": "tiiuae",
+      "scores": {
+        "Mean win rate": 0.378,
+        "MMLU": 0.286,
+        "BoolQ": 0.753,
+        "NarrativeQA": 0.621,
+        "NaturalQuestions (open-book)": 0.579,
+        "QuAC": 0.332,
+        "HellaSwag": -1.0,
+        "OpenbookQA": -1.0,
+        "TruthfulQA": 0.234,
+        "MS MARCO (TREC)": -1.0,
+        "CNN/DailyMail": -1.0,
+        "XSUM": -1.0,
+        "IMDB": 0.836,
+        "CivilComments": 0.514,
+        "RAFT": 0.602
+      }
+    },
+    {
+      "model_id": "tiiuae/Falcon-Instruct-40B",
+      "name": "Falcon-Instruct 40B",
+      "developer": "tiiuae",
+      "scores": {
+        "Mean win rate": 0.727,
+        "MMLU": 0.497,
+        "BoolQ": 0.829,
+        "NarrativeQA": 0.625,
+        "NaturalQuestions (open-book)": 0.666,
+        "QuAC": 0.371,
+        "HellaSwag": -1.0,
+        "OpenbookQA": -1.0,
+        "TruthfulQA": 0.384,
+        "MS MARCO (TREC)": -1.0,
+        "CNN/DailyMail": -1.0,
+        "XSUM": -1.0,
+        "IMDB": 0.959,
+        "CivilComments": 0.603,
+        "RAFT": 0.586
+      }
+    },
+    {
+      "model_id": "tiiuae/Falcon-Instruct-7B",
+      "name": "Falcon-Instruct 7B",
+      "developer": "tiiuae",
+      "scores": {
+        "Mean win rate": 0.244,
+        "MMLU": 0.275,
+        "BoolQ": 0.72,
+        "NarrativeQA": 0.476,
+        "NaturalQuestions (open-book)": 0.449,
+        "QuAC": 0.311,
+        "HellaSwag": -1.0,
+        "OpenbookQA": -1.0,
+        "TruthfulQA": 0.213,
+        "MS MARCO (TREC)": -1.0,
+        "CNN/DailyMail": -1.0,
+        "XSUM": -1.0,
+        "IMDB": 0.852,
+        "CivilComments": 0.511,
+        "RAFT": 0.523
+      }
+    },
+    {
+      "model_id": "together/RedPajama-INCITE-Base-7B",
+      "name": "RedPajama-INCITE-Base 7B",
+      "developer": "together",
+      "scores": {
+        "Mean win rate": 0.378,
+        "MMLU": 0.302,
+        "BoolQ": 0.713,
+        "NarrativeQA": 0.617,
+        "NaturalQuestions (open-book)": 0.586,
+        "QuAC": 0.336,
+        "HellaSwag": -1.0,
+        "OpenbookQA": -1.0,
+        "TruthfulQA": 0.205,
+        "MS MARCO (TREC)": -1.0,
+        "CNN/DailyMail": -1.0,
+        "XSUM": -1.0,
+        "IMDB": 0.752,
+        "CivilComments": 0.547,
+        "RAFT": 0.648
+      }
+    },
+    {
+      "model_id": "together/RedPajama-INCITE-Base-v1-3B",
+      "name": "RedPajama-INCITE-Base-v1 3B",
+      "developer": "together",
+      "scores": {
+        "Mean win rate": 0.311,
+        "MMLU": 0.263,
+        "BoolQ": 0.685,
+        "NarrativeQA": 0.555,
+        "NaturalQuestions (open-book)": 0.52,
+        "QuAC": 0.309,
+        "HellaSwag": -1.0,
+        "OpenbookQA": -1.0,
+        "TruthfulQA": 0.277,
+        "MS MARCO (TREC)": -1.0,
+        "CNN/DailyMail": -1.0,
+        "XSUM": -1.0,
+        "IMDB": 0.907,
+        "CivilComments": 0.549,
+        "RAFT": 0.502
+      }
+    },
+    {
+      "model_id": "together/RedPajama-INCITE-Instruct-7B",
+      "name": "RedPajama-INCITE-Instruct 7B",
+      "developer": "together",
+      "scores": {
+        "Mean win rate": 0.524,
+        "MMLU": 0.363,
+        "BoolQ": 0.705,
+        "NarrativeQA": 0.638,
+        "NaturalQuestions (open-book)": 0.659,
+        "QuAC": 0.26,
+        "HellaSwag": -1.0,
+        "OpenbookQA": -1.0,
+        "TruthfulQA": 0.243,
+        "MS MARCO (TREC)": -1.0,
+        "CNN/DailyMail": -1.0,
+        "XSUM": -1.0,
+        "IMDB": 0.927,
+        "CivilComments": 0.664,
+        "RAFT": 0.695
+      }
+    },
+    {
+      "model_id": "together/RedPajama-INCITE-Instruct-v1-3B",
+      "name": "RedPajama-INCITE-Instruct-v1 3B",
+      "developer": "together",
+      "scores": {
+        "Mean win rate": 0.366,
+        "MMLU": 0.257,
+        "BoolQ": 0.677,
+        "NarrativeQA": 0.638,
+        "NaturalQuestions (open-book)": 0.637,
+        "QuAC": 0.259,
+        "HellaSwag": -1.0,
+        "OpenbookQA": -1.0,
+        "TruthfulQA": 0.208,
+        "MS MARCO (TREC)": -1.0,
+        "CNN/DailyMail": -1.0,
+        "XSUM": -1.0,
+        "IMDB": 0.894,
+        "CivilComments": 0.549,
+        "RAFT": 0.661
+      }
+    },
+    {
+      "model_id": "writer/InstructPalmyra-30B",
+      "name": "InstructPalmyra 30B",
+      "developer": "writer",
+      "scores": {
+        "Mean win rate": 0.568,
+        "MMLU": 0.403,
+        "BoolQ": 0.751,
+        "NarrativeQA": 0.496,
+        "NaturalQuestions (open-book)": 0.682,
+        "QuAC": 0.433,
+        "HellaSwag": -1.0,
+        "OpenbookQA": -1.0,
+        "TruthfulQA": 0.185,
+        "MS MARCO (TREC)": -1.0,
+        "CNN/DailyMail": 0.152,
+        "XSUM": 0.104,
+        "IMDB": 0.94,
+        "CivilComments": 0.555,
+        "RAFT": 0.652
+      }
+    },
+    {
+      "model_id": "yandex/YaLM-100B",
+      "name": "YaLM 100B",
+      "developer": "yandex",
+      "scores": {
+        "Mean win rate": 0.075,
+        "MMLU": 0.243,
+        "BoolQ": 0.634,
+        "NarrativeQA": 0.252,
+        "NaturalQuestions (open-book)": 0.227,
+        "QuAC": 0.162,
+        "HellaSwag": -1.0,
+        "OpenbookQA": -1.0,
+        "TruthfulQA": 0.202,
+        "MS MARCO (TREC)": -1.0,
+        "CNN/DailyMail": 0.017,
+        "XSUM": 0.021,
+        "IMDB": 0.836,
+        "CivilComments": 0.49,
+        "RAFT": 0.395
+      }
+    },
+    {
+      "model_id": "zhipu-ai/GLM-130B",
+      "name": "GLM 130B",
+      "developer": "zhipu-ai",
+      "scores": {
+        "Mean win rate": 0.512,
+        "MMLU": 0.344,
+        "BoolQ": 0.784,
+        "NarrativeQA": 0.706,
+        "NaturalQuestions (open-book)": 0.642,
+        "QuAC": 0.272,
+        "HellaSwag": -1.0,
+        "OpenbookQA": -1.0,
+        "TruthfulQA": 0.218,
+        "MS MARCO (TREC)": -1.0,
+        "CNN/DailyMail": 0.154,
+        "XSUM": 0.132,
+        "IMDB": 0.955,
+        "CivilComments": 0.5,
+        "RAFT": 0.598
+      }
+    }
+  ]
+}

data/benchmarks/helm_instruct.json ADDED Viewed

	@@ -0,0 +1,60 @@

+{
+  "models": [
+    {
+      "model_id": "anthropic/claude-v1.3",
+      "name": "Anthropic Claude v1.3",
+      "developer": "anthropic",
+      "scores": {
+        "Mean win rate": 0.611,
+        "Anthropic RLHF dataset": 4.965,
+        "Best ChatGPT Prompts": 4.995,
+        "Koala test dataset": 4.981,
+        "Open Assistant": 4.975,
+        "Self Instruct": 4.992,
+        "Vicuna": 4.989
+      }
+    },
+    {
+      "model_id": "cohere/command-xlarge-beta",
+      "name": "Cohere Command beta 52.4B",
+      "developer": "cohere",
+      "scores": {
+        "Mean win rate": 0.089,
+        "Anthropic RLHF dataset": 4.214,
+        "Best ChatGPT Prompts": 4.988,
+        "Koala test dataset": 4.969,
+        "Open Assistant": 4.967,
+        "Self Instruct": 4.971,
+        "Vicuna": 4.995
+      }
+    },
+    {
+      "model_id": "openai/gpt-3.5-turbo-0613",
+      "name": "gpt-3.5-turbo-0613",
+      "developer": "openai",
+      "scores": {
+        "Mean win rate": 0.689,
+        "Anthropic RLHF dataset": 4.964,
+        "Best ChatGPT Prompts": 4.986,
+        "Koala test dataset": 4.987,
+        "Open Assistant": 4.987,
+        "Self Instruct": 4.99,
+        "Vicuna": 4.992
+      }
+    },
+    {
+      "model_id": "openai/gpt-4-0314",
+      "name": "GPT-4 0314",
+      "developer": "openai",
+      "scores": {
+        "Mean win rate": 0.611,
+        "Anthropic RLHF dataset": 4.934,
+        "Best ChatGPT Prompts": 4.973,
+        "Koala test dataset": 4.966,
+        "Open Assistant": 4.986,
+        "Self Instruct": 4.976,
+        "Vicuna": 4.995
+      }
+    }
+  ]
+}

data/benchmarks/helm_lite.json ADDED Viewed

	@@ -0,0 +1,1551 @@

+{
+  "models": [
+    {
+      "model_id": "01-ai/yi-34b",
+      "name": "Yi 34B",
+      "developer": "01-ai",
+      "scores": {
+        "Mean win rate": 0.57,
+        "NarrativeQA": 0.782,
+        "NaturalQuestions (closed-book)": 0.443,
+        "OpenbookQA": 0.92,
+        "MMLU": 0.65,
+        "MATH": 0.375,
+        "GSM8K": 0.648,
+        "LegalBench": 0.618,
+        "MedQA": 0.656,
+        "WMT 2014": 0.172
+      }
+    },
+    {
+      "model_id": "01-ai/yi-6b",
+      "name": "Yi 6B",
+      "developer": "01-ai",
+      "scores": {
+        "Mean win rate": 0.253,
+        "NarrativeQA": 0.702,
+        "NaturalQuestions (closed-book)": 0.31,
+        "OpenbookQA": 0.8,
+        "MMLU": 0.53,
+        "MATH": 0.126,
+        "GSM8K": 0.375,
+        "LegalBench": 0.519,
+        "MedQA": 0.497,
+        "WMT 2014": 0.117
+      }
+    },
+    {
+      "model_id": "01-ai/yi-large-preview",
+      "name": "Yi Large Preview",
+      "developer": "01-ai",
+      "scores": {
+        "Mean win rate": 0.471,
+        "NarrativeQA": 0.373,
+        "NaturalQuestions (closed-book)": 0.428,
+        "OpenbookQA": 0.946,
+        "MMLU": 0.712,
+        "MATH": 0.712,
+        "GSM8K": 0.69,
+        "LegalBench": 0.519,
+        "MedQA": 0.66,
+        "WMT 2014": 0.176
+      }
+    },
+    {
+      "model_id": "AlephAlpha/luminous-base",
+      "name": "Luminous Base 13B",
+      "developer": "AlephAlpha",
+      "scores": {
+        "Mean win rate": 0.041,
+        "NarrativeQA": 0.633,
+        "NaturalQuestions (closed-book)": 0.197,
+        "OpenbookQA": 0.286,
+        "MMLU": 0.243,
+        "MATH": 0.026,
+        "GSM8K": 0.028,
+        "LegalBench": 0.332,
+        "MedQA": 0.26,
+        "WMT 2014": 0.066
+      }
+    },
+    {
+      "model_id": "AlephAlpha/luminous-extended",
+      "name": "Luminous Extended 30B",
+      "developer": "AlephAlpha",
+      "scores": {
+        "Mean win rate": 0.078,
+        "NarrativeQA": 0.684,
+        "NaturalQuestions (closed-book)": 0.253,
+        "OpenbookQA": 0.272,
+        "MMLU": 0.248,
+        "MATH": 0.04,
+        "GSM8K": 0.075,
+        "LegalBench": 0.421,
+        "MedQA": 0.276,
+        "WMT 2014": 0.083
+      }
+    },
+    {
+      "model_id": "AlephAlpha/luminous-supreme",
+      "name": "Luminous Supreme 70B",
+      "developer": "AlephAlpha",
+      "scores": {
+        "Mean win rate": 0.145,
+        "NarrativeQA": 0.743,
+        "NaturalQuestions (closed-book)": 0.299,
+        "OpenbookQA": 0.284,
+        "MMLU": 0.316,
+        "MATH": 0.078,
+        "GSM8K": 0.137,
+        "LegalBench": 0.452,
+        "MedQA": 0.276,
+        "WMT 2014": 0.102
+      }
+    },
+    {
+      "model_id": "ai21/j2-grande",
+      "name": "Jurassic-2 Grande 17B",
+      "developer": "ai21",
+      "scores": {
+        "Mean win rate": 0.172,
+        "NarrativeQA": 0.744,
+        "NaturalQuestions (closed-book)": 0.35,
+        "OpenbookQA": 0.614,
+        "MMLU": 0.471,
+        "MATH": 0.064,
+        "GSM8K": 0.159,
+        "LegalBench": 0.468,
+        "MedQA": 0.39,
+        "WMT 2014": 0.102
+      }
+    },
+    {
+      "model_id": "ai21/j2-jumbo",
+      "name": "Jurassic-2 Jumbo 178B",
+      "developer": "ai21",
+      "scores": {
+        "Mean win rate": 0.215,
+        "NarrativeQA": 0.728,
+        "NaturalQuestions (closed-book)": 0.385,
+        "OpenbookQA": 0.688,
+        "MMLU": 0.483,
+        "MATH": 0.103,
+        "GSM8K": 0.239,
+        "LegalBench": 0.533,
+        "MedQA": 0.431,
+        "WMT 2014": 0.114
+      }
+    },
+    {
+      "model_id": "ai21/jamba-1.5-large",
+      "name": "Jamba 1.5 Large",
+      "developer": "ai21",
+      "scores": {
+        "Mean win rate": 0.637,
+        "NarrativeQA": 0.664,
+        "NaturalQuestions (closed-book)": 0.394,
+        "OpenbookQA": 0.948,
+        "MMLU": 0.683,
+        "MATH": 0.692,
+        "GSM8K": 0.846,
+        "LegalBench": 0.675,
+        "MedQA": 0.698,
+        "WMT 2014": 0.203
+      }
+    },
+    {
+      "model_id": "ai21/jamba-1.5-mini",
+      "name": "Jamba 1.5 Mini",
+      "developer": "ai21",
+      "scores": {
+        "Mean win rate": 0.414,
+        "NarrativeQA": 0.746,
+        "NaturalQuestions (closed-book)": 0.388,
+        "OpenbookQA": 0.89,
+        "MMLU": 0.582,
+        "MATH": 0.318,
+        "GSM8K": 0.691,
+        "LegalBench": 0.503,
+        "MedQA": 0.632,
+        "WMT 2014": 0.179
+      }
+    },
+    {
+      "model_id": "ai21/jamba-instruct",
+      "name": "Jamba Instruct",
+      "developer": "ai21",
+      "scores": {
+        "Mean win rate": 0.287,
+        "NarrativeQA": 0.658,
+        "NaturalQuestions (closed-book)": 0.384,
+        "OpenbookQA": 0.796,
+        "MMLU": 0.582,
+        "MATH": 0.38,
+        "GSM8K": 0.67,
+        "LegalBench": 0.54,
+        "MedQA": 0.519,
+        "WMT 2014": 0.164
+      }
+    },
+    {
+      "model_id": "allenai/olmo-7b",
+      "name": "OLMo 7B",
+      "developer": "allenai",
+      "scores": {
+        "Mean win rate": 0.052,
+        "NarrativeQA": 0.597,
+        "NaturalQuestions (closed-book)": 0.259,
+        "OpenbookQA": 0.222,
+        "MMLU": 0.305,
+        "MATH": 0.029,
+        "GSM8K": 0.044,
+        "LegalBench": 0.341,
+        "MedQA": 0.229,
+        "WMT 2014": 0.097
+      }
+    },
+    {
+      "model_id": "amazon/nova-lite-v1:0",
+      "name": "Amazon Nova Lite",
+      "developer": "amazon",
+      "scores": {
+        "Mean win rate": 0.708,
+        "NarrativeQA": 0.768,
+        "NaturalQuestions (closed-book)": 0.352,
+        "OpenbookQA": 0.928,
+        "MMLU": 0.693,
+        "MATH": 0.779,
+        "GSM8K": 0.829,
+        "LegalBench": 0.659,
+        "MedQA": 0.696,
+        "WMT 2014": 0.204
+      }
+    },
+    {
+      "model_id": "amazon/nova-micro-v1:0",
+      "name": "Amazon Nova Micro",
+      "developer": "amazon",
+      "scores": {
+        "Mean win rate": 0.524,
+        "NarrativeQA": 0.744,
+        "NaturalQuestions (closed-book)": 0.285,
+        "OpenbookQA": 0.888,
+        "MMLU": 0.64,
+        "MATH": 0.76,
+        "GSM8K": 0.794,
+        "LegalBench": 0.615,
+        "MedQA": 0.608,
+        "WMT 2014": 0.192
+      }
+    },
+    {
+      "model_id": "amazon/nova-pro-v1:0",
+      "name": "Amazon Nova Pro",
+      "developer": "amazon",
+      "scores": {
+        "Mean win rate": 0.885,
+        "NarrativeQA": 0.791,
+        "NaturalQuestions (closed-book)": 0.405,
+        "OpenbookQA": 0.96,
+        "MMLU": 0.758,
+        "MATH": 0.821,
+        "GSM8K": 0.87,
+        "LegalBench": 0.736,
+        "MedQA": 0.811,
+        "WMT 2014": 0.229
+      }
+    },
+    {
+      "model_id": "anthropic/claude-2.0",
+      "name": "Claude 2.0",
+      "developer": "anthropic",
+      "scores": {
+        "Mean win rate": 0.489,
+        "NarrativeQA": 0.718,
+        "NaturalQuestions (closed-book)": 0.428,
+        "OpenbookQA": 0.862,
+        "MMLU": 0.639,
+        "MATH": 0.603,
+        "GSM8K": 0.583,
+        "LegalBench": 0.643,
+        "MedQA": 0.652,
+        "WMT 2014": 0.219
+      }
+    },
+    {
+      "model_id": "anthropic/claude-2.1",
+      "name": "Claude 2.1",
+      "developer": "anthropic",
+      "scores": {
+        "Mean win rate": 0.437,
+        "NarrativeQA": 0.677,
+        "NaturalQuestions (closed-book)": 0.375,
+        "OpenbookQA": 0.872,
+        "MMLU": 0.643,
+        "MATH": 0.632,
+        "GSM8K": 0.604,
+        "LegalBench": 0.643,
+        "MedQA": 0.644,
+        "WMT 2014": 0.204
+      }
+    },
+    {
+      "model_id": "anthropic/claude-3-5-haiku-20241022",
+      "name": "claude-3-5-haiku-20241022",
+      "developer": "anthropic",
+      "scores": {
+        "Mean win rate": 0.531,
+        "NarrativeQA": 0.763,
+        "NaturalQuestions (closed-book)": 0.344,
+        "OpenbookQA": 0.854,
+        "MMLU": 0.671,
+        "MATH": 0.872,
+        "GSM8K": 0.815,
+        "LegalBench": 0.631,
+        "MedQA": 0.722,
+        "WMT 2014": 0.135
+      }
+    },
+    {
+      "model_id": "anthropic/claude-3-5-sonnet-20240620",
+      "name": "Claude 3.5 Sonnet 20240620",
+      "developer": "anthropic",
+      "scores": {
+        "Mean win rate": 0.885,
+        "NarrativeQA": 0.746,
+        "NaturalQuestions (closed-book)": 0.502,
+        "OpenbookQA": 0.972,
+        "MMLU": 0.799,
+        "MATH": 0.813,
+        "GSM8K": 0.949,
+        "LegalBench": 0.707,
+        "MedQA": 0.825,
+        "WMT 2014": 0.229
+      }
+    },
+    {
+      "model_id": "anthropic/claude-3-5-sonnet-20241022",
+      "name": "Claude 3.5 Sonnet 20241022",
+      "developer": "anthropic",
+      "scores": {
+        "Mean win rate": 0.846,
+        "NarrativeQA": 0.77,
+        "NaturalQuestions (closed-book)": 0.467,
+        "OpenbookQA": 0.966,
+        "MMLU": 0.809,
+        "MATH": 0.904,
+        "GSM8K": 0.956,
+        "LegalBench": 0.647,
+        "MedQA": 0.859,
+        "WMT 2014": 0.226
+      }
+    },
+    {
+      "model_id": "anthropic/claude-3-haiku-20240307",
+      "name": "Claude 3 Haiku 20240307",
+      "developer": "anthropic",
+      "scores": {
+        "Mean win rate": 0.263,
+        "NarrativeQA": 0.244,
+        "NaturalQuestions (closed-book)": 0.144,
+        "OpenbookQA": 0.838,
+        "MMLU": 0.662,
+        "MATH": 0.131,
+        "GSM8K": 0.699,
+        "LegalBench": 0.46,
+        "MedQA": 0.702,
+        "WMT 2014": 0.148
+      }
+    },
+    {
+      "model_id": "anthropic/claude-3-opus-20240229",
+      "name": "Claude 3 Opus 20240229",
+      "developer": "anthropic",
+      "scores": {
+        "Mean win rate": 0.683,
+        "NarrativeQA": 0.351,
+        "NaturalQuestions (closed-book)": 0.441,
+        "OpenbookQA": 0.956,
+        "MMLU": 0.768,
+        "MATH": 0.76,
+        "GSM8K": 0.924,
+        "LegalBench": 0.662,
+        "MedQA": 0.775,
+        "WMT 2014": 0.24
+      }
+    },
+    {
+      "model_id": "anthropic/claude-3-sonnet-20240229",
+      "name": "Claude 3 Sonnet 20240229",
+      "developer": "anthropic",
+      "scores": {
+        "Mean win rate": 0.377,
+        "NarrativeQA": 0.111,
+        "NaturalQuestions (closed-book)": 0.028,
+        "OpenbookQA": 0.918,
+        "MMLU": 0.652,
+        "MATH": 0.084,
+        "GSM8K": 0.907,
+        "LegalBench": 0.49,
+        "MedQA": 0.684,
+        "WMT 2014": 0.218
+      }
+    },
+    {
+      "model_id": "anthropic/claude-instant-1.2",
+      "name": "Claude Instant 1.2",
+      "developer": "anthropic",
+      "scores": {
+        "Mean win rate": 0.399,
+        "NarrativeQA": 0.616,
+        "NaturalQuestions (closed-book)": 0.343,
+        "OpenbookQA": 0.844,
+        "MMLU": 0.631,
+        "MATH": 0.499,
+        "GSM8K": 0.721,
+        "LegalBench": 0.586,
+        "MedQA": 0.559,
+        "WMT 2014": 0.194
+      }
+    },
+    {
+      "model_id": "anthropic/claude-v1.3",
+      "name": "Anthropic Claude v1.3",
+      "developer": "anthropic",
+      "scores": {
+        "Mean win rate": 0.518,
+        "NarrativeQA": 0.723,
+        "NaturalQuestions (closed-book)": 0.409,
+        "OpenbookQA": 0.908,
+        "MMLU": 0.631,
+        "MATH": 0.54,
+        "GSM8K": 0.784,
+        "LegalBench": 0.629,
+        "MedQA": 0.618,
+        "WMT 2014": 0.219
+      }
+    },
+    {
+      "model_id": "cohere/command",
+      "name": "Command",
+      "developer": "cohere",
+      "scores": {
+        "Mean win rate": 0.327,
+        "NarrativeQA": 0.749,
+        "NaturalQuestions (closed-book)": 0.391,
+        "OpenbookQA": 0.774,
+        "MMLU": 0.525,
+        "MATH": 0.236,
+        "GSM8K": 0.452,
+        "LegalBench": 0.578,
+        "MedQA": 0.445,
+        "WMT 2014": 0.088
+      }
+    },
+    {
+      "model_id": "cohere/command-light",
+      "name": "Command Light",
+      "developer": "cohere",
+      "scores": {
+        "Mean win rate": 0.105,
+        "NarrativeQA": 0.629,
+        "NaturalQuestions (closed-book)": 0.195,
+        "OpenbookQA": 0.398,
+        "MMLU": 0.386,
+        "MATH": 0.098,
+        "GSM8K": 0.149,
+        "LegalBench": 0.397,
+        "MedQA": 0.312,
+        "WMT 2014": 0.023
+      }
+    },
+    {
+      "model_id": "cohere/command-r",
+      "name": "Command R",
+      "developer": "cohere",
+      "scores": {
+        "Mean win rate": 0.299,
+        "NarrativeQA": 0.742,
+        "NaturalQuestions (closed-book)": 0.352,
+        "OpenbookQA": 0.782,
+        "MMLU": 0.567,
+        "MATH": 0.266,
+        "GSM8K": 0.551,
+        "LegalBench": 0.507,
+        "MedQA": 0.555,
+        "WMT 2014": 0.149
+      }
+    },
+    {
+      "model_id": "cohere/command-r-plus",
+      "name": "Command R Plus",
+      "developer": "cohere",
+      "scores": {
+        "Mean win rate": 0.441,
+        "NarrativeQA": 0.735,
+        "NaturalQuestions (closed-book)": 0.343,
+        "OpenbookQA": 0.828,
+        "MMLU": 0.59,
+        "MATH": 0.403,
+        "GSM8K": 0.738,
+        "LegalBench": 0.672,
+        "MedQA": 0.567,
+        "WMT 2014": 0.203
+      }
+    },
+    {
+      "model_id": "databricks/dbrx-instruct",
+      "name": "DBRX Instruct",
+      "developer": "databricks",
+      "scores": {
+        "Mean win rate": 0.289,
+        "NarrativeQA": 0.488,
+        "NaturalQuestions (closed-book)": 0.284,
+        "OpenbookQA": 0.91,
+        "MMLU": 0.643,
+        "MATH": 0.358,
+        "GSM8K": 0.671,
+        "LegalBench": 0.426,
+        "MedQA": 0.694,
+        "WMT 2014": 0.131
+      }
+    },
+    {
+      "model_id": "deepseek-ai/deepseek-llm-67b-chat",
+      "name": "DeepSeek LLM Chat 67B",
+      "developer": "deepseek-ai",
+      "scores": {
+        "Mean win rate": 0.488,
+        "NarrativeQA": 0.581,
+        "NaturalQuestions (closed-book)": 0.412,
+        "OpenbookQA": 0.88,
+        "MMLU": 0.641,
+        "MATH": 0.615,
+        "GSM8K": 0.795,
+        "LegalBench": 0.637,
+        "MedQA": 0.628,
+        "WMT 2014": 0.186
+      }
+    },
+    {
+      "model_id": "deepseek-ai/deepseek-v3",
+      "name": "DeepSeek v3",
+      "developer": "deepseek-ai",
+      "scores": {
+        "Mean win rate": 0.908,
+        "NarrativeQA": 0.796,
+        "NaturalQuestions (closed-book)": 0.467,
+        "OpenbookQA": 0.954,
+        "MMLU": 0.803,
+        "MATH": 0.912,
+        "GSM8K": 0.94,
+        "LegalBench": 0.718,
+        "MedQA": 0.809,
+        "WMT 2014": 0.209
+      }
+    },
+    {
+      "model_id": "google/gemini-1.0-pro-002",
+      "name": "Gemini 1.0 Pro 002",
+      "developer": "google",
+      "scores": {
+        "Mean win rate": 0.422,
+        "NarrativeQA": 0.751,
+        "NaturalQuestions (closed-book)": 0.391,
+        "OpenbookQA": 0.788,
+        "MMLU": 0.534,
+        "MATH": 0.665,
+        "GSM8K": 0.816,
+        "LegalBench": 0.475,
+        "MedQA": 0.483,
+        "WMT 2014": 0.194
+      }
+    },
+    {
+      "model_id": "google/gemini-1.5-flash-001",
+      "name": "Gemini 1.5 Flash 001",
+      "developer": "google",
+      "scores": {
+        "Mean win rate": 0.667,
+        "NarrativeQA": 0.783,
+        "NaturalQuestions (closed-book)": 0.332,
+        "OpenbookQA": 0.928,
+        "MMLU": 0.703,
+        "MATH": 0.753,
+        "GSM8K": 0.785,
+        "LegalBench": 0.661,
+        "MedQA": 0.68,
+        "WMT 2014": 0.225
+      }
+    },
+    {
+      "model_id": "google/gemini-1.5-flash-002",
+      "name": "Gemini 1.5 Flash 002",
+      "developer": "google",
+      "scores": {
+        "Mean win rate": 0.573,
+        "NarrativeQA": 0.746,
+        "NaturalQuestions (closed-book)": 0.323,
+        "OpenbookQA": 0.914,
+        "MMLU": 0.679,
+        "MATH": 0.908,
+        "GSM8K": 0.328,
+        "LegalBench": 0.67,
+        "MedQA": 0.656,
+        "WMT 2014": 0.212
+      }
+    },
+    {
+      "model_id": "google/gemini-1.5-pro-001",
+      "name": "Gemini 1.5 Pro 001",
+      "developer": "google",
+      "scores": {
+        "Mean win rate": 0.739,
+        "NarrativeQA": 0.783,
+        "NaturalQuestions (closed-book)": 0.378,
+        "OpenbookQA": 0.902,
+        "MMLU": 0.772,
+        "MATH": 0.825,
+        "GSM8K": 0.836,
+        "LegalBench": 0.757,
+        "MedQA": 0.692,
+        "WMT 2014": 0.189
+      }
+    },
+    {
+      "model_id": "google/gemini-1.5-pro-002",
+      "name": "Gemini 1.5 Pro 002",
+      "developer": "google",
+      "scores": {
+        "Mean win rate": 0.842,
+        "NarrativeQA": 0.756,
+        "NaturalQuestions (closed-book)": 0.455,
+        "OpenbookQA": 0.952,
+        "MMLU": 0.795,
+        "MATH": 0.92,
+        "GSM8K": 0.817,
+        "LegalBench": 0.747,
+        "MedQA": 0.771,
+        "WMT 2014": 0.231
+      }
+    },
+    {
+      "model_id": "google/gemini-2.0-flash-exp",
+      "name": "Gemini 2.0 Flash Experimental",
+      "developer": "google",
+      "scores": {
+        "Mean win rate": 0.813,
+        "NarrativeQA": 0.783,
+        "NaturalQuestions (closed-book)": 0.443,
+        "OpenbookQA": 0.946,
+        "MMLU": 0.717,
+        "MATH": 0.901,
+        "GSM8K": 0.946,
+        "LegalBench": 0.674,
+        "MedQA": 0.73,
+        "WMT 2014": 0.212
+      }
+    },
+    {
+      "model_id": "google/gemma-2-27b-it",
+      "name": "Gemma 2 Instruct 27B",
+      "developer": "google",
+      "scores": {
+        "Mean win rate": 0.675,
+        "NarrativeQA": 0.79,
+        "NaturalQuestions (closed-book)": 0.353,
+        "OpenbookQA": 0.918,
+        "MMLU": 0.664,
+        "MATH": 0.746,
+        "GSM8K": 0.812,
+        "LegalBench": 0.7,
+        "MedQA": 0.684,
+        "WMT 2014": 0.214
+      }
+    },
+    {
+      "model_id": "google/gemma-2-9b-it",
+      "name": "Gemma 2 Instruct 9B",
+      "developer": "google",
+      "scores": {
+        "Mean win rate": 0.562,
+        "NarrativeQA": 0.768,
+        "NaturalQuestions (closed-book)": 0.328,
+        "OpenbookQA": 0.91,
+        "MMLU": 0.645,
+        "MATH": 0.724,
+        "GSM8K": 0.762,
+        "LegalBench": 0.639,
+        "MedQA": 0.63,
+        "WMT 2014": 0.201
+      }
+    },
+    {
+      "model_id": "google/gemma-7b",
+      "name": "Gemma 7B",
+      "developer": "google",
+      "scores": {
+        "Mean win rate": 0.336,
+        "NarrativeQA": 0.752,
+        "NaturalQuestions (closed-book)": 0.336,
+        "OpenbookQA": 0.808,
+        "MMLU": 0.571,
+        "MATH": 0.5,
+        "GSM8K": 0.559,
+        "LegalBench": 0.581,
+        "MedQA": 0.513,
+        "WMT 2014": 0.187
+      }
+    },
+    {
+      "model_id": "google/text-bison@001",
+      "name": "PaLM-2 Bison",
+      "developer": "google",
+      "scores": {
+        "Mean win rate": 0.526,
+        "NarrativeQA": 0.718,
+        "NaturalQuestions (closed-book)": 0.39,
+        "OpenbookQA": 0.878,
+        "MMLU": 0.608,
+        "MATH": 0.421,
+        "GSM8K": 0.61,
+        "LegalBench": 0.645,
+        "MedQA": 0.547,
+        "WMT 2014": 0.241
+      }
+    },
+    {
+      "model_id": "google/text-unicorn@001",
+      "name": "PaLM-2 Unicorn",
+      "developer": "google",
+      "scores": {
+        "Mean win rate": 0.644,
+        "NarrativeQA": 0.583,
+        "NaturalQuestions (closed-book)": 0.435,
+        "OpenbookQA": 0.938,
+        "MMLU": 0.702,
+        "MATH": 0.674,
+        "GSM8K": 0.831,
+        "LegalBench": 0.677,
+        "MedQA": 0.684,
+        "WMT 2014": 0.26
+      }
+    },
+    {
+      "model_id": "meta/llama-2-13b",
+      "name": "Llama 2 13B",
+      "developer": "meta",
+      "scores": {
+        "Mean win rate": 0.233,
+        "NarrativeQA": 0.741,
+        "NaturalQuestions (closed-book)": 0.371,
+        "OpenbookQA": 0.634,
+        "MMLU": 0.505,
+        "MATH": 0.102,
+        "GSM8K": 0.266,
+        "LegalBench": 0.591,
+        "MedQA": 0.392,
+        "WMT 2014": 0.167
+      }
+    },
+    {
+      "model_id": "meta/llama-2-70b",
+      "name": "Llama 2 70B",
+      "developer": "meta",
+      "scores": {
+        "Mean win rate": 0.482,
+        "NarrativeQA": 0.763,
+        "NaturalQuestions (closed-book)": 0.46,
+        "OpenbookQA": 0.838,
+        "MMLU": 0.58,
+        "MATH": 0.323,
+        "GSM8K": 0.567,
+        "LegalBench": 0.673,
+        "MedQA": 0.618,
+        "WMT 2014": 0.196
+      }
+    },
+    {
+      "model_id": "meta/llama-2-7b",
+      "name": "Llama 2 7B",
+      "developer": "meta",
+      "scores": {
+        "Mean win rate": 0.152,
+        "NarrativeQA": 0.686,
+        "NaturalQuestions (closed-book)": 0.333,
+        "OpenbookQA": 0.544,
+        "MMLU": 0.425,
+        "MATH": 0.097,
+        "GSM8K": 0.154,
+        "LegalBench": 0.502,
+        "MedQA": 0.392,
+        "WMT 2014": 0.144
+      }
+    },
+    {
+      "model_id": "meta/llama-3-70b",
+      "name": "Llama 3 70B",
+      "developer": "meta",
+      "scores": {
+        "Mean win rate": 0.793,
+        "NarrativeQA": 0.798,
+        "NaturalQuestions (closed-book)": 0.475,
+        "OpenbookQA": 0.934,
+        "MMLU": 0.695,
+        "MATH": 0.663,
+        "GSM8K": 0.805,
+        "LegalBench": 0.733,
+        "MedQA": 0.777,
+        "WMT 2014": 0.225
+      }
+    },
+    {
+      "model_id": "meta/llama-3-8b",
+      "name": "Llama 3 8B",
+      "developer": "meta",
+      "scores": {
+        "Mean win rate": 0.387,
+        "NarrativeQA": 0.754,
+        "NaturalQuestions (closed-book)": 0.378,
+        "OpenbookQA": 0.766,
+        "MMLU": 0.602,
+        "MATH": 0.391,
+        "GSM8K": 0.499,
+        "LegalBench": 0.637,
+        "MedQA": 0.581,
+        "WMT 2014": 0.183
+      }
+    },
+    {
+      "model_id": "meta/llama-3.1-405b-instruct-turbo",
+      "name": "Llama 3.1 Instruct Turbo 405B",
+      "developer": "meta",
+      "scores": {
+        "Mean win rate": 0.854,
+        "NarrativeQA": 0.749,
+        "NaturalQuestions (closed-book)": 0.456,
+        "OpenbookQA": 0.94,
+        "MMLU": 0.759,
+        "MATH": 0.827,
+        "GSM8K": 0.949,
+        "LegalBench": 0.707,
+        "MedQA": 0.805,
+        "WMT 2014": 0.238
+      }
+    },
+    {
+      "model_id": "meta/llama-3.1-70b-instruct-turbo",
+      "name": "Llama 3.1 Instruct Turbo 70B",
+      "developer": "meta",
+      "scores": {
+        "Mean win rate": 0.808,
+        "NarrativeQA": 0.772,
+        "NaturalQuestions (closed-book)": 0.452,
+        "OpenbookQA": 0.938,
+        "MMLU": 0.709,
+        "MATH": 0.783,
+        "GSM8K": 0.938,
+        "LegalBench": 0.687,
+        "MedQA": 0.769,
+        "WMT 2014": 0.223
+      }
+    },
+    {
+      "model_id": "meta/llama-3.1-8b-instruct-turbo",
+      "name": "Llama 3.1 Instruct Turbo 8B",
+      "developer": "meta",
+      "scores": {
+        "Mean win rate": 0.303,
+        "NarrativeQA": 0.756,
+        "NaturalQuestions (closed-book)": 0.209,
+        "OpenbookQA": 0.74,
+        "MMLU": 0.5,
+        "MATH": 0.703,
+        "GSM8K": 0.798,
+        "LegalBench": 0.342,
+        "MedQA": 0.245,
+        "WMT 2014": 0.181
+      }
+    },
+    {
+      "model_id": "meta/llama-3.2-11b-vision-instruct-turbo",
+      "name": "Llama 3.2 Vision Instruct Turbo 11B",
+      "developer": "meta",
+      "scores": {
+        "Mean win rate": 0.325,
+        "NarrativeQA": 0.756,
+        "NaturalQuestions (closed-book)": 0.234,
+        "OpenbookQA": 0.724,
+        "MMLU": 0.511,
+        "MATH": 0.739,
+        "GSM8K": 0.823,
+        "LegalBench": 0.435,
+        "MedQA": 0.27,
+        "WMT 2014": 0.179
+      }
+    },
+    {
+      "model_id": "meta/llama-3.2-90b-vision-instruct-turbo",
+      "name": "Llama 3.2 Vision Instruct Turbo 90B",
+      "developer": "meta",
+      "scores": {
+        "Mean win rate": 0.819,
+        "NarrativeQA": 0.777,
+        "NaturalQuestions (closed-book)": 0.457,
+        "OpenbookQA": 0.942,
+        "MMLU": 0.703,
+        "MATH": 0.791,
+        "GSM8K": 0.936,
+        "LegalBench": 0.68,
+        "MedQA": 0.769,
+        "WMT 2014": 0.224
+      }
+    },
+    {
+      "model_id": "meta/llama-3.3-70b-instruct-turbo",
+      "name": "Llama 3.3 Instruct Turbo 70B",
+      "developer": "meta",
+      "scores": {
+        "Mean win rate": 0.812,
+        "NarrativeQA": 0.791,
+        "NaturalQuestions (closed-book)": 0.431,
+        "OpenbookQA": 0.928,
+        "MMLU": 0.7,
+        "MATH": 0.808,
+        "GSM8K": 0.942,
+        "LegalBench": 0.725,
+        "MedQA": 0.761,
+        "WMT 2014": 0.219
+      }
+    },
+    {
+      "model_id": "meta/llama-65b",
+      "name": "LLaMA 65B",
+      "developer": "meta",
+      "scores": {
+        "Mean win rate": 0.345,
+        "NarrativeQA": 0.755,
+        "NaturalQuestions (closed-book)": 0.433,
+        "OpenbookQA": 0.754,
+        "MMLU": 0.584,
+        "MATH": 0.257,
+        "GSM8K": 0.489,
+        "LegalBench": 0.48,
+        "MedQA": 0.507,
+        "WMT 2014": 0.189
+      }
+    },
+    {
+      "model_id": "microsoft/phi-2",
+      "name": "Phi-2",
+      "developer": "microsoft",
+      "scores": {
+        "Mean win rate": 0.169,
+        "NarrativeQA": 0.703,
+        "NaturalQuestions (closed-book)": 0.155,
+        "OpenbookQA": 0.798,
+        "MMLU": 0.518,
+        "MATH": 0.255,
+        "GSM8K": 0.581,
+        "LegalBench": 0.334,
+        "MedQA": 0.41,
+        "WMT 2014": 0.038
+      }
+    },
+    {
+      "model_id": "microsoft/phi-3-medium-4k-instruct",
+      "name": "Phi-3 14B",
+      "developer": "microsoft",
+      "scores": {
+        "Mean win rate": 0.509,
+        "NarrativeQA": 0.724,
+        "NaturalQuestions (closed-book)": 0.278,
+        "OpenbookQA": 0.916,
+        "MMLU": 0.675,
+        "MATH": 0.611,
+        "GSM8K": 0.878,
+        "LegalBench": 0.593,
+        "MedQA": 0.696,
+        "WMT 2014": 0.17
+      }
+    },
+    {
+      "model_id": "microsoft/phi-3-small-8k-instruct",
+      "name": "Phi-3 7B",
+      "developer": "microsoft",
+      "scores": {
+        "Mean win rate": 0.473,
+        "NarrativeQA": 0.754,
+        "NaturalQuestions (closed-book)": 0.324,
+        "OpenbookQA": 0.912,
+        "MMLU": 0.659,
+        "MATH": 0.703,
+        "GSM8K": -1.0,
+        "LegalBench": 0.584,
+        "MedQA": 0.672,
+        "WMT 2014": 0.154
+      }
+    },
+    {
+      "model_id": "mistralai/mistral-7b-instruct-v0.3",
+      "name": "Mistral Instruct v0.3 7B",
+      "developer": "mistralai",
+      "scores": {
+        "Mean win rate": 0.196,
+        "NarrativeQA": 0.716,
+        "NaturalQuestions (closed-book)": 0.253,
+        "OpenbookQA": 0.79,
+        "MMLU": 0.51,
+        "MATH": 0.289,
+        "GSM8K": 0.538,
+        "LegalBench": 0.331,
+        "MedQA": 0.517,
+        "WMT 2014": 0.142
+      }
+    },
+    {
+      "model_id": "mistralai/mistral-7b-v0.1",
+      "name": "Mistral v0.1 7B",
+      "developer": "mistralai",
+      "scores": {
+        "Mean win rate": 0.292,
+        "NarrativeQA": 0.716,
+        "NaturalQuestions (closed-book)": 0.367,
+        "OpenbookQA": 0.776,
+        "MMLU": 0.584,
+        "MATH": 0.297,
+        "GSM8K": 0.377,
+        "LegalBench": 0.58,
+        "MedQA": 0.525,
+        "WMT 2014": 0.16
+      }
+    },
+    {
+      "model_id": "mistralai/mistral-large-2402",
+      "name": "Mistral Large 2402",
+      "developer": "mistralai",
+      "scores": {
+        "Mean win rate": 0.328,
+        "NarrativeQA": 0.454,
+        "NaturalQuestions (closed-book)": 0.311,
+        "OpenbookQA": 0.894,
+        "MMLU": 0.638,
+        "MATH": 0.75,
+        "GSM8K": 0.694,
+        "LegalBench": 0.479,
+        "MedQA": 0.499,
+        "WMT 2014": 0.182
+      }
+    },
+    {
+      "model_id": "mistralai/mistral-large-2407",
+      "name": "Mistral Large 2 2407",
+      "developer": "mistralai",
+      "scores": {
+        "Mean win rate": 0.744,
+        "NarrativeQA": 0.779,
+        "NaturalQuestions (closed-book)": 0.453,
+        "OpenbookQA": 0.932,
+        "MMLU": 0.725,
+        "MATH": 0.677,
+        "GSM8K": 0.912,
+        "LegalBench": 0.646,
+        "MedQA": 0.775,
+        "WMT 2014": 0.192
+      }
+    },
+    {
+      "model_id": "mistralai/mistral-medium-2312",
+      "name": "Mistral Medium 2312",
+      "developer": "mistralai",
+      "scores": {
+        "Mean win rate": 0.268,
+        "NarrativeQA": 0.449,
+        "NaturalQuestions (closed-book)": 0.29,
+        "OpenbookQA": 0.83,
+        "MMLU": 0.618,
+        "MATH": 0.565,
+        "GSM8K": 0.706,
+        "LegalBench": 0.452,
+        "MedQA": 0.61,
+        "WMT 2014": 0.169
+      }
+    },
+    {
+      "model_id": "mistralai/mistral-small-2402",
+      "name": "Mistral Small 2402",
+      "developer": "mistralai",
+      "scores": {
+        "Mean win rate": 0.288,
+        "NarrativeQA": 0.519,
+        "NaturalQuestions (closed-book)": 0.304,
+        "OpenbookQA": 0.862,
+        "MMLU": 0.593,
+        "MATH": 0.621,
+        "GSM8K": 0.734,
+        "LegalBench": 0.389,
+        "MedQA": 0.616,
+        "WMT 2014": 0.169
+      }
+    },
+    {
+      "model_id": "mistralai/mixtral-8x22b",
+      "name": "Mixtral 8x22B",
+      "developer": "mistralai",
+      "scores": {
+        "Mean win rate": 0.705,
+        "NarrativeQA": 0.779,
+        "NaturalQuestions (closed-book)": 0.478,
+        "OpenbookQA": 0.882,
+        "MMLU": 0.701,
+        "MATH": 0.656,
+        "GSM8K": 0.8,
+        "LegalBench": 0.708,
+        "MedQA": 0.704,
+        "WMT 2014": 0.209
+      }
+    },
+    {
+      "model_id": "mistralai/mixtral-8x7b-32kseqlen",
+      "name": "Mixtral 8x7B 32K seqlen",
+      "developer": "mistralai",
+      "scores": {
+        "Mean win rate": 0.51,
+        "NarrativeQA": 0.767,
+        "NaturalQuestions (closed-book)": 0.427,
+        "OpenbookQA": 0.868,
+        "MMLU": 0.649,
+        "MATH": 0.494,
+        "GSM8K": 0.622,
+        "LegalBench": 0.63,
+        "MedQA": 0.652,
+        "WMT 2014": 0.19
+      }
+    },
+    {
+      "model_id": "mistralai/open-mistral-nemo-2407",
+      "name": "Mistral NeMo 2402",
+      "developer": "mistralai",
+      "scores": {
+        "Mean win rate": 0.333,
+        "NarrativeQA": 0.731,
+        "NaturalQuestions (closed-book)": 0.265,
+        "OpenbookQA": 0.822,
+        "MMLU": 0.604,
+        "MATH": 0.668,
+        "GSM8K": 0.782,
+        "LegalBench": 0.415,
+        "MedQA": 0.59,
+        "WMT 2014": 0.177
+      }
+    },
+    {
+      "model_id": "openai/gpt-3.5-turbo-0613",
+      "name": "gpt-3.5-turbo-0613",
+      "developer": "openai",
+      "scores": {
+        "Mean win rate": 0.358,
+        "NarrativeQA": 0.655,
+        "NaturalQuestions (closed-book)": 0.335,
+        "OpenbookQA": 0.838,
+        "MMLU": 0.614,
+        "MATH": 0.667,
+        "GSM8K": 0.501,
+        "LegalBench": 0.528,
+        "MedQA": 0.622,
+        "WMT 2014": 0.187
+      }
+    },
+    {
+      "model_id": "openai/gpt-4-0613",
+      "name": "GPT-4 0613",
+      "developer": "openai",
+      "scores": {
+        "Mean win rate": 0.867,
+        "NarrativeQA": 0.768,
+        "NaturalQuestions (closed-book)": 0.457,
+        "OpenbookQA": 0.96,
+        "MMLU": 0.735,
+        "MATH": 0.802,
+        "GSM8K": 0.932,
+        "LegalBench": 0.713,
+        "MedQA": 0.815,
+        "WMT 2014": 0.211
+      }
+    },
+    {
+      "model_id": "openai/gpt-4-1106-preview",
+      "name": "GPT-4 Turbo 1106 preview",
+      "developer": "openai",
+      "scores": {
+        "Mean win rate": 0.698,
+        "NarrativeQA": 0.727,
+        "NaturalQuestions (closed-book)": 0.435,
+        "OpenbookQA": 0.95,
+        "MMLU": 0.699,
+        "MATH": 0.857,
+        "GSM8K": 0.668,
+        "LegalBench": 0.626,
+        "MedQA": 0.817,
+        "WMT 2014": 0.205
+      }
+    },
+    {
+      "model_id": "openai/gpt-4-turbo-2024-04-09",
+      "name": "GPT-4 Turbo 2024-04-09",
+      "developer": "openai",
+      "scores": {
+        "Mean win rate": 0.864,
+        "NarrativeQA": 0.761,
+        "NaturalQuestions (closed-book)": 0.482,
+        "OpenbookQA": 0.97,
+        "MMLU": 0.711,
+        "MATH": 0.833,
+        "GSM8K": 0.824,
+        "LegalBench": 0.727,
+        "MedQA": 0.783,
+        "WMT 2014": 0.218
+      }
+    },
+    {
+      "model_id": "openai/gpt-4o-2024-05-13",
+      "name": "GPT-4o 2024-05-13",
+      "developer": "openai",
+      "scores": {
+        "Mean win rate": 0.938,
+        "NarrativeQA": 0.804,
+        "NaturalQuestions (closed-book)": 0.501,
+        "OpenbookQA": 0.966,
+        "MMLU": 0.748,
+        "MATH": 0.829,
+        "GSM8K": 0.905,
+        "LegalBench": 0.733,
+        "MedQA": 0.857,
+        "WMT 2014": 0.231
+      }
+    },
+    {
+      "model_id": "openai/gpt-4o-2024-08-06",
+      "name": "GPT-4o 2024-08-06",
+      "developer": "openai",
+      "scores": {
+        "Mean win rate": 0.928,
+        "NarrativeQA": 0.795,
+        "NaturalQuestions (closed-book)": 0.496,
+        "OpenbookQA": 0.968,
+        "MMLU": 0.738,
+        "MATH": 0.853,
+        "GSM8K": 0.909,
+        "LegalBench": 0.721,
+        "MedQA": 0.863,
+        "WMT 2014": 0.225
+      }
+    },
+    {
+      "model_id": "openai/gpt-4o-mini-2024-07-18",
+      "name": "GPT-4o mini 2024-07-18",
+      "developer": "openai",
+      "scores": {
+        "Mean win rate": 0.701,
+        "NarrativeQA": 0.768,
+        "NaturalQuestions (closed-book)": 0.386,
+        "OpenbookQA": 0.92,
+        "MMLU": 0.668,
+        "MATH": 0.802,
+        "GSM8K": 0.843,
+        "LegalBench": 0.653,
+        "MedQA": 0.748,
+        "WMT 2014": 0.206
+      }
+    },
+    {
+      "model_id": "openai/text-davinci-002",
+      "name": "text-davinci-002",
+      "developer": "openai",
+      "scores": {
+        "Mean win rate": 0.336,
+        "NarrativeQA": 0.719,
+        "NaturalQuestions (closed-book)": 0.394,
+        "OpenbookQA": 0.796,
+        "MMLU": 0.568,
+        "MATH": 0.428,
+        "GSM8K": 0.479,
+        "LegalBench": 0.58,
+        "MedQA": 0.525,
+        "WMT 2014": 0.174
+      }
+    },
+    {
+      "model_id": "openai/text-davinci-003",
+      "name": "text-davinci-003",
+      "developer": "openai",
+      "scores": {
+        "Mean win rate": 0.439,
+        "NarrativeQA": 0.731,
+        "NaturalQuestions (closed-book)": 0.413,
+        "OpenbookQA": 0.828,
+        "MMLU": 0.555,
+        "MATH": 0.449,
+        "GSM8K": 0.615,
+        "LegalBench": 0.622,
+        "MedQA": 0.531,
+        "WMT 2014": 0.191
+      }
+    },
+    {
+      "model_id": "qwen/qwen1.5-110b-chat",
+      "name": "Qwen1.5 Chat 110B",
+      "developer": "qwen",
+      "scores": {
+        "Mean win rate": 0.55,
+        "NarrativeQA": 0.721,
+        "NaturalQuestions (closed-book)": 0.35,
+        "OpenbookQA": 0.922,
+        "MMLU": 0.704,
+        "MATH": 0.568,
+        "GSM8K": 0.815,
+        "LegalBench": 0.624,
+        "MedQA": 0.64,
+        "WMT 2014": 0.192
+      }
+    },
+    {
+      "model_id": "qwen/qwen1.5-14b",
+      "name": "Qwen1.5 14B",
+      "developer": "qwen",
+      "scores": {
+        "Mean win rate": 0.425,
+        "NarrativeQA": 0.711,
+        "NaturalQuestions (closed-book)": 0.3,
+        "OpenbookQA": 0.862,
+        "MMLU": 0.626,
+        "MATH": 0.686,
+        "GSM8K": 0.693,
+        "LegalBench": 0.593,
+        "MedQA": 0.515,
+        "WMT 2014": 0.178
+      }
+    },
+    {
+      "model_id": "qwen/qwen1.5-32b",
+      "name": "Qwen1.5 32B",
+      "developer": "qwen",
+      "scores": {
+        "Mean win rate": 0.546,
+        "NarrativeQA": 0.589,
+        "NaturalQuestions (closed-book)": 0.353,
+        "OpenbookQA": 0.932,
+        "MMLU": 0.628,
+        "MATH": 0.733,
+        "GSM8K": 0.773,
+        "LegalBench": 0.636,
+        "MedQA": 0.656,
+        "WMT 2014": 0.193
+      }
+    },
+    {
+      "model_id": "qwen/qwen1.5-72b",
+      "name": "Qwen1.5 72B",
+      "developer": "qwen",
+      "scores": {
+        "Mean win rate": 0.608,
+        "NarrativeQA": 0.601,
+        "NaturalQuestions (closed-book)": 0.417,
+        "OpenbookQA": 0.93,
+        "MMLU": 0.647,
+        "MATH": 0.683,
+        "GSM8K": 0.799,
+        "LegalBench": 0.694,
+        "MedQA": 0.67,
+        "WMT 2014": 0.201
+      }
+    },
+    {
+      "model_id": "qwen/qwen1.5-7b",
+      "name": "Qwen1.5 7B",
+      "developer": "qwen",
+      "scores": {
+        "Mean win rate": 0.275,
+        "NarrativeQA": 0.448,
+        "NaturalQuestions (closed-book)": 0.27,
+        "OpenbookQA": 0.806,
+        "MMLU": 0.569,
+        "MATH": 0.561,
+        "GSM8K": 0.6,
+        "LegalBench": 0.523,
+        "MedQA": 0.479,
+        "WMT 2014": 0.153
+      }
+    },
+    {
+      "model_id": "qwen/qwen2-72b-instruct",
+      "name": "Qwen2 Instruct 72B",
+      "developer": "qwen",
+      "scores": {
+        "Mean win rate": 0.77,
+        "NarrativeQA": 0.727,
+        "NaturalQuestions (closed-book)": 0.39,
+        "OpenbookQA": 0.954,
+        "MMLU": 0.769,
+        "MATH": 0.79,
+        "GSM8K": 0.92,
+        "LegalBench": 0.712,
+        "MedQA": 0.746,
+        "WMT 2014": 0.207
+      }
+    },
+    {
+      "model_id": "qwen/qwen2.5-72b-instruct-turbo",
+      "name": "Qwen2.5 Instruct Turbo 72B",
+      "developer": "qwen",
+      "scores": {
+        "Mean win rate": 0.745,
+        "NarrativeQA": 0.745,
+        "NaturalQuestions (closed-book)": 0.359,
+        "OpenbookQA": 0.962,
+        "MMLU": 0.77,
+        "MATH": 0.884,
+        "GSM8K": 0.9,
+        "LegalBench": 0.74,
+        "MedQA": 0.753,
+        "WMT 2014": 0.207
+      }
+    },
+    {
+      "model_id": "qwen/qwen2.5-7b-instruct-turbo",
+      "name": "Qwen2.5 Instruct Turbo 7B",
+      "developer": "qwen",
+      "scores": {
+        "Mean win rate": 0.488,
+        "NarrativeQA": 0.742,
+        "NaturalQuestions (closed-book)": 0.205,
+        "OpenbookQA": 0.862,
+        "MMLU": 0.658,
+        "MATH": 0.835,
+        "GSM8K": 0.83,
+        "LegalBench": 0.632,
+        "MedQA": 0.6,
+        "WMT 2014": 0.155
+      }
+    },
+    {
+      "model_id": "snowflake/snowflake-arctic-instruct",
+      "name": "Arctic Instruct",
+      "developer": "snowflake",
+      "scores": {
+        "Mean win rate": 0.338,
+        "NarrativeQA": 0.654,
+        "NaturalQuestions (closed-book)": 0.39,
+        "OpenbookQA": 0.828,
+        "MMLU": 0.575,
+        "MATH": 0.519,
+        "GSM8K": 0.768,
+        "LegalBench": 0.588,
+        "MedQA": 0.581,
+        "WMT 2014": 0.172
+      }
+    },
+    {
+      "model_id": "tiiuae/falcon-40b",
+      "name": "Falcon 40B",
+      "developer": "tiiuae",
+      "scores": {
+        "Mean win rate": 0.217,
+        "NarrativeQA": 0.671,
+        "NaturalQuestions (closed-book)": 0.392,
+        "OpenbookQA": 0.662,
+        "MMLU": 0.507,
+        "MATH": 0.128,
+        "GSM8K": 0.267,
+        "LegalBench": 0.442,
+        "MedQA": 0.419,
+        "WMT 2014": 0.162
+      }
+    },
+    {
+      "model_id": "tiiuae/falcon-7b",
+      "name": "Falcon 7B",
+      "developer": "tiiuae",
+      "scores": {
+        "Mean win rate": 0.064,
+        "NarrativeQA": 0.621,
+        "NaturalQuestions (closed-book)": 0.285,
+        "OpenbookQA": 0.26,
+        "MMLU": 0.288,
+        "MATH": 0.044,
+        "GSM8K": 0.055,
+        "LegalBench": 0.346,
+        "MedQA": 0.254,
+        "WMT 2014": 0.094
+      }
+    },
+    {
+      "model_id": "upstage/solar-pro-241126",
+      "name": "Solar Pro",
+      "developer": "upstage",
+      "scores": {
+        "Mean win rate": 0.602,
+        "NarrativeQA": 0.753,
+        "NaturalQuestions (closed-book)": 0.297,
+        "OpenbookQA": 0.922,
+        "MMLU": 0.679,
+        "MATH": 0.567,
+        "GSM8K": 0.871,
+        "LegalBench": 0.67,
+        "MedQA": 0.698,
+        "WMT 2014": 0.169
+      }
+    },
+    {
+      "model_id": "writer/palmyra-x-004",
+      "name": "Palmyra-X-004",
+      "developer": "writer",
+      "scores": {
+        "Mean win rate": 0.808,
+        "NarrativeQA": 0.773,
+        "NaturalQuestions (closed-book)": 0.457,
+        "OpenbookQA": 0.926,
+        "MMLU": 0.739,
+        "MATH": 0.767,
+        "GSM8K": 0.905,
+        "LegalBench": 0.73,
+        "MedQA": 0.775,
+        "WMT 2014": 0.203
+      }
+    },
+    {
+      "model_id": "writer/palmyra-x-v2",
+      "name": "Palmyra X V2 33B",
+      "developer": "writer",
+      "scores": {
+        "Mean win rate": 0.589,
+        "NarrativeQA": 0.752,
+        "NaturalQuestions (closed-book)": 0.428,
+        "OpenbookQA": 0.878,
+        "MMLU": 0.621,
+        "MATH": 0.58,
+        "GSM8K": 0.735,
+        "LegalBench": 0.644,
+        "MedQA": 0.598,
+        "WMT 2014": 0.239
+      }
+    },
+    {
+      "model_id": "writer/palmyra-x-v3",
+      "name": "Palmyra X V3 72B",
+      "developer": "writer",
+      "scores": {
+        "Mean win rate": 0.679,
+        "NarrativeQA": 0.706,
+        "NaturalQuestions (closed-book)": 0.407,
+        "OpenbookQA": 0.938,
+        "MMLU": 0.702,
+        "MATH": 0.723,
+        "GSM8K": 0.831,
+        "LegalBench": 0.709,
+        "MedQA": 0.684,
+        "WMT 2014": 0.262
+      }
+    }
+  ]
+}

data/benchmarks/helm_mmlu.json ADDED Viewed

The diff for this file is too large to render. See raw diff

data/benchmarks/hfopenllm_v2.json ADDED Viewed

The diff for this file is too large to render. See raw diff

data/benchmarks/livecodebenchpro.json ADDED Viewed

	@@ -0,0 +1,274 @@

+{
+  "models": [
+    {
+      "model_id": "alibaba/qwen3-235b-a22b-thinking-2507",
+      "name": "qwen3-235b-a22b-thinking-2507",
+      "developer": "Alibaba",
+      "scores": {
+        "Hard Problems": 0.0,
+        "Medium Problems": 0.1267605633802817,
+        "Easy Problems": 0.7605633802816901
+      }
+    },
+    {
+      "model_id": "alibaba/qwen3-30b-a3b",
+      "name": "qwen3-30b-a3b",
+      "developer": "Alibaba",
+      "scores": {
+        "Hard Problems": 0.0,
+        "Medium Problems": 0.028169014084507043,
+        "Easy Problems": 0.5774647887323944
+      }
+    },
+    {
+      "model_id": "alibaba/qwen3-max",
+      "name": "alibaba/qwen3-max",
+      "developer": "Alibaba",
+      "scores": {
+        "Hard Problems": 0.0,
+        "Medium Problems": 0.04225352112676056,
+        "Easy Problems": 0.36619718309859156
+      }
+    },
+    {
+      "model_id": "alibaba/qwen3-next-80b-a3b-thinking",
+      "name": "qwen3-next-80b-a3b-thinking",
+      "developer": "Alibaba",
+      "scores": {
+        "Hard Problems": 0.0,
+        "Medium Problems": 0.14084507042253522,
+        "Easy Problems": 0.7464788732394366
+      }
+    },
+    {
+      "model_id": "aliyun/qwen3-next-80b-a3b-thinking",
+      "name": "qwen3-next-80b-a3b-thinking",
+      "developer": "aliyun",
+      "scores": {
+        "Hard Problems": 0.0,
+        "Medium Problems": 0.0704,
+        "Easy Problems": 0.6901
+      }
+    },
+    {
+      "model_id": "anthropic/claude-3-7-sonnet-20250219",
+      "name": "claude-3-7-sonnet-20250219",
+      "developer": "anthropic",
+      "scores": {
+        "Hard Problems": 0.0,
+        "Medium Problems": 0.0,
+        "Easy Problems": 0.28169014084507044
+      }
+    },
+    {
+      "model_id": "anthropic/claude-3.7-sonnet",
+      "name": "anthropic/claude-3.7-sonnet",
+      "developer": "Anthropic",
+      "scores": {
+        "Hard Problems": 0.0,
+        "Medium Problems": 0.014084507042253521,
+        "Easy Problems": 0.15492957746478872
+      }
+    },
+    {
+      "model_id": "anthropic/claude-sonnet-4-5-20250929",
+      "name": "claude-sonnet-4-5-20250929",
+      "developer": "anthropic",
+      "scores": {
+        "Hard Problems": 0.0,
+        "Medium Problems": 0.0,
+        "Easy Problems": 0.5352
+      }
+    },
+    {
+      "model_id": "ark/ep-20250603132404-cgpjm",
+      "name": "ep-20250603132404-cgpjm",
+      "developer": "ark",
+      "scores": {
+        "Hard Problems": 0.0,
+        "Medium Problems": 0.0141,
+        "Easy Problems": 0.507
+      }
+    },
+    {
+      "model_id": "bytedance/doubao-seed-1-6-thinking-250615",
+      "name": "doubao-seed-1-6-thinking-250615",
+      "developer": "ByteDance",
+      "scores": {
+        "Hard Problems": 0.0,
+        "Medium Problems": 0.07042253521126761,
+        "Easy Problems": 0.5774647887323944
+      }
+    },
+    {
+      "model_id": "deepseek/chat-v3-0324",
+      "name": "deepseek/chat-v3-0324",
+      "developer": "DeepSeek",
+      "scores": {
+        "Hard Problems": 0.0,
+        "Medium Problems": 0.0,
+        "Easy Problems": 0.19718309859154928
+      }
+    },
+    {
+      "model_id": "deepseek/ep-20250214004308-p7n89",
+      "name": "ep-20250214004308-p7n89",
+      "developer": "DeepSeek",
+      "scores": {
+        "Hard Problems": 0.0,
+        "Medium Problems": 0.014084507042253521,
+        "Easy Problems": 0.4225352112676056
+      }
+    },
+    {
+      "model_id": "deepseek/ep-20250228232227-z44x5",
+      "name": "ep-20250228232227-z44x5",
+      "developer": "DeepSeek",
+      "scores": {
+        "Hard Problems": 0.0,
+        "Medium Problems": 0.0,
+        "Easy Problems": 0.1267605633802817
+      }
+    },
+    {
+      "model_id": "deepseek/ep-20250603132404-cgpjm",
+      "name": "ep-20250603132404-cgpjm",
+      "developer": "DeepSeek",
+      "scores": {
+        "Hard Problems": 0.0,
+        "Medium Problems": 0.08450704225352113,
+        "Easy Problems": 0.5774647887323944
+      }
+    },
+    {
+      "model_id": "google/gemini-2.5-flash",
+      "name": "gemini-2.5-flash",
+      "developer": "google",
+      "scores": {
+        "Hard Problems": 0.0,
+        "Medium Problems": 0.028169014084507043,
+        "Easy Problems": 0.38028169014084506
+      }
+    },
+    {
+      "model_id": "google/gemini-2.5-pro",
+      "name": "gemini-2.5-pro",
+      "developer": "google",
+      "scores": {
+        "Hard Problems": 0.014084507042253521,
+        "Medium Problems": 0.2112676056338028,
+        "Easy Problems": 0.7183098591549296
+      }
+    },
+    {
+      "model_id": "kuaishou/kwaipilot-40b-0604",
+      "name": "kwaipilot-40b-0604",
+      "developer": "Kuaishou",
+      "scores": {
+        "Hard Problems": 0.0,
+        "Medium Problems": 0.07042253521126761,
+        "Easy Problems": 0.056338028169014086
+      }
+    },
+    {
+      "model_id": "meta/llama-4-maverick",
+      "name": "meta/llama-4-maverick",
+      "developer": "Meta",
+      "scores": {
+        "Hard Problems": 0.0,
+        "Medium Problems": 0.0,
+        "Easy Problems": 0.09859154929577464
+      }
+    },
+    {
+      "model_id": "openai/gpt-4.1",
+      "name": "openai/gpt-4.1",
+      "developer": "OpenAI",
+      "scores": {
+        "Hard Problems": 0.0,
+        "Medium Problems": 0.0,
+        "Easy Problems": 0.19718309859154928
+      }
+    },
+    {
+      "model_id": "openai/gpt-4o-2024-11-20",
+      "name": "GPT-4o 2024-11-20",
+      "developer": "openai",
+      "scores": {
+        "Hard Problems": 0.0,
+        "Medium Problems": 0.0,
+        "Easy Problems": 0.07042253521126761
+      }
+    },
+    {
+      "model_id": "openai/gpt-5-2025-08-07",
+      "name": "gpt-5-2025-08-07",
+      "developer": "openai",
+      "scores": {
+        "Hard Problems": 0.0423,
+        "Medium Problems": 0.4085,
+        "Easy Problems": 0.9014
+      }
+    },
+    {
+      "model_id": "openai/gpt-5.2-2025-12-11",
+      "name": "gpt-5.2-2025-12-11",
+      "developer": "OpenAI",
+      "scores": {
+        "Hard Problems": 0.1594,
+        "Medium Problems": 0.5211,
+        "Easy Problems": 0.9014
+      }
+    },
+    {
+      "model_id": "openai/gpt-oss-120b",
+      "name": "gpt-oss-120b",
+      "developer": "openai",
+      "scores": {
+        "Hard Problems": 0.0,
+        "Medium Problems": 0.11267605633802817,
+        "Easy Problems": 0.6619718309859155
+      }
+    },
+    {
+      "model_id": "openai/gpt-oss-20b",
+      "name": "gpt-oss-20b",
+      "developer": "openai",
+      "scores": {
+        "Hard Problems": 0.0,
+        "Medium Problems": 0.056338028169014086,
+        "Easy Problems": 0.5070422535211268
+      }
+    },
+    {
+      "model_id": "openai/o3-2025-04-16",
+      "name": "o3 2025-04-16",
+      "developer": "openai",
+      "scores": {
+        "Hard Problems": 0.0,
+        "Medium Problems": 0.22535211267605634,
+        "Easy Problems": 0.7183098591549296
+      }
+    },
+    {
+      "model_id": "openai/o4-mini-2025-04-16",
+      "name": "o4-mini-2025-04-16",
+      "developer": "openai",
+      "scores": {
+        "Hard Problems": 0.014084507042253521,
+        "Medium Problems": 0.30985915492957744,
+        "Easy Problems": 0.8873239436619719
+      }
+    },
+    {
+      "model_id": "z-ai/glm-4.5",
+      "name": "z-ai/glm-4.5",
+      "developer": "Z.AI",
+      "scores": {
+        "Hard Problems": 0.0,
+        "Medium Problems": 0.028169014084507043,
+        "Easy Problems": 0.1267605633802817
+      }
+    }
+  ]
+}

data/benchmarks/reward-bench.json ADDED Viewed

The diff for this file is too large to render. See raw diff

data/benchmarks/swe-bench.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "models": [
+    {
+      "model_id": "anthropic/claude-opus-4-5",
+      "name": "claude-opus-4-5",
+      "developer": "Anthropic",
+      "scores": {
+        "swe-bench": 0.6061
+      }
+    },
+    {
+      "model_id": "google/gemini-3-pro-preview",
+      "name": "gemini-3-pro-preview",
+      "developer": "Google",
+      "scores": {
+        "swe-bench": 0.7576
+      }
+    },
+    {
+      "model_id": "openai/gpt-5.2-2025-12-11",
+      "name": "gpt-5.2-2025-12-11",
+      "developer": "OpenAI",
+      "scores": {
+        "swe-bench": 0.57
+      }
+    }
+  ]
+}

data/benchmarks/tau-bench-2_airline.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "models": [
+    {
+      "model_id": "anthropic/claude-opus-4-5",
+      "name": "claude-opus-4-5",
+      "developer": "Anthropic",
+      "scores": {
+        "tau-bench-2/airline": 0.66
+      }
+    },
+    {
+      "model_id": "google/gemini-3-pro-preview",
+      "name": "gemini-3-pro-preview",
+      "developer": "Google",
+      "scores": {
+        "tau-bench-2/airline": 0.7
+      }
+    },
+    {
+      "model_id": "openai/gpt-5.2-2025-12-11",
+      "name": "gpt-5.2-2025-12-11",
+      "developer": "OpenAI",
+      "scores": {
+        "tau-bench-2/airline": 0.54
+      }
+    }
+  ]
+}

data/benchmarks/tau-bench-2_retail.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "models": [
+    {
+      "model_id": "anthropic/claude-opus-4-5",
+      "name": "claude-opus-4-5",
+      "developer": "Anthropic",
+      "scores": {
+        "tau-bench-2/retail": 0.83
+      }
+    },
+    {
+      "model_id": "google/gemini-3-pro-preview",
+      "name": "gemini-3-pro-preview",
+      "developer": "Google",
+      "scores": {
+        "tau-bench-2/retail": 0.7576
+      }
+    },
+    {
+      "model_id": "openai/gpt-5.2-2025-12-11",
+      "name": "gpt-5.2-2025-12-11",
+      "developer": "OpenAI",
+      "scores": {
+        "tau-bench-2/retail": 0.68
+      }
+    }
+  ]
+}

data/benchmarks/tau-bench-2_telecom.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "models": [
+    {
+      "model_id": "anthropic/claude-opus-4-5",
+      "name": "claude-opus-4-5",
+      "developer": "Anthropic",
+      "scores": {
+        "tau-bench-2/telecom": 0.76
+      }
+    },
+    {
+      "model_id": "google/gemini-3-pro-preview",
+      "name": "gemini-3-pro-preview",
+      "developer": "Google",
+      "scores": {
+        "tau-bench-2/telecom": 0.73
+      }
+    },
+    {
+      "model_id": "openai/gpt-5.2-2025-12-11",
+      "name": "gpt-5.2-2025-12-11",
+      "developer": "OpenAI",
+      "scores": {
+        "tau-bench-2/telecom": 0.5354
+      }
+    }
+  ]
+}

data/benchmarks/terminal-bench-2.0.json ADDED Viewed

	@@ -0,0 +1,300 @@

+{
+  "models": [
+    {
+      "model_id": "alibaba/qwen-3-coder-480b",
+      "name": "Qwen 3 Coder 480B",
+      "developer": "Alibaba",
+      "scores": {
+        "terminal-bench-2.0": 25.4
+      }
+    },
+    {
+      "model_id": "anthropic/claude-haiku-4.5",
+      "name": "Claude Haiku 4.5",
+      "developer": "Anthropic",
+      "scores": {
+        "terminal-bench-2.0": 29.8
+      }
+    },
+    {
+      "model_id": "anthropic/claude-opus-4.1",
+      "name": "Claude Opus 4.1",
+      "developer": "Anthropic",
+      "scores": {
+        "terminal-bench-2.0": 35.1
+      }
+    },
+    {
+      "model_id": "anthropic/claude-opus-4.5",
+      "name": "Claude Opus 4.5",
+      "developer": "Anthropic",
+      "scores": {
+        "terminal-bench-2.0": 59.1
+      }
+    },
+    {
+      "model_id": "anthropic/claude-opus-4.6",
+      "name": "Claude Opus 4.6",
+      "developer": "Anthropic",
+      "scores": {
+        "terminal-bench-2.0": 58.0
+      }
+    },
+    {
+      "model_id": "anthropic/claude-sonnet-4.5",
+      "name": "Claude Sonnet 4.5",
+      "developer": "Anthropic",
+      "scores": {
+        "terminal-bench-2.0": 46.5
+      }
+    },
+    {
+      "model_id": "deepseek/deepseek-v3.2",
+      "name": "DeepSeek-V3.2",
+      "developer": "DeepSeek",
+      "scores": {
+        "terminal-bench-2.0": 39.6
+      }
+    },
+    {
+      "model_id": "google/gemini-2.5-flash",
+      "name": "gemini-2.5-flash",
+      "developer": "google",
+      "scores": {
+        "terminal-bench-2.0": 17.1
+      }
+    },
+    {
+      "model_id": "google/gemini-2.5-pro",
+      "name": "gemini-2.5-pro",
+      "developer": "google",
+      "scores": {
+        "terminal-bench-2.0": 26.1
+      }
+    },
+    {
+      "model_id": "google/gemini-3-flash",
+      "name": "Gemini 3 Flash",
+      "developer": "Google",
+      "scores": {
+        "terminal-bench-2.0": 64.3
+      }
+    },
+    {
+      "model_id": "google/gemini-3-pro",
+      "name": "Gemini 3 Pro",
+      "developer": "Google",
+      "scores": {
+        "terminal-bench-2.0": 65.2
+      }
+    },
+    {
+      "model_id": "google/gemini-3.1-pro",
+      "name": "Gemini 3.1 Pro",
+      "developer": "Google",
+      "scores": {
+        "terminal-bench-2.0": 74.8
+      }
+    },
+    {
+      "model_id": "minimax/minimax-m2",
+      "name": "MiniMax M2",
+      "developer": "MiniMax",
+      "scores": {
+        "terminal-bench-2.0": 30.0
+      }
+    },
+    {
+      "model_id": "minimax/minimax-m2.1",
+      "name": "MiniMax M2.1",
+      "developer": "MiniMax",
+      "scores": {
+        "terminal-bench-2.0": 36.6
+      }
+    },
+    {
+      "model_id": "minimax/minimax-m2.5",
+      "name": "Minimax m2.5",
+      "developer": "Minimax",
+      "scores": {
+        "terminal-bench-2.0": 42.2
+      }
+    },
+    {
+      "model_id": "moonshot-ai/kimi-k2-instruct",
+      "name": "Kimi K2 Instruct",
+      "developer": "Moonshot AI",
+      "scores": {
+        "terminal-bench-2.0": 27.8
+      }
+    },
+    {
+      "model_id": "moonshot-ai/kimi-k2-thinking",
+      "name": "Kimi K2 Thinking",
+      "developer": "Moonshot AI",
+      "scores": {
+        "terminal-bench-2.0": 35.7
+      }
+    },
+    {
+      "model_id": "moonshot-ai/kimi-k2.5",
+      "name": "Kimi K2.5",
+      "developer": "Kimi",
+      "scores": {
+        "terminal-bench-2.0": 43.2
+      }
+    },
+    {
+      "model_id": "multiple/multiple",
+      "name": "Multiple",
+      "developer": "Multiple",
+      "scores": {
+        "terminal-bench-2.0": 59.1
+      }
+    },
+    {
+      "model_id": "openai/gpt-5",
+      "name": "GPT-5",
+      "developer": "OpenAI",
+      "scores": {
+        "terminal-bench-2.0": 33.9
+      }
+    },
+    {
+      "model_id": "openai/gpt-5-codex",
+      "name": "GPT-5-Codex",
+      "developer": "OpenAI",
+      "scores": {
+        "terminal-bench-2.0": 43.4
+      }
+    },
+    {
+      "model_id": "openai/gpt-5-mini",
+      "name": "GPT-5-Mini",
+      "developer": "OpenAI",
+      "scores": {
+        "terminal-bench-2.0": 29.2
+      }
+    },
+    {
+      "model_id": "openai/gpt-5-nano",
+      "name": "GPT-5-Nano",
+      "developer": "OpenAI",
+      "scores": {
+        "terminal-bench-2.0": 9.9
+      }
+    },
+    {
+      "model_id": "openai/gpt-5.1",
+      "name": "GPT-5.1",
+      "developer": "OpenAI",
+      "scores": {
+        "terminal-bench-2.0": 47.6
+      }
+    },
+    {
+      "model_id": "openai/gpt-5.1-codex",
+      "name": "GPT-5.1-Codex",
+      "developer": "OpenAI",
+      "scores": {
+        "terminal-bench-2.0": 57.8
+      }
+    },
+    {
+      "model_id": "openai/gpt-5.1-codex-max",
+      "name": "GPT-5.1-Codex-Max",
+      "developer": "OpenAI",
+      "scores": {
+        "terminal-bench-2.0": 60.4
+      }
+    },
+    {
+      "model_id": "openai/gpt-5.1-codex-mini",
+      "name": "GPT-5.1-Codex-Mini",
+      "developer": "OpenAI",
+      "scores": {
+        "terminal-bench-2.0": 43.1
+      }
+    },
+    {
+      "model_id": "openai/gpt-5.2",
+      "name": "GPT-5.2",
+      "developer": "OpenAI",
+      "scores": {
+        "terminal-bench-2.0": 54.0
+      }
+    },
+    {
+      "model_id": "openai/gpt-5.2-codex",
+      "name": "GPT-5.2-Codex",
+      "developer": "OpenAI",
+      "scores": {
+        "terminal-bench-2.0": 66.5
+      }
+    },
+    {
+      "model_id": "openai/gpt-5.3-codex",
+      "name": "GPT-5.3-Codex",
+      "developer": "OpenAI",
+      "scores": {
+        "terminal-bench-2.0": 70.3
+      }
+    },
+    {
+      "model_id": "openai/gpt-oss-120b",
+      "name": "gpt-oss-120b",
+      "developer": "openai",
+      "scores": {
+        "terminal-bench-2.0": 14.2
+      }
+    },
+    {
+      "model_id": "openai/gpt-oss-20b",
+      "name": "gpt-oss-20b",
+      "developer": "openai",
+      "scores": {
+        "terminal-bench-2.0": 3.1
+      }
+    },
+    {
+      "model_id": "xai/grok-4",
+      "name": "Grok 4",
+      "developer": "xAI",
+      "scores": {
+        "terminal-bench-2.0": 25.4
+      }
+    },
+    {
+      "model_id": "xai/grok-code-fast-1",
+      "name": "Grok Code Fast 1",
+      "developer": "xAI",
+      "scores": {
+        "terminal-bench-2.0": 25.8
+      }
+    },
+    {
+      "model_id": "zhipu-ai/glm-4.6",
+      "name": "GLM 4.6",
+      "developer": "Z.ai",
+      "scores": {
+        "terminal-bench-2.0": 24.5
+      }
+    },
+    {
+      "model_id": "zhipu-ai/glm-4.7",
+      "name": "GLM 4.7",
+      "developer": "Z-AI",
+      "scores": {
+        "terminal-bench-2.0": 33.4
+      }
+    },
+    {
+      "model_id": "zhipu-ai/glm-5",
+      "name": "GLM 5",
+      "developer": "Z-AI",
+      "scores": {
+        "terminal-bench-2.0": 52.4
+      }
+    }
+  ]
+}

data/developers/0-hero.json ADDED Viewed

	@@ -0,0 +1,47 @@

+{
+  "developer": "0-hero",
+  "models": [
+    {
+      "id": "0-hero/Matter-0.1-7B-DPO-preview",
+      "name": "0-hero/Matter-0.1-7B-DPO-preview",
+      "developer": "0-hero",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "reward-bench/Score": 0.7247,
+        "reward-bench/Chat": 0.8939,
+        "reward-bench/Chat Hard": 0.5768,
+        "reward-bench/Safety": 0.6378,
+        "reward-bench/Reasoning": 0.8854,
+        "reward-bench/Prior Sets (0.5 weight)": 0.5348
+      }
+    },
+    {
+      "id": "0-hero/Matter-0.1-7B-boost-DPO-preview",
+      "name": "0-hero/Matter-0.1-7B-boost-DPO-preview",
+      "developer": "0-hero",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "reward-bench/Score": 0.7448,
+        "reward-bench/Chat": 0.9106,
+        "reward-bench/Chat Hard": 0.6096,
+        "reward-bench/Safety": 0.7135,
+        "reward-bench/Reasoning": 0.8395,
+        "reward-bench/Prior Sets (0.5 weight)": 0.5566
+      }
+    },
+    {
+      "id": "0-hero/Matter-0.2-7B-DPO",
+      "name": "Matter-0.2-7B-DPO",
+      "developer": "0-hero",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.3303,
+        "hfopenllm_v2/BBH": 0.3596,
+        "hfopenllm_v2/MATH Level 5": 0.0144,
+        "hfopenllm_v2/GPQA": 0.2592,
+        "hfopenllm_v2/MUSR": 0.3814,
+        "hfopenllm_v2/MMLU-PRO": 0.1164
+      }
+    }
+  ]
+}

data/developers/01-ai.json ADDED Viewed

	@@ -0,0 +1,433 @@

+{
+  "developer": "01-ai",
+  "models": [
+    {
+      "id": "01-ai/Yi-1.5-34B",
+      "name": "Yi-1.5-34B",
+      "developer": "01-ai",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.2841,
+        "hfopenllm_v2/BBH": 0.5976,
+        "hfopenllm_v2/MATH Level 5": 0.1533,
+        "hfopenllm_v2/GPQA": 0.3658,
+        "hfopenllm_v2/MUSR": 0.4236,
+        "hfopenllm_v2/MMLU-PRO": 0.4666
+      }
+    },
+    {
+      "id": "01-ai/Yi-1.5-34B-32K",
+      "name": "Yi-1.5-34B-32K",
+      "developer": "01-ai",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.3119,
+        "hfopenllm_v2/BBH": 0.6016,
+        "hfopenllm_v2/MATH Level 5": 0.1541,
+        "hfopenllm_v2/GPQA": 0.3633,
+        "hfopenllm_v2/MUSR": 0.4398,
+        "hfopenllm_v2/MMLU-PRO": 0.4709
+      }
+    },
+    {
+      "id": "01-ai/Yi-1.5-34B-Chat",
+      "name": "Yi-1.5-34B-Chat",
+      "developer": "01-ai",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.6067,
+        "hfopenllm_v2/BBH": 0.6084,
+        "hfopenllm_v2/MATH Level 5": 0.2772,
+        "hfopenllm_v2/GPQA": 0.3649,
+        "hfopenllm_v2/MUSR": 0.4282,
+        "hfopenllm_v2/MMLU-PRO": 0.452
+      }
+    },
+    {
+      "id": "01-ai/Yi-1.5-34B-Chat-16K",
+      "name": "Yi-1.5-34B-Chat-16K",
+      "developer": "01-ai",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.4564,
+        "hfopenllm_v2/BBH": 0.61,
+        "hfopenllm_v2/MATH Level 5": 0.2137,
+        "hfopenllm_v2/GPQA": 0.3381,
+        "hfopenllm_v2/MUSR": 0.4398,
+        "hfopenllm_v2/MMLU-PRO": 0.4545
+      }
+    },
+    {
+      "id": "01-ai/Yi-1.5-6B",
+      "name": "Yi-1.5-6B",
+      "developer": "01-ai",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.2617,
+        "hfopenllm_v2/BBH": 0.4493,
+        "hfopenllm_v2/MATH Level 5": 0.0665,
+        "hfopenllm_v2/GPQA": 0.3138,
+        "hfopenllm_v2/MUSR": 0.4374,
+        "hfopenllm_v2/MMLU-PRO": 0.3144
+      }
+    },
+    {
+      "id": "01-ai/Yi-1.5-6B-Chat",
+      "name": "Yi-1.5-6B-Chat",
+      "developer": "01-ai",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.5145,
+        "hfopenllm_v2/BBH": 0.4571,
+        "hfopenllm_v2/MATH Level 5": 0.1624,
+        "hfopenllm_v2/GPQA": 0.302,
+        "hfopenllm_v2/MUSR": 0.4392,
+        "hfopenllm_v2/MMLU-PRO": 0.3193
+      }
+    },
+    {
+      "id": "01-ai/Yi-1.5-9B",
+      "name": "Yi-1.5-9B",
+      "developer": "01-ai",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.2936,
+        "hfopenllm_v2/BBH": 0.5143,
+        "hfopenllm_v2/MATH Level 5": 0.114,
+        "hfopenllm_v2/GPQA": 0.3792,
+        "hfopenllm_v2/MUSR": 0.4328,
+        "hfopenllm_v2/MMLU-PRO": 0.3916
+      }
+    },
+    {
+      "id": "01-ai/Yi-1.5-9B-32K",
+      "name": "Yi-1.5-9B-32K",
+      "developer": "01-ai",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.2303,
+        "hfopenllm_v2/BBH": 0.4963,
+        "hfopenllm_v2/MATH Level 5": 0.108,
+        "hfopenllm_v2/GPQA": 0.3591,
+        "hfopenllm_v2/MUSR": 0.4186,
+        "hfopenllm_v2/MMLU-PRO": 0.3765
+      }
+    },
+    {
+      "id": "01-ai/Yi-1.5-9B-Chat",
+      "name": "Yi-1.5-9B-Chat",
+      "developer": "01-ai",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.6046,
+        "hfopenllm_v2/BBH": 0.5559,
+        "hfopenllm_v2/MATH Level 5": 0.2258,
+        "hfopenllm_v2/GPQA": 0.3347,
+        "hfopenllm_v2/MUSR": 0.4259,
+        "hfopenllm_v2/MMLU-PRO": 0.3975
+      }
+    },
+    {
+      "id": "01-ai/Yi-1.5-9B-Chat-16K",
+      "name": "Yi-1.5-9B-Chat-16K",
+      "developer": "01-ai",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.4214,
+        "hfopenllm_v2/BBH": 0.5153,
+        "hfopenllm_v2/MATH Level 5": 0.1782,
+        "hfopenllm_v2/GPQA": 0.3087,
+        "hfopenllm_v2/MUSR": 0.4099,
+        "hfopenllm_v2/MMLU-PRO": 0.3994
+      }
+    },
+    {
+      "id": "01-ai/Yi-34B",
+      "name": "Yi-34B",
+      "developer": "01-ai",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.3046,
+        "hfopenllm_v2/BBH": 0.5457,
+        "hfopenllm_v2/MATH Level 5": 0.0514,
+        "hfopenllm_v2/GPQA": 0.3666,
+        "hfopenllm_v2/MUSR": 0.4119,
+        "hfopenllm_v2/MMLU-PRO": 0.4412
+      }
+    },
+    {
+      "id": "01-ai/Yi-34B-200K",
+      "name": "Yi-34B-200K",
+      "developer": "01-ai",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.1542,
+        "hfopenllm_v2/BBH": 0.5442,
+        "hfopenllm_v2/MATH Level 5": 0.0574,
+        "hfopenllm_v2/GPQA": 0.3565,
+        "hfopenllm_v2/MUSR": 0.3817,
+        "hfopenllm_v2/MMLU-PRO": 0.4535
+      }
+    },
+    {
+      "id": "01-ai/Yi-34B-Chat",
+      "name": "Yi-34B-Chat",
+      "developer": "01-ai",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.4699,
+        "hfopenllm_v2/BBH": 0.5561,
+        "hfopenllm_v2/MATH Level 5": 0.0627,
+        "hfopenllm_v2/GPQA": 0.3381,
+        "hfopenllm_v2/MUSR": 0.3978,
+        "hfopenllm_v2/MMLU-PRO": 0.4093
+      }
+    },
+    {
+      "id": "01-ai/Yi-6B",
+      "name": "Yi-6B",
+      "developer": "01-ai",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.2893,
+        "hfopenllm_v2/BBH": 0.4309,
+        "hfopenllm_v2/MATH Level 5": 0.0159,
+        "hfopenllm_v2/GPQA": 0.2693,
+        "hfopenllm_v2/MUSR": 0.3937,
+        "hfopenllm_v2/MMLU-PRO": 0.2991
+      }
+    },
+    {
+      "id": "01-ai/Yi-6B-200K",
+      "name": "Yi-6B-200K",
+      "developer": "01-ai",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.0843,
+        "hfopenllm_v2/BBH": 0.4289,
+        "hfopenllm_v2/MATH Level 5": 0.0181,
+        "hfopenllm_v2/GPQA": 0.2819,
+        "hfopenllm_v2/MUSR": 0.4587,
+        "hfopenllm_v2/MMLU-PRO": 0.2844
+      }
+    },
+    {
+      "id": "01-ai/Yi-6B-Chat",
+      "name": "Yi-6B-Chat",
+      "developer": "01-ai",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.3395,
+        "hfopenllm_v2/BBH": 0.4133,
+        "hfopenllm_v2/MATH Level 5": 0.0136,
+        "hfopenllm_v2/GPQA": 0.2945,
+        "hfopenllm_v2/MUSR": 0.3688,
+        "hfopenllm_v2/MMLU-PRO": 0.3061
+      }
+    },
+    {
+      "id": "01-ai/Yi-9B",
+      "name": "Yi-9B",
+      "developer": "01-ai",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.2709,
+        "hfopenllm_v2/BBH": 0.494,
+        "hfopenllm_v2/MATH Level 5": 0.0559,
+        "hfopenllm_v2/GPQA": 0.318,
+        "hfopenllm_v2/MUSR": 0.4054,
+        "hfopenllm_v2/MMLU-PRO": 0.3574
+      }
+    },
+    {
+      "id": "01-ai/Yi-9B-200K",
+      "name": "Yi-9B-200K",
+      "developer": "01-ai",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.2327,
+        "hfopenllm_v2/BBH": 0.4793,
+        "hfopenllm_v2/MATH Level 5": 0.0665,
+        "hfopenllm_v2/GPQA": 0.3154,
+        "hfopenllm_v2/MUSR": 0.4294,
+        "hfopenllm_v2/MMLU-PRO": 0.3622
+      }
+    },
+    {
+      "id": "01-ai/Yi-Coder-9B-Chat",
+      "name": "Yi-Coder-9B-Chat",
+      "developer": "01-ai",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.4817,
+        "hfopenllm_v2/BBH": 0.4814,
+        "hfopenllm_v2/MATH Level 5": 0.04,
+        "hfopenllm_v2/GPQA": 0.2475,
+        "hfopenllm_v2/MUSR": 0.3992,
+        "hfopenllm_v2/MMLU-PRO": 0.2425
+      }
+    },
+    {
+      "id": "01-ai/yi-34b",
+      "name": "Yi 34B",
+      "developer": "01-ai",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "helm_lite/Mean win rate": 0.57,
+        "helm_lite/NarrativeQA": 0.782,
+        "helm_lite/NaturalQuestions (closed-book)": 0.443,
+        "helm_lite/OpenbookQA": 0.92,
+        "helm_lite/MMLU": 0.65,
+        "helm_lite/MATH": 0.375,
+        "helm_lite/GSM8K": 0.648,
+        "helm_lite/LegalBench": 0.618,
+        "helm_lite/MedQA": 0.656,
+        "helm_lite/WMT 2014": 0.172,
+        "helm_mmlu/MMLU All Subjects": 0.762,
+        "helm_mmlu/Abstract Algebra": 0.4,
+        "helm_mmlu/Anatomy": 0.748,
+        "helm_mmlu/College Physics": 0.5,
+        "helm_mmlu/Computer Security": 0.83,
+        "helm_mmlu/Econometrics": 0.588,
+        "helm_mmlu/Global Facts": 0.53,
+        "helm_mmlu/Jurisprudence": 0.898,
+        "helm_mmlu/Philosophy": 0.82,
+        "helm_mmlu/Professional Psychology": 0.835,
+        "helm_mmlu/Us Foreign Policy": 0.91,
+        "helm_mmlu/Astronomy": 0.901,
+        "helm_mmlu/Business Ethics": 0.75,
+        "helm_mmlu/Clinical Knowledge": 0.8,
+        "helm_mmlu/Conceptual Physics": 0.77,
+        "helm_mmlu/Electrical Engineering": 0.779,
+        "helm_mmlu/Elementary Mathematics": 0.656,
+        "helm_mmlu/Formal Logic": 0.548,
+        "helm_mmlu/High School World History": 0.907,
+        "helm_mmlu/Human Sexuality": 0.87,
+        "helm_mmlu/International Law": 0.909,
+        "helm_mmlu/Logical Fallacies": 0.883,
+        "helm_mmlu/Machine Learning": 0.58,
+        "helm_mmlu/Management": 0.893,
+        "helm_mmlu/Marketing": 0.936,
+        "helm_mmlu/Medical Genetics": 0.87,
+        "helm_mmlu/Miscellaneous": 0.902,
+        "helm_mmlu/Moral Scenarios": 0.606,
+        "helm_mmlu/Nutrition": 0.869,
+        "helm_mmlu/Prehistory": 0.877,
+        "helm_mmlu/Public Relations": 0.745,
+        "helm_mmlu/Security Studies": 0.833,
+        "helm_mmlu/Sociology": 0.9,
+        "helm_mmlu/Virology": 0.572,
+        "helm_mmlu/World Religions": 0.877,
+        "helm_mmlu/Mean win rate": 0.315
+      }
+    },
+    {
+      "id": "01-ai/yi-6b",
+      "name": "Yi 6B",
+      "developer": "01-ai",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "helm_lite/Mean win rate": 0.253,
+        "helm_lite/NarrativeQA": 0.702,
+        "helm_lite/NaturalQuestions (closed-book)": 0.31,
+        "helm_lite/OpenbookQA": 0.8,
+        "helm_lite/MMLU": 0.53,
+        "helm_lite/MATH": 0.126,
+        "helm_lite/GSM8K": 0.375,
+        "helm_lite/LegalBench": 0.519,
+        "helm_lite/MedQA": 0.497,
+        "helm_lite/WMT 2014": 0.117,
+        "helm_mmlu/MMLU All Subjects": 0.64,
+        "helm_mmlu/Abstract Algebra": 0.3,
+        "helm_mmlu/Anatomy": 0.6,
+        "helm_mmlu/College Physics": 0.422,
+        "helm_mmlu/Computer Security": 0.73,
+        "helm_mmlu/Econometrics": 0.351,
+        "helm_mmlu/Global Facts": 0.43,
+        "helm_mmlu/Jurisprudence": 0.796,
+        "helm_mmlu/Philosophy": 0.678,
+        "helm_mmlu/Professional Psychology": 0.668,
+        "helm_mmlu/Us Foreign Policy": 0.87,
+        "helm_mmlu/Astronomy": 0.684,
+        "helm_mmlu/Business Ethics": 0.67,
+        "helm_mmlu/Clinical Knowledge": 0.66,
+        "helm_mmlu/Conceptual Physics": 0.621,
+        "helm_mmlu/Electrical Engineering": 0.662,
+        "helm_mmlu/Elementary Mathematics": 0.452,
+        "helm_mmlu/Formal Logic": 0.452,
+        "helm_mmlu/High School World History": 0.785,
+        "helm_mmlu/Human Sexuality": 0.763,
+        "helm_mmlu/International Law": 0.769,
+        "helm_mmlu/Logical Fallacies": 0.779,
+        "helm_mmlu/Machine Learning": 0.411,
+        "helm_mmlu/Management": 0.806,
+        "helm_mmlu/Marketing": 0.893,
+        "helm_mmlu/Medical Genetics": 0.77,
+        "helm_mmlu/Miscellaneous": 0.796,
+        "helm_mmlu/Moral Scenarios": 0.335,
+        "helm_mmlu/Nutrition": 0.739,
+        "helm_mmlu/Prehistory": 0.713,
+        "helm_mmlu/Public Relations": 0.718,
+        "helm_mmlu/Security Studies": 0.735,
+        "helm_mmlu/Sociology": 0.831,
+        "helm_mmlu/Virology": 0.452,
+        "helm_mmlu/World Religions": 0.836,
+        "helm_mmlu/Mean win rate": 0.651
+      }
+    },
+    {
+      "id": "01-ai/yi-large-preview",
+      "name": "Yi Large Preview",
+      "developer": "01-ai",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "helm_lite/Mean win rate": 0.471,
+        "helm_lite/NarrativeQA": 0.373,
+        "helm_lite/NaturalQuestions (closed-book)": 0.428,
+        "helm_lite/OpenbookQA": 0.946,
+        "helm_lite/MMLU": 0.712,
+        "helm_lite/MATH": 0.712,
+        "helm_lite/GSM8K": 0.69,
+        "helm_lite/LegalBench": 0.519,
+        "helm_lite/MedQA": 0.66,
+        "helm_lite/WMT 2014": 0.176,
+        "helm_mmlu/MMLU All Subjects": 0.793,
+        "helm_mmlu/Abstract Algebra": 0.6,
+        "helm_mmlu/Anatomy": 0.83,
+        "helm_mmlu/College Physics": 0.569,
+        "helm_mmlu/Computer Security": 0.86,
+        "helm_mmlu/Econometrics": 0.728,
+        "helm_mmlu/Global Facts": 0.52,
+        "helm_mmlu/Jurisprudence": 0.852,
+        "helm_mmlu/Philosophy": 0.842,
+        "helm_mmlu/Professional Psychology": 0.853,
+        "helm_mmlu/Us Foreign Policy": 0.85,
+        "helm_mmlu/Astronomy": 0.914,
+        "helm_mmlu/Business Ethics": 0.8,
+        "helm_mmlu/Clinical Knowledge": 0.857,
+        "helm_mmlu/Conceptual Physics": 0.864,
+        "helm_mmlu/Electrical Engineering": 0.779,
+        "helm_mmlu/Elementary Mathematics": 0.685,
+        "helm_mmlu/Formal Logic": 0.603,
+        "helm_mmlu/High School World History": 0.928,
+        "helm_mmlu/Human Sexuality": 0.901,
+        "helm_mmlu/International Law": 0.917,
+        "helm_mmlu/Logical Fallacies": 0.865,
+        "helm_mmlu/Machine Learning": 0.616,
+        "helm_mmlu/Management": 0.903,
+        "helm_mmlu/Marketing": 0.927,
+        "helm_mmlu/Medical Genetics": 0.83,
+        "helm_mmlu/Miscellaneous": 0.916,
+        "helm_mmlu/Moral Scenarios": 0.831,
+        "helm_mmlu/Nutrition": 0.846,
+        "helm_mmlu/Prehistory": 0.892,
+        "helm_mmlu/Public Relations": 0.827,
+        "helm_mmlu/Security Studies": 0.82,
+        "helm_mmlu/Sociology": 0.881,
+        "helm_mmlu/Virology": 0.59,
+        "helm_mmlu/World Religions": 0.871,
+        "helm_mmlu/Mean win rate": 0.258
+      }
+    }
+  ]
+}

data/developers/1-800-LLMs.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "developer": "1-800-LLMs",
+  "models": [
+    {
+      "id": "1-800-LLMs/Qwen-2.5-14B-Hindi",
+      "name": "Qwen-2.5-14B-Hindi",
+      "developer": "1-800-LLMs",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.5826,
+        "hfopenllm_v2/BBH": 0.6524,
+        "hfopenllm_v2/MATH Level 5": 0.3331,
+        "hfopenllm_v2/GPQA": 0.3624,
+        "hfopenllm_v2/MUSR": 0.4489,
+        "hfopenllm_v2/MMLU-PRO": 0.5263
+      }
+    },
+    {
+      "id": "1-800-LLMs/Qwen-2.5-14B-Hindi-Custom-Instruct",
+      "name": "Qwen-2.5-14B-Hindi-Custom-Instruct",
+      "developer": "1-800-LLMs",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.3077,
+        "hfopenllm_v2/BBH": 0.6284,
+        "hfopenllm_v2/MATH Level 5": 0.3112,
+        "hfopenllm_v2/GPQA": 0.37,
+        "hfopenllm_v2/MUSR": 0.4491,
+        "hfopenllm_v2/MMLU-PRO": 0.5164
+      }
+    }
+  ]
+}

data/developers/1024m.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "developer": "1024m",
+  "models": [
+    {
+      "id": "1024m/PHI-4-Hindi",
+      "name": "PHI-4-Hindi",
+      "developer": "1024m",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.0082,
+        "hfopenllm_v2/BBH": 0.671,
+        "hfopenllm_v2/MATH Level 5": 0.2334,
+        "hfopenllm_v2/GPQA": 0.3977,
+        "hfopenllm_v2/MUSR": 0.4914,
+        "hfopenllm_v2/MMLU-PRO": 0.5239
+      }
+    },
+    {
+      "id": "1024m/QWEN-14B-B100",
+      "name": "QWEN-14B-B100",
+      "developer": "1024m",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.7762,
+        "hfopenllm_v2/BBH": 0.6533,
+        "hfopenllm_v2/MATH Level 5": 0.5438,
+        "hfopenllm_v2/GPQA": 0.3507,
+        "hfopenllm_v2/MUSR": 0.41,
+        "hfopenllm_v2/MMLU-PRO": 0.5179
+      }
+    }
+  ]
+}

data/developers/152334H.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "developer": "152334H",
+  "models": [
+    {
+      "id": "152334H/miqu-1-70b-sf",
+      "name": "miqu-1-70b-sf",
+      "developer": "152334H",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.5182,
+        "hfopenllm_v2/BBH": 0.6102,
+        "hfopenllm_v2/MATH Level 5": 0.1246,
+        "hfopenllm_v2/GPQA": 0.3507,
+        "hfopenllm_v2/MUSR": 0.4582,
+        "hfopenllm_v2/MMLU-PRO": 0.4228
+      }
+    }
+  ]
+}

data/developers/1TuanPham.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "developer": "1TuanPham",
+  "models": [
+    {
+      "id": "1TuanPham/T-VisStar-7B-v0.1",
+      "name": "T-VisStar-7B-v0.1",
+      "developer": "1TuanPham",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.3607,
+        "hfopenllm_v2/BBH": 0.5052,
+        "hfopenllm_v2/MATH Level 5": 0.0574,
+        "hfopenllm_v2/GPQA": 0.2852,
+        "hfopenllm_v2/MUSR": 0.4375,
+        "hfopenllm_v2/MMLU-PRO": 0.3211
+      }
+    },
+    {
+      "id": "1TuanPham/T-VisStar-v0.1",
+      "name": "T-VisStar-v0.1",
+      "developer": "1TuanPham",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.3607,
+        "hfopenllm_v2/BBH": 0.5052,
+        "hfopenllm_v2/MATH Level 5": 0.0574,
+        "hfopenllm_v2/GPQA": 0.2852,
+        "hfopenllm_v2/MUSR": 0.4375,
+        "hfopenllm_v2/MMLU-PRO": 0.3211
+      }
+    }
+  ]
+}

data/developers/3rd-Degree-Burn.json ADDED Viewed

	@@ -0,0 +1,61 @@

+{
+  "developer": "3rd-Degree-Burn",
+  "models": [
+    {
+      "id": "3rd-Degree-Burn/L-3.1-Science-Writer-8B",
+      "name": "L-3.1-Science-Writer-8B",
+      "developer": "3rd-Degree-Burn",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.4263,
+        "hfopenllm_v2/BBH": 0.5041,
+        "hfopenllm_v2/MATH Level 5": 0.1035,
+        "hfopenllm_v2/GPQA": 0.2743,
+        "hfopenllm_v2/MUSR": 0.3959,
+        "hfopenllm_v2/MMLU-PRO": 0.3649
+      }
+    },
+    {
+      "id": "3rd-Degree-Burn/Llama-3.1-8B-Squareroot",
+      "name": "Llama-3.1-8B-Squareroot",
+      "developer": "3rd-Degree-Burn",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.2213,
+        "hfopenllm_v2/BBH": 0.3461,
+        "hfopenllm_v2/MATH Level 5": 0.2659,
+        "hfopenllm_v2/GPQA": 0.2567,
+        "hfopenllm_v2/MUSR": 0.3089,
+        "hfopenllm_v2/MMLU-PRO": 0.175
+      }
+    },
+    {
+      "id": "3rd-Degree-Burn/Llama-3.1-8B-Squareroot-v1",
+      "name": "Llama-3.1-8B-Squareroot-v1",
+      "developer": "3rd-Degree-Burn",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.2892,
+        "hfopenllm_v2/BBH": 0.3343,
+        "hfopenllm_v2/MATH Level 5": 0.0884,
+        "hfopenllm_v2/GPQA": 0.2559,
+        "hfopenllm_v2/MUSR": 0.3341,
+        "hfopenllm_v2/MMLU-PRO": 0.1127
+      }
+    },
+    {
+      "id": "3rd-Degree-Burn/Llama-Squared-8B",
+      "name": "Llama-Squared-8B",
+      "developer": "3rd-Degree-Burn",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.2755,
+        "hfopenllm_v2/BBH": 0.4431,
+        "hfopenllm_v2/MATH Level 5": 0.0574,
+        "hfopenllm_v2/GPQA": 0.2718,
+        "hfopenllm_v2/MUSR": 0.3089,
+        "hfopenllm_v2/MMLU-PRO": 0.2366
+      }
+    }
+  ]
+}

data/developers/4season.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "developer": "4season",
+  "models": [
+    {
+      "id": "4season/final_model_test_v2",
+      "name": "final_model_test_v2",
+      "developer": "4season",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.3191,
+        "hfopenllm_v2/BBH": 0.6342,
+        "hfopenllm_v2/MATH Level 5": 0.0838,
+        "hfopenllm_v2/GPQA": 0.3272,
+        "hfopenllm_v2/MUSR": 0.4314,
+        "hfopenllm_v2/MMLU-PRO": 0.3528
+      }
+    }
+  ]
+}

data/developers/AALF.json ADDED Viewed

	@@ -0,0 +1,61 @@

+{
+  "developer": "AALF",
+  "models": [
+    {
+      "id": "AALF/FuseChat-Llama-3.1-8B-Instruct-preview",
+      "name": "FuseChat-Llama-3.1-8B-Instruct-preview",
+      "developer": "AALF",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.719,
+        "hfopenllm_v2/BBH": 0.512,
+        "hfopenllm_v2/MATH Level 5": 0.2477,
+        "hfopenllm_v2/GPQA": 0.3054,
+        "hfopenllm_v2/MUSR": 0.382,
+        "hfopenllm_v2/MMLU-PRO": 0.3733
+      }
+    },
+    {
+      "id": "AALF/FuseChat-Llama-3.1-8B-SFT-preview",
+      "name": "FuseChat-Llama-3.1-8B-SFT-preview",
+      "developer": "AALF",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.7281,
+        "hfopenllm_v2/BBH": 0.524,
+        "hfopenllm_v2/MATH Level 5": 0.2251,
+        "hfopenllm_v2/GPQA": 0.3045,
+        "hfopenllm_v2/MUSR": 0.402,
+        "hfopenllm_v2/MMLU-PRO": 0.3743
+      }
+    },
+    {
+      "id": "AALF/gemma-2-27b-it-SimPO-37K",
+      "name": "gemma-2-27b-it-SimPO-37K",
+      "developer": "AALF",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.2407,
+        "hfopenllm_v2/BBH": 0.3911,
+        "hfopenllm_v2/MATH Level 5": 0.0128,
+        "hfopenllm_v2/GPQA": 0.2802,
+        "hfopenllm_v2/MUSR": 0.3488,
+        "hfopenllm_v2/MMLU-PRO": 0.1971
+      }
+    },
+    {
+      "id": "AALF/gemma-2-27b-it-SimPO-37K-100steps",
+      "name": "gemma-2-27b-it-SimPO-37K-100steps",
+      "developer": "AALF",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.2568,
+        "hfopenllm_v2/BBH": 0.3931,
+        "hfopenllm_v2/MATH Level 5": 0.0211,
+        "hfopenllm_v2/GPQA": 0.2886,
+        "hfopenllm_v2/MUSR": 0.3329,
+        "hfopenllm_v2/MMLU-PRO": 0.2125
+      }
+    }
+  ]
+}

data/developers/AELLM.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "developer": "AELLM",
+  "models": [
+    {
+      "id": "AELLM/gemma-2-aeria-infinity-9b",
+      "name": "gemma-2-aeria-infinity-9b",
+      "developer": "AELLM",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.7594,
+        "hfopenllm_v2/BBH": 0.5983,
+        "hfopenllm_v2/MATH Level 5": 0.2145,
+        "hfopenllm_v2/GPQA": 0.3339,
+        "hfopenllm_v2/MUSR": 0.402,
+        "hfopenllm_v2/MMLU-PRO": 0.3862
+      }
+    },
+    {
+      "id": "AELLM/gemma-2-lyco-infinity-9b",
+      "name": "gemma-2-lyco-infinity-9b",
+      "developer": "AELLM",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.7316,
+        "hfopenllm_v2/BBH": 0.584,
+        "hfopenllm_v2/MATH Level 5": 0.1707,
+        "hfopenllm_v2/GPQA": 0.328,
+        "hfopenllm_v2/MUSR": 0.4006,
+        "hfopenllm_v2/MMLU-PRO": 0.3787
+      }
+    }
+  ]
+}

data/developers/AGI-0.json ADDED Viewed

	@@ -0,0 +1,47 @@

+{
+  "developer": "AGI-0",
+  "models": [
+    {
+      "id": "AGI-0/Art-v0-3B",
+      "name": "Art-v0-3B",
+      "developer": "AGI-0",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.3192,
+        "hfopenllm_v2/BBH": 0.3401,
+        "hfopenllm_v2/MATH Level 5": 0.2462,
+        "hfopenllm_v2/GPQA": 0.2592,
+        "hfopenllm_v2/MUSR": 0.3768,
+        "hfopenllm_v2/MMLU-PRO": 0.1179
+      }
+    },
+    {
+      "id": "AGI-0/Artificium-llama3.1-8B-001",
+      "name": "Artificium-llama3.1-8B-001",
+      "developer": "AGI-0",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.5248,
+        "hfopenllm_v2/BBH": 0.4256,
+        "hfopenllm_v2/MATH Level 5": 0.136,
+        "hfopenllm_v2/GPQA": 0.2659,
+        "hfopenllm_v2/MUSR": 0.3795,
+        "hfopenllm_v2/MMLU-PRO": 0.3182
+      }
+    },
+    {
+      "id": "AGI-0/smartllama3.1-8B-001",
+      "name": "smartllama3.1-8B-001",
+      "developer": "AGI-0",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.3518,
+        "hfopenllm_v2/BBH": 0.467,
+        "hfopenllm_v2/MATH Level 5": 0.1299,
+        "hfopenllm_v2/GPQA": 0.3062,
+        "hfopenllm_v2/MUSR": 0.4386,
+        "hfopenllm_v2/MMLU-PRO": 0.3487
+      }
+    }
+  ]
+}

data/developers/AI-MO.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "developer": "AI-MO",
+  "models": [
+    {
+      "id": "AI-MO/NuminaMath-7B-CoT",
+      "name": "NuminaMath-7B-CoT",
+      "developer": "AI-MO",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.2689,
+        "hfopenllm_v2/BBH": 0.4314,
+        "hfopenllm_v2/MATH Level 5": 0.2696,
+        "hfopenllm_v2/GPQA": 0.2659,
+        "hfopenllm_v2/MUSR": 0.3303,
+        "hfopenllm_v2/MMLU-PRO": 0.2868
+      }
+    },
+    {
+      "id": "AI-MO/NuminaMath-7B-TIR",
+      "name": "NuminaMath-7B-TIR",
+      "developer": "AI-MO",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.2756,
+        "hfopenllm_v2/BBH": 0.4144,
+        "hfopenllm_v2/MATH Level 5": 0.1609,
+        "hfopenllm_v2/GPQA": 0.2584,
+        "hfopenllm_v2/MUSR": 0.3509,
+        "hfopenllm_v2/MMLU-PRO": 0.2733
+      }
+    }
+  ]
+}

data/developers/AI-Sweden-Models.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "developer": "AI-Sweden-Models",
+  "models": [
+    {
+      "id": "AI-Sweden-Models/Llama-3-8B-instruct",
+      "name": "Llama-3-8B-instruct",
+      "developer": "AI-Sweden-Models",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.2401,
+        "hfopenllm_v2/BBH": 0.4173,
+        "hfopenllm_v2/MATH Level 5": 0.0385,
+        "hfopenllm_v2/GPQA": 0.2659,
+        "hfopenllm_v2/MUSR": 0.4771,
+        "hfopenllm_v2/MMLU-PRO": 0.2597
+      }
+    },
+    {
+      "id": "AI-Sweden-Models/gpt-sw3-40b",
+      "name": "gpt-sw3-40b",
+      "developer": "AI-Sweden-Models",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.147,
+        "hfopenllm_v2/BBH": 0.3268,
+        "hfopenllm_v2/MATH Level 5": 0.0174,
+        "hfopenllm_v2/GPQA": 0.2349,
+        "hfopenllm_v2/MUSR": 0.3632,
+        "hfopenllm_v2/MMLU-PRO": 0.1276
+      }
+    }
+  ]
+}

data/developers/AI4free.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "developer": "AI4free",
+  "models": [
+    {
+      "id": "AI4free/Dhanishtha",
+      "name": "Dhanishtha",
+      "developer": "AI4free",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.2451,
+        "hfopenllm_v2/BBH": 0.3404,
+        "hfopenllm_v2/MATH Level 5": 0.256,
+        "hfopenllm_v2/GPQA": 0.2525,
+        "hfopenllm_v2/MUSR": 0.3569,
+        "hfopenllm_v2/MMLU-PRO": 0.1643
+      }
+    },
+    {
+      "id": "AI4free/t2",
+      "name": "t2",
+      "developer": "AI4free",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.3867,
+        "hfopenllm_v2/BBH": 0.291,
+        "hfopenllm_v2/MATH Level 5": 0.1896,
+        "hfopenllm_v2/GPQA": 0.2576,
+        "hfopenllm_v2/MUSR": 0.3846,
+        "hfopenllm_v2/MMLU-PRO": 0.1144
+      }
+    }
+  ]
+}

data/developers/AIDC-AI.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "developer": "AIDC-AI",
+  "models": [
+    {
+      "id": "AIDC-AI/Marco-o1",
+      "name": "Marco-o1",
+      "developer": "AIDC-AI",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.4771,
+        "hfopenllm_v2/BBH": 0.5364,
+        "hfopenllm_v2/MATH Level 5": 0.3746,
+        "hfopenllm_v2/GPQA": 0.2592,
+        "hfopenllm_v2/MUSR": 0.4138,
+        "hfopenllm_v2/MMLU-PRO": 0.4117
+      }
+    }
+  ]
+}

data/developers/Aashraf995.json ADDED Viewed

	@@ -0,0 +1,61 @@

+{
+  "developer": "Aashraf995",
+  "models": [
+    {
+      "id": "Aashraf995/Creative-7B-nerd",
+      "name": "Creative-7B-nerd",
+      "developer": "Aashraf995",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.4722,
+        "hfopenllm_v2/BBH": 0.5607,
+        "hfopenllm_v2/MATH Level 5": 0.3165,
+        "hfopenllm_v2/GPQA": 0.3263,
+        "hfopenllm_v2/MUSR": 0.4515,
+        "hfopenllm_v2/MMLU-PRO": 0.4492
+      }
+    },
+    {
+      "id": "Aashraf995/Gemma-Evo-10B",
+      "name": "Gemma-Evo-10B",
+      "developer": "Aashraf995",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.7332,
+        "hfopenllm_v2/BBH": 0.6044,
+        "hfopenllm_v2/MATH Level 5": 0.2228,
+        "hfopenllm_v2/GPQA": 0.354,
+        "hfopenllm_v2/MUSR": 0.4595,
+        "hfopenllm_v2/MMLU-PRO": 0.4275
+      }
+    },
+    {
+      "id": "Aashraf995/Qwen-Evo-7B",
+      "name": "Qwen-Evo-7B",
+      "developer": "Aashraf995",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.4757,
+        "hfopenllm_v2/BBH": 0.5709,
+        "hfopenllm_v2/MATH Level 5": 0.3142,
+        "hfopenllm_v2/GPQA": 0.3255,
+        "hfopenllm_v2/MUSR": 0.4541,
+        "hfopenllm_v2/MMLU-PRO": 0.4462
+      }
+    },
+    {
+      "id": "Aashraf995/QwenStock-14B",
+      "name": "QwenStock-14B",
+      "developer": "Aashraf995",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.5009,
+        "hfopenllm_v2/BBH": 0.655,
+        "hfopenllm_v2/MATH Level 5": 0.3573,
+        "hfopenllm_v2/GPQA": 0.3893,
+        "hfopenllm_v2/MUSR": 0.4793,
+        "hfopenllm_v2/MMLU-PRO": 0.5382
+      }
+    }
+  ]
+}

data/developers/AbacusResearch.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "developer": "AbacusResearch",
+  "models": [
+    {
+      "id": "AbacusResearch/Jallabi-34B",
+      "name": "Jallabi-34B",
+      "developer": "AbacusResearch",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.3529,
+        "hfopenllm_v2/BBH": 0.6023,
+        "hfopenllm_v2/MATH Level 5": 0.0521,
+        "hfopenllm_v2/GPQA": 0.3389,
+        "hfopenllm_v2/MUSR": 0.4822,
+        "hfopenllm_v2/MMLU-PRO": 0.4682
+      }
+    }
+  ]
+}

data/developers/Ahdoot.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "developer": "Ahdoot",
+  "models": [
+    {
+      "id": "Ahdoot/StructuredThinker-v0.3-MoreStructure",
+      "name": "StructuredThinker-v0.3-MoreStructure",
+      "developer": "Ahdoot",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.4193,
+        "hfopenllm_v2/BBH": 0.4838,
+        "hfopenllm_v2/MATH Level 5": 0.2908,
+        "hfopenllm_v2/GPQA": 0.297,
+        "hfopenllm_v2/MUSR": 0.4158,
+        "hfopenllm_v2/MMLU-PRO": 0.361
+      }
+    },
+    {
+      "id": "Ahdoot/Test_StealthThinker",
+      "name": "Test_StealthThinker",
+      "developer": "Ahdoot",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.422,
+        "hfopenllm_v2/BBH": 0.4647,
+        "hfopenllm_v2/MATH Level 5": 0.179,
+        "hfopenllm_v2/GPQA": 0.2961,
+        "hfopenllm_v2/MUSR": 0.428,
+        "hfopenllm_v2/MMLU-PRO": 0.3597
+      }
+    }
+  ]
+}

data/developers/Ahjeong.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "developer": "Ahjeong",
+  "models": [
+    {
+      "id": "Ahjeong/MMPO_Gemma_7b",
+      "name": "Ahjeong/MMPO_Gemma_7b",
+      "developer": "Ahjeong",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "reward-bench/Score": 0.7587,
+        "reward-bench/Chat": 0.9693,
+        "reward-bench/Chat Hard": 0.614,
+        "reward-bench/Safety": 0.7135,
+        "reward-bench/Reasoning": 0.7756,
+        "reward-bench/Prior Sets (0.5 weight)": 0.6831
+      }
+    },
+    {
+      "id": "Ahjeong/MMPO_Gemma_7b_gamma1.1_epoch3",
+      "name": "Ahjeong/MMPO_Gemma_7b_gamma1.1_epoch3",
+      "developer": "Ahjeong",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "reward-bench/Score": 0.7652,
+        "reward-bench/Chat": 0.9721,
+        "reward-bench/Chat Hard": 0.6338,
+        "reward-bench/Safety": 0.7635,
+        "reward-bench/Reasoning": 0.7284,
+        "reward-bench/Prior Sets (0.5 weight)": 0.6913
+      }
+    }
+  ]
+}

data/developers/AicoresSecurity.json ADDED Viewed

	@@ -0,0 +1,61 @@

+{
+  "developer": "AicoresSecurity",
+  "models": [
+    {
+      "id": "AicoresSecurity/Cybernet-Sec-3B-R1-V0",
+      "name": "Cybernet-Sec-3B-R1-V0",
+      "developer": "AicoresSecurity",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.6358,
+        "hfopenllm_v2/BBH": 0.4497,
+        "hfopenllm_v2/MATH Level 5": 0.1156,
+        "hfopenllm_v2/GPQA": 0.2634,
+        "hfopenllm_v2/MUSR": 0.3314,
+        "hfopenllm_v2/MMLU-PRO": 0.301
+      }
+    },
+    {
+      "id": "AicoresSecurity/Cybernet-Sec-3B-R1-V0-Coder",
+      "name": "Cybernet-Sec-3B-R1-V0-Coder",
+      "developer": "AicoresSecurity",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.7098,
+        "hfopenllm_v2/BBH": 0.4478,
+        "hfopenllm_v2/MATH Level 5": 0.1488,
+        "hfopenllm_v2/GPQA": 0.2718,
+        "hfopenllm_v2/MUSR": 0.3408,
+        "hfopenllm_v2/MMLU-PRO": 0.3178
+      }
+    },
+    {
+      "id": "AicoresSecurity/Cybernet-Sec-3B-R1-V1",
+      "name": "Cybernet-Sec-3B-R1-V1",
+      "developer": "AicoresSecurity",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.6146,
+        "hfopenllm_v2/BBH": 0.4282,
+        "hfopenllm_v2/MATH Level 5": 0.1518,
+        "hfopenllm_v2/GPQA": 0.2609,
+        "hfopenllm_v2/MUSR": 0.3287,
+        "hfopenllm_v2/MMLU-PRO": 0.2876
+      }
+    },
+    {
+      "id": "AicoresSecurity/Cybernet-Sec-3B-R1-V1.1",
+      "name": "Cybernet-Sec-3B-R1-V1.1",
+      "developer": "AicoresSecurity",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.673,
+        "hfopenllm_v2/BBH": 0.4392,
+        "hfopenllm_v2/MATH Level 5": 0.176,
+        "hfopenllm_v2/GPQA": 0.271,
+        "hfopenllm_v2/MUSR": 0.3541,
+        "hfopenllm_v2/MMLU-PRO": 0.3088
+      }
+    }
+  ]
+}

data/developers/Alepach.json ADDED Viewed

	@@ -0,0 +1,47 @@

+{
+  "developer": "Alepach",
+  "models": [
+    {
+      "id": "Alepach/notHumpback-M0",
+      "name": "notHumpback-M0",
+      "developer": "Alepach",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.235,
+        "hfopenllm_v2/BBH": 0.2785,
+        "hfopenllm_v2/MATH Level 5": 0.0189,
+        "hfopenllm_v2/GPQA": 0.2492,
+        "hfopenllm_v2/MUSR": 0.3552,
+        "hfopenllm_v2/MMLU-PRO": 0.1119
+      }
+    },
+    {
+      "id": "Alepach/notHumpback-M1",
+      "name": "notHumpback-M1",
+      "developer": "Alepach",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.2207,
+        "hfopenllm_v2/BBH": 0.2882,
+        "hfopenllm_v2/MATH Level 5": 0.0159,
+        "hfopenllm_v2/GPQA": 0.2374,
+        "hfopenllm_v2/MUSR": 0.342,
+        "hfopenllm_v2/MMLU-PRO": 0.1091
+      }
+    },
+    {
+      "id": "Alepach/notHumpback-M1-v2",
+      "name": "notHumpback-M1-v2",
+      "developer": "Alepach",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.2277,
+        "hfopenllm_v2/BBH": 0.2776,
+        "hfopenllm_v2/MATH Level 5": 0.0219,
+        "hfopenllm_v2/GPQA": 0.2601,
+        "hfopenllm_v2/MUSR": 0.3473,
+        "hfopenllm_v2/MMLU-PRO": 0.1119
+      }
+    }
+  ]
+}

data/developers/AlephAlpha.json ADDED Viewed

	@@ -0,0 +1,59 @@

+{
+  "developer": "AlephAlpha",
+  "models": [
+    {
+      "id": "AlephAlpha/luminous-base",
+      "name": "Luminous Base 13B",
+      "developer": "AlephAlpha",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "helm_lite/Mean win rate": 0.041,
+        "helm_lite/NarrativeQA": 0.633,
+        "helm_lite/NaturalQuestions (closed-book)": 0.197,
+        "helm_lite/OpenbookQA": 0.286,
+        "helm_lite/MMLU": 0.243,
+        "helm_lite/MATH": 0.026,
+        "helm_lite/GSM8K": 0.028,
+        "helm_lite/LegalBench": 0.332,
+        "helm_lite/MedQA": 0.26,
+        "helm_lite/WMT 2014": 0.066
+      }
+    },
+    {
+      "id": "AlephAlpha/luminous-extended",
+      "name": "Luminous Extended 30B",
+      "developer": "AlephAlpha",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "helm_lite/Mean win rate": 0.078,
+        "helm_lite/NarrativeQA": 0.684,
+        "helm_lite/NaturalQuestions (closed-book)": 0.253,
+        "helm_lite/OpenbookQA": 0.272,
+        "helm_lite/MMLU": 0.248,
+        "helm_lite/MATH": 0.04,
+        "helm_lite/GSM8K": 0.075,
+        "helm_lite/LegalBench": 0.421,
+        "helm_lite/MedQA": 0.276,
+        "helm_lite/WMT 2014": 0.083
+      }
+    },
+    {
+      "id": "AlephAlpha/luminous-supreme",
+      "name": "Luminous Supreme 70B",
+      "developer": "AlephAlpha",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "helm_lite/Mean win rate": 0.145,
+        "helm_lite/NarrativeQA": 0.743,
+        "helm_lite/NaturalQuestions (closed-book)": 0.299,
+        "helm_lite/OpenbookQA": 0.284,
+        "helm_lite/MMLU": 0.316,
+        "helm_lite/MATH": 0.078,
+        "helm_lite/GSM8K": 0.137,
+        "helm_lite/LegalBench": 0.452,
+        "helm_lite/MedQA": 0.276,
+        "helm_lite/WMT 2014": 0.102
+      }
+    }
+  ]
+}

data/developers/Alibaba-NLP.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "developer": "Alibaba-NLP",
+  "models": [
+    {
+      "id": "Alibaba-NLP/gte-Qwen2-7B-instruct",
+      "name": "gte-Qwen2-7B-instruct",
+      "developer": "Alibaba-NLP",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.2255,
+        "hfopenllm_v2/BBH": 0.4495,
+        "hfopenllm_v2/MATH Level 5": 0.0642,
+        "hfopenllm_v2/GPQA": 0.245,
+        "hfopenllm_v2/MUSR": 0.3559,
+        "hfopenllm_v2/MMLU-PRO": 0.3321
+      }
+    }
+  ]
+}

data/developers/Alibaba.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "developer": "Alibaba",
+  "models": [
+    {
+      "id": "alibaba/qwen-3-coder-480b",
+      "name": "Qwen 3 Coder 480B",
+      "developer": "Alibaba",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "terminal-bench-2.0/terminal-bench-2.0": 23.9
+      }
+    },
+    {
+      "id": "alibaba/qwen3-235b-a22b-thinking-2507",
+      "name": "qwen3-235b-a22b-thinking-2507",
+      "developer": "Alibaba",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "livecodebenchpro/Hard Problems": 0.0,
+        "livecodebenchpro/Medium Problems": 0.1267605633802817,
+        "livecodebenchpro/Easy Problems": 0.7605633802816901
+      }
+    },
+    {
+      "id": "alibaba/qwen3-30b-a3b",
+      "name": "qwen3-30b-a3b",
+      "developer": "Alibaba",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "livecodebenchpro/Hard Problems": 0.0,
+        "livecodebenchpro/Medium Problems": 0.028169014084507043,
+        "livecodebenchpro/Easy Problems": 0.5774647887323944
+      }
+    },
+    {
+      "id": "alibaba/qwen3-max",
+      "name": "alibaba/qwen3-max",
+      "developer": "Alibaba",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "livecodebenchpro/Hard Problems": 0.0,
+        "livecodebenchpro/Medium Problems": 0.04225352112676056,
+        "livecodebenchpro/Easy Problems": 0.36619718309859156
+      }
+    },
+    {
+      "id": "alibaba/qwen3-next-80b-a3b-thinking",
+      "name": "qwen3-next-80b-a3b-thinking",
+      "developer": "Alibaba",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "livecodebenchpro/Hard Problems": 0.0,
+        "livecodebenchpro/Medium Problems": 0.14084507042253522,
+        "livecodebenchpro/Easy Problems": 0.7464788732394366
+      }
+    }
+  ]
+}

data/developers/Alsebay.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "developer": "Alsebay",
+  "models": [
+    {
+      "id": "Alsebay/Qwen2.5-7B-test-novelist",
+      "name": "Qwen2.5-7B-test-novelist",
+      "developer": "Alsebay",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.5352,
+        "hfopenllm_v2/BBH": 0.5151,
+        "hfopenllm_v2/MATH Level 5": 0.2349,
+        "hfopenllm_v2/GPQA": 0.2911,
+        "hfopenllm_v2/MUSR": 0.4749,
+        "hfopenllm_v2/MMLU-PRO": 0.3866
+      }
+    }
+  ]
+}

data/developers/Amaorynho.json ADDED Viewed

	@@ -0,0 +1,61 @@

+{
+  "developer": "Amaorynho",
+  "models": [
+    {
+      "id": "Amaorynho/BBAI2006",
+      "name": "BBAI2006",
+      "developer": "Amaorynho",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.1467,
+        "hfopenllm_v2/BBH": 0.2704,
+        "hfopenllm_v2/MATH Level 5": 0.0,
+        "hfopenllm_v2/GPQA": 0.2525,
+        "hfopenllm_v2/MUSR": 0.3605,
+        "hfopenllm_v2/MMLU-PRO": 0.1123
+      }
+    },
+    {
+      "id": "Amaorynho/BBAI270V4",
+      "name": "BBAI270V4",
+      "developer": "Amaorynho",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.199,
+        "hfopenllm_v2/BBH": 0.3071,
+        "hfopenllm_v2/MATH Level 5": 0.0083,
+        "hfopenllm_v2/GPQA": 0.2458,
+        "hfopenllm_v2/MUSR": 0.3314,
+        "hfopenllm_v2/MMLU-PRO": 0.1114
+      }
+    },
+    {
+      "id": "Amaorynho/BBAIIFEV1",
+      "name": "BBAIIFEV1",
+      "developer": "Amaorynho",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.8047,
+        "hfopenllm_v2/BBH": 0.5292,
+        "hfopenllm_v2/MATH Level 5": 0.1934,
+        "hfopenllm_v2/GPQA": 0.3104,
+        "hfopenllm_v2/MUSR": 0.4185,
+        "hfopenllm_v2/MMLU-PRO": 0.3857
+      }
+    },
+    {
+      "id": "Amaorynho/BBAI_375",
+      "name": "BBAI_375",
+      "developer": "Amaorynho",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.1467,
+        "hfopenllm_v2/BBH": 0.2704,
+        "hfopenllm_v2/MATH Level 5": 0.0,
+        "hfopenllm_v2/GPQA": 0.2525,
+        "hfopenllm_v2/MUSR": 0.3605,
+        "hfopenllm_v2/MMLU-PRO": 0.1123
+      }
+    }
+  ]
+}

data/developers/Amu.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "developer": "Amu",
+  "models": [
+    {
+      "id": "Amu/t1-1.5B",
+      "name": "t1-1.5B",
+      "developer": "Amu",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.3394,
+        "hfopenllm_v2/BBH": 0.4008,
+        "hfopenllm_v2/MATH Level 5": 0.0514,
+        "hfopenllm_v2/GPQA": 0.2433,
+        "hfopenllm_v2/MUSR": 0.3517,
+        "hfopenllm_v2/MMLU-PRO": 0.2566
+      }
+    },
+    {
+      "id": "Amu/t1-3B",
+      "name": "t1-3B",
+      "developer": "Amu",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.3328,
+        "hfopenllm_v2/BBH": 0.3999,
+        "hfopenllm_v2/MATH Level 5": 0.1375,
+        "hfopenllm_v2/GPQA": 0.2408,
+        "hfopenllm_v2/MUSR": 0.3435,
+        "hfopenllm_v2/MMLU-PRO": 0.1284
+      }
+    }
+  ]
+}

data/developers/Anthropic.json ADDED Viewed

	@@ -0,0 +1,129 @@

+{
+  "developer": "Anthropic",
+  "models": [
+    {
+      "id": "Anthropic/claude-3-5-sonnet-20240620",
+      "name": "Anthropic/claude-3-5-sonnet-20240620",
+      "developer": "Anthropic",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "reward-bench/Score": 0.8417,
+        "reward-bench/Chat": 0.9637,
+        "reward-bench/Chat Hard": 0.7401,
+        "reward-bench/Safety": 0.8162,
+        "reward-bench/Reasoning": 0.8469
+      }
+    },
+    {
+      "id": "Anthropic/claude-3-haiku-20240307",
+      "name": "Anthropic/claude-3-haiku-20240307",
+      "developer": "Anthropic",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "reward-bench/Score": 0.7289,
+        "reward-bench/Chat": 0.9274,
+        "reward-bench/Chat Hard": 0.5197,
+        "reward-bench/Safety": 0.7953,
+        "reward-bench/Reasoning": 0.706,
+        "reward-bench/Prior Sets (0.5 weight)": 0.6635
+      }
+    },
+    {
+      "id": "Anthropic/claude-3-opus-20240229",
+      "name": "Anthropic/claude-3-opus-20240229",
+      "developer": "Anthropic",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "reward-bench/Score": 0.8008,
+        "reward-bench/Chat": 0.9469,
+        "reward-bench/Chat Hard": 0.6031,
+        "reward-bench/Safety": 0.8662,
+        "reward-bench/Reasoning": 0.7868
+      }
+    },
+    {
+      "id": "Anthropic/claude-3-sonnet-20240229",
+      "name": "Anthropic/claude-3-sonnet-20240229",
+      "developer": "Anthropic",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "reward-bench/Score": 0.7458,
+        "reward-bench/Chat": 0.9344,
+        "reward-bench/Chat Hard": 0.5658,
+        "reward-bench/Safety": 0.8169,
+        "reward-bench/Reasoning": 0.6907,
+        "reward-bench/Prior Sets (0.5 weight)": 0.6963
+      }
+    },
+    {
+      "id": "anthropic/claude-3.7-sonnet",
+      "name": "anthropic/claude-3.7-sonnet",
+      "developer": "Anthropic",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "livecodebenchpro/Hard Problems": 0.0,
+        "livecodebenchpro/Medium Problems": 0.014084507042253521,
+        "livecodebenchpro/Easy Problems": 0.15492957746478872
+      }
+    },
+    {
+      "id": "anthropic/claude-haiku-4.5",
+      "name": "Claude Haiku 4.5",
+      "developer": "Anthropic",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "terminal-bench-2.0/terminal-bench-2.0": 35.5
+      }
+    },
+    {
+      "id": "anthropic/claude-opus-4-5",
+      "name": "claude-opus-4-5",
+      "developer": "Anthropic",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "appworld_test_normal/appworld/test_normal": 0.66,
+        "browsecompplus/browsecompplus": 0.49,
+        "swe-bench/swe-bench": 0.65,
+        "tau-bench-2_airline/tau-bench-2/airline": 0.66,
+        "tau-bench-2_retail/tau-bench-2/retail": 0.85,
+        "tau-bench-2_telecom/tau-bench-2/telecom": 0.58
+      }
+    },
+    {
+      "id": "anthropic/claude-opus-4.1",
+      "name": "Claude Opus 4.1",
+      "developer": "Anthropic",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "terminal-bench-2.0/terminal-bench-2.0": 38.0
+      }
+    },
+    {
+      "id": "anthropic/claude-opus-4.5",
+      "name": "Claude Opus 4.5",
+      "developer": "Anthropic",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "terminal-bench-2.0/terminal-bench-2.0": 54.3
+      }
+    },
+    {
+      "id": "anthropic/claude-opus-4.6",
+      "name": "Claude Opus 4.6",
+      "developer": "Anthropic",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "terminal-bench-2.0/terminal-bench-2.0": 69.9
+      }
+    },
+    {
+      "id": "anthropic/claude-sonnet-4.5",
+      "name": "Claude Sonnet 4.5",
+      "developer": "Anthropic",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "terminal-bench-2.0/terminal-bench-2.0": 42.6
+      }
+    }
+  ]
+}

data/developers/ArliAI.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "developer": "ArliAI",
+  "models": [
+    {
+      "id": "ArliAI/ArliAI-RPMax-12B-v1.1",
+      "name": "ArliAI-RPMax-12B-v1.1",
+      "developer": "ArliAI",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.5349,
+        "hfopenllm_v2/BBH": 0.4752,
+        "hfopenllm_v2/MATH Level 5": 0.1125,
+        "hfopenllm_v2/GPQA": 0.2819,
+        "hfopenllm_v2/MUSR": 0.3618,
+        "hfopenllm_v2/MMLU-PRO": 0.3384
+      }
+    },
+    {
+      "id": "ArliAI/Llama-3.1-8B-ArliAI-RPMax-v1.1",
+      "name": "Llama-3.1-8B-ArliAI-RPMax-v1.1",
+      "developer": "ArliAI",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.6359,
+        "hfopenllm_v2/BBH": 0.5016,
+        "hfopenllm_v2/MATH Level 5": 0.1314,
+        "hfopenllm_v2/GPQA": 0.2836,
+        "hfopenllm_v2/MUSR": 0.3577,
+        "hfopenllm_v2/MMLU-PRO": 0.3551
+      }
+    }
+  ]
+}

data/developers/Arthur-LAGACHERIE.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "developer": "Arthur-LAGACHERIE",
+  "models": [
+    {
+      "id": "Arthur-LAGACHERIE/Precis-1B-Instruct",
+      "name": "Precis-1B-Instruct",
+      "developer": "Arthur-LAGACHERIE",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.3671,
+        "hfopenllm_v2/BBH": 0.3224,
+        "hfopenllm_v2/MATH Level 5": 0.0038,
+        "hfopenllm_v2/GPQA": 0.2659,
+        "hfopenllm_v2/MUSR": 0.3436,
+        "hfopenllm_v2/MMLU-PRO": 0.1426
+      }
+    }
+  ]
+}

data/developers/Artples.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "developer": "Artples",
+  "models": [
+    {
+      "id": "Artples/L-MChat-7b",
+      "name": "L-MChat-7b",
+      "developer": "Artples",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.5297,
+        "hfopenllm_v2/BBH": 0.46,
+        "hfopenllm_v2/MATH Level 5": 0.0921,
+        "hfopenllm_v2/GPQA": 0.3054,
+        "hfopenllm_v2/MUSR": 0.4029,
+        "hfopenllm_v2/MMLU-PRO": 0.3299
+      }
+    },
+    {
+      "id": "Artples/L-MChat-Small",
+      "name": "L-MChat-Small",
+      "developer": "Artples",
+      "evaluator_relationship": null,
+      "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.3287,
+        "hfopenllm_v2/BBH": 0.4823,
+        "hfopenllm_v2/MATH Level 5": 0.0378,
+        "hfopenllm_v2/GPQA": 0.2676,
+        "hfopenllm_v2/MUSR": 0.3696,
+        "hfopenllm_v2/MMLU-PRO": 0.2464
+      }
+    }
+  ]
+}