Spaces:

evaleval
/

general-eval-card

Running on CPU Spr

App Files Files Community

GitHub Actions commited on Apr 2

Commit

49596d9

1 Parent(s): d8be99e

chore: sync EEE pipeline output [2026-04-02 05:07 UTC]

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

data/benchmarks.json +5 -1
data/benchmarks/appworld_test_normal.json +2 -2
data/benchmarks/browsecompplus.json +2 -2
data/benchmarks/hfopenllm_v2.json +205 -218
data/benchmarks/livecodebenchpro.json +3 -3
data/benchmarks/reward-bench.json +174 -174
data/benchmarks/swe-bench.json +2 -2
data/benchmarks/tau-bench-2_airline.json +2 -2
data/benchmarks/tau-bench-2_retail.json +1 -1
data/benchmarks/tau-bench-2_telecom.json +2 -2
data/benchmarks/terminal-bench-2.0.json +20 -20
data/benchmarks/theory_of_mind.json +12 -0
data/developers.json +1 -1
data/developers/adriszmar.json +6 -6
data/developers/ai2.json +3 -3
data/developers/akjindal53244.json +5 -5
data/developers/allenai.json +33 -33
data/developers/anthropic.json +10 -10
data/developers/cognitivecomputations.json +6 -6
data/developers/columbia-nlp.json +6 -6
data/developers/cpayne1303.json +5 -5
data/developers/daemontatox.json +6 -6
data/developers/deepmount00.json +6 -6
data/developers/dfurman.json +6 -6
data/developers/doppelreflex.json +6 -6
data/developers/google.json +20 -20
data/developers/huggingfacetb.json +6 -6
data/developers/infly.json +6 -6
data/developers/internlm.json +6 -6
data/developers/jaspionjader.json +5 -5
data/developers/leroydyer.json +6 -6
data/developers/llmat.json +6 -6
data/developers/lxzgordon.json +6 -6
data/developers/meta.json +22 -22
data/developers/minimax.json +1 -1
data/developers/mistralai.json +16 -16
data/developers/mlabonne.json +6 -6
data/developers/moonshot_ai.json +1 -1
data/developers/multiple.json +1 -1
data/developers/nazimali.json +6 -6
data/developers/nicolinho.json +12 -12
data/developers/nisten.json +6 -6
data/developers/nousresearch.json +0 -14
data/developers/omkar1102.json +5 -5
data/developers/openai.json +53 -53
data/developers/openassistant.json +14 -14
data/developers/openbmb.json +7 -7
data/developers/pku-alignment.json +21 -21
data/developers/primeintellect.json +4 -4
data/developers/princeton-nlp.json +6 -6

data/benchmarks.json CHANGED Viewed

@@ -45,7 +45,7 @@
   },
   {
     "benchmark": "hfopenllm_v2",
-    "model_count": 4494
   },
   {
     "benchmark": "la_leaderboard",
@@ -78,5 +78,9 @@
   {
     "benchmark": "terminal-bench-2.0",
     "model_count": 37
   }
 ]

   },
   {
     "benchmark": "hfopenllm_v2",
+    "model_count": 4493
   },
   {
     "benchmark": "la_leaderboard",
   {
     "benchmark": "terminal-bench-2.0",
     "model_count": 37
+  },
+  {
+    "benchmark": "theory_of_mind",
+    "model_count": 1
   }
 ]

data/benchmarks/appworld_test_normal.json CHANGED Viewed

@@ -5,7 +5,7 @@
       "name": "claude-opus-4-5",
       "developer": "Anthropic",
       "scores": {
-        "appworld/test_normal": 0.68
       }
     },
     {
@@ -13,7 +13,7 @@
       "name": "gemini-3-pro-preview",
       "developer": "Google",
       "scores": {
-        "appworld/test_normal": 0.13
       }
     },
     {

       "name": "claude-opus-4-5",
       "developer": "Anthropic",
       "scores": {
+        "appworld/test_normal": 0.7
       }
     },
     {
       "name": "gemini-3-pro-preview",
       "developer": "Google",
       "scores": {
+        "appworld/test_normal": 0.55
       }
     },
     {

data/benchmarks/browsecompplus.json CHANGED Viewed

@@ -13,7 +13,7 @@
       "name": "gemini-3-pro-preview",
       "developer": "Google",
       "scores": {
-        "browsecompplus": 0.48
       }
     },
     {
@@ -21,7 +21,7 @@
       "name": "gpt-5.2-2025-12-11",
       "developer": "OpenAI",
       "scores": {
-        "browsecompplus": 0.48
       }
     }
   ]

       "name": "gemini-3-pro-preview",
       "developer": "Google",
       "scores": {
+        "browsecompplus": 0.3333
       }
     },
     {
       "name": "gpt-5.2-2025-12-11",
       "developer": "OpenAI",
       "scores": {
+        "browsecompplus": 0.43
       }
     }
   ]

data/benchmarks/hfopenllm_v2.json CHANGED Viewed

@@ -2176,12 +2176,12 @@
       "name": "LION-Gemma-2b-dpo-v1.0",
       "developer": "Columbia-NLP",
       "scores": {
-        "IFEval": 0.3278,
-        "BBH": 0.392,
-        "MATH Level 5": 0.0431,
-        "GPQA": 0.2492,
-        "MUSR": 0.412,
-        "MMLU-PRO": 0.1666
       }
     },
     {
@@ -3229,12 +3229,12 @@
       "name": "PathfinderAI",
       "developer": "Daemontatox",
       "scores": {
-        "IFEval": 0.3745,
-        "BBH": 0.6668,
-        "MATH Level 5": 0.4758,
-        "GPQA": 0.3943,
-        "MUSR": 0.4858,
-        "MMLU-PRO": 0.5593
       }
     },
     {
@@ -4321,12 +4321,12 @@
       "name": "Llama-3.1-8b-ITA",
       "developer": "DeepMount00",
       "scores": {
-        "IFEval": 0.7917,
-        "BBH": 0.5109,
-        "MATH Level 5": 0.1088,
-        "GPQA": 0.2878,
-        "MUSR": 0.4136,
-        "MMLU-PRO": 0.3876
       }
     },
     {
@@ -4646,12 +4646,12 @@
       "name": "MN-12B-LilithFrame",
       "developer": "DoppelReflEx",
       "scores": {
-        "IFEval": 0.451,
-        "BBH": 0.4944,
-        "MATH Level 5": 0.1156,
-        "GPQA": 0.3196,
-        "MUSR": 0.3896,
-        "MMLU-PRO": 0.3256
       }
     },
     {
@@ -9144,12 +9144,12 @@
       "name": "SmolLM2-135M-Instruct",
       "developer": "HuggingFaceTB",
       "scores": {
-        "IFEval": 0.0593,
-        "BBH": 0.3135,
-        "MATH Level 5": 0.0144,
-        "GPQA": 0.2341,
-        "MUSR": 0.3871,
-        "MMLU-PRO": 0.1092
       }
     },
     {
@@ -13057,12 +13057,12 @@
       "name": "SpydazWeb_AI_HumanAI_012_INSTRUCT_XA",
       "developer": "LeroyDyer",
       "scores": {
-        "IFEval": 0.3579,
-        "BBH": 0.4477,
-        "MATH Level 5": 0.0423,
-        "GPQA": 0.3096,
-        "MUSR": 0.4134,
-        "MMLU-PRO": 0.2376
       }
     },
     {
@@ -16874,19 +16874,6 @@
         "MMLU-PRO": 0.232
       }
     },
-    {
-      "model_id": "NousResearch/Yarn-Llama-2-7b-128k",
-      "name": "Yarn-Llama-2-7b-128k",
-      "developer": "NousResearch",
-      "scores": {
-        "IFEval": 0.1485,
-        "BBH": 0.3248,
-        "MATH Level 5": 0.0151,
-        "GPQA": 0.2601,
-        "MUSR": 0.3967,
-        "MMLU-PRO": 0.1791
-      }
-    },
     {
       "model_id": "NousResearch/Yarn-Llama-2-7b-64k",
       "name": "Yarn-Llama-2-7b-64k",
@@ -17204,12 +17191,12 @@
       "name": "code-yi",
       "developer": "Omkar1102",
       "scores": {
-        "IFEval": 0.2254,
-        "BBH": 0.275,
         "MATH Level 5": 0.0,
-        "GPQA": 0.2576,
-        "MUSR": 0.3762,
-        "MMLU-PRO": 0.1123
       }
     },
     {
@@ -18141,11 +18128,11 @@
       "developer": "PrimeIntellect",
       "scores": {
         "IFEval": 0.1757,
-        "BBH": 0.276,
         "MATH Level 5": 0.0,
-        "GPQA": 0.2534,
-        "MUSR": 0.3339,
-        "MMLU-PRO": 0.1123
       }
     },
     {
@@ -18712,12 +18699,12 @@
       "name": "ODB-14B-sce",
       "developer": "Quazim0t0",
       "scores": {
-        "IFEval": 0.7016,
-        "BBH": 0.6942,
-        "MATH Level 5": 0.4116,
-        "GPQA": 0.3624,
-        "MUSR": 0.4571,
-        "MMLU-PRO": 0.5411
       }
     },
     {
@@ -19466,12 +19453,12 @@
       "name": "Qwen2.5-0.5B-Instruct",
       "developer": "Qwen",
       "scores": {
-        "IFEval": 0.3153,
-        "BBH": 0.3322,
-        "MATH Level 5": 0.1035,
-        "GPQA": 0.2592,
-        "MUSR": 0.3342,
-        "MMLU-PRO": 0.172
       }
     },
     {
@@ -19726,12 +19713,12 @@
       "name": "Qwen2.5-Coder-7B-Instruct",
       "developer": "Qwen",
       "scores": {
-        "IFEval": 0.6147,
-        "BBH": 0.4999,
-        "MATH Level 5": 0.031,
-        "GPQA": 0.2936,
-        "MUSR": 0.4099,
-        "MMLU-PRO": 0.3354
       }
     },
     {
@@ -19986,12 +19973,12 @@
       "name": "Replete-LLM-Qwen2-7b",
       "developer": "Replete-AI",
       "scores": {
-        "IFEval": 0.0932,
-        "BBH": 0.2977,
         "MATH Level 5": 0.0,
-        "GPQA": 0.2475,
-        "MUSR": 0.3941,
-        "MMLU-PRO": 0.1157
       }
     },
     {
@@ -24653,12 +24640,12 @@
       "name": "Llama-3-Instruct-8B-SPPO-Iter3",
       "developer": "UCLA-AGI",
       "scores": {
-        "IFEval": 0.6834,
-        "BBH": 0.508,
-        "MATH Level 5": 0.0959,
         "GPQA": 0.2651,
-        "MUSR": 0.3661,
-        "MMLU-PRO": 0.3644
       }
     },
     {
@@ -25004,12 +24991,12 @@
       "name": "llama-3-Korean-8B",
       "developer": "VIRNECT",
       "scores": {
-        "IFEval": 0.5021,
-        "BBH": 0.4918,
-        "MATH Level 5": 0.108,
         "GPQA": 0.271,
-        "MUSR": 0.3648,
-        "MMLU-PRO": 0.3536
       }
     },
     {
@@ -25108,12 +25095,12 @@
       "name": "Llama3.1-8B-Fireplace2",
       "developer": "ValiantLabs",
       "scores": {
-        "IFEval": 0.5328,
-        "BBH": 0.4613,
-        "MATH Level 5": 0.0876,
-        "GPQA": 0.2894,
-        "MUSR": 0.3367,
-        "MMLU-PRO": 0.2424
       }
     },
     {
@@ -25121,12 +25108,12 @@
       "name": "Llama3.1-8B-ShiningValiant2",
       "developer": "ValiantLabs",
       "scores": {
-        "IFEval": 0.6496,
-        "BBH": 0.4774,
-        "MATH Level 5": 0.0566,
-        "GPQA": 0.3104,
-        "MUSR": 0.3909,
-        "MMLU-PRO": 0.3382
       }
     },
     {
@@ -25654,12 +25641,12 @@
       "name": "Qwen2.5-14B-YOYO-1010",
       "developer": "YOYO-AI",
       "scores": {
-        "IFEval": 0.5899,
-        "BBH": 0.654,
-        "MATH Level 5": 0.4509,
-        "GPQA": 0.3834,
-        "MUSR": 0.4744,
-        "MMLU-PRO": 0.5376
       }
     },
     {
@@ -26603,12 +26590,12 @@
       "name": "QAIMath-Qwen2.5-7B-TIES",
       "developer": "adriszmar",
       "scores": {
-        "IFEval": 0.1685,
-        "BBH": 0.3124,
-        "MATH Level 5": 0.0015,
-        "GPQA": 0.2492,
-        "MUSR": 0.3963,
-        "MMLU-PRO": 0.1066
       }
     },
     {
@@ -26889,12 +26876,12 @@
       "name": "Llama-3.1-Storm-8B",
       "developer": "akjindal53244",
       "scores": {
-        "IFEval": 0.8051,
-        "BBH": 0.5189,
-        "MATH Level 5": 0.1722,
-        "GPQA": 0.3263,
         "MUSR": 0.4028,
-        "MMLU-PRO": 0.3803
       }
     },
     {
@@ -26915,12 +26902,12 @@
       "name": "Llama-3.1-Tulu-3-70B",
       "developer": "allenai",
       "scores": {
-        "IFEval": 0.8379,
-        "BBH": 0.6157,
-        "MATH Level 5": 0.3829,
         "GPQA": 0.3733,
-        "MUSR": 0.4988,
-        "MMLU-PRO": 0.4656
       }
     },
     {
@@ -31647,12 +31634,12 @@
       "name": "dolphin-2.9.2-Phi-3-Medium-abliterated",
       "developer": "cognitivecomputations",
       "scores": {
-        "IFEval": 0.4124,
-        "BBH": 0.6383,
-        "MATH Level 5": 0.182,
-        "GPQA": 0.3289,
-        "MUSR": 0.4349,
-        "MMLU-PRO": 0.4525
       }
     },
     {
@@ -31790,12 +31777,12 @@
       "name": "llama-43m-beta",
       "developer": "cpayne1303",
       "scores": {
-        "IFEval": 0.1916,
-        "BBH": 0.2977,
-        "MATH Level 5": 0.0,
         "GPQA": 0.2685,
-        "MUSR": 0.3872,
-        "MMLU-PRO": 0.1132
       }
     },
     {
@@ -32167,12 +32154,12 @@
       "name": "Llama-3-8B-Orpo-v0.1",
       "developer": "dfurman",
       "scores": {
-        "IFEval": 0.2835,
-        "BBH": 0.3842,
-        "MATH Level 5": 0.0521,
-        "GPQA": 0.2609,
-        "MUSR": 0.3566,
-        "MMLU-PRO": 0.2298
       }
     },
     {
@@ -34663,12 +34650,12 @@
       "name": "gemma-2-2b",
       "developer": "Google",
       "scores": {
-        "IFEval": 0.2018,
-        "BBH": 0.3709,
-        "MATH Level 5": 0.0302,
         "GPQA": 0.2626,
-        "MUSR": 0.4219,
-        "MMLU-PRO": 0.2217
       }
     },
     {
@@ -34689,12 +34676,12 @@
       "name": "gemma-2-2b-jpn-it",
       "developer": "Google",
       "scores": {
-        "IFEval": 0.5078,
-        "BBH": 0.4226,
-        "MATH Level 5": 0.0347,
-        "GPQA": 0.2852,
-        "MUSR": 0.3964,
-        "MMLU-PRO": 0.2578
       }
     },
     {
@@ -37705,12 +37692,12 @@
       "name": "Kosmos-EVAA-Fusion-8B",
       "developer": "jaspionjader",
       "scores": {
-        "IFEval": 0.4345,
-        "BBH": 0.5419,
-        "MATH Level 5": 0.1292,
-        "GPQA": 0.3087,
         "MUSR": 0.4277,
-        "MMLU-PRO": 0.3854
       }
     },
     {
@@ -42359,12 +42346,12 @@
       "name": "Mistral-v0.3-7B-ORPO",
       "developer": "llmat",
       "scores": {
-        "IFEval": 0.364,
-        "BBH": 0.4005,
-        "MATH Level 5": 0.0015,
-        "GPQA": 0.2693,
-        "MUSR": 0.3529,
-        "MMLU-PRO": 0.2301
       }
     },
     {
@@ -44478,12 +44465,12 @@
       "name": "Mixtral-8x7B-v0.1",
       "developer": "mistralai",
       "scores": {
-        "IFEval": 0.2326,
-        "BBH": 0.5098,
-        "MATH Level 5": 0.0937,
-        "GPQA": 0.3205,
-        "MUSR": 0.4413,
-        "MMLU-PRO": 0.3871
       }
     },
     {
@@ -44738,12 +44725,12 @@
       "name": "NeuralDaredevil-8B-abliterated",
       "developer": "mlabonne",
       "scores": {
-        "IFEval": 0.4162,
-        "BBH": 0.5124,
-        "MATH Level 5": 0.0853,
-        "GPQA": 0.3029,
-        "MUSR": 0.415,
-        "MMLU-PRO": 0.3802
       }
     },
     {
@@ -45076,12 +45063,12 @@
       "name": "Mistral-Nemo-Kurdish-Instruct",
       "developer": "nazimali",
       "scores": {
-        "IFEval": 0.4964,
-        "BBH": 0.4699,
-        "MATH Level 5": 0.0045,
-        "GPQA": 0.2827,
-        "MUSR": 0.3979,
-        "MMLU-PRO": 0.3063
       }
     },
     {
@@ -46779,12 +46766,12 @@
       "name": "franqwenstein-35b",
       "developer": "nisten",
       "scores": {
-        "IFEval": 0.3914,
-        "BBH": 0.6591,
-        "MATH Level 5": 0.3044,
-        "GPQA": 0.3591,
-        "MUSR": 0.4681,
-        "MMLU-PRO": 0.5611
       }
     },
     {
@@ -48729,12 +48716,12 @@
       "name": "Llama-3-8B-ProLong-512k-Instruct",
       "developer": "princeton-nlp",
       "scores": {
-        "IFEval": 0.5508,
-        "BBH": 0.5028,
-        "MATH Level 5": 0.0529,
-        "GPQA": 0.2861,
-        "MUSR": 0.4266,
-        "MMLU-PRO": 0.3231
       }
     },
     {
@@ -51303,12 +51290,12 @@
       "name": "Gemma-2-Ataraxy-Gemmasutra-9B-slerp",
       "developer": "recoilme",
       "scores": {
-        "IFEval": 0.7649,
-        "BBH": 0.5974,
-        "MATH Level 5": 0.0174,
-        "GPQA": 0.3305,
-        "MUSR": 0.4245,
-        "MMLU-PRO": 0.4207
       }
     },
     {
@@ -51329,12 +51316,12 @@
       "name": "recoilme-gemma-2-9B-v0.2",
       "developer": "recoilme",
       "scores": {
-        "IFEval": 0.2747,
-        "BBH": 0.6031,
-        "MATH Level 5": 0.0831,
-        "GPQA": 0.3305,
-        "MUSR": 0.4686,
-        "MMLU-PRO": 0.4122
       }
     },
     {
@@ -51342,12 +51329,12 @@
       "name": "recoilme-gemma-2-9B-v0.3",
       "developer": "recoilme",
       "scores": {
-        "IFEval": 0.7439,
-        "BBH": 0.5993,
-        "MATH Level 5": 0.0876,
-        "GPQA": 0.3238,
-        "MUSR": 0.4204,
-        "MMLU-PRO": 0.4072
       }
     },
     {
@@ -56997,12 +56984,12 @@
       "name": "BagelMIsteryTour-v2-8x7B",
       "developer": "ycros",
       "scores": {
-        "IFEval": 0.6262,
-        "BBH": 0.5142,
-        "MATH Level 5": 0.0937,
-        "GPQA": 0.3079,
-        "MUSR": 0.4138,
-        "MMLU-PRO": 0.3481
       }
     },
     {

       "name": "LION-Gemma-2b-dpo-v1.0",
       "developer": "Columbia-NLP",
       "scores": {
+        "IFEval": 0.3102,
+        "BBH": 0.3881,
+        "MATH Level 5": 0.0536,
+        "GPQA": 0.2534,
+        "MUSR": 0.4081,
+        "MMLU-PRO": 0.1665
       }
     },
     {
       "name": "PathfinderAI",
       "developer": "Daemontatox",
       "scores": {
+        "IFEval": 0.4855,
+        "BBH": 0.6627,
+        "MATH Level 5": 0.4841,
+        "GPQA": 0.3096,
+        "MUSR": 0.4256,
+        "MMLU-PRO": 0.5542
       }
     },
     {
       "name": "Llama-3.1-8b-ITA",
       "developer": "DeepMount00",
       "scores": {
+        "IFEval": 0.5365,
+        "BBH": 0.517,
+        "MATH Level 5": 0.1707,
+        "GPQA": 0.3062,
+        "MUSR": 0.4487,
+        "MMLU-PRO": 0.396
       }
     },
     {
       "name": "MN-12B-LilithFrame",
       "developer": "DoppelReflEx",
       "scores": {
+        "IFEval": 0.436,
+        "BBH": 0.4956,
+        "MATH Level 5": 0.0589,
+        "GPQA": 0.3205,
+        "MUSR": 0.3843,
+        "MMLU-PRO": 0.3237
       }
     },
     {
       "name": "SmolLM2-135M-Instruct",
       "developer": "HuggingFaceTB",
       "scores": {
+        "IFEval": 0.2883,
+        "BBH": 0.3124,
+        "MATH Level 5": 0.003,
+        "GPQA": 0.2357,
+        "MUSR": 0.3662,
+        "MMLU-PRO": 0.1115
       }
     },
     {
       "name": "SpydazWeb_AI_HumanAI_012_INSTRUCT_XA",
       "developer": "LeroyDyer",
       "scores": {
+        "IFEval": 0.3798,
+        "BBH": 0.4483,
+        "MATH Level 5": 0.04,
+        "GPQA": 0.3129,
+        "MUSR": 0.4148,
+        "MMLU-PRO": 0.2389
       }
     },
     {
         "MMLU-PRO": 0.232
       }
     },
     {
       "model_id": "NousResearch/Yarn-Llama-2-7b-64k",
       "name": "Yarn-Llama-2-7b-64k",
       "name": "code-yi",
       "developer": "Omkar1102",
       "scores": {
+        "IFEval": 0.2148,
+        "BBH": 0.276,
         "MATH Level 5": 0.0,
+        "GPQA": 0.2508,
+        "MUSR": 0.3802,
+        "MMLU-PRO": 0.1126
       }
     },
     {
       "developer": "PrimeIntellect",
       "scores": {
         "IFEval": 0.1757,
+        "BBH": 0.274,
         "MATH Level 5": 0.0,
+        "GPQA": 0.25,
+        "MUSR": 0.3753,
+        "MMLU-PRO": 0.112
       }
     },
     {
       "name": "ODB-14B-sce",
       "developer": "Quazim0t0",
       "scores": {
+        "IFEval": 0.2922,
+        "BBH": 0.6559,
+        "MATH Level 5": 0.2545,
+        "GPQA": 0.2659,
+        "MUSR": 0.3929,
+        "MMLU-PRO": 0.5207
       }
     },
     {
       "name": "Qwen2.5-0.5B-Instruct",
       "developer": "Qwen",
       "scores": {
+        "IFEval": 0.3071,
+        "BBH": 0.3341,
+        "MATH Level 5": 0.0,
+        "GPQA": 0.2576,
+        "MUSR": 0.3329,
+        "MMLU-PRO": 0.1697
       }
     },
     {
       "name": "Qwen2.5-Coder-7B-Instruct",
       "developer": "Qwen",
       "scores": {
+        "IFEval": 0.6101,
+        "BBH": 0.5008,
+        "MATH Level 5": 0.3716,
+        "GPQA": 0.2919,
+        "MUSR": 0.4073,
+        "MMLU-PRO": 0.3352
       }
     },
     {
       "name": "Replete-LLM-Qwen2-7b",
       "developer": "Replete-AI",
       "scores": {
+        "IFEval": 0.0905,
+        "BBH": 0.2985,
         "MATH Level 5": 0.0,
+        "GPQA": 0.2534,
+        "MUSR": 0.3848,
+        "MMLU-PRO": 0.1158
       }
     },
     {
       "name": "Llama-3-Instruct-8B-SPPO-Iter3",
       "developer": "UCLA-AGI",
       "scores": {
+        "IFEval": 0.6703,
+        "BBH": 0.5076,
+        "MATH Level 5": 0.0718,
         "GPQA": 0.2651,
+        "MUSR": 0.3647,
+        "MMLU-PRO": 0.3658
       }
     },
     {
       "name": "llama-3-Korean-8B",
       "developer": "VIRNECT",
       "scores": {
+        "IFEval": 0.5058,
+        "BBH": 0.4908,
+        "MATH Level 5": 0.0929,
         "GPQA": 0.271,
+        "MUSR": 0.3662,
+        "MMLU-PRO": 0.3539
       }
     },
     {
       "name": "Llama3.1-8B-Fireplace2",
       "developer": "ValiantLabs",
       "scores": {
+        "IFEval": 0.5483,
+        "BBH": 0.461,
+        "MATH Level 5": 0.0582,
+        "GPQA": 0.2886,
+        "MUSR": 0.3433,
+        "MMLU-PRO": 0.2407
       }
     },
     {
       "name": "Llama3.1-8B-ShiningValiant2",
       "developer": "ValiantLabs",
       "scores": {
+        "IFEval": 0.2678,
+        "BBH": 0.4429,
+        "MATH Level 5": 0.0521,
+        "GPQA": 0.302,
+        "MUSR": 0.3959,
+        "MMLU-PRO": 0.2927
       }
     },
     {
       "name": "Qwen2.5-14B-YOYO-1010",
       "developer": "YOYO-AI",
       "scores": {
+        "IFEval": 0.7905,
+        "BBH": 0.6406,
+        "MATH Level 5": 0.0,
+        "GPQA": 0.3163,
+        "MUSR": 0.4181,
+        "MMLU-PRO": 0.4944
       }
     },
     {
       "name": "QAIMath-Qwen2.5-7B-TIES",
       "developer": "adriszmar",
       "scores": {
+        "IFEval": 0.1746,
+        "BBH": 0.3126,
+        "MATH Level 5": 0.0,
+        "GPQA": 0.245,
+        "MUSR": 0.4096,
+        "MMLU-PRO": 0.1087
       }
     },
     {
       "name": "Llama-3.1-Storm-8B",
       "developer": "akjindal53244",
       "scores": {
+        "IFEval": 0.8033,
+        "BBH": 0.5196,
+        "MATH Level 5": 0.1624,
+        "GPQA": 0.3096,
         "MUSR": 0.4028,
+        "MMLU-PRO": 0.3812
       }
     },
     {
       "name": "Llama-3.1-Tulu-3-70B",
       "developer": "allenai",
       "scores": {
+        "IFEval": 0.8291,
+        "BBH": 0.6164,
+        "MATH Level 5": 0.4502,
         "GPQA": 0.3733,
+        "MUSR": 0.4948,
+        "MMLU-PRO": 0.4645
       }
     },
     {
       "name": "dolphin-2.9.2-Phi-3-Medium-abliterated",
       "developer": "cognitivecomputations",
       "scores": {
+        "IFEval": 0.3613,
+        "BBH": 0.6123,
+        "MATH Level 5": 0.1239,
+        "GPQA": 0.328,
+        "MUSR": 0.4112,
+        "MMLU-PRO": 0.4494
       }
     },
     {
       "name": "llama-43m-beta",
       "developer": "cpayne1303",
       "scores": {
+        "IFEval": 0.1949,
+        "BBH": 0.2965,
+        "MATH Level 5": 0.0045,
         "GPQA": 0.2685,
+        "MUSR": 0.3885,
+        "MMLU-PRO": 0.1111
       }
     },
     {
       "name": "Llama-3-8B-Orpo-v0.1",
       "developer": "dfurman",
       "scores": {
+        "IFEval": 0.3,
+        "BBH": 0.3853,
+        "MATH Level 5": 0.0415,
+        "GPQA": 0.2617,
+        "MUSR": 0.3579,
+        "MMLU-PRO": 0.2281
       }
     },
     {
       "name": "gemma-2-2b",
       "developer": "Google",
       "scores": {
+        "IFEval": 0.1993,
+        "BBH": 0.3656,
+        "MATH Level 5": 0.0287,
         "GPQA": 0.2626,
+        "MUSR": 0.4232,
+        "MMLU-PRO": 0.218
       }
     },
     {
       "name": "gemma-2-2b-jpn-it",
       "developer": "Google",
       "scores": {
+        "IFEval": 0.5288,
+        "BBH": 0.4178,
+        "MATH Level 5": 0.0476,
+        "GPQA": 0.2752,
+        "MUSR": 0.3728,
+        "MMLU-PRO": 0.2467
       }
     },
     {
       "name": "Kosmos-EVAA-Fusion-8B",
       "developer": "jaspionjader",
       "scores": {
+        "IFEval": 0.4418,
+        "BBH": 0.5406,
+        "MATH Level 5": 0.1352,
+        "GPQA": 0.3062,
         "MUSR": 0.4277,
+        "MMLU-PRO": 0.386
       }
     },
     {
       "name": "Mistral-v0.3-7B-ORPO",
       "developer": "llmat",
       "scores": {
+        "IFEval": 0.377,
+        "BBH": 0.3978,
+        "MATH Level 5": 0.0242,
+        "GPQA": 0.2668,
+        "MUSR": 0.3555,
+        "MMLU-PRO": 0.2278
       }
     },
     {
       "name": "Mixtral-8x7B-v0.1",
       "developer": "mistralai",
       "scores": {
+        "IFEval": 0.2415,
+        "BBH": 0.5087,
+        "MATH Level 5": 0.102,
+        "GPQA": 0.3138,
+        "MUSR": 0.4321,
+        "MMLU-PRO": 0.385
       }
     },
     {
       "name": "NeuralDaredevil-8B-abliterated",
       "developer": "mlabonne",
       "scores": {
+        "IFEval": 0.7561,
+        "BBH": 0.5111,
+        "MATH Level 5": 0.0906,
+        "GPQA": 0.3062,
+        "MUSR": 0.4019,
+        "MMLU-PRO": 0.3841
       }
     },
     {
       "name": "Mistral-Nemo-Kurdish-Instruct",
       "developer": "nazimali",
       "scores": {
+        "IFEval": 0.486,
+        "BBH": 0.4721,
+        "MATH Level 5": 0.0846,
+        "GPQA": 0.2844,
+        "MUSR": 0.4006,
+        "MMLU-PRO": 0.3087
       }
     },
     {
       "name": "franqwenstein-35b",
       "developer": "nisten",
       "scores": {
+        "IFEval": 0.3799,
+        "BBH": 0.6647,
+        "MATH Level 5": 0.3406,
+        "GPQA": 0.4035,
+        "MUSR": 0.494,
+        "MMLU-PRO": 0.5731
       }
     },
     {
       "name": "Llama-3-8B-ProLong-512k-Instruct",
       "developer": "princeton-nlp",
       "scores": {
+        "IFEval": 0.3978,
+        "BBH": 0.4983,
+        "MATH Level 5": 0.0582,
+        "GPQA": 0.281,
+        "MUSR": 0.425,
+        "MMLU-PRO": 0.3246
       }
     },
     {
       "name": "Gemma-2-Ataraxy-Gemmasutra-9B-slerp",
       "developer": "recoilme",
       "scores": {
+        "IFEval": 0.2854,
+        "BBH": 0.5984,
+        "MATH Level 5": 0.1005,
+        "GPQA": 0.3297,
+        "MUSR": 0.4607,
+        "MMLU-PRO": 0.4162
       }
     },
     {
       "name": "recoilme-gemma-2-9B-v0.2",
       "developer": "recoilme",
       "scores": {
+        "IFEval": 0.7592,
+        "BBH": 0.6026,
+        "MATH Level 5": 0.0529,
+        "GPQA": 0.3289,
+        "MUSR": 0.4099,
+        "MMLU-PRO": 0.4163
       }
     },
     {
       "name": "recoilme-gemma-2-9B-v0.3",
       "developer": "recoilme",
       "scores": {
+        "IFEval": 0.5761,
+        "BBH": 0.602,
+        "MATH Level 5": 0.1888,
+        "GPQA": 0.3372,
+        "MUSR": 0.4632,
+        "MMLU-PRO": 0.4039
       }
     },
     {
       "name": "BagelMIsteryTour-v2-8x7B",
       "developer": "ycros",
       "scores": {
+        "IFEval": 0.5994,
+        "BBH": 0.5159,
+        "MATH Level 5": 0.0785,
+        "GPQA": 0.3045,
+        "MUSR": 0.4203,
+        "MMLU-PRO": 0.3473
       }
     },
     {

data/benchmarks/livecodebenchpro.json CHANGED Viewed

@@ -205,9 +205,9 @@
       "name": "gpt-5-2025-08-07",
       "developer": "OpenAI",
       "scores": {
-        "Hard Problems": 0.0423,
-        "Medium Problems": 0.4085,
-        "Easy Problems": 0.9014
       }
     },
     {

       "name": "gpt-5-2025-08-07",
       "developer": "OpenAI",
       "scores": {
+        "Hard Problems": 0.04225352112676056,
+        "Medium Problems": 0.4084507042253521,
+        "Easy Problems": 0.8873239436619719
       }
     },
     {

data/benchmarks/reward-bench.json CHANGED Viewed

@@ -453,16 +453,16 @@
       "name": "LxzGordon/URM-LLaMa-3.1-8B",
       "developer": "LxzGordon",
       "scores": {
-        "Score": 0.9294,
         "Factuality": 0.6884,
         "Precise IF": 0.45,
         "Math": 0.6393,
-        "Safety": 0.9108,
         "Focus": 0.9758,
-        "Ties": 0.7653,
-        "Chat": 0.9553,
-        "Chat Hard": 0.8816,
-        "Reasoning": 0.9698
       }
     },
     {
@@ -555,17 +555,17 @@
       "name": "OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1",
       "developer": "OpenAssistant",
       "scores": {
-        "Score": 0.615,
         "Factuality": 0.3979,
         "Precise IF": 0.2875,
         "Math": 0.377,
-        "Safety": 0.5446,
         "Focus": 0.1535,
-        "Ties": 0.047,
-        "Chat": 0.9246,
-        "Chat Hard": 0.3728,
-        "Reasoning": 0.5855,
-        "Prior Sets (0.5 weight)": 0.6801
       }
     },
     {
@@ -573,17 +573,17 @@
       "name": "OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5",
       "developer": "OpenAssistant",
       "scores": {
-        "Score": 0.2648,
-        "Chat": 0.8855,
-        "Chat Hard": 0.4868,
-        "Safety": 0.3244,
-        "Reasoning": 0.7752,
-        "Prior Sets (0.5 weight)": 0.6533,
         "Factuality": 0.3179,
         "Precise IF": 0.2625,
         "Math": 0.3934,
         "Focus": 0.2707,
-        "Ties": 0.0198
       }
     },
     {
@@ -609,17 +609,17 @@
       "name": "PKU-Alignment/beaver-7b-v1.0-cost",
       "developer": "PKU-Alignment",
       "scores": {
-        "Score": 0.3332,
-        "Chat": 0.6173,
-        "Chat Hard": 0.4232,
-        "Safety": 0.7589,
-        "Reasoning": 0.5482,
-        "Prior Sets (0.5 weight)": 0.57,
         "Factuality": 0.3263,
         "Precise IF": 0.2313,
         "Math": 0.3989,
         "Focus": 0.2939,
-        "Ties": -0.01
       }
     },
     {
@@ -627,17 +627,17 @@
       "name": "PKU-Alignment/beaver-7b-v1.0-reward",
       "developer": "PKU-Alignment",
       "scores": {
-        "Score": 0.4727,
         "Factuality": 0.2105,
         "Precise IF": 0.2938,
         "Math": 0.2623,
-        "Safety": 0.3757,
         "Focus": 0.0646,
-        "Ties": -0.01,
-        "Chat": 0.8184,
-        "Chat Hard": 0.2873,
-        "Reasoning": 0.346,
-        "Prior Sets (0.5 weight)": 0.5993
       }
     },
     {
@@ -663,17 +663,17 @@
       "name": "PKU-Alignment/beaver-7b-v2.0-reward",
       "developer": "PKU-Alignment",
       "scores": {
-        "Score": 0.6366,
         "Factuality": 0.2168,
         "Precise IF": 0.2562,
         "Math": 0.3825,
-        "Safety": 0.6041,
         "Focus": 0.2606,
-        "Ties": 0.0944,
-        "Chat": 0.8994,
-        "Chat Hard": 0.364,
-        "Reasoning": 0.6887,
-        "Prior Sets (0.5 weight)": 0.6171
       }
     },
     {
@@ -921,16 +921,16 @@
       "name": "Ray2333/GRM-gemma2-2B-rewardmodel-ft",
       "developer": "Ray2333",
       "scores": {
-        "Score": 0.8839,
         "Factuality": 0.5305,
         "Precise IF": 0.3125,
         "Math": 0.5902,
-        "Safety": 0.9216,
         "Focus": 0.7455,
-        "Ties": 0.4788,
-        "Chat": 0.9302,
-        "Chat Hard": 0.7719,
-        "Reasoning": 0.912
       }
     },
     {
@@ -956,17 +956,17 @@
       "name": "Ray2333/GRM-llama3-8B-sftreg",
       "developer": "Ray2333",
       "scores": {
-        "Score": 0.6089,
-        "Chat": 0.986,
-        "Chat Hard": 0.6776,
-        "Safety": 0.7867,
-        "Reasoning": 0.9229,
-        "Prior Sets (0.5 weight)": 0.7309,
         "Factuality": 0.6189,
         "Precise IF": 0.3875,
         "Math": 0.5792,
         "Focus": 0.6828,
-        "Ties": 0.5981
       }
     },
     {
@@ -1139,16 +1139,16 @@
       "name": "Skywork/Skywork-Reward-Gemma-2-27B",
       "developer": "Skywork",
       "scores": {
-        "Score": 0.938,
         "Factuality": 0.7368,
         "Precise IF": 0.4031,
         "Math": 0.7049,
-        "Safety": 0.9189,
         "Focus": 0.9323,
-        "Ties": 0.8261,
-        "Chat": 0.9581,
-        "Chat Hard": 0.9145,
-        "Reasoning": 0.9606
       }
     },
     {
@@ -1156,16 +1156,16 @@
       "name": "Skywork/Skywork-Reward-Gemma-2-27B-v0.2",
       "developer": "Skywork",
       "scores": {
-        "Score": 0.7531,
-        "Chat": 0.9609,
-        "Chat Hard": 0.8991,
-        "Safety": 0.9689,
-        "Reasoning": 0.9807,
         "Factuality": 0.7674,
         "Precise IF": 0.375,
         "Math": 0.6721,
         "Focus": 0.9172,
-        "Ties": 0.8182
       }
     },
     {
@@ -1173,16 +1173,16 @@
       "name": "Skywork/Skywork-Reward-Llama-3.1-8B",
       "developer": "Skywork",
       "scores": {
-        "Score": 0.7314,
-        "Chat": 0.9581,
-        "Chat Hard": 0.8728,
-        "Safety": 0.9333,
-        "Reasoning": 0.962,
         "Factuality": 0.6989,
         "Precise IF": 0.425,
         "Math": 0.6284,
         "Focus": 0.9616,
-        "Ties": 0.741
       }
     },
     {
@@ -1305,16 +1305,16 @@
       "name": "Skywork/Skywork-VL-Reward-7B",
       "developer": "Skywork",
       "scores": {
-        "Score": 0.9007,
         "Factuality": 0.6063,
         "Precise IF": 0.35,
         "Math": 0.6339,
-        "Safety": 0.9108,
         "Focus": 0.8909,
-        "Ties": 0.7586,
-        "Chat": 0.8994,
-        "Chat Hard": 0.875,
-        "Reasoning": 0.9176
       }
     },
     {
@@ -1379,9 +1379,9 @@
       "name": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...",
       "developer": "AI2",
       "scores": {
-        "Score": 0.7008,
-        "Chat": 0.9385,
-        "Chat Hard": 0.3882,
         "Safety": 0.7757
       }
     },
@@ -1423,17 +1423,17 @@
       "name": "allenai/Llama-3.1-70B-Instruct-RM-RB2",
       "developer": "allenai",
       "scores": {
-        "Score": 0.9021,
         "Factuality": 0.8126,
         "Precise IF": 0.4188,
         "Math": 0.6995,
-        "Safety": 0.9095,
         "Focus": 0.8646,
-        "Ties": 0.8835,
-        "Chat": 0.9665,
-        "Chat Hard": 0.8355,
-        "Reasoning": 0.8969,
-        "Prior Sets (0.5 weight)": 0.0
       }
     },
     {
@@ -1459,17 +1459,17 @@
       "name": "allenai/Llama-3.1-8B-Instruct-RM-RB2",
       "developer": "allenai",
       "scores": {
-        "Score": 0.8885,
         "Factuality": 0.7432,
         "Precise IF": 0.4437,
         "Math": 0.6175,
-        "Safety": 0.8932,
         "Focus": 0.9071,
-        "Ties": 0.7638,
-        "Chat": 0.9581,
-        "Chat Hard": 0.8158,
-        "Reasoning": 0.887,
-        "Prior Sets (0.5 weight)": 0.0
       }
     },
     {
@@ -1477,17 +1477,17 @@
       "name": "allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2",
       "developer": "allenai",
       "scores": {
-        "Score": 0.722,
-        "Chat": 0.9693,
-        "Chat Hard": 0.8268,
-        "Safety": 0.8689,
-        "Reasoning": 0.8583,
-        "Prior Sets (0.5 weight)": 0.0,
         "Factuality": 0.8084,
         "Precise IF": 0.3688,
         "Math": 0.6776,
         "Focus": 0.7778,
-        "Ties": 0.8308
       }
     },
     {
@@ -1495,17 +1495,17 @@
       "name": "allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2",
       "developer": "allenai",
       "scores": {
-        "Score": 0.687,
-        "Chat": 0.9553,
-        "Chat Hard": 0.761,
-        "Safety": 0.86,
-        "Reasoning": 0.7898,
-        "Prior Sets (0.5 weight)": 0.0,
         "Factuality": 0.7516,
         "Precise IF": 0.3875,
         "Math": 0.6284,
         "Focus": 0.8545,
-        "Ties": 0.6397
       }
     },
     {
@@ -3784,16 +3784,16 @@
       "name": "infly/INF-ORM-Llama3.1-70B",
       "developer": "infly",
       "scores": {
-        "Score": 0.7648,
-        "Chat": 0.9665,
-        "Chat Hard": 0.9101,
-        "Safety": 0.9644,
-        "Reasoning": 0.9912,
         "Factuality": 0.7411,
         "Precise IF": 0.4188,
         "Math": 0.6995,
         "Focus": 0.903,
-        "Ties": 0.8622
       }
     },
     {
@@ -3835,16 +3835,16 @@
       "name": "internlm/internlm2-7b-reward",
       "developer": "internlm",
       "scores": {
-        "Score": 0.8759,
         "Factuality": 0.4211,
         "Precise IF": 0.4,
         "Math": 0.5628,
-        "Safety": 0.8716,
         "Focus": 0.7051,
-        "Ties": 0.5164,
-        "Chat": 0.9916,
-        "Chat Hard": 0.6952,
-        "Reasoning": 0.9453
       }
     },
     {
@@ -4014,16 +4014,16 @@
       "name": "nicolinho/QRM-Gemma-2-27B",
       "developer": "nicolinho",
       "scores": {
-        "Score": 0.9444,
         "Factuality": 0.7853,
         "Precise IF": 0.3719,
         "Math": 0.6995,
-        "Safety": 0.927,
         "Focus": 0.9535,
-        "Ties": 0.8321,
-        "Chat": 0.9665,
-        "Chat Hard": 0.9013,
-        "Reasoning": 0.9826
       }
     },
     {
@@ -4055,16 +4055,16 @@
       "name": "nicolinho/QRM-Llama3.1-8B-v2",
       "developer": "nicolinho",
       "scores": {
-        "Score": 0.9314,
         "Factuality": 0.6653,
         "Precise IF": 0.4062,
         "Math": 0.612,
-        "Safety": 0.9257,
         "Focus": 0.8909,
-        "Ties": 0.7234,
-        "Chat": 0.9637,
-        "Chat Hard": 0.8684,
-        "Reasoning": 0.9677
       }
     },
     {
@@ -4202,16 +4202,16 @@
       "name": "GPT-4o 2024-08-06",
       "developer": "OpenAI",
       "scores": {
-        "Score": 0.6493,
-        "Chat": 0.9609,
-        "Chat Hard": 0.761,
-        "Safety": 0.8619,
-        "Reasoning": 0.8661,
         "Factuality": 0.5684,
         "Precise IF": 0.3312,
         "Math": 0.623,
         "Focus": 0.7293,
-        "Ties": 0.7819
       }
     },
     {
@@ -4249,17 +4249,17 @@
       "name": "openbmb/Eurus-RM-7b",
       "developer": "openbmb",
       "scores": {
-        "Score": 0.5806,
-        "Chat": 0.9804,
-        "Chat Hard": 0.6557,
-        "Safety": 0.6267,
-        "Reasoning": 0.8633,
-        "Prior Sets (0.5 weight)": 0.7172,
         "Factuality": 0.6,
         "Precise IF": 0.3438,
         "Math": 0.5683,
         "Focus": 0.7475,
-        "Ties": 0.5972
       }
     },
     {
@@ -4370,17 +4370,17 @@
       "name": "sfairXC/FsfairX-LLaMA3-RM-v0.1",
       "developer": "sfairXC",
       "scores": {
-        "Score": 0.6292,
-        "Chat": 0.9944,
-        "Chat Hard": 0.6513,
-        "Safety": 0.7667,
-        "Reasoning": 0.8644,
-        "Prior Sets (0.5 weight)": 0.7492,
         "Factuality": 0.5916,
         "Precise IF": 0.4188,
         "Math": 0.6284,
         "Focus": 0.7051,
-        "Ties": 0.6647
       }
     },
     {
@@ -4492,17 +4492,17 @@
       "name": "weqweasdas/RM-Gemma-2B",
       "developer": "weqweasdas",
       "scores": {
-        "Score": 0.3057,
-        "Chat": 0.9441,
-        "Chat Hard": 0.4079,
-        "Safety": 0.3311,
-        "Reasoning": 0.7637,
-        "Prior Sets (0.5 weight)": 0.6652,
         "Factuality": 0.3705,
         "Precise IF": 0.2812,
         "Math": 0.4317,
         "Focus": 0.2343,
-        "Ties": 0.1851
       }
     },
     {
@@ -4541,17 +4541,17 @@
       "name": "weqweasdas/RM-Mistral-7B",
       "developer": "weqweasdas",
       "scores": {
-        "Score": 0.596,
-        "Chat": 0.9665,
-        "Chat Hard": 0.6053,
-        "Safety": 0.6911,
-        "Reasoning": 0.7736,
-        "Prior Sets (0.5 weight)": 0.753,
         "Factuality": 0.5937,
         "Precise IF": 0.3438,
         "Math": 0.5956,
         "Focus": 0.7293,
-        "Ties": 0.6226
       }
     },
     {
@@ -4559,17 +4559,17 @@
       "name": "weqweasdas/hh_rlhf_rm_open_llama_3b",
       "developer": "weqweasdas",
       "scores": {
-        "Score": 0.2498,
-        "Chat": 0.8184,
-        "Chat Hard": 0.3728,
-        "Safety": 0.24,
-        "Reasoning": 0.3281,
-        "Prior Sets (0.5 weight)": 0.6564,
         "Factuality": 0.3642,
         "Precise IF": 0.275,
         "Math": 0.3497,
         "Focus": 0.2384,
-        "Ties": 0.0315
       }
     }
   ]

       "name": "LxzGordon/URM-LLaMa-3.1-8B",
       "developer": "LxzGordon",
       "scores": {
+        "Score": 0.7394,
+        "Chat": 0.9553,
+        "Chat Hard": 0.8816,
+        "Safety": 0.9178,
+        "Reasoning": 0.9698,
         "Factuality": 0.6884,
         "Precise IF": 0.45,
         "Math": 0.6393,
         "Focus": 0.9758,
+        "Ties": 0.7653
       }
     },
     {
       "name": "OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1",
       "developer": "OpenAssistant",
       "scores": {
+        "Score": 0.2653,
+        "Chat": 0.9246,
+        "Chat Hard": 0.3728,
+        "Safety": 0.3289,
+        "Reasoning": 0.5855,
+        "Prior Sets (0.5 weight)": 0.6801,
         "Factuality": 0.3979,
         "Precise IF": 0.2875,
         "Math": 0.377,
         "Focus": 0.1535,
+        "Ties": 0.047
       }
     },
     {
       "name": "OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5",
       "developer": "OpenAssistant",
       "scores": {
+        "Score": 0.6901,
         "Factuality": 0.3179,
         "Precise IF": 0.2625,
         "Math": 0.3934,
+        "Safety": 0.6311,
         "Focus": 0.2707,
+        "Ties": 0.0198,
+        "Chat": 0.8855,
+        "Chat Hard": 0.4868,
+        "Reasoning": 0.7752,
+        "Prior Sets (0.5 weight)": 0.6533
       }
     },
     {
       "name": "PKU-Alignment/beaver-7b-v1.0-cost",
       "developer": "PKU-Alignment",
       "scores": {
+        "Score": 0.5798,
         "Factuality": 0.3263,
         "Precise IF": 0.2313,
         "Math": 0.3989,
+        "Safety": 0.7351,
         "Focus": 0.2939,
+        "Ties": -0.01,
+        "Chat": 0.6173,
+        "Chat Hard": 0.4232,
+        "Reasoning": 0.5482,
+        "Prior Sets (0.5 weight)": 0.57
       }
     },
     {
       "name": "PKU-Alignment/beaver-7b-v1.0-reward",
       "developer": "PKU-Alignment",
       "scores": {
+        "Score": 0.1606,
+        "Chat": 0.8184,
+        "Chat Hard": 0.2873,
+        "Safety": 0.1422,
+        "Reasoning": 0.346,
+        "Prior Sets (0.5 weight)": 0.5993,
         "Factuality": 0.2105,
         "Precise IF": 0.2938,
         "Math": 0.2623,
         "Focus": 0.0646,
+        "Ties": -0.01
       }
     },
     {
       "name": "PKU-Alignment/beaver-7b-v2.0-reward",
       "developer": "PKU-Alignment",
       "scores": {
+        "Score": 0.2544,
+        "Chat": 0.8994,
+        "Chat Hard": 0.364,
+        "Safety": 0.3156,
+        "Reasoning": 0.6887,
+        "Prior Sets (0.5 weight)": 0.6171,
         "Factuality": 0.2168,
         "Precise IF": 0.2562,
         "Math": 0.3825,
         "Focus": 0.2606,
+        "Ties": 0.0944
       }
     },
     {
       "name": "Ray2333/GRM-gemma2-2B-rewardmodel-ft",
       "developer": "Ray2333",
       "scores": {
+        "Score": 0.5966,
+        "Chat": 0.9302,
+        "Chat Hard": 0.7719,
+        "Safety": 0.9222,
+        "Reasoning": 0.912,
         "Factuality": 0.5305,
         "Precise IF": 0.3125,
         "Math": 0.5902,
         "Focus": 0.7455,
+        "Ties": 0.4788
       }
     },
     {
       "name": "Ray2333/GRM-llama3-8B-sftreg",
       "developer": "Ray2333",
       "scores": {
+        "Score": 0.8542,
         "Factuality": 0.6189,
         "Precise IF": 0.3875,
         "Math": 0.5792,
+        "Safety": 0.8919,
         "Focus": 0.6828,
+        "Ties": 0.5981,
+        "Chat": 0.986,
+        "Chat Hard": 0.6776,
+        "Reasoning": 0.9229,
+        "Prior Sets (0.5 weight)": 0.7309
       }
     },
     {
       "name": "Skywork/Skywork-Reward-Gemma-2-27B",
       "developer": "Skywork",
       "scores": {
+        "Score": 0.7576,
+        "Chat": 0.9581,
+        "Chat Hard": 0.9145,
+        "Safety": 0.9422,
+        "Reasoning": 0.9606,
         "Factuality": 0.7368,
         "Precise IF": 0.4031,
         "Math": 0.7049,
         "Focus": 0.9323,
+        "Ties": 0.8261
       }
     },
     {
       "name": "Skywork/Skywork-Reward-Gemma-2-27B-v0.2",
       "developer": "Skywork",
       "scores": {
+        "Score": 0.9426,
         "Factuality": 0.7674,
         "Precise IF": 0.375,
         "Math": 0.6721,
+        "Safety": 0.9297,
         "Focus": 0.9172,
+        "Ties": 0.8182,
+        "Chat": 0.9609,
+        "Chat Hard": 0.8991,
+        "Reasoning": 0.9807
       }
     },
     {
       "name": "Skywork/Skywork-Reward-Llama-3.1-8B",
       "developer": "Skywork",
       "scores": {
+        "Score": 0.9252,
         "Factuality": 0.6989,
         "Precise IF": 0.425,
         "Math": 0.6284,
+        "Safety": 0.9081,
         "Focus": 0.9616,
+        "Ties": 0.741,
+        "Chat": 0.9581,
+        "Chat Hard": 0.8728,
+        "Reasoning": 0.962
       }
     },
     {
       "name": "Skywork/Skywork-VL-Reward-7B",
       "developer": "Skywork",
       "scores": {
+        "Score": 0.6885,
+        "Chat": 0.8994,
+        "Chat Hard": 0.875,
+        "Safety": 0.8911,
+        "Reasoning": 0.9176,
         "Factuality": 0.6063,
         "Precise IF": 0.35,
         "Math": 0.6339,
         "Focus": 0.8909,
+        "Ties": 0.7586
       }
     },
     {
       "name": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...",
       "developer": "AI2",
       "scores": {
+        "Score": 0.6924,
+        "Chat": 0.9441,
+        "Chat Hard": 0.3575,
         "Safety": 0.7757
       }
     },
       "name": "allenai/Llama-3.1-70B-Instruct-RM-RB2",
       "developer": "allenai",
       "scores": {
+        "Score": 0.7606,
+        "Chat": 0.9665,
+        "Chat Hard": 0.8355,
+        "Safety": 0.8844,
+        "Reasoning": 0.8969,
+        "Prior Sets (0.5 weight)": 0.0,
         "Factuality": 0.8126,
         "Precise IF": 0.4188,
         "Math": 0.6995,
         "Focus": 0.8646,
+        "Ties": 0.8835
       }
     },
     {
       "name": "allenai/Llama-3.1-8B-Instruct-RM-RB2",
       "developer": "allenai",
       "scores": {
+        "Score": 0.7285,
+        "Chat": 0.9581,
+        "Chat Hard": 0.8158,
+        "Safety": 0.8956,
+        "Reasoning": 0.887,
+        "Prior Sets (0.5 weight)": 0.0,
         "Factuality": 0.7432,
         "Precise IF": 0.4437,
         "Math": 0.6175,
         "Focus": 0.9071,
+        "Ties": 0.7638
       }
     },
     {
       "name": "allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2",
       "developer": "allenai",
       "scores": {
+        "Score": 0.8892,
         "Factuality": 0.8084,
         "Precise IF": 0.3688,
         "Math": 0.6776,
+        "Safety": 0.9027,
         "Focus": 0.7778,
+        "Ties": 0.8308,
+        "Chat": 0.9693,
+        "Chat Hard": 0.8268,
+        "Reasoning": 0.8583,
+        "Prior Sets (0.5 weight)": 0.0
       }
     },
     {
       "name": "allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2",
       "developer": "allenai",
       "scores": {
+        "Score": 0.8431,
         "Factuality": 0.7516,
         "Precise IF": 0.3875,
         "Math": 0.6284,
+        "Safety": 0.8662,
         "Focus": 0.8545,
+        "Ties": 0.6397,
+        "Chat": 0.9553,
+        "Chat Hard": 0.761,
+        "Reasoning": 0.7898,
+        "Prior Sets (0.5 weight)": 0.0
       }
     },
     {
       "name": "infly/INF-ORM-Llama3.1-70B",
       "developer": "infly",
       "scores": {
+        "Score": 0.9511,
         "Factuality": 0.7411,
         "Precise IF": 0.4188,
         "Math": 0.6995,
+        "Safety": 0.9365,
         "Focus": 0.903,
+        "Ties": 0.8622,
+        "Chat": 0.9665,
+        "Chat Hard": 0.9101,
+        "Reasoning": 0.9912
       }
     },
     {
       "name": "internlm/internlm2-7b-reward",
       "developer": "internlm",
       "scores": {
+        "Score": 0.5335,
+        "Chat": 0.9916,
+        "Chat Hard": 0.6952,
+        "Safety": 0.5956,
+        "Reasoning": 0.9453,
         "Factuality": 0.4211,
         "Precise IF": 0.4,
         "Math": 0.5628,
         "Focus": 0.7051,
+        "Ties": 0.5164
       }
     },
     {
       "name": "nicolinho/QRM-Gemma-2-27B",
       "developer": "nicolinho",
       "scores": {
+        "Score": 0.7667,
+        "Chat": 0.9665,
+        "Chat Hard": 0.9013,
+        "Safety": 0.9578,
+        "Reasoning": 0.9826,
         "Factuality": 0.7853,
         "Precise IF": 0.3719,
         "Math": 0.6995,
         "Focus": 0.9535,
+        "Ties": 0.8321
       }
     },
     {
       "name": "nicolinho/QRM-Llama3.1-8B-v2",
       "developer": "nicolinho",
       "scores": {
+        "Score": 0.7074,
+        "Chat": 0.9637,
+        "Chat Hard": 0.8684,
+        "Safety": 0.9467,
+        "Reasoning": 0.9677,
         "Factuality": 0.6653,
         "Precise IF": 0.4062,
         "Math": 0.612,
         "Focus": 0.8909,
+        "Ties": 0.7234
       }
     },
     {
       "name": "GPT-4o 2024-08-06",
       "developer": "OpenAI",
       "scores": {
+        "Score": 0.8673,
         "Factuality": 0.5684,
         "Precise IF": 0.3312,
         "Math": 0.623,
+        "Safety": 0.8811,
         "Focus": 0.7293,
+        "Ties": 0.7819,
+        "Chat": 0.9609,
+        "Chat Hard": 0.761,
+        "Reasoning": 0.8661
       }
     },
     {
       "name": "openbmb/Eurus-RM-7b",
       "developer": "openbmb",
       "scores": {
+        "Score": 0.8159,
         "Factuality": 0.6,
         "Precise IF": 0.3438,
         "Math": 0.5683,
+        "Safety": 0.8135,
         "Focus": 0.7475,
+        "Ties": 0.5972,
+        "Chat": 0.9804,
+        "Chat Hard": 0.6557,
+        "Reasoning": 0.8633,
+        "Prior Sets (0.5 weight)": 0.7172
       }
     },
     {
       "name": "sfairXC/FsfairX-LLaMA3-RM-v0.1",
       "developer": "sfairXC",
       "scores": {
+        "Score": 0.8338,
         "Factuality": 0.5916,
         "Precise IF": 0.4188,
         "Math": 0.6284,
+        "Safety": 0.8676,
         "Focus": 0.7051,
+        "Ties": 0.6647,
+        "Chat": 0.9944,
+        "Chat Hard": 0.6513,
+        "Reasoning": 0.8644,
+        "Prior Sets (0.5 weight)": 0.7492
       }
     },
     {
       "name": "weqweasdas/RM-Gemma-2B",
       "developer": "weqweasdas",
       "scores": {
+        "Score": 0.6549,
         "Factuality": 0.3705,
         "Precise IF": 0.2812,
         "Math": 0.4317,
+        "Safety": 0.4986,
         "Focus": 0.2343,
+        "Ties": 0.1851,
+        "Chat": 0.9441,
+        "Chat Hard": 0.4079,
+        "Reasoning": 0.7637,
+        "Prior Sets (0.5 weight)": 0.6652
       }
     },
     {
       "name": "weqweasdas/RM-Mistral-7B",
       "developer": "weqweasdas",
       "scores": {
+        "Score": 0.7982,
         "Factuality": 0.5937,
         "Precise IF": 0.3438,
         "Math": 0.5956,
+        "Safety": 0.8703,
         "Focus": 0.7293,
+        "Ties": 0.6226,
+        "Chat": 0.9665,
+        "Chat Hard": 0.6053,
+        "Reasoning": 0.7736,
+        "Prior Sets (0.5 weight)": 0.753
       }
     },
     {
       "name": "weqweasdas/hh_rlhf_rm_open_llama_3b",
       "developer": "weqweasdas",
       "scores": {
+        "Score": 0.5027,
         "Factuality": 0.3642,
         "Precise IF": 0.275,
         "Math": 0.3497,
+        "Safety": 0.4149,
         "Focus": 0.2384,
+        "Ties": 0.0315,
+        "Chat": 0.8184,
+        "Chat Hard": 0.3728,
+        "Reasoning": 0.3281,
+        "Prior Sets (0.5 weight)": 0.6564
       }
     }
   ]

data/benchmarks/swe-bench.json CHANGED Viewed

@@ -5,7 +5,7 @@
       "name": "claude-opus-4-5",
       "developer": "Anthropic",
       "scores": {
-        "swe-bench": 0.65
       }
     },
     {
@@ -13,7 +13,7 @@
       "name": "gemini-3-pro-preview",
       "developer": "Google",
       "scores": {
-        "swe-bench": 0.7234
       }
     },
     {

       "name": "claude-opus-4-5",
       "developer": "Anthropic",
       "scores": {
+        "swe-bench": 0.6061
       }
     },
     {
       "name": "gemini-3-pro-preview",
       "developer": "Google",
       "scores": {
+        "swe-bench": 0.71
       }
     },
     {

data/benchmarks/tau-bench-2_airline.json CHANGED Viewed

@@ -5,7 +5,7 @@
       "name": "claude-opus-4-5",
       "developer": "Anthropic",
       "scores": {
-        "tau-bench-2/airline": 0.72
       }
     },
     {
@@ -13,7 +13,7 @@
       "name": "gemini-3-pro-preview",
       "developer": "Google",
       "scores": {
-        "tau-bench-2/airline": 0.7
       }
     },
     {

       "name": "claude-opus-4-5",
       "developer": "Anthropic",
       "scores": {
+        "tau-bench-2/airline": 0.66
       }
     },
     {
       "name": "gemini-3-pro-preview",
       "developer": "Google",
       "scores": {
+        "tau-bench-2/airline": 0.68
       }
     },
     {

data/benchmarks/tau-bench-2_retail.json CHANGED Viewed

@@ -21,7 +21,7 @@
       "name": "gpt-5.2-2025-12-11",
       "developer": "OpenAI",
       "scores": {
-        "tau-bench-2/retail": 0.68
       }
     }
   ]

       "name": "gpt-5.2-2025-12-11",
       "developer": "OpenAI",
       "scores": {
+        "tau-bench-2/retail": 0.73
       }
     }
   ]

data/benchmarks/tau-bench-2_telecom.json CHANGED Viewed

@@ -5,7 +5,7 @@
       "name": "claude-opus-4-5",
       "developer": "Anthropic",
       "scores": {
-        "tau-bench-2/telecom": 0.76
       }
     },
     {
@@ -21,7 +21,7 @@
       "name": "gpt-5.2-2025-12-11",
       "developer": "OpenAI",
       "scores": {
-        "tau-bench-2/telecom": 0.5354
       }
     }
   ]

       "name": "claude-opus-4-5",
       "developer": "Anthropic",
       "scores": {
+        "tau-bench-2/telecom": 0.84
       }
     },
     {
       "name": "gpt-5.2-2025-12-11",
       "developer": "OpenAI",
       "scores": {
+        "tau-bench-2/telecom": 0.71
       }
     }
   ]

data/benchmarks/terminal-bench-2.0.json CHANGED Viewed

@@ -21,7 +21,7 @@
       "name": "Claude Opus 4.1",
       "developer": "Anthropic",
       "scores": {
-        "terminal-bench-2.0": 38.0
       }
     },
     {
@@ -29,7 +29,7 @@
       "name": "Claude Opus 4.5",
       "developer": "Anthropic",
       "scores": {
-        "terminal-bench-2.0": 59.1
       }
     },
     {
@@ -37,7 +37,7 @@
       "name": "Claude Opus 4.6",
       "developer": "Anthropic",
       "scores": {
-        "terminal-bench-2.0": 58.0
       }
     },
     {
@@ -45,7 +45,7 @@
       "name": "Claude Sonnet 4.5",
       "developer": "Anthropic",
       "scores": {
-        "terminal-bench-2.0": 43.1
       }
     },
     {
@@ -61,7 +61,7 @@
       "name": "Gemini 2.5 Flash",
       "developer": "Google",
       "scores": {
-        "terminal-bench-2.0": 17.1
       }
     },
     {
@@ -77,7 +77,7 @@
       "name": "Gemini 3 Flash",
       "developer": "Google",
       "scores": {
-        "terminal-bench-2.0": 51.0
       }
     },
     {
@@ -109,7 +109,7 @@
       "name": "MiniMax M2.1",
       "developer": "MiniMax",
       "scores": {
-        "terminal-bench-2.0": 29.2
       }
     },
     {
@@ -125,7 +125,7 @@
       "name": "Kimi K2 Instruct",
       "developer": "Moonshot AI",
       "scores": {
-        "terminal-bench-2.0": 26.7
       }
     },
     {
@@ -149,7 +149,7 @@
       "name": "Multiple",
       "developer": "Multiple",
       "scores": {
-        "terminal-bench-2.0": 71.0
       }
     },
     {
@@ -157,7 +157,7 @@
       "name": "GPT-5",
       "developer": "OpenAI",
       "scores": {
-        "terminal-bench-2.0": 35.2
       }
     },
     {
@@ -165,7 +165,7 @@
       "name": "GPT-5-Codex",
       "developer": "OpenAI",
       "scores": {
-        "terminal-bench-2.0": 44.3
       }
     },
     {
@@ -173,7 +173,7 @@
       "name": "GPT-5-Mini",
       "developer": "OpenAI",
       "scores": {
-        "terminal-bench-2.0": 34.8
       }
     },
     {
@@ -181,7 +181,7 @@
       "name": "GPT-5-Nano",
       "developer": "OpenAI",
       "scores": {
-        "terminal-bench-2.0": 9.9
       }
     },
     {
@@ -197,7 +197,7 @@
       "name": "GPT-5.1-Codex",
       "developer": "OpenAI",
       "scores": {
-        "terminal-bench-2.0": 53.5
       }
     },
     {
@@ -221,7 +221,7 @@
       "name": "GPT-5.2",
       "developer": "OpenAI",
       "scores": {
-        "terminal-bench-2.0": 60.7
       }
     },
     {
@@ -237,7 +237,7 @@
       "name": "GPT-5.3-Codex",
       "developer": "OpenAI",
       "scores": {
-        "terminal-bench-2.0": 64.7
       }
     },
     {
@@ -245,7 +245,7 @@
       "name": "GPT-OSS-120B",
       "developer": "OpenAI",
       "scores": {
-        "terminal-bench-2.0": 14.2
       }
     },
     {
@@ -253,7 +253,7 @@
       "name": "GPT-OSS-20B",
       "developer": "OpenAI",
       "scores": {
-        "terminal-bench-2.0": 3.1
       }
     },
     {
@@ -261,7 +261,7 @@
       "name": "Grok 4",
       "developer": "xAI",
       "scores": {
-        "terminal-bench-2.0": 25.4
       }
     },
     {
@@ -269,7 +269,7 @@
       "name": "Grok Code Fast 1",
       "developer": "xAI",
       "scores": {
-        "terminal-bench-2.0": 25.8
       }
     },
     {

       "name": "Claude Opus 4.1",
       "developer": "Anthropic",
       "scores": {
+        "terminal-bench-2.0": 35.1
       }
     },
     {
       "name": "Claude Opus 4.5",
       "developer": "Anthropic",
       "scores": {
+        "terminal-bench-2.0": 52.1
       }
     },
     {
       "name": "Claude Opus 4.6",
       "developer": "Anthropic",
       "scores": {
+        "terminal-bench-2.0": 62.9
       }
     },
     {
       "name": "Claude Sonnet 4.5",
       "developer": "Anthropic",
       "scores": {
+        "terminal-bench-2.0": 42.6
       }
     },
     {
       "name": "Gemini 2.5 Flash",
       "developer": "Google",
       "scores": {
+        "terminal-bench-2.0": 16.9
       }
     },
     {
       "name": "Gemini 3 Flash",
       "developer": "Google",
       "scores": {
+        "terminal-bench-2.0": 47.4
       }
     },
     {
       "name": "MiniMax M2.1",
       "developer": "MiniMax",
       "scores": {
+        "terminal-bench-2.0": 36.6
       }
     },
     {
       "name": "Kimi K2 Instruct",
       "developer": "Moonshot AI",
       "scores": {
+        "terminal-bench-2.0": 27.8
       }
     },
     {
       "name": "Multiple",
       "developer": "Multiple",
       "scores": {
+        "terminal-bench-2.0": 72.4
       }
     },
     {
       "name": "GPT-5",
       "developer": "OpenAI",
       "scores": {
+        "terminal-bench-2.0": 49.6
       }
     },
     {
       "name": "GPT-5-Codex",
       "developer": "OpenAI",
       "scores": {
+        "terminal-bench-2.0": 43.4
       }
     },
     {
       "name": "GPT-5-Mini",
       "developer": "OpenAI",
       "scores": {
+        "terminal-bench-2.0": 24.0
       }
     },
     {
       "name": "GPT-5-Nano",
       "developer": "OpenAI",
       "scores": {
+        "terminal-bench-2.0": 11.5
       }
     },
     {
       "name": "GPT-5.1-Codex",
       "developer": "OpenAI",
       "scores": {
+        "terminal-bench-2.0": 57.8
       }
     },
     {
       "name": "GPT-5.2",
       "developer": "OpenAI",
       "scores": {
+        "terminal-bench-2.0": 62.9
       }
     },
     {
       "name": "GPT-5.3-Codex",
       "developer": "OpenAI",
       "scores": {
+        "terminal-bench-2.0": 77.3
       }
     },
     {
       "name": "GPT-OSS-120B",
       "developer": "OpenAI",
       "scores": {
+        "terminal-bench-2.0": 18.7
       }
     },
     {
       "name": "GPT-OSS-20B",
       "developer": "OpenAI",
       "scores": {
+        "terminal-bench-2.0": 3.4
       }
     },
     {
       "name": "Grok 4",
       "developer": "xAI",
       "scores": {
+        "terminal-bench-2.0": 23.1
       }
     },
     {
       "name": "Grok Code Fast 1",
       "developer": "xAI",
       "scores": {
+        "terminal-bench-2.0": 14.2
       }
     },
     {

data/benchmarks/theory_of_mind.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "models": [
+    {
+      "model_id": "Qwen/Qwen2.5-3B-Instruct",
+      "name": "Qwen2.5-3B-Instruct",
+      "developer": "Qwen",
+      "scores": {
+        "accuracy on theory_of_mind for scorer model_graded_fact": 0.78
+      }
+    }
+  ]
+}

data/developers.json CHANGED Viewed

@@ -1917,7 +1917,7 @@
   },
   {
     "developer": "NousResearch",
-    "model_count": 19
   },
   {
     "developer": "Novaciano",

   },
   {
     "developer": "NousResearch",
+    "model_count": 18
   },
   {
     "developer": "Novaciano",

data/developers/adriszmar.json CHANGED Viewed

@@ -7,12 +7,12 @@
       "developer": "adriszmar",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.1685,
-        "hfopenllm_v2/BBH": 0.3124,
-        "hfopenllm_v2/MATH Level 5": 0.0015,
-        "hfopenllm_v2/GPQA": 0.2492,
-        "hfopenllm_v2/MUSR": 0.3963,
-        "hfopenllm_v2/MMLU-PRO": 0.1066
       }
     }
   ]

       "developer": "adriszmar",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.1746,
+        "hfopenllm_v2/BBH": 0.3126,
+        "hfopenllm_v2/MATH Level 5": 0.0,
+        "hfopenllm_v2/GPQA": 0.245,
+        "hfopenllm_v2/MUSR": 0.4096,
+        "hfopenllm_v2/MMLU-PRO": 0.1087
       }
     }
   ]

data/developers/ai2.json CHANGED Viewed

@@ -43,9 +43,9 @@
       "developer": "AI2",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "reward-bench/Score": 0.7008,
-        "reward-bench/Chat": 0.9385,
-        "reward-bench/Chat Hard": 0.3882,
         "reward-bench/Safety": 0.7757
       }
     },

       "developer": "AI2",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "reward-bench/Score": 0.6924,
+        "reward-bench/Chat": 0.9441,
+        "reward-bench/Chat Hard": 0.3575,
         "reward-bench/Safety": 0.7757
       }
     },

data/developers/akjindal53244.json CHANGED Viewed

@@ -7,12 +7,12 @@
       "developer": "akjindal53244",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.8051,
-        "hfopenllm_v2/BBH": 0.5189,
-        "hfopenllm_v2/MATH Level 5": 0.1722,
-        "hfopenllm_v2/GPQA": 0.3263,
         "hfopenllm_v2/MUSR": 0.4028,
-        "hfopenllm_v2/MMLU-PRO": 0.3803
       }
     }
   ]

       "developer": "akjindal53244",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.8033,
+        "hfopenllm_v2/BBH": 0.5196,
+        "hfopenllm_v2/MATH Level 5": 0.1624,
+        "hfopenllm_v2/GPQA": 0.3096,
         "hfopenllm_v2/MUSR": 0.4028,
+        "hfopenllm_v2/MMLU-PRO": 0.3812
       }
     }
   ]

data/developers/allenai.json CHANGED Viewed

@@ -63,17 +63,17 @@
       "developer": "allenai",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "reward-bench/Score": 0.9021,
         "reward-bench/Factuality": 0.8126,
         "reward-bench/Precise IF": 0.4188,
         "reward-bench/Math": 0.6995,
-        "reward-bench/Safety": 0.9095,
         "reward-bench/Focus": 0.8646,
-        "reward-bench/Ties": 0.8835,
-        "reward-bench/Chat": 0.9665,
-        "reward-bench/Chat Hard": 0.8355,
-        "reward-bench/Reasoning": 0.8969,
-        "reward-bench/Prior Sets (0.5 weight)": 0.0
       }
     },
     {
@@ -101,17 +101,17 @@
       "developer": "allenai",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "reward-bench/Score": 0.8885,
         "reward-bench/Factuality": 0.7432,
         "reward-bench/Precise IF": 0.4437,
         "reward-bench/Math": 0.6175,
-        "reward-bench/Safety": 0.8932,
         "reward-bench/Focus": 0.9071,
-        "reward-bench/Ties": 0.7638,
-        "reward-bench/Chat": 0.9581,
-        "reward-bench/Chat Hard": 0.8158,
-        "reward-bench/Reasoning": 0.887,
-        "reward-bench/Prior Sets (0.5 weight)": 0.0
       }
     },
     {
@@ -120,12 +120,12 @@
       "developer": "allenai",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.8379,
-        "hfopenllm_v2/BBH": 0.6157,
-        "hfopenllm_v2/MATH Level 5": 0.3829,
         "hfopenllm_v2/GPQA": 0.3733,
-        "hfopenllm_v2/MUSR": 0.4988,
-        "hfopenllm_v2/MMLU-PRO": 0.4656
       }
     },
     {
@@ -162,17 +162,17 @@
       "developer": "allenai",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "reward-bench/Score": 0.722,
-        "reward-bench/Chat": 0.9693,
-        "reward-bench/Chat Hard": 0.8268,
-        "reward-bench/Safety": 0.8689,
-        "reward-bench/Reasoning": 0.8583,
-        "reward-bench/Prior Sets (0.5 weight)": 0.0,
         "reward-bench/Factuality": 0.8084,
         "reward-bench/Precise IF": 0.3688,
         "reward-bench/Math": 0.6776,
         "reward-bench/Focus": 0.7778,
-        "reward-bench/Ties": 0.8308
       }
     },
     {
@@ -209,17 +209,17 @@
       "developer": "allenai",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "reward-bench/Score": 0.687,
-        "reward-bench/Chat": 0.9553,
-        "reward-bench/Chat Hard": 0.761,
-        "reward-bench/Safety": 0.86,
-        "reward-bench/Reasoning": 0.7898,
-        "reward-bench/Prior Sets (0.5 weight)": 0.0,
         "reward-bench/Factuality": 0.7516,
         "reward-bench/Precise IF": 0.3875,
         "reward-bench/Math": 0.6284,
         "reward-bench/Focus": 0.8545,
-        "reward-bench/Ties": 0.6397
       }
     },
     {

       "developer": "allenai",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "reward-bench/Score": 0.7606,
+        "reward-bench/Chat": 0.9665,
+        "reward-bench/Chat Hard": 0.8355,
+        "reward-bench/Safety": 0.8844,
+        "reward-bench/Reasoning": 0.8969,
+        "reward-bench/Prior Sets (0.5 weight)": 0.0,
         "reward-bench/Factuality": 0.8126,
         "reward-bench/Precise IF": 0.4188,
         "reward-bench/Math": 0.6995,
         "reward-bench/Focus": 0.8646,
+        "reward-bench/Ties": 0.8835
       }
     },
     {
       "developer": "allenai",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "reward-bench/Score": 0.7285,
+        "reward-bench/Chat": 0.9581,
+        "reward-bench/Chat Hard": 0.8158,
+        "reward-bench/Safety": 0.8956,
+        "reward-bench/Reasoning": 0.887,
+        "reward-bench/Prior Sets (0.5 weight)": 0.0,
         "reward-bench/Factuality": 0.7432,
         "reward-bench/Precise IF": 0.4437,
         "reward-bench/Math": 0.6175,
         "reward-bench/Focus": 0.9071,
+        "reward-bench/Ties": 0.7638
       }
     },
     {
       "developer": "allenai",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.8291,
+        "hfopenllm_v2/BBH": 0.6164,
+        "hfopenllm_v2/MATH Level 5": 0.4502,
         "hfopenllm_v2/GPQA": 0.3733,
+        "hfopenllm_v2/MUSR": 0.4948,
+        "hfopenllm_v2/MMLU-PRO": 0.4645
       }
     },
     {
       "developer": "allenai",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "reward-bench/Score": 0.8892,
         "reward-bench/Factuality": 0.8084,
         "reward-bench/Precise IF": 0.3688,
         "reward-bench/Math": 0.6776,
+        "reward-bench/Safety": 0.9027,
         "reward-bench/Focus": 0.7778,
+        "reward-bench/Ties": 0.8308,
+        "reward-bench/Chat": 0.9693,
+        "reward-bench/Chat Hard": 0.8268,
+        "reward-bench/Reasoning": 0.8583,
+        "reward-bench/Prior Sets (0.5 weight)": 0.0
       }
     },
     {
       "developer": "allenai",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "reward-bench/Score": 0.8431,
         "reward-bench/Factuality": 0.7516,
         "reward-bench/Precise IF": 0.3875,
         "reward-bench/Math": 0.6284,
+        "reward-bench/Safety": 0.8662,
         "reward-bench/Focus": 0.8545,
+        "reward-bench/Ties": 0.6397,
+        "reward-bench/Chat": 0.9553,
+        "reward-bench/Chat Hard": 0.761,
+        "reward-bench/Reasoning": 0.7898,
+        "reward-bench/Prior Sets (0.5 weight)": 0.0
       }
     },
     {

data/developers/anthropic.json CHANGED Viewed

@@ -650,12 +650,12 @@
       "developer": "Anthropic",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "appworld_test_normal/appworld/test_normal": 0.68,
         "browsecompplus/browsecompplus": 0.61,
-        "swe-bench/swe-bench": 0.65,
-        "tau-bench-2_airline/tau-bench-2/airline": 0.72,
         "tau-bench-2_retail/tau-bench-2/retail": 0.78,
-        "tau-bench-2_telecom/tau-bench-2/telecom": 0.76
       }
     },
     {
@@ -664,7 +664,7 @@
       "developer": "Anthropic",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "terminal-bench-2.0/terminal-bench-2.0": 38.0
       }
     },
     {
@@ -673,7 +673,7 @@
       "developer": "Anthropic",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "terminal-bench-2.0/terminal-bench-2.0": 59.1
       }
     },
     {
@@ -682,7 +682,7 @@
       "developer": "Anthropic",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "terminal-bench-2.0/terminal-bench-2.0": 58.0
       }
     },
     {
@@ -756,7 +756,7 @@
       "developer": "Anthropic",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "terminal-bench-2.0/terminal-bench-2.0": 43.1
       }
     },
     {
@@ -800,8 +800,6 @@
       "developer": "Anthropic",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "ace/Overall Score": 0.478,
-        "ace/Gaming Score": 0.391,
         "apex-agents/Overall Pass@1": 0.184,
         "apex-agents/Overall Pass@8": 0.34,
         "apex-agents/Overall Mean Score": 0.348,
@@ -809,6 +807,8 @@
         "apex-agents/Management Consulting Pass@1": 0.132,
         "apex-agents/Corporate Law Pass@1": 0.202,
         "apex-agents/Corporate Lawyer Mean Score": 0.471,
         "apex-v1/Medicine (MD) Score": 0.65
       }
     },

       "developer": "Anthropic",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "appworld_test_normal/appworld/test_normal": 0.7,
         "browsecompplus/browsecompplus": 0.61,
+        "swe-bench/swe-bench": 0.6061,
+        "tau-bench-2_airline/tau-bench-2/airline": 0.66,
         "tau-bench-2_retail/tau-bench-2/retail": 0.78,
+        "tau-bench-2_telecom/tau-bench-2/telecom": 0.84
       }
     },
     {
       "developer": "Anthropic",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "terminal-bench-2.0/terminal-bench-2.0": 35.1
       }
     },
     {
       "developer": "Anthropic",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "terminal-bench-2.0/terminal-bench-2.0": 52.1
       }
     },
     {
       "developer": "Anthropic",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "terminal-bench-2.0/terminal-bench-2.0": 62.9
       }
     },
     {
       "developer": "Anthropic",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "terminal-bench-2.0/terminal-bench-2.0": 42.6
       }
     },
     {
       "developer": "Anthropic",
       "evaluator_relationship": null,
       "benchmark_scores": {
         "apex-agents/Overall Pass@1": 0.184,
         "apex-agents/Overall Pass@8": 0.34,
         "apex-agents/Overall Mean Score": 0.348,
         "apex-agents/Management Consulting Pass@1": 0.132,
         "apex-agents/Corporate Law Pass@1": 0.202,
         "apex-agents/Corporate Lawyer Mean Score": 0.471,
+        "ace/Overall Score": 0.478,
+        "ace/Gaming Score": 0.391,
         "apex-v1/Medicine (MD) Score": 0.65
       }
     },

data/developers/cognitivecomputations.json CHANGED Viewed

@@ -77,12 +77,12 @@
       "developer": "cognitivecomputations",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.4124,
-        "hfopenllm_v2/BBH": 0.6383,
-        "hfopenllm_v2/MATH Level 5": 0.182,
-        "hfopenllm_v2/GPQA": 0.3289,
-        "hfopenllm_v2/MUSR": 0.4349,
-        "hfopenllm_v2/MMLU-PRO": 0.4525
       }
     },
     {

       "developer": "cognitivecomputations",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.3613,
+        "hfopenllm_v2/BBH": 0.6123,
+        "hfopenllm_v2/MATH Level 5": 0.1239,
+        "hfopenllm_v2/GPQA": 0.328,
+        "hfopenllm_v2/MUSR": 0.4112,
+        "hfopenllm_v2/MMLU-PRO": 0.4494
       }
     },
     {

data/developers/columbia-nlp.json CHANGED Viewed

@@ -7,12 +7,12 @@
       "developer": "Columbia-NLP",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.3278,
-        "hfopenllm_v2/BBH": 0.392,
-        "hfopenllm_v2/MATH Level 5": 0.0431,
-        "hfopenllm_v2/GPQA": 0.2492,
-        "hfopenllm_v2/MUSR": 0.412,
-        "hfopenllm_v2/MMLU-PRO": 0.1666
       }
     },
     {

       "developer": "Columbia-NLP",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.3102,
+        "hfopenllm_v2/BBH": 0.3881,
+        "hfopenllm_v2/MATH Level 5": 0.0536,
+        "hfopenllm_v2/GPQA": 0.2534,
+        "hfopenllm_v2/MUSR": 0.4081,
+        "hfopenllm_v2/MMLU-PRO": 0.1665
       }
     },
     {

data/developers/cpayne1303.json CHANGED Viewed

@@ -35,12 +35,12 @@
       "developer": "cpayne1303",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.1916,
-        "hfopenllm_v2/BBH": 0.2977,
-        "hfopenllm_v2/MATH Level 5": 0.0,
         "hfopenllm_v2/GPQA": 0.2685,
-        "hfopenllm_v2/MUSR": 0.3872,
-        "hfopenllm_v2/MMLU-PRO": 0.1132
       }
     },
     {

       "developer": "cpayne1303",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.1949,
+        "hfopenllm_v2/BBH": 0.2965,
+        "hfopenllm_v2/MATH Level 5": 0.0045,
         "hfopenllm_v2/GPQA": 0.2685,
+        "hfopenllm_v2/MUSR": 0.3885,
+        "hfopenllm_v2/MMLU-PRO": 0.1111
       }
     },
     {

data/developers/daemontatox.json CHANGED Viewed

@@ -231,12 +231,12 @@
       "developer": "Daemontatox",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.3745,
-        "hfopenllm_v2/BBH": 0.6668,
-        "hfopenllm_v2/MATH Level 5": 0.4758,
-        "hfopenllm_v2/GPQA": 0.3943,
-        "hfopenllm_v2/MUSR": 0.4858,
-        "hfopenllm_v2/MMLU-PRO": 0.5593
       }
     },
     {

       "developer": "Daemontatox",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.4855,
+        "hfopenllm_v2/BBH": 0.6627,
+        "hfopenllm_v2/MATH Level 5": 0.4841,
+        "hfopenllm_v2/GPQA": 0.3096,
+        "hfopenllm_v2/MUSR": 0.4256,
+        "hfopenllm_v2/MMLU-PRO": 0.5542
       }
     },
     {

data/developers/deepmount00.json CHANGED Viewed

@@ -63,12 +63,12 @@
       "developer": "DeepMount00",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.7917,
-        "hfopenllm_v2/BBH": 0.5109,
-        "hfopenllm_v2/MATH Level 5": 0.1088,
-        "hfopenllm_v2/GPQA": 0.2878,
-        "hfopenllm_v2/MUSR": 0.4136,
-        "hfopenllm_v2/MMLU-PRO": 0.3876
       }
     },
     {

       "developer": "DeepMount00",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.5365,
+        "hfopenllm_v2/BBH": 0.517,
+        "hfopenllm_v2/MATH Level 5": 0.1707,
+        "hfopenllm_v2/GPQA": 0.3062,
+        "hfopenllm_v2/MUSR": 0.4487,
+        "hfopenllm_v2/MMLU-PRO": 0.396
       }
     },
     {

data/developers/dfurman.json CHANGED Viewed

@@ -35,12 +35,12 @@
       "developer": "dfurman",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.2835,
-        "hfopenllm_v2/BBH": 0.3842,
-        "hfopenllm_v2/MATH Level 5": 0.0521,
-        "hfopenllm_v2/GPQA": 0.2609,
-        "hfopenllm_v2/MUSR": 0.3566,
-        "hfopenllm_v2/MMLU-PRO": 0.2298
       }
     },
     {

       "developer": "dfurman",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.3,
+        "hfopenllm_v2/BBH": 0.3853,
+        "hfopenllm_v2/MATH Level 5": 0.0415,
+        "hfopenllm_v2/GPQA": 0.2617,
+        "hfopenllm_v2/MUSR": 0.3579,
+        "hfopenllm_v2/MMLU-PRO": 0.2281
       }
     },
     {

data/developers/doppelreflex.json CHANGED Viewed

@@ -175,12 +175,12 @@
       "developer": "DoppelReflEx",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.451,
-        "hfopenllm_v2/BBH": 0.4944,
-        "hfopenllm_v2/MATH Level 5": 0.1156,
-        "hfopenllm_v2/GPQA": 0.3196,
-        "hfopenllm_v2/MUSR": 0.3896,
-        "hfopenllm_v2/MMLU-PRO": 0.3256
       }
     },
     {

       "developer": "DoppelReflEx",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.436,
+        "hfopenllm_v2/BBH": 0.4956,
+        "hfopenllm_v2/MATH Level 5": 0.0589,
+        "hfopenllm_v2/GPQA": 0.3205,
+        "hfopenllm_v2/MUSR": 0.3843,
+        "hfopenllm_v2/MMLU-PRO": 0.3237
       }
     },
     {

data/developers/google.json CHANGED Viewed

@@ -139,6 +139,7 @@
       "developer": "Google",
       "evaluator_relationship": null,
       "benchmark_scores": {
         "apex-agents/Overall Pass@1": 0.24,
         "apex-agents/Overall Pass@8": 0.367,
         "apex-agents/Overall Mean Score": 0.395,
@@ -146,7 +147,6 @@
         "apex-agents/Management Consulting Pass@1": 0.193,
         "apex-agents/Corporate Law Pass@1": 0.259,
         "apex-agents/Corporate Lawyer Mean Score": 0.524,
-        "ace/Gaming Score": 0.415,
         "apex-v1/Overall Score": 0.64,
         "apex-v1/Consulting Score": 0.64
       }
@@ -157,6 +157,8 @@
       "developer": "Google",
       "evaluator_relationship": null,
       "benchmark_scores": {
         "apex-agents/Overall Pass@1": 0.184,
         "apex-agents/Overall Pass@8": 0.373,
         "apex-agents/Overall Mean Score": 0.341,
@@ -164,8 +166,6 @@
         "apex-agents/Management Consulting Pass@1": 0.124,
         "apex-agents/Corporate Law Pass@1": 0.239,
         "apex-agents/Corporate Lawyer Mean Score": 0.487,
-        "ace/Overall Score": 0.47,
-        "ace/Gaming Score": 0.509,
         "apex-v1/Overall Score": 0.643,
         "apex-v1/Consulting Score": 0.64,
         "apex-v1/Investment Banking Score": 0.63
@@ -723,7 +723,7 @@
         "reward-bench/Safety": 0.909,
         "reward-bench/Focus": 0.841,
         "reward-bench/Ties": 0.809,
-        "terminal-bench-2.0/terminal-bench-2.0": 17.1
       }
     },
     {
@@ -861,7 +861,7 @@
       "developer": "Google",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "terminal-bench-2.0/terminal-bench-2.0": 51.0
       }
     },
     {
@@ -879,8 +879,8 @@
       "developer": "Google",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "appworld_test_normal/appworld/test_normal": 0.13,
-        "browsecompplus/browsecompplus": 0.48,
         "global-mmlu-lite/Global MMLU Lite": 0.9453,
         "global-mmlu-lite/Culturally Sensitive": 0.9397,
         "global-mmlu-lite/Culturally Agnostic": 0.9509,
@@ -900,8 +900,8 @@
         "global-mmlu-lite/Yoruba": 0.9425,
         "global-mmlu-lite/Chinese": 0.9475,
         "global-mmlu-lite/Burmese": 0.9425,
-        "swe-bench/swe-bench": 0.7234,
-        "tau-bench-2_airline/tau-bench-2/airline": 0.7,
         "tau-bench-2_retail/tau-bench-2/retail": 0.73,
         "tau-bench-2_telecom/tau-bench-2/telecom": 0.73
       }
@@ -1028,12 +1028,12 @@
       "developer": "Google",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.2018,
-        "hfopenllm_v2/BBH": 0.3709,
-        "hfopenllm_v2/MATH Level 5": 0.0302,
         "hfopenllm_v2/GPQA": 0.2626,
-        "hfopenllm_v2/MUSR": 0.4219,
-        "hfopenllm_v2/MMLU-PRO": 0.2217
       }
     },
     {
@@ -1056,12 +1056,12 @@
       "developer": "Google",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.5078,
-        "hfopenllm_v2/BBH": 0.4226,
-        "hfopenllm_v2/MATH Level 5": 0.0347,
-        "hfopenllm_v2/GPQA": 0.2852,
-        "hfopenllm_v2/MUSR": 0.3964,
-        "hfopenllm_v2/MMLU-PRO": 0.2578
       }
     },
     {

       "developer": "Google",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "ace/Gaming Score": 0.415,
         "apex-agents/Overall Pass@1": 0.24,
         "apex-agents/Overall Pass@8": 0.367,
         "apex-agents/Overall Mean Score": 0.395,
         "apex-agents/Management Consulting Pass@1": 0.193,
         "apex-agents/Corporate Law Pass@1": 0.259,
         "apex-agents/Corporate Lawyer Mean Score": 0.524,
         "apex-v1/Overall Score": 0.64,
         "apex-v1/Consulting Score": 0.64
       }
       "developer": "Google",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "ace/Overall Score": 0.47,
+        "ace/Gaming Score": 0.509,
         "apex-agents/Overall Pass@1": 0.184,
         "apex-agents/Overall Pass@8": 0.373,
         "apex-agents/Overall Mean Score": 0.341,
         "apex-agents/Management Consulting Pass@1": 0.124,
         "apex-agents/Corporate Law Pass@1": 0.239,
         "apex-agents/Corporate Lawyer Mean Score": 0.487,
         "apex-v1/Overall Score": 0.643,
         "apex-v1/Consulting Score": 0.64,
         "apex-v1/Investment Banking Score": 0.63
         "reward-bench/Safety": 0.909,
         "reward-bench/Focus": 0.841,
         "reward-bench/Ties": 0.809,
+        "terminal-bench-2.0/terminal-bench-2.0": 16.9
       }
     },
     {
       "developer": "Google",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "terminal-bench-2.0/terminal-bench-2.0": 47.4
       }
     },
     {
       "developer": "Google",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "appworld_test_normal/appworld/test_normal": 0.55,
+        "browsecompplus/browsecompplus": 0.3333,
         "global-mmlu-lite/Global MMLU Lite": 0.9453,
         "global-mmlu-lite/Culturally Sensitive": 0.9397,
         "global-mmlu-lite/Culturally Agnostic": 0.9509,
         "global-mmlu-lite/Yoruba": 0.9425,
         "global-mmlu-lite/Chinese": 0.9475,
         "global-mmlu-lite/Burmese": 0.9425,
+        "swe-bench/swe-bench": 0.71,
+        "tau-bench-2_airline/tau-bench-2/airline": 0.68,
         "tau-bench-2_retail/tau-bench-2/retail": 0.73,
         "tau-bench-2_telecom/tau-bench-2/telecom": 0.73
       }
       "developer": "Google",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.1993,
+        "hfopenllm_v2/BBH": 0.3656,
+        "hfopenllm_v2/MATH Level 5": 0.0287,
         "hfopenllm_v2/GPQA": 0.2626,
+        "hfopenllm_v2/MUSR": 0.4232,
+        "hfopenllm_v2/MMLU-PRO": 0.218
       }
     },
     {
       "developer": "Google",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.5288,
+        "hfopenllm_v2/BBH": 0.4178,
+        "hfopenllm_v2/MATH Level 5": 0.0476,
+        "hfopenllm_v2/GPQA": 0.2752,
+        "hfopenllm_v2/MUSR": 0.3728,
+        "hfopenllm_v2/MMLU-PRO": 0.2467
       }
     },
     {

data/developers/huggingfacetb.json CHANGED Viewed

@@ -133,12 +133,12 @@
       "developer": "HuggingFaceTB",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.0593,
-        "hfopenllm_v2/BBH": 0.3135,
-        "hfopenllm_v2/MATH Level 5": 0.0144,
-        "hfopenllm_v2/GPQA": 0.2341,
-        "hfopenllm_v2/MUSR": 0.3871,
-        "hfopenllm_v2/MMLU-PRO": 0.1092
       }
     },
     {

       "developer": "HuggingFaceTB",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.2883,
+        "hfopenllm_v2/BBH": 0.3124,
+        "hfopenllm_v2/MATH Level 5": 0.003,
+        "hfopenllm_v2/GPQA": 0.2357,
+        "hfopenllm_v2/MUSR": 0.3662,
+        "hfopenllm_v2/MMLU-PRO": 0.1115
       }
     },
     {

data/developers/infly.json CHANGED Viewed

@@ -7,16 +7,16 @@
       "developer": "infly",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "reward-bench/Score": 0.7648,
-        "reward-bench/Chat": 0.9665,
-        "reward-bench/Chat Hard": 0.9101,
-        "reward-bench/Safety": 0.9644,
-        "reward-bench/Reasoning": 0.9912,
         "reward-bench/Factuality": 0.7411,
         "reward-bench/Precise IF": 0.4188,
         "reward-bench/Math": 0.6995,
         "reward-bench/Focus": 0.903,
-        "reward-bench/Ties": 0.8622
       }
     }
   ]

       "developer": "infly",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "reward-bench/Score": 0.9511,
         "reward-bench/Factuality": 0.7411,
         "reward-bench/Precise IF": 0.4188,
         "reward-bench/Math": 0.6995,
+        "reward-bench/Safety": 0.9365,
         "reward-bench/Focus": 0.903,
+        "reward-bench/Ties": 0.8622,
+        "reward-bench/Chat": 0.9665,
+        "reward-bench/Chat Hard": 0.9101,
+        "reward-bench/Reasoning": 0.9912
       }
     }
   ]

data/developers/internlm.json CHANGED Viewed

@@ -71,16 +71,16 @@
       "developer": "internlm",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "reward-bench/Score": 0.8759,
         "reward-bench/Factuality": 0.4211,
         "reward-bench/Precise IF": 0.4,
         "reward-bench/Math": 0.5628,
-        "reward-bench/Safety": 0.8716,
         "reward-bench/Focus": 0.7051,
-        "reward-bench/Ties": 0.5164,
-        "reward-bench/Chat": 0.9916,
-        "reward-bench/Chat Hard": 0.6952,
-        "reward-bench/Reasoning": 0.9453
       }
     },
     {

       "developer": "internlm",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "reward-bench/Score": 0.5335,
+        "reward-bench/Chat": 0.9916,
+        "reward-bench/Chat Hard": 0.6952,
+        "reward-bench/Safety": 0.5956,
+        "reward-bench/Reasoning": 0.9453,
         "reward-bench/Factuality": 0.4211,
         "reward-bench/Precise IF": 0.4,
         "reward-bench/Math": 0.5628,
         "reward-bench/Focus": 0.7051,
+        "reward-bench/Ties": 0.5164
       }
     },
     {

data/developers/jaspionjader.json CHANGED Viewed

@@ -1477,12 +1477,12 @@
       "developer": "jaspionjader",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.4345,
-        "hfopenllm_v2/BBH": 0.5419,
-        "hfopenllm_v2/MATH Level 5": 0.1292,
-        "hfopenllm_v2/GPQA": 0.3087,
         "hfopenllm_v2/MUSR": 0.4277,
-        "hfopenllm_v2/MMLU-PRO": 0.3854
       }
     },
     {

       "developer": "jaspionjader",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.4418,
+        "hfopenllm_v2/BBH": 0.5406,
+        "hfopenllm_v2/MATH Level 5": 0.1352,
+        "hfopenllm_v2/GPQA": 0.3062,
         "hfopenllm_v2/MUSR": 0.4277,
+        "hfopenllm_v2/MMLU-PRO": 0.386
       }
     },
     {

data/developers/leroydyer.json CHANGED Viewed

@@ -707,12 +707,12 @@
       "developer": "LeroyDyer",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.3579,
-        "hfopenllm_v2/BBH": 0.4477,
-        "hfopenllm_v2/MATH Level 5": 0.0423,
-        "hfopenllm_v2/GPQA": 0.3096,
-        "hfopenllm_v2/MUSR": 0.4134,
-        "hfopenllm_v2/MMLU-PRO": 0.2376
       }
     },
     {

       "developer": "LeroyDyer",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.3798,
+        "hfopenllm_v2/BBH": 0.4483,
+        "hfopenllm_v2/MATH Level 5": 0.04,
+        "hfopenllm_v2/GPQA": 0.3129,
+        "hfopenllm_v2/MUSR": 0.4148,
+        "hfopenllm_v2/MMLU-PRO": 0.2389
       }
     },
     {

data/developers/llmat.json CHANGED Viewed

@@ -7,12 +7,12 @@
       "developer": "llmat",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.364,
-        "hfopenllm_v2/BBH": 0.4005,
-        "hfopenllm_v2/MATH Level 5": 0.0015,
-        "hfopenllm_v2/GPQA": 0.2693,
-        "hfopenllm_v2/MUSR": 0.3529,
-        "hfopenllm_v2/MMLU-PRO": 0.2301
       }
     }
   ]

       "developer": "llmat",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.377,
+        "hfopenllm_v2/BBH": 0.3978,
+        "hfopenllm_v2/MATH Level 5": 0.0242,
+        "hfopenllm_v2/GPQA": 0.2668,
+        "hfopenllm_v2/MUSR": 0.3555,
+        "hfopenllm_v2/MMLU-PRO": 0.2278
       }
     }
   ]

data/developers/lxzgordon.json CHANGED Viewed

@@ -20,16 +20,16 @@
       "developer": "LxzGordon",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "reward-bench/Score": 0.9294,
         "reward-bench/Factuality": 0.6884,
         "reward-bench/Precise IF": 0.45,
         "reward-bench/Math": 0.6393,
-        "reward-bench/Safety": 0.9108,
         "reward-bench/Focus": 0.9758,
-        "reward-bench/Ties": 0.7653,
-        "reward-bench/Chat": 0.9553,
-        "reward-bench/Chat Hard": 0.8816,
-        "reward-bench/Reasoning": 0.9698
       }
     }
   ]

       "developer": "LxzGordon",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "reward-bench/Score": 0.7394,
+        "reward-bench/Chat": 0.9553,
+        "reward-bench/Chat Hard": 0.8816,
+        "reward-bench/Safety": 0.9178,
+        "reward-bench/Reasoning": 0.9698,
         "reward-bench/Factuality": 0.6884,
         "reward-bench/Precise IF": 0.45,
         "reward-bench/Math": 0.6393,
         "reward-bench/Focus": 0.9758,
+        "reward-bench/Ties": 0.7653
       }
     }
   ]

data/developers/meta.json CHANGED Viewed

@@ -471,6 +471,16 @@
         "helm_capabilities/IFEval": 0.743,
         "helm_capabilities/WildBench": 0.686,
         "helm_capabilities/Omni-MATH": 0.137,
         "helm_mmlu/MMLU All Subjects": 0.561,
         "helm_mmlu/Abstract Algebra": 0.26,
         "helm_mmlu/Anatomy": 0.459,
@@ -506,17 +516,7 @@
         "helm_mmlu/Sociology": 0.701,
         "helm_mmlu/Virology": 0.446,
         "helm_mmlu/World Religions": 0.789,
-        "helm_mmlu/Mean win rate": 0.475,
-        "helm_lite/Mean win rate": 0.303,
-        "helm_lite/NarrativeQA": 0.756,
-        "helm_lite/NaturalQuestions (closed-book)": 0.209,
-        "helm_lite/OpenbookQA": 0.74,
-        "helm_lite/MMLU": 0.5,
-        "helm_lite/MATH": 0.703,
-        "helm_lite/GSM8K": 0.798,
-        "helm_lite/LegalBench": 0.342,
-        "helm_lite/MedQA": 0.245,
-        "helm_lite/WMT 2014": 0.181
       }
     },
     {
@@ -579,6 +579,16 @@
       "developer": "Meta",
       "evaluator_relationship": null,
       "benchmark_scores": {
         "helm_mmlu/MMLU All Subjects": 0.803,
         "helm_mmlu/Abstract Algebra": 0.52,
         "helm_mmlu/Anatomy": 0.8,
@@ -614,17 +624,7 @@
         "helm_mmlu/Sociology": 0.92,
         "helm_mmlu/Virology": 0.584,
         "helm_mmlu/World Religions": 0.901,
-        "helm_mmlu/Mean win rate": 0.773,
-        "helm_lite/Mean win rate": 0.819,
-        "helm_lite/NarrativeQA": 0.777,
-        "helm_lite/NaturalQuestions (closed-book)": 0.457,
-        "helm_lite/OpenbookQA": 0.942,
-        "helm_lite/MMLU": 0.703,
-        "helm_lite/MATH": 0.791,
-        "helm_lite/GSM8K": 0.936,
-        "helm_lite/LegalBench": 0.68,
-        "helm_lite/MedQA": 0.769,
-        "helm_lite/WMT 2014": 0.224
       }
     },
     {

         "helm_capabilities/IFEval": 0.743,
         "helm_capabilities/WildBench": 0.686,
         "helm_capabilities/Omni-MATH": 0.137,
+        "helm_lite/Mean win rate": 0.303,
+        "helm_lite/NarrativeQA": 0.756,
+        "helm_lite/NaturalQuestions (closed-book)": 0.209,
+        "helm_lite/OpenbookQA": 0.74,
+        "helm_lite/MMLU": 0.5,
+        "helm_lite/MATH": 0.703,
+        "helm_lite/GSM8K": 0.798,
+        "helm_lite/LegalBench": 0.342,
+        "helm_lite/MedQA": 0.245,
+        "helm_lite/WMT 2014": 0.181,
         "helm_mmlu/MMLU All Subjects": 0.561,
         "helm_mmlu/Abstract Algebra": 0.26,
         "helm_mmlu/Anatomy": 0.459,
         "helm_mmlu/Sociology": 0.701,
         "helm_mmlu/Virology": 0.446,
         "helm_mmlu/World Religions": 0.789,
+        "helm_mmlu/Mean win rate": 0.475
       }
     },
     {
       "developer": "Meta",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "helm_lite/Mean win rate": 0.819,
+        "helm_lite/NarrativeQA": 0.777,
+        "helm_lite/NaturalQuestions (closed-book)": 0.457,
+        "helm_lite/OpenbookQA": 0.942,
+        "helm_lite/MMLU": 0.703,
+        "helm_lite/MATH": 0.791,
+        "helm_lite/GSM8K": 0.936,
+        "helm_lite/LegalBench": 0.68,
+        "helm_lite/MedQA": 0.769,
+        "helm_lite/WMT 2014": 0.224,
         "helm_mmlu/MMLU All Subjects": 0.803,
         "helm_mmlu/Abstract Algebra": 0.52,
         "helm_mmlu/Anatomy": 0.8,
         "helm_mmlu/Sociology": 0.92,
         "helm_mmlu/Virology": 0.584,
         "helm_mmlu/World Religions": 0.901,
+        "helm_mmlu/Mean win rate": 0.773
       }
     },
     {

data/developers/minimax.json CHANGED Viewed

@@ -25,7 +25,7 @@
       "developer": "MiniMax",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "terminal-bench-2.0/terminal-bench-2.0": 29.2
       }
     },
     {

       "developer": "MiniMax",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "terminal-bench-2.0/terminal-bench-2.0": 36.6
       }
     },
     {

data/developers/mistralai.json CHANGED Viewed

@@ -69,6 +69,16 @@
         "helm_capabilities/IFEval": 0.567,
         "helm_capabilities/WildBench": 0.66,
         "helm_capabilities/Omni-MATH": 0.072,
         "helm_mmlu/MMLU All Subjects": 0.599,
         "helm_mmlu/Abstract Algebra": 0.27,
         "helm_mmlu/Anatomy": 0.585,
@@ -105,16 +115,6 @@
         "helm_mmlu/Virology": 0.47,
         "helm_mmlu/World Religions": 0.825,
         "helm_mmlu/Mean win rate": 0.509,
-        "helm_lite/Mean win rate": 0.196,
-        "helm_lite/NarrativeQA": 0.716,
-        "helm_lite/NaturalQuestions (closed-book)": 0.253,
-        "helm_lite/OpenbookQA": 0.79,
-        "helm_lite/MMLU": 0.51,
-        "helm_lite/MATH": 0.289,
-        "helm_lite/GSM8K": 0.538,
-        "helm_lite/LegalBench": 0.331,
-        "helm_lite/MedQA": 0.517,
-        "helm_lite/WMT 2014": 0.142,
         "hfopenllm_v2/IFEval": 0.5465,
         "hfopenllm_v2/BBH": 0.4722,
         "hfopenllm_v2/MATH Level 5": 0.0385,
@@ -718,12 +718,12 @@
       "developer": "mistralai",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.2326,
-        "hfopenllm_v2/BBH": 0.5098,
-        "hfopenllm_v2/MATH Level 5": 0.0937,
-        "hfopenllm_v2/GPQA": 0.3205,
-        "hfopenllm_v2/MUSR": 0.4413,
-        "hfopenllm_v2/MMLU-PRO": 0.3871
       }
     },
     {

         "helm_capabilities/IFEval": 0.567,
         "helm_capabilities/WildBench": 0.66,
         "helm_capabilities/Omni-MATH": 0.072,
+        "helm_lite/Mean win rate": 0.196,
+        "helm_lite/NarrativeQA": 0.716,
+        "helm_lite/NaturalQuestions (closed-book)": 0.253,
+        "helm_lite/OpenbookQA": 0.79,
+        "helm_lite/MMLU": 0.51,
+        "helm_lite/MATH": 0.289,
+        "helm_lite/GSM8K": 0.538,
+        "helm_lite/LegalBench": 0.331,
+        "helm_lite/MedQA": 0.517,
+        "helm_lite/WMT 2014": 0.142,
         "helm_mmlu/MMLU All Subjects": 0.599,
         "helm_mmlu/Abstract Algebra": 0.27,
         "helm_mmlu/Anatomy": 0.585,
         "helm_mmlu/Virology": 0.47,
         "helm_mmlu/World Religions": 0.825,
         "helm_mmlu/Mean win rate": 0.509,
         "hfopenllm_v2/IFEval": 0.5465,
         "hfopenllm_v2/BBH": 0.4722,
         "hfopenllm_v2/MATH Level 5": 0.0385,
       "developer": "mistralai",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.2415,
+        "hfopenllm_v2/BBH": 0.5087,
+        "hfopenllm_v2/MATH Level 5": 0.102,
+        "hfopenllm_v2/GPQA": 0.3138,
+        "hfopenllm_v2/MUSR": 0.4321,
+        "hfopenllm_v2/MMLU-PRO": 0.385
       }
     },
     {

data/developers/mlabonne.json CHANGED Viewed

@@ -161,12 +161,12 @@
       "developer": "mlabonne",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.4162,
-        "hfopenllm_v2/BBH": 0.5124,
-        "hfopenllm_v2/MATH Level 5": 0.0853,
-        "hfopenllm_v2/GPQA": 0.3029,
-        "hfopenllm_v2/MUSR": 0.415,
-        "hfopenllm_v2/MMLU-PRO": 0.3802
       }
     },
     {

       "developer": "mlabonne",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.7561,
+        "hfopenllm_v2/BBH": 0.5111,
+        "hfopenllm_v2/MATH Level 5": 0.0906,
+        "hfopenllm_v2/GPQA": 0.3062,
+        "hfopenllm_v2/MUSR": 0.4019,
+        "hfopenllm_v2/MMLU-PRO": 0.3841
       }
     },
     {

data/developers/moonshot_ai.json CHANGED Viewed

@@ -7,7 +7,7 @@
       "developer": "Moonshot AI",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "terminal-bench-2.0/terminal-bench-2.0": 26.7
       }
     },
     {

       "developer": "Moonshot AI",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "terminal-bench-2.0/terminal-bench-2.0": 27.8
       }
     },
     {

data/developers/multiple.json CHANGED Viewed

@@ -7,7 +7,7 @@
       "developer": "Multiple",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "terminal-bench-2.0/terminal-bench-2.0": 71.0
       }
     }
   ]

       "developer": "Multiple",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "terminal-bench-2.0/terminal-bench-2.0": 72.4
       }
     }
   ]

data/developers/nazimali.json CHANGED Viewed

@@ -21,12 +21,12 @@
       "developer": "nazimali",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.4964,
-        "hfopenllm_v2/BBH": 0.4699,
-        "hfopenllm_v2/MATH Level 5": 0.0045,
-        "hfopenllm_v2/GPQA": 0.2827,
-        "hfopenllm_v2/MUSR": 0.3979,
-        "hfopenllm_v2/MMLU-PRO": 0.3063
       }
     }
   ]

       "developer": "nazimali",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.486,
+        "hfopenllm_v2/BBH": 0.4721,
+        "hfopenllm_v2/MATH Level 5": 0.0846,
+        "hfopenllm_v2/GPQA": 0.2844,
+        "hfopenllm_v2/MUSR": 0.4006,
+        "hfopenllm_v2/MMLU-PRO": 0.3087
       }
     }
   ]

data/developers/nicolinho.json CHANGED Viewed

@@ -7,16 +7,16 @@
       "developer": "nicolinho",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "reward-bench/Score": 0.9444,
         "reward-bench/Factuality": 0.7853,
         "reward-bench/Precise IF": 0.3719,
         "reward-bench/Math": 0.6995,
-        "reward-bench/Safety": 0.927,
         "reward-bench/Focus": 0.9535,
-        "reward-bench/Ties": 0.8321,
-        "reward-bench/Chat": 0.9665,
-        "reward-bench/Chat Hard": 0.9013,
-        "reward-bench/Reasoning": 0.9826
       }
     },
     {
@@ -51,16 +51,16 @@
       "developer": "nicolinho",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "reward-bench/Score": 0.9314,
         "reward-bench/Factuality": 0.6653,
         "reward-bench/Precise IF": 0.4062,
         "reward-bench/Math": 0.612,
-        "reward-bench/Safety": 0.9257,
         "reward-bench/Focus": 0.8909,
-        "reward-bench/Ties": 0.7234,
-        "reward-bench/Chat": 0.9637,
-        "reward-bench/Chat Hard": 0.8684,
-        "reward-bench/Reasoning": 0.9677
       }
     }
   ]

       "developer": "nicolinho",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "reward-bench/Score": 0.7667,
+        "reward-bench/Chat": 0.9665,
+        "reward-bench/Chat Hard": 0.9013,
+        "reward-bench/Safety": 0.9578,
+        "reward-bench/Reasoning": 0.9826,
         "reward-bench/Factuality": 0.7853,
         "reward-bench/Precise IF": 0.3719,
         "reward-bench/Math": 0.6995,
         "reward-bench/Focus": 0.9535,
+        "reward-bench/Ties": 0.8321
       }
     },
     {
       "developer": "nicolinho",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "reward-bench/Score": 0.7074,
+        "reward-bench/Chat": 0.9637,
+        "reward-bench/Chat Hard": 0.8684,
+        "reward-bench/Safety": 0.9467,
+        "reward-bench/Reasoning": 0.9677,
         "reward-bench/Factuality": 0.6653,
         "reward-bench/Precise IF": 0.4062,
         "reward-bench/Math": 0.612,
         "reward-bench/Focus": 0.8909,
+        "reward-bench/Ties": 0.7234
       }
     }
   ]

data/developers/nisten.json CHANGED Viewed

@@ -7,12 +7,12 @@
       "developer": "nisten",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.3914,
-        "hfopenllm_v2/BBH": 0.6591,
-        "hfopenllm_v2/MATH Level 5": 0.3044,
-        "hfopenllm_v2/GPQA": 0.3591,
-        "hfopenllm_v2/MUSR": 0.4681,
-        "hfopenllm_v2/MMLU-PRO": 0.5611
       }
     },
     {

       "developer": "nisten",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.3799,
+        "hfopenllm_v2/BBH": 0.6647,
+        "hfopenllm_v2/MATH Level 5": 0.3406,
+        "hfopenllm_v2/GPQA": 0.4035,
+        "hfopenllm_v2/MUSR": 0.494,
+        "hfopenllm_v2/MMLU-PRO": 0.5731
       }
     },
     {

data/developers/nousresearch.json CHANGED Viewed

@@ -200,20 +200,6 @@
         "hfopenllm_v2/MMLU-PRO": 0.232
       }
     },
-    {
-      "id": "NousResearch/Yarn-Llama-2-7b-128k",
-      "name": "Yarn-Llama-2-7b-128k",
-      "developer": "NousResearch",
-      "evaluator_relationship": null,
-      "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.1485,
-        "hfopenllm_v2/BBH": 0.3248,
-        "hfopenllm_v2/MATH Level 5": 0.0151,
-        "hfopenllm_v2/GPQA": 0.2601,
-        "hfopenllm_v2/MUSR": 0.3967,
-        "hfopenllm_v2/MMLU-PRO": 0.1791
-      }
-    },
     {
       "id": "NousResearch/Yarn-Llama-2-7b-64k",
       "name": "Yarn-Llama-2-7b-64k",

         "hfopenllm_v2/MMLU-PRO": 0.232
       }
     },
     {
       "id": "NousResearch/Yarn-Llama-2-7b-64k",
       "name": "Yarn-Llama-2-7b-64k",

data/developers/omkar1102.json CHANGED Viewed

@@ -7,12 +7,12 @@
       "developer": "Omkar1102",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.2254,
-        "hfopenllm_v2/BBH": 0.275,
         "hfopenllm_v2/MATH Level 5": 0.0,
-        "hfopenllm_v2/GPQA": 0.2576,
-        "hfopenllm_v2/MUSR": 0.3762,
-        "hfopenllm_v2/MMLU-PRO": 0.1123
       }
     }
   ]

       "developer": "Omkar1102",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.2148,
+        "hfopenllm_v2/BBH": 0.276,
         "hfopenllm_v2/MATH Level 5": 0.0,
+        "hfopenllm_v2/GPQA": 0.2508,
+        "hfopenllm_v2/MUSR": 0.3802,
+        "hfopenllm_v2/MMLU-PRO": 0.1126
       }
     }
   ]

data/developers/openai.json CHANGED Viewed

@@ -163,16 +163,16 @@
       "developer": "OpenAI",
       "evaluator_relationship": null,
       "benchmark_scores": {
         "apex-agents/Overall Pass@1": 0.23,
         "apex-agents/Overall Pass@8": 0.4,
         "apex-agents/Overall Mean Score": 0.387,
         "apex-agents/Investment Banking Pass@1": 0.273,
         "apex-agents/Management Consulting Pass@1": 0.227,
         "apex-agents/Corporate Law Pass@1": 0.189,
-        "apex-agents/Corporate Lawyer Mean Score": 0.443,
-        "ace/Overall Score": 0.515,
-        "ace/Food Score": 0.65,
-        "ace/Gaming Score": 0.578
       }
     },
     {
@@ -300,13 +300,6 @@
       "developer": "OpenAI",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "helm_instruct/Mean win rate": 0.689,
-        "helm_instruct/Anthropic RLHF dataset": 4.964,
-        "helm_instruct/Best ChatGPT Prompts": 4.986,
-        "helm_instruct/Koala test dataset": 4.987,
-        "helm_instruct/Open Assistant": 4.987,
-        "helm_instruct/Self Instruct": 4.99,
-        "helm_instruct/Vicuna": 4.992,
         "helm_classic/Mean win rate": 0.783,
         "helm_classic/MMLU": 0.391,
         "helm_classic/BoolQ": 0.87,
@@ -322,6 +315,13 @@
         "helm_classic/IMDB": 0.943,
         "helm_classic/CivilComments": 0.696,
         "helm_classic/RAFT": 0.748,
         "helm_lite/Mean win rate": 0.358,
         "helm_lite/NarrativeQA": 0.655,
         "helm_lite/NaturalQuestions (closed-book)": 0.335,
@@ -405,6 +405,16 @@
       "developer": "OpenAI",
       "evaluator_relationship": null,
       "benchmark_scores": {
         "helm_mmlu/MMLU All Subjects": 0.824,
         "helm_mmlu/Abstract Algebra": 0.63,
         "helm_mmlu/Anatomy": 0.8,
@@ -440,17 +450,7 @@
         "helm_mmlu/Sociology": 0.93,
         "helm_mmlu/Virology": 0.596,
         "helm_mmlu/World Religions": 0.877,
-        "helm_mmlu/Mean win rate": 0.517,
-        "helm_lite/Mean win rate": 0.867,
-        "helm_lite/NarrativeQA": 0.768,
-        "helm_lite/NaturalQuestions (closed-book)": 0.457,
-        "helm_lite/OpenbookQA": 0.96,
-        "helm_lite/MMLU": 0.735,
-        "helm_lite/MATH": 0.802,
-        "helm_lite/GSM8K": 0.932,
-        "helm_lite/LegalBench": 0.713,
-        "helm_lite/MedQA": 0.815,
-        "helm_lite/WMT 2014": 0.211
       }
     },
     {
@@ -513,6 +513,16 @@
       "developer": "OpenAI",
       "evaluator_relationship": null,
       "benchmark_scores": {
         "helm_mmlu/MMLU All Subjects": 0.813,
         "helm_mmlu/Abstract Algebra": 0.56,
         "helm_mmlu/Anatomy": 0.822,
@@ -549,16 +559,6 @@
         "helm_mmlu/Virology": 0.602,
         "helm_mmlu/World Religions": 0.848,
         "helm_mmlu/Mean win rate": 0.351,
-        "helm_lite/Mean win rate": 0.864,
-        "helm_lite/NarrativeQA": 0.761,
-        "helm_lite/NaturalQuestions (closed-book)": 0.482,
-        "helm_lite/OpenbookQA": 0.97,
-        "helm_lite/MMLU": 0.711,
-        "helm_lite/MATH": 0.833,
-        "helm_lite/GSM8K": 0.824,
-        "helm_lite/LegalBench": 0.727,
-        "helm_lite/MedQA": 0.783,
-        "helm_lite/WMT 2014": 0.218,
         "reward-bench/Score": 0.8395,
         "reward-bench/Chat": 0.9525,
         "reward-bench/Chat Hard": 0.7544,
@@ -772,16 +772,16 @@
         "helm_mmlu/Virology": 0.578,
         "helm_mmlu/World Religions": 0.883,
         "helm_mmlu/Mean win rate": 0.52,
-        "reward-bench/Score": 0.6493,
-        "reward-bench/Chat": 0.9609,
-        "reward-bench/Chat Hard": 0.761,
-        "reward-bench/Safety": 0.8619,
-        "reward-bench/Reasoning": 0.8661,
         "reward-bench/Factuality": 0.5684,
         "reward-bench/Precise IF": 0.3312,
         "reward-bench/Math": 0.623,
         "reward-bench/Focus": 0.7293,
-        "reward-bench/Ties": 0.7819
       }
     },
     {
@@ -877,7 +877,7 @@
       "developer": "OpenAI",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "terminal-bench-2.0/terminal-bench-2.0": 35.2
       }
     },
     {
@@ -911,9 +911,9 @@
         "helm_capabilities/IFEval": 0.875,
         "helm_capabilities/WildBench": 0.857,
         "helm_capabilities/Omni-MATH": 0.647,
-        "livecodebenchpro/Hard Problems": 0.0423,
-        "livecodebenchpro/Medium Problems": 0.4085,
-        "livecodebenchpro/Easy Problems": 0.9014
       }
     },
     {
@@ -922,7 +922,7 @@
       "developer": "OpenAI",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "terminal-bench-2.0/terminal-bench-2.0": 44.3
       }
     },
     {
@@ -931,7 +931,7 @@
       "developer": "OpenAI",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "terminal-bench-2.0/terminal-bench-2.0": 34.8
       }
     },
     {
@@ -954,7 +954,7 @@
       "developer": "OpenAI",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "terminal-bench-2.0/terminal-bench-2.0": 9.9
       }
     },
     {
@@ -986,7 +986,7 @@
       "developer": "OpenAI",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "terminal-bench-2.0/terminal-bench-2.0": 53.5
       }
     },
     {
@@ -1013,7 +1013,7 @@
       "developer": "OpenAI",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "terminal-bench-2.0/terminal-bench-2.0": 60.7
       }
     },
     {
@@ -1023,14 +1023,14 @@
       "evaluator_relationship": null,
       "benchmark_scores": {
         "appworld_test_normal/appworld/test_normal": 0.0,
-        "browsecompplus/browsecompplus": 0.48,
         "livecodebenchpro/Hard Problems": 0.1594,
         "livecodebenchpro/Medium Problems": 0.5211,
         "livecodebenchpro/Easy Problems": 0.9014,
         "swe-bench/swe-bench": 0.5455,
         "tau-bench-2_airline/tau-bench-2/airline": 0.6,
-        "tau-bench-2_retail/tau-bench-2/retail": 0.68,
-        "tau-bench-2_telecom/tau-bench-2/telecom": 0.5354
       }
     },
     {
@@ -1048,7 +1048,7 @@
       "developer": "OpenAI",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "terminal-bench-2.0/terminal-bench-2.0": 64.7
       }
     },
     {
@@ -1112,7 +1112,7 @@
         "livecodebenchpro/Hard Problems": 0.0,
         "livecodebenchpro/Medium Problems": 0.11267605633802817,
         "livecodebenchpro/Easy Problems": 0.6619718309859155,
-        "terminal-bench-2.0/terminal-bench-2.0": 14.2
       }
     },
     {
@@ -1130,7 +1130,7 @@
         "livecodebenchpro/Hard Problems": 0.0,
         "livecodebenchpro/Medium Problems": 0.056338028169014086,
         "livecodebenchpro/Easy Problems": 0.5070422535211268,
-        "terminal-bench-2.0/terminal-bench-2.0": 3.1
       }
     },
     {

       "developer": "OpenAI",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "ace/Overall Score": 0.515,
+        "ace/Food Score": 0.65,
+        "ace/Gaming Score": 0.578,
         "apex-agents/Overall Pass@1": 0.23,
         "apex-agents/Overall Pass@8": 0.4,
         "apex-agents/Overall Mean Score": 0.387,
         "apex-agents/Investment Banking Pass@1": 0.273,
         "apex-agents/Management Consulting Pass@1": 0.227,
         "apex-agents/Corporate Law Pass@1": 0.189,
+        "apex-agents/Corporate Lawyer Mean Score": 0.443
       }
     },
     {
       "developer": "OpenAI",
       "evaluator_relationship": null,
       "benchmark_scores": {
         "helm_classic/Mean win rate": 0.783,
         "helm_classic/MMLU": 0.391,
         "helm_classic/BoolQ": 0.87,
         "helm_classic/IMDB": 0.943,
         "helm_classic/CivilComments": 0.696,
         "helm_classic/RAFT": 0.748,
+        "helm_instruct/Mean win rate": 0.689,
+        "helm_instruct/Anthropic RLHF dataset": 4.964,
+        "helm_instruct/Best ChatGPT Prompts": 4.986,
+        "helm_instruct/Koala test dataset": 4.987,
+        "helm_instruct/Open Assistant": 4.987,
+        "helm_instruct/Self Instruct": 4.99,
+        "helm_instruct/Vicuna": 4.992,
         "helm_lite/Mean win rate": 0.358,
         "helm_lite/NarrativeQA": 0.655,
         "helm_lite/NaturalQuestions (closed-book)": 0.335,
       "developer": "OpenAI",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "helm_lite/Mean win rate": 0.867,
+        "helm_lite/NarrativeQA": 0.768,
+        "helm_lite/NaturalQuestions (closed-book)": 0.457,
+        "helm_lite/OpenbookQA": 0.96,
+        "helm_lite/MMLU": 0.735,
+        "helm_lite/MATH": 0.802,
+        "helm_lite/GSM8K": 0.932,
+        "helm_lite/LegalBench": 0.713,
+        "helm_lite/MedQA": 0.815,
+        "helm_lite/WMT 2014": 0.211,
         "helm_mmlu/MMLU All Subjects": 0.824,
         "helm_mmlu/Abstract Algebra": 0.63,
         "helm_mmlu/Anatomy": 0.8,
         "helm_mmlu/Sociology": 0.93,
         "helm_mmlu/Virology": 0.596,
         "helm_mmlu/World Religions": 0.877,
+        "helm_mmlu/Mean win rate": 0.517
       }
     },
     {
       "developer": "OpenAI",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "helm_lite/Mean win rate": 0.864,
+        "helm_lite/NarrativeQA": 0.761,
+        "helm_lite/NaturalQuestions (closed-book)": 0.482,
+        "helm_lite/OpenbookQA": 0.97,
+        "helm_lite/MMLU": 0.711,
+        "helm_lite/MATH": 0.833,
+        "helm_lite/GSM8K": 0.824,
+        "helm_lite/LegalBench": 0.727,
+        "helm_lite/MedQA": 0.783,
+        "helm_lite/WMT 2014": 0.218,
         "helm_mmlu/MMLU All Subjects": 0.813,
         "helm_mmlu/Abstract Algebra": 0.56,
         "helm_mmlu/Anatomy": 0.822,
         "helm_mmlu/Virology": 0.602,
         "helm_mmlu/World Religions": 0.848,
         "helm_mmlu/Mean win rate": 0.351,
         "reward-bench/Score": 0.8395,
         "reward-bench/Chat": 0.9525,
         "reward-bench/Chat Hard": 0.7544,
         "helm_mmlu/Virology": 0.578,
         "helm_mmlu/World Religions": 0.883,
         "helm_mmlu/Mean win rate": 0.52,
+        "reward-bench/Score": 0.8673,
         "reward-bench/Factuality": 0.5684,
         "reward-bench/Precise IF": 0.3312,
         "reward-bench/Math": 0.623,
+        "reward-bench/Safety": 0.8811,
         "reward-bench/Focus": 0.7293,
+        "reward-bench/Ties": 0.7819,
+        "reward-bench/Chat": 0.9609,
+        "reward-bench/Chat Hard": 0.761,
+        "reward-bench/Reasoning": 0.8661
       }
     },
     {
       "developer": "OpenAI",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "terminal-bench-2.0/terminal-bench-2.0": 49.6
       }
     },
     {
         "helm_capabilities/IFEval": 0.875,
         "helm_capabilities/WildBench": 0.857,
         "helm_capabilities/Omni-MATH": 0.647,
+        "livecodebenchpro/Hard Problems": 0.04225352112676056,
+        "livecodebenchpro/Medium Problems": 0.4084507042253521,
+        "livecodebenchpro/Easy Problems": 0.8873239436619719
       }
     },
     {
       "developer": "OpenAI",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "terminal-bench-2.0/terminal-bench-2.0": 43.4
       }
     },
     {
       "developer": "OpenAI",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "terminal-bench-2.0/terminal-bench-2.0": 24.0
       }
     },
     {
       "developer": "OpenAI",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "terminal-bench-2.0/terminal-bench-2.0": 11.5
       }
     },
     {
       "developer": "OpenAI",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "terminal-bench-2.0/terminal-bench-2.0": 57.8
       }
     },
     {
       "developer": "OpenAI",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "terminal-bench-2.0/terminal-bench-2.0": 62.9
       }
     },
     {
       "evaluator_relationship": null,
       "benchmark_scores": {
         "appworld_test_normal/appworld/test_normal": 0.0,
+        "browsecompplus/browsecompplus": 0.43,
         "livecodebenchpro/Hard Problems": 0.1594,
         "livecodebenchpro/Medium Problems": 0.5211,
         "livecodebenchpro/Easy Problems": 0.9014,
         "swe-bench/swe-bench": 0.5455,
         "tau-bench-2_airline/tau-bench-2/airline": 0.6,
+        "tau-bench-2_retail/tau-bench-2/retail": 0.73,
+        "tau-bench-2_telecom/tau-bench-2/telecom": 0.71
       }
     },
     {
       "developer": "OpenAI",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "terminal-bench-2.0/terminal-bench-2.0": 77.3
       }
     },
     {
         "livecodebenchpro/Hard Problems": 0.0,
         "livecodebenchpro/Medium Problems": 0.11267605633802817,
         "livecodebenchpro/Easy Problems": 0.6619718309859155,
+        "terminal-bench-2.0/terminal-bench-2.0": 18.7
       }
     },
     {
         "livecodebenchpro/Hard Problems": 0.0,
         "livecodebenchpro/Medium Problems": 0.056338028169014086,
         "livecodebenchpro/Easy Problems": 0.5070422535211268,
+        "terminal-bench-2.0/terminal-bench-2.0": 3.4
       }
     },
     {

data/developers/openassistant.json CHANGED Viewed

@@ -7,17 +7,17 @@
       "developer": "OpenAssistant",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "reward-bench/Score": 0.615,
         "reward-bench/Factuality": 0.3979,
         "reward-bench/Precise IF": 0.2875,
         "reward-bench/Math": 0.377,
-        "reward-bench/Safety": 0.5446,
         "reward-bench/Focus": 0.1535,
-        "reward-bench/Ties": 0.047,
-        "reward-bench/Chat": 0.9246,
-        "reward-bench/Chat Hard": 0.3728,
-        "reward-bench/Reasoning": 0.5855,
-        "reward-bench/Prior Sets (0.5 weight)": 0.6801
       }
     },
     {
@@ -26,17 +26,17 @@
       "developer": "OpenAssistant",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "reward-bench/Score": 0.2648,
-        "reward-bench/Chat": 0.8855,
-        "reward-bench/Chat Hard": 0.4868,
-        "reward-bench/Safety": 0.3244,
-        "reward-bench/Reasoning": 0.7752,
-        "reward-bench/Prior Sets (0.5 weight)": 0.6533,
         "reward-bench/Factuality": 0.3179,
         "reward-bench/Precise IF": 0.2625,
         "reward-bench/Math": 0.3934,
         "reward-bench/Focus": 0.2707,
-        "reward-bench/Ties": 0.0198
       }
     },
     {

       "developer": "OpenAssistant",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "reward-bench/Score": 0.2653,
+        "reward-bench/Chat": 0.9246,
+        "reward-bench/Chat Hard": 0.3728,
+        "reward-bench/Safety": 0.3289,
+        "reward-bench/Reasoning": 0.5855,
+        "reward-bench/Prior Sets (0.5 weight)": 0.6801,
         "reward-bench/Factuality": 0.3979,
         "reward-bench/Precise IF": 0.2875,
         "reward-bench/Math": 0.377,
         "reward-bench/Focus": 0.1535,
+        "reward-bench/Ties": 0.047
       }
     },
     {
       "developer": "OpenAssistant",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "reward-bench/Score": 0.6901,
         "reward-bench/Factuality": 0.3179,
         "reward-bench/Precise IF": 0.2625,
         "reward-bench/Math": 0.3934,
+        "reward-bench/Safety": 0.6311,
         "reward-bench/Focus": 0.2707,
+        "reward-bench/Ties": 0.0198,
+        "reward-bench/Chat": 0.8855,
+        "reward-bench/Chat Hard": 0.4868,
+        "reward-bench/Reasoning": 0.7752,
+        "reward-bench/Prior Sets (0.5 weight)": 0.6533
       }
     },
     {

data/developers/openbmb.json CHANGED Viewed

@@ -21,17 +21,17 @@
       "developer": "openbmb",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "reward-bench/Score": 0.5806,
-        "reward-bench/Chat": 0.9804,
-        "reward-bench/Chat Hard": 0.6557,
-        "reward-bench/Safety": 0.6267,
-        "reward-bench/Reasoning": 0.8633,
-        "reward-bench/Prior Sets (0.5 weight)": 0.7172,
         "reward-bench/Factuality": 0.6,
         "reward-bench/Precise IF": 0.3438,
         "reward-bench/Math": 0.5683,
         "reward-bench/Focus": 0.7475,
-        "reward-bench/Ties": 0.5972
       }
     },
     {

       "developer": "openbmb",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "reward-bench/Score": 0.8159,
         "reward-bench/Factuality": 0.6,
         "reward-bench/Precise IF": 0.3438,
         "reward-bench/Math": 0.5683,
+        "reward-bench/Safety": 0.8135,
         "reward-bench/Focus": 0.7475,
+        "reward-bench/Ties": 0.5972,
+        "reward-bench/Chat": 0.9804,
+        "reward-bench/Chat Hard": 0.6557,
+        "reward-bench/Reasoning": 0.8633,
+        "reward-bench/Prior Sets (0.5 weight)": 0.7172
       }
     },
     {

data/developers/pku-alignment.json CHANGED Viewed

@@ -7,17 +7,17 @@
       "developer": "PKU-Alignment",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "reward-bench/Score": 0.3332,
-        "reward-bench/Chat": 0.6173,
-        "reward-bench/Chat Hard": 0.4232,
-        "reward-bench/Safety": 0.7589,
-        "reward-bench/Reasoning": 0.5482,
-        "reward-bench/Prior Sets (0.5 weight)": 0.57,
         "reward-bench/Factuality": 0.3263,
         "reward-bench/Precise IF": 0.2313,
         "reward-bench/Math": 0.3989,
         "reward-bench/Focus": 0.2939,
-        "reward-bench/Ties": -0.01
       }
     },
     {
@@ -26,17 +26,17 @@
       "developer": "PKU-Alignment",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "reward-bench/Score": 0.4727,
         "reward-bench/Factuality": 0.2105,
         "reward-bench/Precise IF": 0.2938,
         "reward-bench/Math": 0.2623,
-        "reward-bench/Safety": 0.3757,
         "reward-bench/Focus": 0.0646,
-        "reward-bench/Ties": -0.01,
-        "reward-bench/Chat": 0.8184,
-        "reward-bench/Chat Hard": 0.2873,
-        "reward-bench/Reasoning": 0.346,
-        "reward-bench/Prior Sets (0.5 weight)": 0.5993
       }
     },
     {
@@ -64,17 +64,17 @@
       "developer": "PKU-Alignment",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "reward-bench/Score": 0.6366,
         "reward-bench/Factuality": 0.2168,
         "reward-bench/Precise IF": 0.2562,
         "reward-bench/Math": 0.3825,
-        "reward-bench/Safety": 0.6041,
         "reward-bench/Focus": 0.2606,
-        "reward-bench/Ties": 0.0944,
-        "reward-bench/Chat": 0.8994,
-        "reward-bench/Chat Hard": 0.364,
-        "reward-bench/Reasoning": 0.6887,
-        "reward-bench/Prior Sets (0.5 weight)": 0.6171
       }
     }
   ]

       "developer": "PKU-Alignment",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "reward-bench/Score": 0.5798,
         "reward-bench/Factuality": 0.3263,
         "reward-bench/Precise IF": 0.2313,
         "reward-bench/Math": 0.3989,
+        "reward-bench/Safety": 0.7351,
         "reward-bench/Focus": 0.2939,
+        "reward-bench/Ties": -0.01,
+        "reward-bench/Chat": 0.6173,
+        "reward-bench/Chat Hard": 0.4232,
+        "reward-bench/Reasoning": 0.5482,
+        "reward-bench/Prior Sets (0.5 weight)": 0.57
       }
     },
     {
       "developer": "PKU-Alignment",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "reward-bench/Score": 0.1606,
+        "reward-bench/Chat": 0.8184,
+        "reward-bench/Chat Hard": 0.2873,
+        "reward-bench/Safety": 0.1422,
+        "reward-bench/Reasoning": 0.346,
+        "reward-bench/Prior Sets (0.5 weight)": 0.5993,
         "reward-bench/Factuality": 0.2105,
         "reward-bench/Precise IF": 0.2938,
         "reward-bench/Math": 0.2623,
         "reward-bench/Focus": 0.0646,
+        "reward-bench/Ties": -0.01
       }
     },
     {
       "developer": "PKU-Alignment",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "reward-bench/Score": 0.2544,
+        "reward-bench/Chat": 0.8994,
+        "reward-bench/Chat Hard": 0.364,
+        "reward-bench/Safety": 0.3156,
+        "reward-bench/Reasoning": 0.6887,
+        "reward-bench/Prior Sets (0.5 weight)": 0.6171,
         "reward-bench/Factuality": 0.2168,
         "reward-bench/Precise IF": 0.2562,
         "reward-bench/Math": 0.3825,
         "reward-bench/Focus": 0.2606,
+        "reward-bench/Ties": 0.0944
       }
     }
   ]

data/developers/primeintellect.json CHANGED Viewed

@@ -8,11 +8,11 @@
       "evaluator_relationship": null,
       "benchmark_scores": {
         "hfopenllm_v2/IFEval": 0.1757,
-        "hfopenllm_v2/BBH": 0.276,
         "hfopenllm_v2/MATH Level 5": 0.0,
-        "hfopenllm_v2/GPQA": 0.2534,
-        "hfopenllm_v2/MUSR": 0.3339,
-        "hfopenllm_v2/MMLU-PRO": 0.1123
       }
     },
     {

       "evaluator_relationship": null,
       "benchmark_scores": {
         "hfopenllm_v2/IFEval": 0.1757,
+        "hfopenllm_v2/BBH": 0.274,
         "hfopenllm_v2/MATH Level 5": 0.0,
+        "hfopenllm_v2/GPQA": 0.25,
+        "hfopenllm_v2/MUSR": 0.3753,
+        "hfopenllm_v2/MMLU-PRO": 0.112
       }
     },
     {

data/developers/princeton-nlp.json CHANGED Viewed

@@ -49,12 +49,12 @@
       "developer": "princeton-nlp",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.5508,
-        "hfopenllm_v2/BBH": 0.5028,
-        "hfopenllm_v2/MATH Level 5": 0.0529,
-        "hfopenllm_v2/GPQA": 0.2861,
-        "hfopenllm_v2/MUSR": 0.4266,
-        "hfopenllm_v2/MMLU-PRO": 0.3231
       }
     },
     {

       "developer": "princeton-nlp",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "hfopenllm_v2/IFEval": 0.3978,
+        "hfopenllm_v2/BBH": 0.4983,
+        "hfopenllm_v2/MATH Level 5": 0.0582,
+        "hfopenllm_v2/GPQA": 0.281,
+        "hfopenllm_v2/MUSR": 0.425,
+        "hfopenllm_v2/MMLU-PRO": 0.3246
       }
     },
     {