diff --git a/data/benchmarks.json b/data/benchmarks.json
index b2c2822ebba1561c909be7c113905d12c56e46d9..b89fad043205746a1fe3c3d97fb375c96123e519 100644
--- a/data/benchmarks.json
+++ b/data/benchmarks.json
@@ -45,7 +45,7 @@
   },
   {
     "benchmark": "hfopenllm_v2",
-    "model_count": 4494
+    "model_count": 4493
   },
   {
     "benchmark": "la_leaderboard",
@@ -78,5 +78,9 @@
   {
     "benchmark": "terminal-bench-2.0",
     "model_count": 37
+  },
+  {
+    "benchmark": "theory_of_mind",
+    "model_count": 1
   }
 ]
\ No newline at end of file
diff --git a/data/benchmarks/appworld_test_normal.json b/data/benchmarks/appworld_test_normal.json
index 6b56627ff81e57642084626a8f27419d8d068837..27bfa4d8d1ae3df861f951b1a84e2236e37e45c3 100644
--- a/data/benchmarks/appworld_test_normal.json
+++ b/data/benchmarks/appworld_test_normal.json
@@ -5,7 +5,7 @@
       "name": "claude-opus-4-5",
       "developer": "Anthropic",
       "scores": {
-        "appworld/test_normal": 0.68
+        "appworld/test_normal": 0.7
       }
     },
     {
@@ -13,7 +13,7 @@
       "name": "gemini-3-pro-preview",
       "developer": "Google",
       "scores": {
-        "appworld/test_normal": 0.13
+        "appworld/test_normal": 0.55
       }
     },
     {
diff --git a/data/benchmarks/browsecompplus.json b/data/benchmarks/browsecompplus.json
index cbe338e54d9ae32cab6bb8e3bb3af6af8c5948a3..fa5cad4bfa9a7ba3be365ad38a5955a6ca39cc9b 100644
--- a/data/benchmarks/browsecompplus.json
+++ b/data/benchmarks/browsecompplus.json
@@ -13,7 +13,7 @@
       "name": "gemini-3-pro-preview",
       "developer": "Google",
       "scores": {
-        "browsecompplus": 0.48
+        "browsecompplus": 0.3333
       }
     },
     {
@@ -21,7 +21,7 @@
       "name": "gpt-5.2-2025-12-11",
       "developer": "OpenAI",
       "scores": {
-        "browsecompplus": 0.48
+        "browsecompplus": 0.43
       }
     }
   ]
diff --git a/data/benchmarks/hfopenllm_v2.json b/data/benchmarks/hfopenllm_v2.json
index b2e8b3aab80c98d2c6986f81a5c4e1c543ca42ae..caf5db92e1eb1042b92e8b5d4090c0a48c4f57e2 100644
--- a/data/benchmarks/hfopenllm_v2.json
+++ b/data/benchmarks/hfopenllm_v2.json
@@ -2176,12 +2176,12 @@
       "name": "LION-Gemma-2b-dpo-v1.0",
       "developer": "Columbia-NLP",
       "scores": {
-        "IFEval": 0.3278,
-        "BBH": 0.392,
-        "MATH Level 5": 0.0431,
-        "GPQA": 0.2492,
-        "MUSR": 0.412,
-        "MMLU-PRO": 0.1666
+        "IFEval": 0.3102,
+        "BBH": 0.3881,
+        "MATH Level 5": 0.0536,
+        "GPQA": 0.2534,
+        "MUSR": 0.4081,
+        "MMLU-PRO": 0.1665
       }
     },
     {
@@ -3229,12 +3229,12 @@
       "name": "PathfinderAI",
       "developer": "Daemontatox",
       "scores": {
-        "IFEval": 0.3745,
-        "BBH": 0.6668,
-        "MATH Level 5": 0.4758,
-        "GPQA": 0.3943,
-        "MUSR": 0.4858,
-        "MMLU-PRO": 0.5593
+        "IFEval": 0.4855,
+        "BBH": 0.6627,
+        "MATH Level 5": 0.4841,
+        "GPQA": 0.3096,
+        "MUSR": 0.4256,
+        "MMLU-PRO": 0.5542
       }
     },
     {
@@ -4321,12 +4321,12 @@
       "name": "Llama-3.1-8b-ITA",
       "developer": "DeepMount00",
       "scores": {
-        "IFEval": 0.7917,
-        "BBH": 0.5109,
-        "MATH Level 5": 0.1088,
-        "GPQA": 0.2878,
-        "MUSR": 0.4136,
-        "MMLU-PRO": 0.3876
+        "IFEval": 0.5365,
+        "BBH": 0.517,
+        "MATH Level 5": 0.1707,
+        "GPQA": 0.3062,
+        "MUSR": 0.4487,
+        "MMLU-PRO": 0.396
       }
     },
     {
@@ -4646,12 +4646,12 @@
       "name": "MN-12B-LilithFrame",
       "developer": "DoppelReflEx",
       "scores": {
-        "IFEval": 0.451,
-        "BBH": 0.4944,
-        "MATH Level 5": 0.1156,
-        "GPQA": 0.3196,
-        "MUSR": 0.3896,
-        "MMLU-PRO": 0.3256
+        "IFEval": 0.436,
+        "BBH": 0.4956,
+        "MATH Level 5": 0.0589,
+        "GPQA": 0.3205,
+        "MUSR": 0.3843,
+        "MMLU-PRO": 0.3237
       }
     },
     {
@@ -9144,12 +9144,12 @@
       "name": "SmolLM2-135M-Instruct",
       "developer": "HuggingFaceTB",
       "scores": {
-        "IFEval": 0.0593,
-        "BBH": 0.3135,
-        "MATH Level 5": 0.0144,
-        "GPQA": 0.2341,
-        "MUSR": 0.3871,
-        "MMLU-PRO": 0.1092
+        "IFEval": 0.2883,
+        "BBH": 0.3124,
+        "MATH Level 5": 0.003,
+        "GPQA": 0.2357,
+        "MUSR": 0.3662,
+        "MMLU-PRO": 0.1115
       }
     },
     {
@@ -13057,12 +13057,12 @@
       "name": "SpydazWeb_AI_HumanAI_012_INSTRUCT_XA",
       "developer": "LeroyDyer",
       "scores": {
-        "IFEval": 0.3579,
-        "BBH": 0.4477,
-        "MATH Level 5": 0.0423,
-        "GPQA": 0.3096,
-        "MUSR": 0.4134,
-        "MMLU-PRO": 0.2376
+        "IFEval": 0.3798,
+        "BBH": 0.4483,
+        "MATH Level 5": 0.04,
+        "GPQA": 0.3129,
+        "MUSR": 0.4148,
+        "MMLU-PRO": 0.2389
       }
     },
     {
@@ -16874,19 +16874,6 @@
         "MMLU-PRO": 0.232
       }
     },
-    {
-      "model_id": "NousResearch/Yarn-Llama-2-7b-128k",
-      "name": "Yarn-Llama-2-7b-128k",
-      "developer": "NousResearch",
-      "scores": {
-        "IFEval": 0.1485,
-        "BBH": 0.3248,
-        "MATH Level 5": 0.0151,
-        "GPQA": 0.2601,
-        "MUSR": 0.3967,
-        "MMLU-PRO": 0.1791
-      }
-    },
     {
       "model_id": "NousResearch/Yarn-Llama-2-7b-64k",
       "name": "Yarn-Llama-2-7b-64k",
@@ -17204,12 +17191,12 @@
       "name": "code-yi",
       "developer": "Omkar1102",
       "scores": {
-        "IFEval": 0.2254,
-        "BBH": 0.275,
+        "IFEval": 0.2148,
+        "BBH": 0.276,
         "MATH Level 5": 0.0,
-        "GPQA": 0.2576,
-        "MUSR": 0.3762,
-        "MMLU-PRO": 0.1123
+        "GPQA": 0.2508,
+        "MUSR": 0.3802,
+        "MMLU-PRO": 0.1126
       }
     },
     {
@@ -18141,11 +18128,11 @@
       "developer": "PrimeIntellect",
       "scores": {
         "IFEval": 0.1757,
-        "BBH": 0.276,
+        "BBH": 0.274,
         "MATH Level 5": 0.0,
-        "GPQA": 0.2534,
-        "MUSR": 0.3339,
-        "MMLU-PRO": 0.1123
+        "GPQA": 0.25,
+        "MUSR": 0.3753,
+        "MMLU-PRO": 0.112
       }
     },
     {
@@ -18712,12 +18699,12 @@
       "name": "ODB-14B-sce",
       "developer": "Quazim0t0",
       "scores": {
-        "IFEval": 0.7016,
-        "BBH": 0.6942,
-        "MATH Level 5": 0.4116,
-        "GPQA": 0.3624,
-        "MUSR": 0.4571,
-        "MMLU-PRO": 0.5411
+        "IFEval": 0.2922,
+        "BBH": 0.6559,
+        "MATH Level 5": 0.2545,
+        "GPQA": 0.2659,
+        "MUSR": 0.3929,
+        "MMLU-PRO": 0.5207
       }
     },
     {
@@ -19466,12 +19453,12 @@
       "name": "Qwen2.5-0.5B-Instruct",
       "developer": "Qwen",
       "scores": {
-        "IFEval": 0.3153,
-        "BBH": 0.3322,
-        "MATH Level 5": 0.1035,
-        "GPQA": 0.2592,
-        "MUSR": 0.3342,
-        "MMLU-PRO": 0.172
+        "IFEval": 0.3071,
+        "BBH": 0.3341,
+        "MATH Level 5": 0.0,
+        "GPQA": 0.2576,
+        "MUSR": 0.3329,
+        "MMLU-PRO": 0.1697
       }
     },
     {
@@ -19726,12 +19713,12 @@
       "name": "Qwen2.5-Coder-7B-Instruct",
       "developer": "Qwen",
       "scores": {
-        "IFEval": 0.6147,
-        "BBH": 0.4999,
-        "MATH Level 5": 0.031,
-        "GPQA": 0.2936,
-        "MUSR": 0.4099,
-        "MMLU-PRO": 0.3354
+        "IFEval": 0.6101,
+        "BBH": 0.5008,
+        "MATH Level 5": 0.3716,
+        "GPQA": 0.2919,
+        "MUSR": 0.4073,
+        "MMLU-PRO": 0.3352
       }
     },
     {
@@ -19986,12 +19973,12 @@
       "name": "Replete-LLM-Qwen2-7b",
       "developer": "Replete-AI",
       "scores": {
-        "IFEval": 0.0932,
-        "BBH": 0.2977,
+        "IFEval": 0.0905,
+        "BBH": 0.2985,
         "MATH Level 5": 0.0,
-        "GPQA": 0.2475,
-        "MUSR": 0.3941,
-        "MMLU-PRO": 0.1157
+        "GPQA": 0.2534,
+        "MUSR": 0.3848,
+        "MMLU-PRO": 0.1158
       }
     },
     {
@@ -24653,12 +24640,12 @@
       "name": "Llama-3-Instruct-8B-SPPO-Iter3",
       "developer": "UCLA-AGI",
       "scores": {
-        "IFEval": 0.6834,
-        "BBH": 0.508,
-        "MATH Level 5": 0.0959,
+        "IFEval": 0.6703,
+        "BBH": 0.5076,
+        "MATH Level 5": 0.0718,
         "GPQA": 0.2651,
-        "MUSR": 0.3661,
-        "MMLU-PRO": 0.3644
+        "MUSR": 0.3647,
+        "MMLU-PRO": 0.3658
       }
     },
     {
@@ -25004,12 +24991,12 @@
       "name": "llama-3-Korean-8B",
       "developer": "VIRNECT",
       "scores": {
-        "IFEval": 0.5021,
-        "BBH": 0.4918,
-        "MATH Level 5": 0.108,
+        "IFEval": 0.5058,
+        "BBH": 0.4908,
+        "MATH Level 5": 0.0929,
         "GPQA": 0.271,
-        "MUSR": 0.3648,
-        "MMLU-PRO": 0.3536
+        "MUSR": 0.3662,
+        "MMLU-PRO": 0.3539
       }
     },
     {
@@ -25108,12 +25095,12 @@
       "name": "Llama3.1-8B-Fireplace2",
       "developer": "ValiantLabs",
       "scores": {
-        "IFEval": 0.5328,
-        "BBH": 0.4613,
-        "MATH Level 5": 0.0876,
-        "GPQA": 0.2894,
-        "MUSR": 0.3367,
-        "MMLU-PRO": 0.2424
+        "IFEval": 0.5483,
+        "BBH": 0.461,
+        "MATH Level 5": 0.0582,
+        "GPQA": 0.2886,
+        "MUSR": 0.3433,
+        "MMLU-PRO": 0.2407
       }
     },
     {
@@ -25121,12 +25108,12 @@
       "name": "Llama3.1-8B-ShiningValiant2",
       "developer": "ValiantLabs",
       "scores": {
-        "IFEval": 0.6496,
-        "BBH": 0.4774,
-        "MATH Level 5": 0.0566,
-        "GPQA": 0.3104,
-        "MUSR": 0.3909,
-        "MMLU-PRO": 0.3382
+        "IFEval": 0.2678,
+        "BBH": 0.4429,
+        "MATH Level 5": 0.0521,
+        "GPQA": 0.302,
+        "MUSR": 0.3959,
+        "MMLU-PRO": 0.2927
       }
     },
     {
@@ -25654,12 +25641,12 @@
       "name": "Qwen2.5-14B-YOYO-1010",
       "developer": "YOYO-AI",
       "scores": {
-        "IFEval": 0.5899,
-        "BBH": 0.654,
-        "MATH Level 5": 0.4509,
-        "GPQA": 0.3834,
-        "MUSR": 0.4744,
-        "MMLU-PRO": 0.5376
+        "IFEval": 0.7905,
+        "BBH": 0.6406,
+        "MATH Level 5": 0.0,
+        "GPQA": 0.3163,
+        "MUSR": 0.4181,
+        "MMLU-PRO": 0.4944
       }
     },
     {
@@ -26603,12 +26590,12 @@
       "name": "QAIMath-Qwen2.5-7B-TIES",
       "developer": "adriszmar",
       "scores": {
-        "IFEval": 0.1685,
-        "BBH": 0.3124,
-        "MATH Level 5": 0.0015,
-        "GPQA": 0.2492,
-        "MUSR": 0.3963,
-        "MMLU-PRO": 0.1066
+        "IFEval": 0.1746,
+        "BBH": 0.3126,
+        "MATH Level 5": 0.0,
+        "GPQA": 0.245,
+        "MUSR": 0.4096,
+        "MMLU-PRO": 0.1087
       }
     },
     {
@@ -26889,12 +26876,12 @@
       "name": "Llama-3.1-Storm-8B",
       "developer": "akjindal53244",
       "scores": {
-        "IFEval": 0.8051,
-        "BBH": 0.5189,
-        "MATH Level 5": 0.1722,
-        "GPQA": 0.3263,
+        "IFEval": 0.8033,
+        "BBH": 0.5196,
+        "MATH Level 5": 0.1624,
+        "GPQA": 0.3096,
         "MUSR": 0.4028,
-        "MMLU-PRO": 0.3803
+        "MMLU-PRO": 0.3812
       }
     },
     {
@@ -26915,12 +26902,12 @@
       "name": "Llama-3.1-Tulu-3-70B",
       "developer": "allenai",
       "scores": {
-        "IFEval": 0.8379,
-        "BBH": 0.6157,
-        "MATH Level 5": 0.3829,
+        "IFEval": 0.8291,
+        "BBH": 0.6164,
+        "MATH Level 5": 0.4502,
         "GPQA": 0.3733,
-        "MUSR": 0.4988,
-        "MMLU-PRO": 0.4656
+        "MUSR": 0.4948,
+        "MMLU-PRO": 0.4645
       }
     },
     {
@@ -31647,12 +31634,12 @@
       "name": "dolphin-2.9.2-Phi-3-Medium-abliterated",
       "developer": "cognitivecomputations",
       "scores": {
-        "IFEval": 0.4124,
-        "BBH": 0.6383,
-        "MATH Level 5": 0.182,
-        "GPQA": 0.3289,
-        "MUSR": 0.4349,
-        "MMLU-PRO": 0.4525
+        "IFEval": 0.3613,
+        "BBH": 0.6123,
+        "MATH Level 5": 0.1239,
+        "GPQA": 0.328,
+        "MUSR": 0.4112,
+        "MMLU-PRO": 0.4494
       }
     },
     {
@@ -31790,12 +31777,12 @@
       "name": "llama-43m-beta",
       "developer": "cpayne1303",
       "scores": {
-        "IFEval": 0.1916,
-        "BBH": 0.2977,
-        "MATH Level 5": 0.0,
+        "IFEval": 0.1949,
+        "BBH": 0.2965,
+        "MATH Level 5": 0.0045,
         "GPQA": 0.2685,
-        "MUSR": 0.3872,
-        "MMLU-PRO": 0.1132
+        "MUSR": 0.3885,
+        "MMLU-PRO": 0.1111
       }
     },
     {
@@ -32167,12 +32154,12 @@
       "name": "Llama-3-8B-Orpo-v0.1",
       "developer": "dfurman",
       "scores": {
-        "IFEval": 0.2835,
-        "BBH": 0.3842,
-        "MATH Level 5": 0.0521,
-        "GPQA": 0.2609,
-        "MUSR": 0.3566,
-        "MMLU-PRO": 0.2298
+        "IFEval": 0.3,
+        "BBH": 0.3853,
+        "MATH Level 5": 0.0415,
+        "GPQA": 0.2617,
+        "MUSR": 0.3579,
+        "MMLU-PRO": 0.2281
       }
     },
     {
@@ -34663,12 +34650,12 @@
       "name": "gemma-2-2b",
       "developer": "Google",
       "scores": {
-        "IFEval": 0.2018,
-        "BBH": 0.3709,
-        "MATH Level 5": 0.0302,
+        "IFEval": 0.1993,
+        "BBH": 0.3656,
+        "MATH Level 5": 0.0287,
         "GPQA": 0.2626,
-        "MUSR": 0.4219,
-        "MMLU-PRO": 0.2217
+        "MUSR": 0.4232,
+        "MMLU-PRO": 0.218
       }
     },
     {
@@ -34689,12 +34676,12 @@
       "name": "gemma-2-2b-jpn-it",
       "developer": "Google",
       "scores": {
-        "IFEval": 0.5078,
-        "BBH": 0.4226,
-        "MATH Level 5": 0.0347,
-        "GPQA": 0.2852,
-        "MUSR": 0.3964,
-        "MMLU-PRO": 0.2578
+        "IFEval": 0.5288,
+        "BBH": 0.4178,
+        "MATH Level 5": 0.0476,
+        "GPQA": 0.2752,
+        "MUSR": 0.3728,
+        "MMLU-PRO": 0.2467
       }
     },
     {
@@ -37705,12 +37692,12 @@
       "name": "Kosmos-EVAA-Fusion-8B",
       "developer": "jaspionjader",
       "scores": {
-        "IFEval": 0.4345,
-        "BBH": 0.5419,
-        "MATH Level 5": 0.1292,
-        "GPQA": 0.3087,
+        "IFEval": 0.4418,
+        "BBH": 0.5406,
+        "MATH Level 5": 0.1352,
+        "GPQA": 0.3062,
         "MUSR": 0.4277,
-        "MMLU-PRO": 0.3854
+        "MMLU-PRO": 0.386
       }
     },
     {
@@ -42359,12 +42346,12 @@
       "name": "Mistral-v0.3-7B-ORPO",
       "developer": "llmat",
       "scores": {
-        "IFEval": 0.364,
-        "BBH": 0.4005,
-        "MATH Level 5": 0.0015,
-        "GPQA": 0.2693,
-        "MUSR": 0.3529,
-        "MMLU-PRO": 0.2301
+        "IFEval": 0.377,
+        "BBH": 0.3978,
+        "MATH Level 5": 0.0242,
+        "GPQA": 0.2668,
+        "MUSR": 0.3555,
+        "MMLU-PRO": 0.2278
       }
     },
     {
@@ -44478,12 +44465,12 @@
       "name": "Mixtral-8x7B-v0.1",
       "developer": "mistralai",
       "scores": {
-        "IFEval": 0.2326,
-        "BBH": 0.5098,
-        "MATH Level 5": 0.0937,
-        "GPQA": 0.3205,
-        "MUSR": 0.4413,
-        "MMLU-PRO": 0.3871
+        "IFEval": 0.2415,
+        "BBH": 0.5087,
+        "MATH Level 5": 0.102,
+        "GPQA": 0.3138,
+        "MUSR": 0.4321,
+        "MMLU-PRO": 0.385
       }
     },
     {
@@ -44738,12 +44725,12 @@
       "name": "NeuralDaredevil-8B-abliterated",
       "developer": "mlabonne",
       "scores": {
-        "IFEval": 0.4162,
-        "BBH": 0.5124,
-        "MATH Level 5": 0.0853,
-        "GPQA": 0.3029,
-        "MUSR": 0.415,
-        "MMLU-PRO": 0.3802
+        "IFEval": 0.7561,
+        "BBH": 0.5111,
+        "MATH Level 5": 0.0906,
+        "GPQA": 0.3062,
+        "MUSR": 0.4019,
+        "MMLU-PRO": 0.3841
       }
     },
     {
@@ -45076,12 +45063,12 @@
       "name": "Mistral-Nemo-Kurdish-Instruct",
       "developer": "nazimali",
       "scores": {
-        "IFEval": 0.4964,
-        "BBH": 0.4699,
-        "MATH Level 5": 0.0045,
-        "GPQA": 0.2827,
-        "MUSR": 0.3979,
-        "MMLU-PRO": 0.3063
+        "IFEval": 0.486,
+        "BBH": 0.4721,
+        "MATH Level 5": 0.0846,
+        "GPQA": 0.2844,
+        "MUSR": 0.4006,
+        "MMLU-PRO": 0.3087
       }
     },
     {
@@ -46779,12 +46766,12 @@
       "name": "franqwenstein-35b",
       "developer": "nisten",
       "scores": {
-        "IFEval": 0.3914,
-        "BBH": 0.6591,
-        "MATH Level 5": 0.3044,
-        "GPQA": 0.3591,
-        "MUSR": 0.4681,
-        "MMLU-PRO": 0.5611
+        "IFEval": 0.3799,
+        "BBH": 0.6647,
+        "MATH Level 5": 0.3406,
+        "GPQA": 0.4035,
+        "MUSR": 0.494,
+        "MMLU-PRO": 0.5731
       }
     },
     {
@@ -48729,12 +48716,12 @@
       "name": "Llama-3-8B-ProLong-512k-Instruct",
       "developer": "princeton-nlp",
       "scores": {
-        "IFEval": 0.5508,
-        "BBH": 0.5028,
-        "MATH Level 5": 0.0529,
-        "GPQA": 0.2861,
-        "MUSR": 0.4266,
-        "MMLU-PRO": 0.3231
+        "IFEval": 0.3978,
+        "BBH": 0.4983,
+        "MATH Level 5": 0.0582,
+        "GPQA": 0.281,
+        "MUSR": 0.425,
+        "MMLU-PRO": 0.3246
       }
     },
     {
@@ -51303,12 +51290,12 @@
       "name": "Gemma-2-Ataraxy-Gemmasutra-9B-slerp",
       "developer": "recoilme",
       "scores": {
-        "IFEval": 0.7649,
-        "BBH": 0.5974,
-        "MATH Level 5": 0.0174,
-        "GPQA": 0.3305,
-        "MUSR": 0.4245,
-        "MMLU-PRO": 0.4207
+        "IFEval": 0.2854,
+        "BBH": 0.5984,
+        "MATH Level 5": 0.1005,
+        "GPQA": 0.3297,
+        "MUSR": 0.4607,
+        "MMLU-PRO": 0.4162
       }
     },
     {
@@ -51329,12 +51316,12 @@
       "name": "recoilme-gemma-2-9B-v0.2",
       "developer": "recoilme",
       "scores": {
-        "IFEval": 0.2747,
-        "BBH": 0.6031,
-        "MATH Level 5": 0.0831,
-        "GPQA": 0.3305,
-        "MUSR": 0.4686,
-        "MMLU-PRO": 0.4122
+        "IFEval": 0.7592,
+        "BBH": 0.6026,
+        "MATH Level 5": 0.0529,
+        "GPQA": 0.3289,
+        "MUSR": 0.4099,
+        "MMLU-PRO": 0.4163
       }
     },
     {
@@ -51342,12 +51329,12 @@
       "name": "recoilme-gemma-2-9B-v0.3",
       "developer": "recoilme",
       "scores": {
-        "IFEval": 0.7439,
-        "BBH": 0.5993,
-        "MATH Level 5": 0.0876,
-        "GPQA": 0.3238,
-        "MUSR": 0.4204,
-        "MMLU-PRO": 0.4072
+        "IFEval": 0.5761,
+        "BBH": 0.602,
+        "MATH Level 5": 0.1888,
+        "GPQA": 0.3372,
+        "MUSR": 0.4632,
+        "MMLU-PRO": 0.4039
       }
     },
     {
@@ -56997,12 +56984,12 @@
       "name": "BagelMIsteryTour-v2-8x7B",
       "developer": "ycros",
       "scores": {
-        "IFEval": 0.6262,
-        "BBH": 0.5142,
-        "MATH Level 5": 0.0937,
-        "GPQA": 0.3079,
-        "MUSR": 0.4138,
-        "MMLU-PRO": 0.3481
+        "IFEval": 0.5994,
+        "BBH": 0.5159,
+        "MATH Level 5": 0.0785,
+        "GPQA": 0.3045,
+        "MUSR": 0.4203,
+        "MMLU-PRO": 0.3473
       }
     },
     {
diff --git a/data/benchmarks/livecodebenchpro.json b/data/benchmarks/livecodebenchpro.json
index 449e36cb0d36e98776192be2ede4fd274d74ec7e..896b3c77b4f3fcb0324b8662401bdd6aa70bb19b 100644
--- a/data/benchmarks/livecodebenchpro.json
+++ b/data/benchmarks/livecodebenchpro.json
@@ -205,9 +205,9 @@
       "name": "gpt-5-2025-08-07",
       "developer": "OpenAI",
       "scores": {
-        "Hard Problems": 0.0423,
-        "Medium Problems": 0.4085,
-        "Easy Problems": 0.9014
+        "Hard Problems": 0.04225352112676056,
+        "Medium Problems": 0.4084507042253521,
+        "Easy Problems": 0.8873239436619719
       }
     },
     {
diff --git a/data/benchmarks/reward-bench.json b/data/benchmarks/reward-bench.json
index 02bfaa8b54fd08d9d25532a11affc44ecd74ea63..d41f3c4a1d71f52cfb866cd53b58580a0eae02dd 100644
--- a/data/benchmarks/reward-bench.json
+++ b/data/benchmarks/reward-bench.json
@@ -453,16 +453,16 @@
       "name": "LxzGordon/URM-LLaMa-3.1-8B",
       "developer": "LxzGordon",
       "scores": {
-        "Score": 0.9294,
+        "Score": 0.7394,
+        "Chat": 0.9553,
+        "Chat Hard": 0.8816,
+        "Safety": 0.9178,
+        "Reasoning": 0.9698,
         "Factuality": 0.6884,
         "Precise IF": 0.45,
         "Math": 0.6393,
-        "Safety": 0.9108,
         "Focus": 0.9758,
-        "Ties": 0.7653,
-        "Chat": 0.9553,
-        "Chat Hard": 0.8816,
-        "Reasoning": 0.9698
+        "Ties": 0.7653
       }
     },
     {
@@ -555,17 +555,17 @@
       "name": "OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1",
       "developer": "OpenAssistant",
       "scores": {
-        "Score": 0.615,
+        "Score": 0.2653,
+        "Chat": 0.9246,
+        "Chat Hard": 0.3728,
+        "Safety": 0.3289,
+        "Reasoning": 0.5855,
+        "Prior Sets (0.5 weight)": 0.6801,
         "Factuality": 0.3979,
         "Precise IF": 0.2875,
         "Math": 0.377,
-        "Safety": 0.5446,
         "Focus": 0.1535,
-        "Ties": 0.047,
-        "Chat": 0.9246,
-        "Chat Hard": 0.3728,
-        "Reasoning": 0.5855,
-        "Prior Sets (0.5 weight)": 0.6801
+        "Ties": 0.047
       }
     },
     {
@@ -573,17 +573,17 @@
       "name": "OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5",
       "developer": "OpenAssistant",
       "scores": {
-        "Score": 0.2648,
-        "Chat": 0.8855,
-        "Chat Hard": 0.4868,
-        "Safety": 0.3244,
-        "Reasoning": 0.7752,
-        "Prior Sets (0.5 weight)": 0.6533,
+        "Score": 0.6901,
         "Factuality": 0.3179,
         "Precise IF": 0.2625,
         "Math": 0.3934,
+        "Safety": 0.6311,
         "Focus": 0.2707,
-        "Ties": 0.0198
+        "Ties": 0.0198,
+        "Chat": 0.8855,
+        "Chat Hard": 0.4868,
+        "Reasoning": 0.7752,
+        "Prior Sets (0.5 weight)": 0.6533
       }
     },
     {
@@ -609,17 +609,17 @@
       "name": "PKU-Alignment/beaver-7b-v1.0-cost",
       "developer": "PKU-Alignment",
       "scores": {
-        "Score": 0.3332,
-        "Chat": 0.6173,
-        "Chat Hard": 0.4232,
-        "Safety": 0.7589,
-        "Reasoning": 0.5482,
-        "Prior Sets (0.5 weight)": 0.57,
+        "Score": 0.5798,
         "Factuality": 0.3263,
         "Precise IF": 0.2313,
         "Math": 0.3989,
+        "Safety": 0.7351,
         "Focus": 0.2939,
-        "Ties": -0.01
+        "Ties": -0.01,
+        "Chat": 0.6173,
+        "Chat Hard": 0.4232,
+        "Reasoning": 0.5482,
+        "Prior Sets (0.5 weight)": 0.57
       }
     },
     {
@@ -627,17 +627,17 @@
       "name": "PKU-Alignment/beaver-7b-v1.0-reward",
       "developer": "PKU-Alignment",
       "scores": {
-        "Score": 0.4727,
+        "Score": 0.1606,
+        "Chat": 0.8184,
+        "Chat Hard": 0.2873,
+        "Safety": 0.1422,
+        "Reasoning": 0.346,
+        "Prior Sets (0.5 weight)": 0.5993,
         "Factuality": 0.2105,
         "Precise IF": 0.2938,
         "Math": 0.2623,
-        "Safety": 0.3757,
         "Focus": 0.0646,
-        "Ties": -0.01,
-        "Chat": 0.8184,
-        "Chat Hard": 0.2873,
-        "Reasoning": 0.346,
-        "Prior Sets (0.5 weight)": 0.5993
+        "Ties": -0.01
       }
     },
     {
@@ -663,17 +663,17 @@
       "name": "PKU-Alignment/beaver-7b-v2.0-reward",
       "developer": "PKU-Alignment",
       "scores": {
-        "Score": 0.6366,
+        "Score": 0.2544,
+        "Chat": 0.8994,
+        "Chat Hard": 0.364,
+        "Safety": 0.3156,
+        "Reasoning": 0.6887,
+        "Prior Sets (0.5 weight)": 0.6171,
         "Factuality": 0.2168,
         "Precise IF": 0.2562,
         "Math": 0.3825,
-        "Safety": 0.6041,
         "Focus": 0.2606,
-        "Ties": 0.0944,
-        "Chat": 0.8994,
-        "Chat Hard": 0.364,
-        "Reasoning": 0.6887,
-        "Prior Sets (0.5 weight)": 0.6171
+        "Ties": 0.0944
       }
     },
     {
@@ -921,16 +921,16 @@
       "name": "Ray2333/GRM-gemma2-2B-rewardmodel-ft",
       "developer": "Ray2333",
       "scores": {
-        "Score": 0.8839,
+        "Score": 0.5966,
+        "Chat": 0.9302,
+        "Chat Hard": 0.7719,
+        "Safety": 0.9222,
+        "Reasoning": 0.912,
         "Factuality": 0.5305,
         "Precise IF": 0.3125,
         "Math": 0.5902,
-        "Safety": 0.9216,
         "Focus": 0.7455,
-        "Ties": 0.4788,
-        "Chat": 0.9302,
-        "Chat Hard": 0.7719,
-        "Reasoning": 0.912
+        "Ties": 0.4788
       }
     },
     {
@@ -956,17 +956,17 @@
       "name": "Ray2333/GRM-llama3-8B-sftreg",
       "developer": "Ray2333",
       "scores": {
-        "Score": 0.6089,
-        "Chat": 0.986,
-        "Chat Hard": 0.6776,
-        "Safety": 0.7867,
-        "Reasoning": 0.9229,
-        "Prior Sets (0.5 weight)": 0.7309,
+        "Score": 0.8542,
         "Factuality": 0.6189,
         "Precise IF": 0.3875,
         "Math": 0.5792,
+        "Safety": 0.8919,
         "Focus": 0.6828,
-        "Ties": 0.5981
+        "Ties": 0.5981,
+        "Chat": 0.986,
+        "Chat Hard": 0.6776,
+        "Reasoning": 0.9229,
+        "Prior Sets (0.5 weight)": 0.7309
       }
     },
     {
@@ -1139,16 +1139,16 @@
       "name": "Skywork/Skywork-Reward-Gemma-2-27B",
       "developer": "Skywork",
       "scores": {
-        "Score": 0.938,
+        "Score": 0.7576,
+        "Chat": 0.9581,
+        "Chat Hard": 0.9145,
+        "Safety": 0.9422,
+        "Reasoning": 0.9606,
         "Factuality": 0.7368,
         "Precise IF": 0.4031,
         "Math": 0.7049,
-        "Safety": 0.9189,
         "Focus": 0.9323,
-        "Ties": 0.8261,
-        "Chat": 0.9581,
-        "Chat Hard": 0.9145,
-        "Reasoning": 0.9606
+        "Ties": 0.8261
       }
     },
     {
@@ -1156,16 +1156,16 @@
       "name": "Skywork/Skywork-Reward-Gemma-2-27B-v0.2",
       "developer": "Skywork",
       "scores": {
-        "Score": 0.7531,
-        "Chat": 0.9609,
-        "Chat Hard": 0.8991,
-        "Safety": 0.9689,
-        "Reasoning": 0.9807,
+        "Score": 0.9426,
         "Factuality": 0.7674,
         "Precise IF": 0.375,
         "Math": 0.6721,
+        "Safety": 0.9297,
         "Focus": 0.9172,
-        "Ties": 0.8182
+        "Ties": 0.8182,
+        "Chat": 0.9609,
+        "Chat Hard": 0.8991,
+        "Reasoning": 0.9807
       }
     },
     {
@@ -1173,16 +1173,16 @@
       "name": "Skywork/Skywork-Reward-Llama-3.1-8B",
       "developer": "Skywork",
       "scores": {
-        "Score": 0.7314,
-        "Chat": 0.9581,
-        "Chat Hard": 0.8728,
-        "Safety": 0.9333,
-        "Reasoning": 0.962,
+        "Score": 0.9252,
         "Factuality": 0.6989,
         "Precise IF": 0.425,
         "Math": 0.6284,
+        "Safety": 0.9081,
         "Focus": 0.9616,
-        "Ties": 0.741
+        "Ties": 0.741,
+        "Chat": 0.9581,
+        "Chat Hard": 0.8728,
+        "Reasoning": 0.962
       }
     },
     {
@@ -1305,16 +1305,16 @@
       "name": "Skywork/Skywork-VL-Reward-7B",
       "developer": "Skywork",
       "scores": {
-        "Score": 0.9007,
+        "Score": 0.6885,
+        "Chat": 0.8994,
+        "Chat Hard": 0.875,
+        "Safety": 0.8911,
+        "Reasoning": 0.9176,
         "Factuality": 0.6063,
         "Precise IF": 0.35,
         "Math": 0.6339,
-        "Safety": 0.9108,
         "Focus": 0.8909,
-        "Ties": 0.7586,
-        "Chat": 0.8994,
-        "Chat Hard": 0.875,
-        "Reasoning": 0.9176
+        "Ties": 0.7586
       }
     },
     {
@@ -1379,9 +1379,9 @@
       "name": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...",
       "developer": "AI2",
       "scores": {
-        "Score": 0.7008,
-        "Chat": 0.9385,
-        "Chat Hard": 0.3882,
+        "Score": 0.6924,
+        "Chat": 0.9441,
+        "Chat Hard": 0.3575,
         "Safety": 0.7757
       }
     },
@@ -1423,17 +1423,17 @@
       "name": "allenai/Llama-3.1-70B-Instruct-RM-RB2",
       "developer": "allenai",
       "scores": {
-        "Score": 0.9021,
+        "Score": 0.7606,
+        "Chat": 0.9665,
+        "Chat Hard": 0.8355,
+        "Safety": 0.8844,
+        "Reasoning": 0.8969,
+        "Prior Sets (0.5 weight)": 0.0,
         "Factuality": 0.8126,
         "Precise IF": 0.4188,
         "Math": 0.6995,
-        "Safety": 0.9095,
         "Focus": 0.8646,
-        "Ties": 0.8835,
-        "Chat": 0.9665,
-        "Chat Hard": 0.8355,
-        "Reasoning": 0.8969,
-        "Prior Sets (0.5 weight)": 0.0
+        "Ties": 0.8835
       }
     },
     {
@@ -1459,17 +1459,17 @@
       "name": "allenai/Llama-3.1-8B-Instruct-RM-RB2",
       "developer": "allenai",
       "scores": {
-        "Score": 0.8885,
+        "Score": 0.7285,
+        "Chat": 0.9581,
+        "Chat Hard": 0.8158,
+        "Safety": 0.8956,
+        "Reasoning": 0.887,
+        "Prior Sets (0.5 weight)": 0.0,
         "Factuality": 0.7432,
         "Precise IF": 0.4437,
         "Math": 0.6175,
-        "Safety": 0.8932,
         "Focus": 0.9071,
-        "Ties": 0.7638,
-        "Chat": 0.9581,
-        "Chat Hard": 0.8158,
-        "Reasoning": 0.887,
-        "Prior Sets (0.5 weight)": 0.0
+        "Ties": 0.7638
       }
     },
     {
@@ -1477,17 +1477,17 @@
       "name": "allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2",
       "developer": "allenai",
       "scores": {
-        "Score": 0.722,
-        "Chat": 0.9693,
-        "Chat Hard": 0.8268,
-        "Safety": 0.8689,
-        "Reasoning": 0.8583,
-        "Prior Sets (0.5 weight)": 0.0,
+        "Score": 0.8892,
         "Factuality": 0.8084,
         "Precise IF": 0.3688,
         "Math": 0.6776,
+        "Safety": 0.9027,
         "Focus": 0.7778,
-        "Ties": 0.8308
+        "Ties": 0.8308,
+        "Chat": 0.9693,
+        "Chat Hard": 0.8268,
+        "Reasoning": 0.8583,
+        "Prior Sets (0.5 weight)": 0.0
       }
     },
     {
@@ -1495,17 +1495,17 @@
       "name": "allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2",
       "developer": "allenai",
       "scores": {
-        "Score": 0.687,
-        "Chat": 0.9553,
-        "Chat Hard": 0.761,
-        "Safety": 0.86,
-        "Reasoning": 0.7898,
-        "Prior Sets (0.5 weight)": 0.0,
+        "Score": 0.8431,
         "Factuality": 0.7516,
         "Precise IF": 0.3875,
         "Math": 0.6284,
+        "Safety": 0.8662,
         "Focus": 0.8545,
-        "Ties": 0.6397
+        "Ties": 0.6397,
+        "Chat": 0.9553,
+        "Chat Hard": 0.761,
+        "Reasoning": 0.7898,
+        "Prior Sets (0.5 weight)": 0.0
       }
     },
     {
@@ -3784,16 +3784,16 @@
       "name": "infly/INF-ORM-Llama3.1-70B",
       "developer": "infly",
       "scores": {
-        "Score": 0.7648,
-        "Chat": 0.9665,
-        "Chat Hard": 0.9101,
-        "Safety": 0.9644,
-        "Reasoning": 0.9912,
+        "Score": 0.9511,
         "Factuality": 0.7411,
         "Precise IF": 0.4188,
         "Math": 0.6995,
+        "Safety": 0.9365,
         "Focus": 0.903,
-        "Ties": 0.8622
+        "Ties": 0.8622,
+        "Chat": 0.9665,
+        "Chat Hard": 0.9101,
+        "Reasoning": 0.9912
       }
     },
     {
@@ -3835,16 +3835,16 @@
       "name": "internlm/internlm2-7b-reward",
       "developer": "internlm",
       "scores": {
-        "Score": 0.8759,
+        "Score": 0.5335,
+        "Chat": 0.9916,
+        "Chat Hard": 0.6952,
+        "Safety": 0.5956,
+        "Reasoning": 0.9453,
         "Factuality": 0.4211,
         "Precise IF": 0.4,
         "Math": 0.5628,
-        "Safety": 0.8716,
         "Focus": 0.7051,
-        "Ties": 0.5164,
-        "Chat": 0.9916,
-        "Chat Hard": 0.6952,
-        "Reasoning": 0.9453
+        "Ties": 0.5164
       }
     },
     {
@@ -4014,16 +4014,16 @@
       "name": "nicolinho/QRM-Gemma-2-27B",
       "developer": "nicolinho",
       "scores": {
-        "Score": 0.9444,
+        "Score": 0.7667,
+        "Chat": 0.9665,
+        "Chat Hard": 0.9013,
+        "Safety": 0.9578,
+        "Reasoning": 0.9826,
         "Factuality": 0.7853,
         "Precise IF": 0.3719,
         "Math": 0.6995,
-        "Safety": 0.927,
         "Focus": 0.9535,
-        "Ties": 0.8321,
-        "Chat": 0.9665,
-        "Chat Hard": 0.9013,
-        "Reasoning": 0.9826
+        "Ties": 0.8321
       }
     },
     {
@@ -4055,16 +4055,16 @@
       "name": "nicolinho/QRM-Llama3.1-8B-v2",
       "developer": "nicolinho",
       "scores": {
-        "Score": 0.9314,
+        "Score": 0.7074,
+        "Chat": 0.9637,
+        "Chat Hard": 0.8684,
+        "Safety": 0.9467,
+        "Reasoning": 0.9677,
         "Factuality": 0.6653,
         "Precise IF": 0.4062,
         "Math": 0.612,
-        "Safety": 0.9257,
         "Focus": 0.8909,
-        "Ties": 0.7234,
-        "Chat": 0.9637,
-        "Chat Hard": 0.8684,
-        "Reasoning": 0.9677
+        "Ties": 0.7234
       }
     },
     {
@@ -4202,16 +4202,16 @@
       "name": "GPT-4o 2024-08-06",
       "developer": "OpenAI",
       "scores": {
-        "Score": 0.6493,
-        "Chat": 0.9609,
-        "Chat Hard": 0.761,
-        "Safety": 0.8619,
-        "Reasoning": 0.8661,
+        "Score": 0.8673,
         "Factuality": 0.5684,
         "Precise IF": 0.3312,
         "Math": 0.623,
+        "Safety": 0.8811,
         "Focus": 0.7293,
-        "Ties": 0.7819
+        "Ties": 0.7819,
+        "Chat": 0.9609,
+        "Chat Hard": 0.761,
+        "Reasoning": 0.8661
       }
     },
     {
@@ -4249,17 +4249,17 @@
       "name": "openbmb/Eurus-RM-7b",
       "developer": "openbmb",
       "scores": {
-        "Score": 0.5806,
-        "Chat": 0.9804,
-        "Chat Hard": 0.6557,
-        "Safety": 0.6267,
-        "Reasoning": 0.8633,
-        "Prior Sets (0.5 weight)": 0.7172,
+        "Score": 0.8159,
         "Factuality": 0.6,
         "Precise IF": 0.3438,
         "Math": 0.5683,
+        "Safety": 0.8135,
         "Focus": 0.7475,
-        "Ties": 0.5972
+        "Ties": 0.5972,
+        "Chat": 0.9804,
+        "Chat Hard": 0.6557,
+        "Reasoning": 0.8633,
+        "Prior Sets (0.5 weight)": 0.7172
       }
     },
     {
@@ -4370,17 +4370,17 @@
       "name": "sfairXC/FsfairX-LLaMA3-RM-v0.1",
       "developer": "sfairXC",
       "scores": {
-        "Score": 0.6292,
-        "Chat": 0.9944,
-        "Chat Hard": 0.6513,
-        "Safety": 0.7667,
-        "Reasoning": 0.8644,
-        "Prior Sets (0.5 weight)": 0.7492,
+        "Score": 0.8338,
         "Factuality": 0.5916,
         "Precise IF": 0.4188,
         "Math": 0.6284,
+        "Safety": 0.8676,
         "Focus": 0.7051,
-        "Ties": 0.6647
+        "Ties": 0.6647,
+        "Chat": 0.9944,
+        "Chat Hard": 0.6513,
+        "Reasoning": 0.8644,
+        "Prior Sets (0.5 weight)": 0.7492
       }
     },
     {
@@ -4492,17 +4492,17 @@
       "name": "weqweasdas/RM-Gemma-2B",
       "developer": "weqweasdas",
       "scores": {
-        "Score": 0.3057,
-        "Chat": 0.9441,
-        "Chat Hard": 0.4079,
-        "Safety": 0.3311,
-        "Reasoning": 0.7637,
-        "Prior Sets (0.5 weight)": 0.6652,
+        "Score": 0.6549,
         "Factuality": 0.3705,
         "Precise IF": 0.2812,
         "Math": 0.4317,
+        "Safety": 0.4986,
         "Focus": 0.2343,
-        "Ties": 0.1851
+        "Ties": 0.1851,
+        "Chat": 0.9441,
+        "Chat Hard": 0.4079,
+        "Reasoning": 0.7637,
+        "Prior Sets (0.5 weight)": 0.6652
       }
     },
     {
@@ -4541,17 +4541,17 @@
       "name": "weqweasdas/RM-Mistral-7B",
       "developer": "weqweasdas",
       "scores": {
-        "Score": 0.596,
-        "Chat": 0.9665,
-        "Chat Hard": 0.6053,
-        "Safety": 0.6911,
-        "Reasoning": 0.7736,
-        "Prior Sets (0.5 weight)": 0.753,
+        "Score": 0.7982,
         "Factuality": 0.5937,
         "Precise IF": 0.3438,
         "Math": 0.5956,
+        "Safety": 0.8703,
         "Focus": 0.7293,
-        "Ties": 0.6226
+        "Ties": 0.6226,
+        "Chat": 0.9665,
+        "Chat Hard": 0.6053,
+        "Reasoning": 0.7736,
+        "Prior Sets (0.5 weight)": 0.753
       }
     },
     {
@@ -4559,17 +4559,17 @@
       "name": "weqweasdas/hh_rlhf_rm_open_llama_3b",
       "developer": "weqweasdas",
       "scores": {
-        "Score": 0.2498,
-        "Chat": 0.8184,
-        "Chat Hard": 0.3728,
-        "Safety": 0.24,
-        "Reasoning": 0.3281,
-        "Prior Sets (0.5 weight)": 0.6564,
+        "Score": 0.5027,
         "Factuality": 0.3642,
         "Precise IF": 0.275,
         "Math": 0.3497,
+        "Safety": 0.4149,
         "Focus": 0.2384,
-        "Ties": 0.0315
+        "Ties": 0.0315,
+        "Chat": 0.8184,
+        "Chat Hard": 0.3728,
+        "Reasoning": 0.3281,
+        "Prior Sets (0.5 weight)": 0.6564
       }
     }
   ]
diff --git a/data/benchmarks/swe-bench.json b/data/benchmarks/swe-bench.json
index 1411ad930fa9e027d02d79c5b478c955d8c4e629..c5ac3821d8cf33006a01a757ed86c4602272b367 100644
--- a/data/benchmarks/swe-bench.json
+++ b/data/benchmarks/swe-bench.json
@@ -5,7 +5,7 @@
       "name": "claude-opus-4-5",
       "developer": "Anthropic",
       "scores": {
-        "swe-bench": 0.65
+        "swe-bench": 0.6061
       }
     },
     {
@@ -13,7 +13,7 @@
       "name": "gemini-3-pro-preview",
       "developer": "Google",
       "scores": {
-        "swe-bench": 0.7234
+        "swe-bench": 0.71
       }
     },
     {
diff --git a/data/benchmarks/tau-bench-2_airline.json b/data/benchmarks/tau-bench-2_airline.json
index 8d2ca0689ae7644754daa6ce07597d00fd2d892b..f12d28637a77b58341be6f902aadfc1d22527d1a 100644
--- a/data/benchmarks/tau-bench-2_airline.json
+++ b/data/benchmarks/tau-bench-2_airline.json
@@ -5,7 +5,7 @@
       "name": "claude-opus-4-5",
       "developer": "Anthropic",
       "scores": {
-        "tau-bench-2/airline": 0.72
+        "tau-bench-2/airline": 0.66
       }
     },
     {
@@ -13,7 +13,7 @@
       "name": "gemini-3-pro-preview",
       "developer": "Google",
       "scores": {
-        "tau-bench-2/airline": 0.7
+        "tau-bench-2/airline": 0.68
       }
     },
     {
diff --git a/data/benchmarks/tau-bench-2_retail.json b/data/benchmarks/tau-bench-2_retail.json
index 1f141a12075d4e893a2a3bae0a3a7637670fb4c8..8567872e9ca3424b440aa66923f7ea7a010a7290 100644
--- a/data/benchmarks/tau-bench-2_retail.json
+++ b/data/benchmarks/tau-bench-2_retail.json
@@ -21,7 +21,7 @@
       "name": "gpt-5.2-2025-12-11",
       "developer": "OpenAI",
       "scores": {
-        "tau-bench-2/retail": 0.68
+        "tau-bench-2/retail": 0.73
       }
     }
   ]
diff --git a/data/benchmarks/tau-bench-2_telecom.json b/data/benchmarks/tau-bench-2_telecom.json
index 5e2e97c5a63c814404bfd0e936bb7f41ce63593e..717a8c139daad73d2bba1920f3f4fdded08dd42b 100644
--- a/data/benchmarks/tau-bench-2_telecom.json
+++ b/data/benchmarks/tau-bench-2_telecom.json
@@ -5,7 +5,7 @@
       "name": "claude-opus-4-5",
       "developer": "Anthropic",
       "scores": {
-        "tau-bench-2/telecom": 0.76
+        "tau-bench-2/telecom": 0.84
       }
     },
     {
@@ -21,7 +21,7 @@
       "name": "gpt-5.2-2025-12-11",
       "developer": "OpenAI",
       "scores": {
-        "tau-bench-2/telecom": 0.5354
+        "tau-bench-2/telecom": 0.71
       }
     }
   ]
diff --git a/data/benchmarks/terminal-bench-2.0.json b/data/benchmarks/terminal-bench-2.0.json
index 2a3783a3e17d1f920cd8ee2720ab236eb67f76b1..10b4a1e53538ae0245444d9327835077907520b5 100644
--- a/data/benchmarks/terminal-bench-2.0.json
+++ b/data/benchmarks/terminal-bench-2.0.json
@@ -21,7 +21,7 @@
       "name": "Claude Opus 4.1",
       "developer": "Anthropic",
       "scores": {
-        "terminal-bench-2.0": 38.0
+        "terminal-bench-2.0": 35.1
       }
     },
     {
@@ -29,7 +29,7 @@
       "name": "Claude Opus 4.5",
       "developer": "Anthropic",
       "scores": {
-        "terminal-bench-2.0": 59.1
+        "terminal-bench-2.0": 52.1
       }
     },
     {
@@ -37,7 +37,7 @@
       "name": "Claude Opus 4.6",
       "developer": "Anthropic",
       "scores": {
-        "terminal-bench-2.0": 58.0
+        "terminal-bench-2.0": 62.9
       }
     },
     {
@@ -45,7 +45,7 @@
       "name": "Claude Sonnet 4.5",
       "developer": "Anthropic",
       "scores": {
-        "terminal-bench-2.0": 43.1
+        "terminal-bench-2.0": 42.6
       }
     },
     {
@@ -61,7 +61,7 @@
       "name": "Gemini 2.5 Flash",
       "developer": "Google",
       "scores": {
-        "terminal-bench-2.0": 17.1
+        "terminal-bench-2.0": 16.9
       }
     },
     {
@@ -77,7 +77,7 @@
       "name": "Gemini 3 Flash",
       "developer": "Google",
       "scores": {
-        "terminal-bench-2.0": 51.0
+        "terminal-bench-2.0": 47.4
       }
     },
     {
@@ -109,7 +109,7 @@
       "name": "MiniMax M2.1",
       "developer": "MiniMax",
       "scores": {
-        "terminal-bench-2.0": 29.2
+        "terminal-bench-2.0": 36.6
       }
     },
     {
@@ -125,7 +125,7 @@
       "name": "Kimi K2 Instruct",
       "developer": "Moonshot AI",
       "scores": {
-        "terminal-bench-2.0": 26.7
+        "terminal-bench-2.0": 27.8
       }
     },
     {
@@ -149,7 +149,7 @@
       "name": "Multiple",
       "developer": "Multiple",
       "scores": {
-        "terminal-bench-2.0": 71.0
+        "terminal-bench-2.0": 72.4
       }
     },
     {
@@ -157,7 +157,7 @@
       "name": "GPT-5",
       "developer": "OpenAI",
       "scores": {
-        "terminal-bench-2.0": 35.2
+        "terminal-bench-2.0": 49.6
       }
     },
     {
@@ -165,7 +165,7 @@
       "name": "GPT-5-Codex",
       "developer": "OpenAI",
       "scores": {
-        "terminal-bench-2.0": 44.3
+        "terminal-bench-2.0": 43.4
       }
     },
     {
@@ -173,7 +173,7 @@
       "name": "GPT-5-Mini",
       "developer": "OpenAI",
       "scores": {
-        "terminal-bench-2.0": 34.8
+        "terminal-bench-2.0": 24.0
       }
     },
     {
@@ -181,7 +181,7 @@
       "name": "GPT-5-Nano",
       "developer": "OpenAI",
       "scores": {
-        "terminal-bench-2.0": 9.9
+        "terminal-bench-2.0": 11.5
       }
     },
     {
@@ -197,7 +197,7 @@
       "name": "GPT-5.1-Codex",
       "developer": "OpenAI",
       "scores": {
-        "terminal-bench-2.0": 53.5
+        "terminal-bench-2.0": 57.8
       }
     },
     {
@@ -221,7 +221,7 @@
       "name": "GPT-5.2",
       "developer": "OpenAI",
       "scores": {
-        "terminal-bench-2.0": 60.7
+        "terminal-bench-2.0": 62.9
       }
     },
     {
@@ -237,7 +237,7 @@
       "name": "GPT-5.3-Codex",
       "developer": "OpenAI",
       "scores": {
-        "terminal-bench-2.0": 64.7
+        "terminal-bench-2.0": 77.3
       }
     },
     {
@@ -245,7 +245,7 @@
       "name": "GPT-OSS-120B",
       "developer": "OpenAI",
       "scores": {
-        "terminal-bench-2.0": 14.2
+        "terminal-bench-2.0": 18.7
       }
     },
     {
@@ -253,7 +253,7 @@
       "name": "GPT-OSS-20B",
       "developer": "OpenAI",
       "scores": {
-        "terminal-bench-2.0": 3.1
+        "terminal-bench-2.0": 3.4
       }
     },
     {
@@ -261,7 +261,7 @@
       "name": "Grok 4",
       "developer": "xAI",
       "scores": {
-        "terminal-bench-2.0": 25.4
+        "terminal-bench-2.0": 23.1
       }
     },
     {
@@ -269,7 +269,7 @@
       "name": "Grok Code Fast 1",
       "developer": "xAI",
       "scores": {
-        "terminal-bench-2.0": 25.8
+        "terminal-bench-2.0": 14.2
       }
     },
     {
diff --git a/data/benchmarks/theory_of_mind.json b/data/benchmarks/theory_of_mind.json
new file mode 100644
index 0000000000000000000000000000000000000000..8af892835f3a3bd725f97d45e09284631904bf11
--- /dev/null
+++ b/data/benchmarks/theory_of_mind.json
@@ -0,0 +1,12 @@
+{
+  "models": [
+    {
+      "model_id": "Qwen/Qwen2.5-3B-Instruct",
+      "name": "Qwen2.5-3B-Instruct",
+      "developer": "Qwen",
+      "scores": {
+        "accuracy on theory_of_mind for scorer model_graded_fact": 0.78
+      }
+    }
+  ]
+}
\ No newline at end of file
diff --git a/data/developers.json b/data/developers.json
index 003e462b3fb10e7ebce2d70b3ed9418882ec3a93..f579192a07283e4405b9950165cfa67af8d0dcda 100644
--- a/data/developers.json
+++ b/data/developers.json
@@ -1917,7 +1917,7 @@
   },
   {
     "developer": "NousResearch",
-    "model_count": 19
+    "model_count": 18
   },
   {
     "developer": "Novaciano",
diff --git a/data/developers/adriszmar.json b/data/developers/adriszmar.json
index acb90d745752909d8f96f323acb9caa9b19061ae..1f1d39916960942963a9c3c265196aea3657be38 100644
--- a/data/developers/adriszmar.json
+++ b/data/developers/adriszmar.json
@@ -7,12 +7,12 @@
       "developer": "adriszmar",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.1685,
-        "hfopenllm_v2/BBH": 0.3124,
-        "hfopenllm_v2/MATH Level 5": 0.0015,
-        "hfopenllm_v2/GPQA": 0.2492,
-        "hfopenllm_v2/MUSR": 0.3963,
-        "hfopenllm_v2/MMLU-PRO": 0.1066
+        "hfopenllm_v2/IFEval": 0.1746,
+        "hfopenllm_v2/BBH": 0.3126,
+        "hfopenllm_v2/MATH Level 5": 0.0,
+        "hfopenllm_v2/GPQA": 0.245,
+        "hfopenllm_v2/MUSR": 0.4096,
+        "hfopenllm_v2/MMLU-PRO": 0.1087
       }
     }
   ]
diff --git a/data/developers/ai2.json b/data/developers/ai2.json
index 4934c11b7806e647da8c3821dcfccba2566bb947..6ae2e91a5d501e1d313c59819f3bee806d5615b0 100644
--- a/data/developers/ai2.json
+++ b/data/developers/ai2.json
@@ -43,9 +43,9 @@
       "developer": "AI2",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "reward-bench/Score": 0.7008,
-        "reward-bench/Chat": 0.9385,
-        "reward-bench/Chat Hard": 0.3882,
+        "reward-bench/Score": 0.6924,
+        "reward-bench/Chat": 0.9441,
+        "reward-bench/Chat Hard": 0.3575,
         "reward-bench/Safety": 0.7757
       }
     },
diff --git a/data/developers/akjindal53244.json b/data/developers/akjindal53244.json
index 237ea0357d953fdc2d416f7c27241c406836e723..86acdd918ca996450aa81c0111cbad62da1b8b17 100644
--- a/data/developers/akjindal53244.json
+++ b/data/developers/akjindal53244.json
@@ -7,12 +7,12 @@
       "developer": "akjindal53244",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.8051,
-        "hfopenllm_v2/BBH": 0.5189,
-        "hfopenllm_v2/MATH Level 5": 0.1722,
-        "hfopenllm_v2/GPQA": 0.3263,
+        "hfopenllm_v2/IFEval": 0.8033,
+        "hfopenllm_v2/BBH": 0.5196,
+        "hfopenllm_v2/MATH Level 5": 0.1624,
+        "hfopenllm_v2/GPQA": 0.3096,
         "hfopenllm_v2/MUSR": 0.4028,
-        "hfopenllm_v2/MMLU-PRO": 0.3803
+        "hfopenllm_v2/MMLU-PRO": 0.3812
       }
     }
   ]
diff --git a/data/developers/allenai.json b/data/developers/allenai.json
index 00cefcb6fd468cf299bbaf9888936aea413ac714..4d8014825fe4883af161e8da92543c33ea544e4b 100644
--- a/data/developers/allenai.json
+++ b/data/developers/allenai.json
@@ -63,17 +63,17 @@
       "developer": "allenai",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "reward-bench/Score": 0.9021,
+        "reward-bench/Score": 0.7606,
+        "reward-bench/Chat": 0.9665,
+        "reward-bench/Chat Hard": 0.8355,
+        "reward-bench/Safety": 0.8844,
+        "reward-bench/Reasoning": 0.8969,
+        "reward-bench/Prior Sets (0.5 weight)": 0.0,
         "reward-bench/Factuality": 0.8126,
         "reward-bench/Precise IF": 0.4188,
         "reward-bench/Math": 0.6995,
-        "reward-bench/Safety": 0.9095,
         "reward-bench/Focus": 0.8646,
-        "reward-bench/Ties": 0.8835,
-        "reward-bench/Chat": 0.9665,
-        "reward-bench/Chat Hard": 0.8355,
-        "reward-bench/Reasoning": 0.8969,
-        "reward-bench/Prior Sets (0.5 weight)": 0.0
+        "reward-bench/Ties": 0.8835
       }
     },
     {
@@ -101,17 +101,17 @@
       "developer": "allenai",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "reward-bench/Score": 0.8885,
+        "reward-bench/Score": 0.7285,
+        "reward-bench/Chat": 0.9581,
+        "reward-bench/Chat Hard": 0.8158,
+        "reward-bench/Safety": 0.8956,
+        "reward-bench/Reasoning": 0.887,
+        "reward-bench/Prior Sets (0.5 weight)": 0.0,
         "reward-bench/Factuality": 0.7432,
         "reward-bench/Precise IF": 0.4437,
         "reward-bench/Math": 0.6175,
-        "reward-bench/Safety": 0.8932,
         "reward-bench/Focus": 0.9071,
-        "reward-bench/Ties": 0.7638,
-        "reward-bench/Chat": 0.9581,
-        "reward-bench/Chat Hard": 0.8158,
-        "reward-bench/Reasoning": 0.887,
-        "reward-bench/Prior Sets (0.5 weight)": 0.0
+        "reward-bench/Ties": 0.7638
       }
     },
     {
@@ -120,12 +120,12 @@
       "developer": "allenai",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.8379,
-        "hfopenllm_v2/BBH": 0.6157,
-        "hfopenllm_v2/MATH Level 5": 0.3829,
+        "hfopenllm_v2/IFEval": 0.8291,
+        "hfopenllm_v2/BBH": 0.6164,
+        "hfopenllm_v2/MATH Level 5": 0.4502,
         "hfopenllm_v2/GPQA": 0.3733,
-        "hfopenllm_v2/MUSR": 0.4988,
-        "hfopenllm_v2/MMLU-PRO": 0.4656
+        "hfopenllm_v2/MUSR": 0.4948,
+        "hfopenllm_v2/MMLU-PRO": 0.4645
       }
     },
     {
@@ -162,17 +162,17 @@
       "developer": "allenai",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "reward-bench/Score": 0.722,
-        "reward-bench/Chat": 0.9693,
-        "reward-bench/Chat Hard": 0.8268,
-        "reward-bench/Safety": 0.8689,
-        "reward-bench/Reasoning": 0.8583,
-        "reward-bench/Prior Sets (0.5 weight)": 0.0,
+        "reward-bench/Score": 0.8892,
         "reward-bench/Factuality": 0.8084,
         "reward-bench/Precise IF": 0.3688,
         "reward-bench/Math": 0.6776,
+        "reward-bench/Safety": 0.9027,
         "reward-bench/Focus": 0.7778,
-        "reward-bench/Ties": 0.8308
+        "reward-bench/Ties": 0.8308,
+        "reward-bench/Chat": 0.9693,
+        "reward-bench/Chat Hard": 0.8268,
+        "reward-bench/Reasoning": 0.8583,
+        "reward-bench/Prior Sets (0.5 weight)": 0.0
       }
     },
     {
@@ -209,17 +209,17 @@
       "developer": "allenai",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "reward-bench/Score": 0.687,
-        "reward-bench/Chat": 0.9553,
-        "reward-bench/Chat Hard": 0.761,
-        "reward-bench/Safety": 0.86,
-        "reward-bench/Reasoning": 0.7898,
-        "reward-bench/Prior Sets (0.5 weight)": 0.0,
+        "reward-bench/Score": 0.8431,
         "reward-bench/Factuality": 0.7516,
         "reward-bench/Precise IF": 0.3875,
         "reward-bench/Math": 0.6284,
+        "reward-bench/Safety": 0.8662,
         "reward-bench/Focus": 0.8545,
-        "reward-bench/Ties": 0.6397
+        "reward-bench/Ties": 0.6397,
+        "reward-bench/Chat": 0.9553,
+        "reward-bench/Chat Hard": 0.761,
+        "reward-bench/Reasoning": 0.7898,
+        "reward-bench/Prior Sets (0.5 weight)": 0.0
       }
     },
     {
diff --git a/data/developers/anthropic.json b/data/developers/anthropic.json
index 674f01d5340b017e6a95b1abb6e16b7a263cdb47..e348f58bb2289a52a6840206a2b95ae3acade6c2 100644
--- a/data/developers/anthropic.json
+++ b/data/developers/anthropic.json
@@ -650,12 +650,12 @@
       "developer": "Anthropic",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "appworld_test_normal/appworld/test_normal": 0.68,
+        "appworld_test_normal/appworld/test_normal": 0.7,
         "browsecompplus/browsecompplus": 0.61,
-        "swe-bench/swe-bench": 0.65,
-        "tau-bench-2_airline/tau-bench-2/airline": 0.72,
+        "swe-bench/swe-bench": 0.6061,
+        "tau-bench-2_airline/tau-bench-2/airline": 0.66,
         "tau-bench-2_retail/tau-bench-2/retail": 0.78,
-        "tau-bench-2_telecom/tau-bench-2/telecom": 0.76
+        "tau-bench-2_telecom/tau-bench-2/telecom": 0.84
       }
     },
     {
@@ -664,7 +664,7 @@
       "developer": "Anthropic",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "terminal-bench-2.0/terminal-bench-2.0": 38.0
+        "terminal-bench-2.0/terminal-bench-2.0": 35.1
       }
     },
     {
@@ -673,7 +673,7 @@
       "developer": "Anthropic",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "terminal-bench-2.0/terminal-bench-2.0": 59.1
+        "terminal-bench-2.0/terminal-bench-2.0": 52.1
       }
     },
     {
@@ -682,7 +682,7 @@
       "developer": "Anthropic",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "terminal-bench-2.0/terminal-bench-2.0": 58.0
+        "terminal-bench-2.0/terminal-bench-2.0": 62.9
       }
     },
     {
@@ -756,7 +756,7 @@
       "developer": "Anthropic",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "terminal-bench-2.0/terminal-bench-2.0": 43.1
+        "terminal-bench-2.0/terminal-bench-2.0": 42.6
       }
     },
     {
@@ -800,8 +800,6 @@
       "developer": "Anthropic",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "ace/Overall Score": 0.478,
-        "ace/Gaming Score": 0.391,
         "apex-agents/Overall Pass@1": 0.184,
         "apex-agents/Overall Pass@8": 0.34,
         "apex-agents/Overall Mean Score": 0.348,
@@ -809,6 +807,8 @@
         "apex-agents/Management Consulting Pass@1": 0.132,
         "apex-agents/Corporate Law Pass@1": 0.202,
         "apex-agents/Corporate Lawyer Mean Score": 0.471,
+        "ace/Overall Score": 0.478,
+        "ace/Gaming Score": 0.391,
         "apex-v1/Medicine (MD) Score": 0.65
       }
     },
diff --git a/data/developers/cognitivecomputations.json b/data/developers/cognitivecomputations.json
index 27ef3ede420acfeab83ed5bb754062e32374c41a..292d5e513d244c8ebb441a809017ff16919c9354 100644
--- a/data/developers/cognitivecomputations.json
+++ b/data/developers/cognitivecomputations.json
@@ -77,12 +77,12 @@
       "developer": "cognitivecomputations",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.4124,
-        "hfopenllm_v2/BBH": 0.6383,
-        "hfopenllm_v2/MATH Level 5": 0.182,
-        "hfopenllm_v2/GPQA": 0.3289,
-        "hfopenllm_v2/MUSR": 0.4349,
-        "hfopenllm_v2/MMLU-PRO": 0.4525
+        "hfopenllm_v2/IFEval": 0.3613,
+        "hfopenllm_v2/BBH": 0.6123,
+        "hfopenllm_v2/MATH Level 5": 0.1239,
+        "hfopenllm_v2/GPQA": 0.328,
+        "hfopenllm_v2/MUSR": 0.4112,
+        "hfopenllm_v2/MMLU-PRO": 0.4494
       }
     },
     {
diff --git a/data/developers/columbia-nlp.json b/data/developers/columbia-nlp.json
index b04d1f97bca6939c03a53d86cd396214edf82f72..11f5fed45eb39522787aef4e964f3d3e28d320c0 100644
--- a/data/developers/columbia-nlp.json
+++ b/data/developers/columbia-nlp.json
@@ -7,12 +7,12 @@
       "developer": "Columbia-NLP",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.3278,
-        "hfopenllm_v2/BBH": 0.392,
-        "hfopenllm_v2/MATH Level 5": 0.0431,
-        "hfopenllm_v2/GPQA": 0.2492,
-        "hfopenllm_v2/MUSR": 0.412,
-        "hfopenllm_v2/MMLU-PRO": 0.1666
+        "hfopenllm_v2/IFEval": 0.3102,
+        "hfopenllm_v2/BBH": 0.3881,
+        "hfopenllm_v2/MATH Level 5": 0.0536,
+        "hfopenllm_v2/GPQA": 0.2534,
+        "hfopenllm_v2/MUSR": 0.4081,
+        "hfopenllm_v2/MMLU-PRO": 0.1665
       }
     },
     {
diff --git a/data/developers/cpayne1303.json b/data/developers/cpayne1303.json
index 6d735bd94a67b9fc86d407e5a74d4ec119a21a01..878ab50b2c306395a2d661382c84d0660b0f0d51 100644
--- a/data/developers/cpayne1303.json
+++ b/data/developers/cpayne1303.json
@@ -35,12 +35,12 @@
       "developer": "cpayne1303",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.1916,
-        "hfopenllm_v2/BBH": 0.2977,
-        "hfopenllm_v2/MATH Level 5": 0.0,
+        "hfopenllm_v2/IFEval": 0.1949,
+        "hfopenllm_v2/BBH": 0.2965,
+        "hfopenllm_v2/MATH Level 5": 0.0045,
         "hfopenllm_v2/GPQA": 0.2685,
-        "hfopenllm_v2/MUSR": 0.3872,
-        "hfopenllm_v2/MMLU-PRO": 0.1132
+        "hfopenllm_v2/MUSR": 0.3885,
+        "hfopenllm_v2/MMLU-PRO": 0.1111
       }
     },
     {
diff --git a/data/developers/daemontatox.json b/data/developers/daemontatox.json
index 09a88a4f4f23ea30a1c40f4893201987d0954b30..3de1c87bc255ec29e11a7fcd9434ebc4d17ff27a 100644
--- a/data/developers/daemontatox.json
+++ b/data/developers/daemontatox.json
@@ -231,12 +231,12 @@
       "developer": "Daemontatox",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.3745,
-        "hfopenllm_v2/BBH": 0.6668,
-        "hfopenllm_v2/MATH Level 5": 0.4758,
-        "hfopenllm_v2/GPQA": 0.3943,
-        "hfopenllm_v2/MUSR": 0.4858,
-        "hfopenllm_v2/MMLU-PRO": 0.5593
+        "hfopenllm_v2/IFEval": 0.4855,
+        "hfopenllm_v2/BBH": 0.6627,
+        "hfopenllm_v2/MATH Level 5": 0.4841,
+        "hfopenllm_v2/GPQA": 0.3096,
+        "hfopenllm_v2/MUSR": 0.4256,
+        "hfopenllm_v2/MMLU-PRO": 0.5542
       }
     },
     {
diff --git a/data/developers/deepmount00.json b/data/developers/deepmount00.json
index 5505c28134c7ac10ef3a27978dd2745253c793a1..e898074e4a61782e05002f7b47eb2ee0411313aa 100644
--- a/data/developers/deepmount00.json
+++ b/data/developers/deepmount00.json
@@ -63,12 +63,12 @@
       "developer": "DeepMount00",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.7917,
-        "hfopenllm_v2/BBH": 0.5109,
-        "hfopenllm_v2/MATH Level 5": 0.1088,
-        "hfopenllm_v2/GPQA": 0.2878,
-        "hfopenllm_v2/MUSR": 0.4136,
-        "hfopenllm_v2/MMLU-PRO": 0.3876
+        "hfopenllm_v2/IFEval": 0.5365,
+        "hfopenllm_v2/BBH": 0.517,
+        "hfopenllm_v2/MATH Level 5": 0.1707,
+        "hfopenllm_v2/GPQA": 0.3062,
+        "hfopenllm_v2/MUSR": 0.4487,
+        "hfopenllm_v2/MMLU-PRO": 0.396
       }
     },
     {
diff --git a/data/developers/dfurman.json b/data/developers/dfurman.json
index 2947dc3ef503295f24886c787e729305dcebb026..7e28f4da929deb69b4028e08d70bcf7cf516d943 100644
--- a/data/developers/dfurman.json
+++ b/data/developers/dfurman.json
@@ -35,12 +35,12 @@
       "developer": "dfurman",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.2835,
-        "hfopenllm_v2/BBH": 0.3842,
-        "hfopenllm_v2/MATH Level 5": 0.0521,
-        "hfopenllm_v2/GPQA": 0.2609,
-        "hfopenllm_v2/MUSR": 0.3566,
-        "hfopenllm_v2/MMLU-PRO": 0.2298
+        "hfopenllm_v2/IFEval": 0.3,
+        "hfopenllm_v2/BBH": 0.3853,
+        "hfopenllm_v2/MATH Level 5": 0.0415,
+        "hfopenllm_v2/GPQA": 0.2617,
+        "hfopenllm_v2/MUSR": 0.3579,
+        "hfopenllm_v2/MMLU-PRO": 0.2281
       }
     },
     {
diff --git a/data/developers/doppelreflex.json b/data/developers/doppelreflex.json
index 4e478be7761570eec41b8a497db82c0ab081c8b0..0b78fc1cbec001d1df6b60b1fab265b2eab6799e 100644
--- a/data/developers/doppelreflex.json
+++ b/data/developers/doppelreflex.json
@@ -175,12 +175,12 @@
       "developer": "DoppelReflEx",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.451,
-        "hfopenllm_v2/BBH": 0.4944,
-        "hfopenllm_v2/MATH Level 5": 0.1156,
-        "hfopenllm_v2/GPQA": 0.3196,
-        "hfopenllm_v2/MUSR": 0.3896,
-        "hfopenllm_v2/MMLU-PRO": 0.3256
+        "hfopenllm_v2/IFEval": 0.436,
+        "hfopenllm_v2/BBH": 0.4956,
+        "hfopenllm_v2/MATH Level 5": 0.0589,
+        "hfopenllm_v2/GPQA": 0.3205,
+        "hfopenllm_v2/MUSR": 0.3843,
+        "hfopenllm_v2/MMLU-PRO": 0.3237
       }
     },
     {
diff --git a/data/developers/google.json b/data/developers/google.json
index d8eb903319736c192b1ab41a5a5c1c01cb963ec6..a447ddbc58f3e10c0347885b529d40f4033f2cb3 100644
--- a/data/developers/google.json
+++ b/data/developers/google.json
@@ -139,6 +139,7 @@
       "developer": "Google",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "ace/Gaming Score": 0.415,
         "apex-agents/Overall Pass@1": 0.24,
         "apex-agents/Overall Pass@8": 0.367,
         "apex-agents/Overall Mean Score": 0.395,
@@ -146,7 +147,6 @@
         "apex-agents/Management Consulting Pass@1": 0.193,
         "apex-agents/Corporate Law Pass@1": 0.259,
         "apex-agents/Corporate Lawyer Mean Score": 0.524,
-        "ace/Gaming Score": 0.415,
         "apex-v1/Overall Score": 0.64,
         "apex-v1/Consulting Score": 0.64
       }
@@ -157,6 +157,8 @@
       "developer": "Google",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "ace/Overall Score": 0.47,
+        "ace/Gaming Score": 0.509,
         "apex-agents/Overall Pass@1": 0.184,
         "apex-agents/Overall Pass@8": 0.373,
         "apex-agents/Overall Mean Score": 0.341,
@@ -164,8 +166,6 @@
         "apex-agents/Management Consulting Pass@1": 0.124,
         "apex-agents/Corporate Law Pass@1": 0.239,
         "apex-agents/Corporate Lawyer Mean Score": 0.487,
-        "ace/Overall Score": 0.47,
-        "ace/Gaming Score": 0.509,
         "apex-v1/Overall Score": 0.643,
         "apex-v1/Consulting Score": 0.64,
         "apex-v1/Investment Banking Score": 0.63
@@ -723,7 +723,7 @@
         "reward-bench/Safety": 0.909,
         "reward-bench/Focus": 0.841,
         "reward-bench/Ties": 0.809,
-        "terminal-bench-2.0/terminal-bench-2.0": 17.1
+        "terminal-bench-2.0/terminal-bench-2.0": 16.9
       }
     },
     {
@@ -861,7 +861,7 @@
       "developer": "Google",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "terminal-bench-2.0/terminal-bench-2.0": 51.0
+        "terminal-bench-2.0/terminal-bench-2.0": 47.4
       }
     },
     {
@@ -879,8 +879,8 @@
       "developer": "Google",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "appworld_test_normal/appworld/test_normal": 0.13,
-        "browsecompplus/browsecompplus": 0.48,
+        "appworld_test_normal/appworld/test_normal": 0.55,
+        "browsecompplus/browsecompplus": 0.3333,
         "global-mmlu-lite/Global MMLU Lite": 0.9453,
         "global-mmlu-lite/Culturally Sensitive": 0.9397,
         "global-mmlu-lite/Culturally Agnostic": 0.9509,
@@ -900,8 +900,8 @@
         "global-mmlu-lite/Yoruba": 0.9425,
         "global-mmlu-lite/Chinese": 0.9475,
         "global-mmlu-lite/Burmese": 0.9425,
-        "swe-bench/swe-bench": 0.7234,
-        "tau-bench-2_airline/tau-bench-2/airline": 0.7,
+        "swe-bench/swe-bench": 0.71,
+        "tau-bench-2_airline/tau-bench-2/airline": 0.68,
         "tau-bench-2_retail/tau-bench-2/retail": 0.73,
         "tau-bench-2_telecom/tau-bench-2/telecom": 0.73
       }
@@ -1028,12 +1028,12 @@
       "developer": "Google",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.2018,
-        "hfopenllm_v2/BBH": 0.3709,
-        "hfopenllm_v2/MATH Level 5": 0.0302,
+        "hfopenllm_v2/IFEval": 0.1993,
+        "hfopenllm_v2/BBH": 0.3656,
+        "hfopenllm_v2/MATH Level 5": 0.0287,
         "hfopenllm_v2/GPQA": 0.2626,
-        "hfopenllm_v2/MUSR": 0.4219,
-        "hfopenllm_v2/MMLU-PRO": 0.2217
+        "hfopenllm_v2/MUSR": 0.4232,
+        "hfopenllm_v2/MMLU-PRO": 0.218
       }
     },
     {
@@ -1056,12 +1056,12 @@
       "developer": "Google",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.5078,
-        "hfopenllm_v2/BBH": 0.4226,
-        "hfopenllm_v2/MATH Level 5": 0.0347,
-        "hfopenllm_v2/GPQA": 0.2852,
-        "hfopenllm_v2/MUSR": 0.3964,
-        "hfopenllm_v2/MMLU-PRO": 0.2578
+        "hfopenllm_v2/IFEval": 0.5288,
+        "hfopenllm_v2/BBH": 0.4178,
+        "hfopenllm_v2/MATH Level 5": 0.0476,
+        "hfopenllm_v2/GPQA": 0.2752,
+        "hfopenllm_v2/MUSR": 0.3728,
+        "hfopenllm_v2/MMLU-PRO": 0.2467
       }
     },
     {
diff --git a/data/developers/huggingfacetb.json b/data/developers/huggingfacetb.json
index bed31781473fb30427be579aab45ef01bf5054ce..1df25de945ccaf98a06fb9f3b47618bba05613d9 100644
--- a/data/developers/huggingfacetb.json
+++ b/data/developers/huggingfacetb.json
@@ -133,12 +133,12 @@
       "developer": "HuggingFaceTB",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.0593,
-        "hfopenllm_v2/BBH": 0.3135,
-        "hfopenllm_v2/MATH Level 5": 0.0144,
-        "hfopenllm_v2/GPQA": 0.2341,
-        "hfopenllm_v2/MUSR": 0.3871,
-        "hfopenllm_v2/MMLU-PRO": 0.1092
+        "hfopenllm_v2/IFEval": 0.2883,
+        "hfopenllm_v2/BBH": 0.3124,
+        "hfopenllm_v2/MATH Level 5": 0.003,
+        "hfopenllm_v2/GPQA": 0.2357,
+        "hfopenllm_v2/MUSR": 0.3662,
+        "hfopenllm_v2/MMLU-PRO": 0.1115
       }
     },
     {
diff --git a/data/developers/infly.json b/data/developers/infly.json
index fe3f0dc6f7a4b2c08dd2895544fd05de4f16df3c..d497bf1e2632542284e99f21127cba81c8ed1b97 100644
--- a/data/developers/infly.json
+++ b/data/developers/infly.json
@@ -7,16 +7,16 @@
       "developer": "infly",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "reward-bench/Score": 0.7648,
-        "reward-bench/Chat": 0.9665,
-        "reward-bench/Chat Hard": 0.9101,
-        "reward-bench/Safety": 0.9644,
-        "reward-bench/Reasoning": 0.9912,
+        "reward-bench/Score": 0.9511,
         "reward-bench/Factuality": 0.7411,
         "reward-bench/Precise IF": 0.4188,
         "reward-bench/Math": 0.6995,
+        "reward-bench/Safety": 0.9365,
         "reward-bench/Focus": 0.903,
-        "reward-bench/Ties": 0.8622
+        "reward-bench/Ties": 0.8622,
+        "reward-bench/Chat": 0.9665,
+        "reward-bench/Chat Hard": 0.9101,
+        "reward-bench/Reasoning": 0.9912
       }
     }
   ]
diff --git a/data/developers/internlm.json b/data/developers/internlm.json
index 69708dbd584389b0c656d19a5340943c92e82210..fbcc7249ad65cfb92e1e09809d6765ef54a982e2 100644
--- a/data/developers/internlm.json
+++ b/data/developers/internlm.json
@@ -71,16 +71,16 @@
       "developer": "internlm",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "reward-bench/Score": 0.8759,
+        "reward-bench/Score": 0.5335,
+        "reward-bench/Chat": 0.9916,
+        "reward-bench/Chat Hard": 0.6952,
+        "reward-bench/Safety": 0.5956,
+        "reward-bench/Reasoning": 0.9453,
         "reward-bench/Factuality": 0.4211,
         "reward-bench/Precise IF": 0.4,
         "reward-bench/Math": 0.5628,
-        "reward-bench/Safety": 0.8716,
         "reward-bench/Focus": 0.7051,
-        "reward-bench/Ties": 0.5164,
-        "reward-bench/Chat": 0.9916,
-        "reward-bench/Chat Hard": 0.6952,
-        "reward-bench/Reasoning": 0.9453
+        "reward-bench/Ties": 0.5164
       }
     },
     {
diff --git a/data/developers/jaspionjader.json b/data/developers/jaspionjader.json
index 053d128582b4aa02040ae51cd0577288669f17e5..9d9d1e268e56a9945ae657deca0493de6a22ce3d 100644
--- a/data/developers/jaspionjader.json
+++ b/data/developers/jaspionjader.json
@@ -1477,12 +1477,12 @@
       "developer": "jaspionjader",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.4345,
-        "hfopenllm_v2/BBH": 0.5419,
-        "hfopenllm_v2/MATH Level 5": 0.1292,
-        "hfopenllm_v2/GPQA": 0.3087,
+        "hfopenllm_v2/IFEval": 0.4418,
+        "hfopenllm_v2/BBH": 0.5406,
+        "hfopenllm_v2/MATH Level 5": 0.1352,
+        "hfopenllm_v2/GPQA": 0.3062,
         "hfopenllm_v2/MUSR": 0.4277,
-        "hfopenllm_v2/MMLU-PRO": 0.3854
+        "hfopenllm_v2/MMLU-PRO": 0.386
       }
     },
     {
diff --git a/data/developers/leroydyer.json b/data/developers/leroydyer.json
index e1aa95462e1fed8d4e011735c7846112031a21a1..119d29bab457804b8d64637d339b1f72d7389de3 100644
--- a/data/developers/leroydyer.json
+++ b/data/developers/leroydyer.json
@@ -707,12 +707,12 @@
       "developer": "LeroyDyer",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.3579,
-        "hfopenllm_v2/BBH": 0.4477,
-        "hfopenllm_v2/MATH Level 5": 0.0423,
-        "hfopenllm_v2/GPQA": 0.3096,
-        "hfopenllm_v2/MUSR": 0.4134,
-        "hfopenllm_v2/MMLU-PRO": 0.2376
+        "hfopenllm_v2/IFEval": 0.3798,
+        "hfopenllm_v2/BBH": 0.4483,
+        "hfopenllm_v2/MATH Level 5": 0.04,
+        "hfopenllm_v2/GPQA": 0.3129,
+        "hfopenllm_v2/MUSR": 0.4148,
+        "hfopenllm_v2/MMLU-PRO": 0.2389
       }
     },
     {
diff --git a/data/developers/llmat.json b/data/developers/llmat.json
index 95633d3199310803501c256677a0fb788d0a08f7..d073eb81547c22e04e5363702192b3a9654d7362 100644
--- a/data/developers/llmat.json
+++ b/data/developers/llmat.json
@@ -7,12 +7,12 @@
       "developer": "llmat",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.364,
-        "hfopenllm_v2/BBH": 0.4005,
-        "hfopenllm_v2/MATH Level 5": 0.0015,
-        "hfopenllm_v2/GPQA": 0.2693,
-        "hfopenllm_v2/MUSR": 0.3529,
-        "hfopenllm_v2/MMLU-PRO": 0.2301
+        "hfopenllm_v2/IFEval": 0.377,
+        "hfopenllm_v2/BBH": 0.3978,
+        "hfopenllm_v2/MATH Level 5": 0.0242,
+        "hfopenllm_v2/GPQA": 0.2668,
+        "hfopenllm_v2/MUSR": 0.3555,
+        "hfopenllm_v2/MMLU-PRO": 0.2278
       }
     }
   ]
diff --git a/data/developers/lxzgordon.json b/data/developers/lxzgordon.json
index e4ace3cc8f7193c8c711403c980535ee73fdd6d3..7f802cf733857054e01537f3ecf745a3fdb38a05 100644
--- a/data/developers/lxzgordon.json
+++ b/data/developers/lxzgordon.json
@@ -20,16 +20,16 @@
       "developer": "LxzGordon",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "reward-bench/Score": 0.9294,
+        "reward-bench/Score": 0.7394,
+        "reward-bench/Chat": 0.9553,
+        "reward-bench/Chat Hard": 0.8816,
+        "reward-bench/Safety": 0.9178,
+        "reward-bench/Reasoning": 0.9698,
         "reward-bench/Factuality": 0.6884,
         "reward-bench/Precise IF": 0.45,
         "reward-bench/Math": 0.6393,
-        "reward-bench/Safety": 0.9108,
         "reward-bench/Focus": 0.9758,
-        "reward-bench/Ties": 0.7653,
-        "reward-bench/Chat": 0.9553,
-        "reward-bench/Chat Hard": 0.8816,
-        "reward-bench/Reasoning": 0.9698
+        "reward-bench/Ties": 0.7653
       }
     }
   ]
diff --git a/data/developers/meta.json b/data/developers/meta.json
index ed6538df893d5e651c8ce4719f630ed6d36178ec..801662ea5cb32e2dbb8c066d026c908449d6cd1d 100644
--- a/data/developers/meta.json
+++ b/data/developers/meta.json
@@ -471,6 +471,16 @@
         "helm_capabilities/IFEval": 0.743,
         "helm_capabilities/WildBench": 0.686,
         "helm_capabilities/Omni-MATH": 0.137,
+        "helm_lite/Mean win rate": 0.303,
+        "helm_lite/NarrativeQA": 0.756,
+        "helm_lite/NaturalQuestions (closed-book)": 0.209,
+        "helm_lite/OpenbookQA": 0.74,
+        "helm_lite/MMLU": 0.5,
+        "helm_lite/MATH": 0.703,
+        "helm_lite/GSM8K": 0.798,
+        "helm_lite/LegalBench": 0.342,
+        "helm_lite/MedQA": 0.245,
+        "helm_lite/WMT 2014": 0.181,
         "helm_mmlu/MMLU All Subjects": 0.561,
         "helm_mmlu/Abstract Algebra": 0.26,
         "helm_mmlu/Anatomy": 0.459,
@@ -506,17 +516,7 @@
         "helm_mmlu/Sociology": 0.701,
         "helm_mmlu/Virology": 0.446,
         "helm_mmlu/World Religions": 0.789,
-        "helm_mmlu/Mean win rate": 0.475,
-        "helm_lite/Mean win rate": 0.303,
-        "helm_lite/NarrativeQA": 0.756,
-        "helm_lite/NaturalQuestions (closed-book)": 0.209,
-        "helm_lite/OpenbookQA": 0.74,
-        "helm_lite/MMLU": 0.5,
-        "helm_lite/MATH": 0.703,
-        "helm_lite/GSM8K": 0.798,
-        "helm_lite/LegalBench": 0.342,
-        "helm_lite/MedQA": 0.245,
-        "helm_lite/WMT 2014": 0.181
+        "helm_mmlu/Mean win rate": 0.475
       }
     },
     {
@@ -579,6 +579,16 @@
       "developer": "Meta",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "helm_lite/Mean win rate": 0.819,
+        "helm_lite/NarrativeQA": 0.777,
+        "helm_lite/NaturalQuestions (closed-book)": 0.457,
+        "helm_lite/OpenbookQA": 0.942,
+        "helm_lite/MMLU": 0.703,
+        "helm_lite/MATH": 0.791,
+        "helm_lite/GSM8K": 0.936,
+        "helm_lite/LegalBench": 0.68,
+        "helm_lite/MedQA": 0.769,
+        "helm_lite/WMT 2014": 0.224,
         "helm_mmlu/MMLU All Subjects": 0.803,
         "helm_mmlu/Abstract Algebra": 0.52,
         "helm_mmlu/Anatomy": 0.8,
@@ -614,17 +624,7 @@
         "helm_mmlu/Sociology": 0.92,
         "helm_mmlu/Virology": 0.584,
         "helm_mmlu/World Religions": 0.901,
-        "helm_mmlu/Mean win rate": 0.773,
-        "helm_lite/Mean win rate": 0.819,
-        "helm_lite/NarrativeQA": 0.777,
-        "helm_lite/NaturalQuestions (closed-book)": 0.457,
-        "helm_lite/OpenbookQA": 0.942,
-        "helm_lite/MMLU": 0.703,
-        "helm_lite/MATH": 0.791,
-        "helm_lite/GSM8K": 0.936,
-        "helm_lite/LegalBench": 0.68,
-        "helm_lite/MedQA": 0.769,
-        "helm_lite/WMT 2014": 0.224
+        "helm_mmlu/Mean win rate": 0.773
       }
     },
     {
diff --git a/data/developers/minimax.json b/data/developers/minimax.json
index b575a16ccd7fb272ceb4b3067b0a9e48f65cff08..3eb98fb6a6609e5f1cc4d74a47ecc2b74aaa9bb0 100644
--- a/data/developers/minimax.json
+++ b/data/developers/minimax.json
@@ -25,7 +25,7 @@
       "developer": "MiniMax",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "terminal-bench-2.0/terminal-bench-2.0": 29.2
+        "terminal-bench-2.0/terminal-bench-2.0": 36.6
       }
     },
     {
diff --git a/data/developers/mistralai.json b/data/developers/mistralai.json
index 57b0b83dacb50720e9d21af9c48cd23ccbf0f7b9..168bb98ff1b313fc7d40f024df899fec3a02671f 100644
--- a/data/developers/mistralai.json
+++ b/data/developers/mistralai.json
@@ -69,6 +69,16 @@
         "helm_capabilities/IFEval": 0.567,
         "helm_capabilities/WildBench": 0.66,
         "helm_capabilities/Omni-MATH": 0.072,
+        "helm_lite/Mean win rate": 0.196,
+        "helm_lite/NarrativeQA": 0.716,
+        "helm_lite/NaturalQuestions (closed-book)": 0.253,
+        "helm_lite/OpenbookQA": 0.79,
+        "helm_lite/MMLU": 0.51,
+        "helm_lite/MATH": 0.289,
+        "helm_lite/GSM8K": 0.538,
+        "helm_lite/LegalBench": 0.331,
+        "helm_lite/MedQA": 0.517,
+        "helm_lite/WMT 2014": 0.142,
         "helm_mmlu/MMLU All Subjects": 0.599,
         "helm_mmlu/Abstract Algebra": 0.27,
         "helm_mmlu/Anatomy": 0.585,
@@ -105,16 +115,6 @@
         "helm_mmlu/Virology": 0.47,
         "helm_mmlu/World Religions": 0.825,
         "helm_mmlu/Mean win rate": 0.509,
-        "helm_lite/Mean win rate": 0.196,
-        "helm_lite/NarrativeQA": 0.716,
-        "helm_lite/NaturalQuestions (closed-book)": 0.253,
-        "helm_lite/OpenbookQA": 0.79,
-        "helm_lite/MMLU": 0.51,
-        "helm_lite/MATH": 0.289,
-        "helm_lite/GSM8K": 0.538,
-        "helm_lite/LegalBench": 0.331,
-        "helm_lite/MedQA": 0.517,
-        "helm_lite/WMT 2014": 0.142,
         "hfopenllm_v2/IFEval": 0.5465,
         "hfopenllm_v2/BBH": 0.4722,
         "hfopenllm_v2/MATH Level 5": 0.0385,
@@ -718,12 +718,12 @@
       "developer": "mistralai",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.2326,
-        "hfopenllm_v2/BBH": 0.5098,
-        "hfopenllm_v2/MATH Level 5": 0.0937,
-        "hfopenllm_v2/GPQA": 0.3205,
-        "hfopenllm_v2/MUSR": 0.4413,
-        "hfopenllm_v2/MMLU-PRO": 0.3871
+        "hfopenllm_v2/IFEval": 0.2415,
+        "hfopenllm_v2/BBH": 0.5087,
+        "hfopenllm_v2/MATH Level 5": 0.102,
+        "hfopenllm_v2/GPQA": 0.3138,
+        "hfopenllm_v2/MUSR": 0.4321,
+        "hfopenllm_v2/MMLU-PRO": 0.385
       }
     },
     {
diff --git a/data/developers/mlabonne.json b/data/developers/mlabonne.json
index be86bd7fe732025f133b06dc7c412aa5e56b7119..2620a8c4e8931697abdcd44e4a4aae7c1e430da5 100644
--- a/data/developers/mlabonne.json
+++ b/data/developers/mlabonne.json
@@ -161,12 +161,12 @@
       "developer": "mlabonne",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.4162,
-        "hfopenllm_v2/BBH": 0.5124,
-        "hfopenllm_v2/MATH Level 5": 0.0853,
-        "hfopenllm_v2/GPQA": 0.3029,
-        "hfopenllm_v2/MUSR": 0.415,
-        "hfopenllm_v2/MMLU-PRO": 0.3802
+        "hfopenllm_v2/IFEval": 0.7561,
+        "hfopenllm_v2/BBH": 0.5111,
+        "hfopenllm_v2/MATH Level 5": 0.0906,
+        "hfopenllm_v2/GPQA": 0.3062,
+        "hfopenllm_v2/MUSR": 0.4019,
+        "hfopenllm_v2/MMLU-PRO": 0.3841
       }
     },
     {
diff --git a/data/developers/moonshot_ai.json b/data/developers/moonshot_ai.json
index d83f11402fcca0c39c33d09eed495f2aefd69384..746185ce773a539bd025922ab856fdcf2f8a1d9f 100644
--- a/data/developers/moonshot_ai.json
+++ b/data/developers/moonshot_ai.json
@@ -7,7 +7,7 @@
       "developer": "Moonshot AI",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "terminal-bench-2.0/terminal-bench-2.0": 26.7
+        "terminal-bench-2.0/terminal-bench-2.0": 27.8
       }
     },
     {
diff --git a/data/developers/multiple.json b/data/developers/multiple.json
index e235ffc0287578be5fd9fdf3ba4e4e1b232b5df8..34cdb844d495e12fd3a3820204fbda313306e211 100644
--- a/data/developers/multiple.json
+++ b/data/developers/multiple.json
@@ -7,7 +7,7 @@
       "developer": "Multiple",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "terminal-bench-2.0/terminal-bench-2.0": 71.0
+        "terminal-bench-2.0/terminal-bench-2.0": 72.4
       }
     }
   ]
diff --git a/data/developers/nazimali.json b/data/developers/nazimali.json
index 07c42beb48b2fc9e0ad1a1059178a9ce4071a591..34d47c9647d462b17a0fe6015c5c4f9fc00264e7 100644
--- a/data/developers/nazimali.json
+++ b/data/developers/nazimali.json
@@ -21,12 +21,12 @@
       "developer": "nazimali",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.4964,
-        "hfopenllm_v2/BBH": 0.4699,
-        "hfopenllm_v2/MATH Level 5": 0.0045,
-        "hfopenllm_v2/GPQA": 0.2827,
-        "hfopenllm_v2/MUSR": 0.3979,
-        "hfopenllm_v2/MMLU-PRO": 0.3063
+        "hfopenllm_v2/IFEval": 0.486,
+        "hfopenllm_v2/BBH": 0.4721,
+        "hfopenllm_v2/MATH Level 5": 0.0846,
+        "hfopenllm_v2/GPQA": 0.2844,
+        "hfopenllm_v2/MUSR": 0.4006,
+        "hfopenllm_v2/MMLU-PRO": 0.3087
       }
     }
   ]
diff --git a/data/developers/nicolinho.json b/data/developers/nicolinho.json
index 551d4a5de698babd0e830b509f51bb11f4dd2ac7..79bf445ae201aa8b9add0559d92e4abd4fd3bebb 100644
--- a/data/developers/nicolinho.json
+++ b/data/developers/nicolinho.json
@@ -7,16 +7,16 @@
       "developer": "nicolinho",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "reward-bench/Score": 0.9444,
+        "reward-bench/Score": 0.7667,
+        "reward-bench/Chat": 0.9665,
+        "reward-bench/Chat Hard": 0.9013,
+        "reward-bench/Safety": 0.9578,
+        "reward-bench/Reasoning": 0.9826,
         "reward-bench/Factuality": 0.7853,
         "reward-bench/Precise IF": 0.3719,
         "reward-bench/Math": 0.6995,
-        "reward-bench/Safety": 0.927,
         "reward-bench/Focus": 0.9535,
-        "reward-bench/Ties": 0.8321,
-        "reward-bench/Chat": 0.9665,
-        "reward-bench/Chat Hard": 0.9013,
-        "reward-bench/Reasoning": 0.9826
+        "reward-bench/Ties": 0.8321
       }
     },
     {
@@ -51,16 +51,16 @@
       "developer": "nicolinho",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "reward-bench/Score": 0.9314,
+        "reward-bench/Score": 0.7074,
+        "reward-bench/Chat": 0.9637,
+        "reward-bench/Chat Hard": 0.8684,
+        "reward-bench/Safety": 0.9467,
+        "reward-bench/Reasoning": 0.9677,
         "reward-bench/Factuality": 0.6653,
         "reward-bench/Precise IF": 0.4062,
         "reward-bench/Math": 0.612,
-        "reward-bench/Safety": 0.9257,
         "reward-bench/Focus": 0.8909,
-        "reward-bench/Ties": 0.7234,
-        "reward-bench/Chat": 0.9637,
-        "reward-bench/Chat Hard": 0.8684,
-        "reward-bench/Reasoning": 0.9677
+        "reward-bench/Ties": 0.7234
       }
     }
   ]
diff --git a/data/developers/nisten.json b/data/developers/nisten.json
index 7b275a3c64fa267662b1c7ec09c2c6db9c0fbfc6..785709badefb68361515ca958f0d589a8268f14c 100644
--- a/data/developers/nisten.json
+++ b/data/developers/nisten.json
@@ -7,12 +7,12 @@
       "developer": "nisten",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.3914,
-        "hfopenllm_v2/BBH": 0.6591,
-        "hfopenllm_v2/MATH Level 5": 0.3044,
-        "hfopenllm_v2/GPQA": 0.3591,
-        "hfopenllm_v2/MUSR": 0.4681,
-        "hfopenllm_v2/MMLU-PRO": 0.5611
+        "hfopenllm_v2/IFEval": 0.3799,
+        "hfopenllm_v2/BBH": 0.6647,
+        "hfopenllm_v2/MATH Level 5": 0.3406,
+        "hfopenllm_v2/GPQA": 0.4035,
+        "hfopenllm_v2/MUSR": 0.494,
+        "hfopenllm_v2/MMLU-PRO": 0.5731
       }
     },
     {
diff --git a/data/developers/nousresearch.json b/data/developers/nousresearch.json
index 5eca3534d830aded2e15a419e7392ebd605b4769..68e17c3374e0831b38026cc5c7fe37546bb1fc55 100644
--- a/data/developers/nousresearch.json
+++ b/data/developers/nousresearch.json
@@ -200,20 +200,6 @@
         "hfopenllm_v2/MMLU-PRO": 0.232
       }
     },
-    {
-      "id": "NousResearch/Yarn-Llama-2-7b-128k",
-      "name": "Yarn-Llama-2-7b-128k",
-      "developer": "NousResearch",
-      "evaluator_relationship": null,
-      "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.1485,
-        "hfopenllm_v2/BBH": 0.3248,
-        "hfopenllm_v2/MATH Level 5": 0.0151,
-        "hfopenllm_v2/GPQA": 0.2601,
-        "hfopenllm_v2/MUSR": 0.3967,
-        "hfopenllm_v2/MMLU-PRO": 0.1791
-      }
-    },
     {
       "id": "NousResearch/Yarn-Llama-2-7b-64k",
       "name": "Yarn-Llama-2-7b-64k",
diff --git a/data/developers/omkar1102.json b/data/developers/omkar1102.json
index 1d044781189744d770af993cfdb651c1e02eee6f..ca0270b469e58068ee5242b6801e2fbe782ddc0a 100644
--- a/data/developers/omkar1102.json
+++ b/data/developers/omkar1102.json
@@ -7,12 +7,12 @@
       "developer": "Omkar1102",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.2254,
-        "hfopenllm_v2/BBH": 0.275,
+        "hfopenllm_v2/IFEval": 0.2148,
+        "hfopenllm_v2/BBH": 0.276,
         "hfopenllm_v2/MATH Level 5": 0.0,
-        "hfopenllm_v2/GPQA": 0.2576,
-        "hfopenllm_v2/MUSR": 0.3762,
-        "hfopenllm_v2/MMLU-PRO": 0.1123
+        "hfopenllm_v2/GPQA": 0.2508,
+        "hfopenllm_v2/MUSR": 0.3802,
+        "hfopenllm_v2/MMLU-PRO": 0.1126
       }
     }
   ]
diff --git a/data/developers/openai.json b/data/developers/openai.json
index 3a51dd8f7575d45f6d9b06bface9fb31355ba46d..463a516253b52dd581eb26a0054c21d8f415cdc5 100644
--- a/data/developers/openai.json
+++ b/data/developers/openai.json
@@ -163,16 +163,16 @@
       "developer": "OpenAI",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "ace/Overall Score": 0.515,
+        "ace/Food Score": 0.65,
+        "ace/Gaming Score": 0.578,
         "apex-agents/Overall Pass@1": 0.23,
         "apex-agents/Overall Pass@8": 0.4,
         "apex-agents/Overall Mean Score": 0.387,
         "apex-agents/Investment Banking Pass@1": 0.273,
         "apex-agents/Management Consulting Pass@1": 0.227,
         "apex-agents/Corporate Law Pass@1": 0.189,
-        "apex-agents/Corporate Lawyer Mean Score": 0.443,
-        "ace/Overall Score": 0.515,
-        "ace/Food Score": 0.65,
-        "ace/Gaming Score": 0.578
+        "apex-agents/Corporate Lawyer Mean Score": 0.443
       }
     },
     {
@@ -300,13 +300,6 @@
       "developer": "OpenAI",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "helm_instruct/Mean win rate": 0.689,
-        "helm_instruct/Anthropic RLHF dataset": 4.964,
-        "helm_instruct/Best ChatGPT Prompts": 4.986,
-        "helm_instruct/Koala test dataset": 4.987,
-        "helm_instruct/Open Assistant": 4.987,
-        "helm_instruct/Self Instruct": 4.99,
-        "helm_instruct/Vicuna": 4.992,
         "helm_classic/Mean win rate": 0.783,
         "helm_classic/MMLU": 0.391,
         "helm_classic/BoolQ": 0.87,
@@ -322,6 +315,13 @@
         "helm_classic/IMDB": 0.943,
         "helm_classic/CivilComments": 0.696,
         "helm_classic/RAFT": 0.748,
+        "helm_instruct/Mean win rate": 0.689,
+        "helm_instruct/Anthropic RLHF dataset": 4.964,
+        "helm_instruct/Best ChatGPT Prompts": 4.986,
+        "helm_instruct/Koala test dataset": 4.987,
+        "helm_instruct/Open Assistant": 4.987,
+        "helm_instruct/Self Instruct": 4.99,
+        "helm_instruct/Vicuna": 4.992,
         "helm_lite/Mean win rate": 0.358,
         "helm_lite/NarrativeQA": 0.655,
         "helm_lite/NaturalQuestions (closed-book)": 0.335,
@@ -405,6 +405,16 @@
       "developer": "OpenAI",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "helm_lite/Mean win rate": 0.867,
+        "helm_lite/NarrativeQA": 0.768,
+        "helm_lite/NaturalQuestions (closed-book)": 0.457,
+        "helm_lite/OpenbookQA": 0.96,
+        "helm_lite/MMLU": 0.735,
+        "helm_lite/MATH": 0.802,
+        "helm_lite/GSM8K": 0.932,
+        "helm_lite/LegalBench": 0.713,
+        "helm_lite/MedQA": 0.815,
+        "helm_lite/WMT 2014": 0.211,
         "helm_mmlu/MMLU All Subjects": 0.824,
         "helm_mmlu/Abstract Algebra": 0.63,
         "helm_mmlu/Anatomy": 0.8,
@@ -440,17 +450,7 @@
         "helm_mmlu/Sociology": 0.93,
         "helm_mmlu/Virology": 0.596,
         "helm_mmlu/World Religions": 0.877,
-        "helm_mmlu/Mean win rate": 0.517,
-        "helm_lite/Mean win rate": 0.867,
-        "helm_lite/NarrativeQA": 0.768,
-        "helm_lite/NaturalQuestions (closed-book)": 0.457,
-        "helm_lite/OpenbookQA": 0.96,
-        "helm_lite/MMLU": 0.735,
-        "helm_lite/MATH": 0.802,
-        "helm_lite/GSM8K": 0.932,
-        "helm_lite/LegalBench": 0.713,
-        "helm_lite/MedQA": 0.815,
-        "helm_lite/WMT 2014": 0.211
+        "helm_mmlu/Mean win rate": 0.517
       }
     },
     {
@@ -513,6 +513,16 @@
       "developer": "OpenAI",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "helm_lite/Mean win rate": 0.864,
+        "helm_lite/NarrativeQA": 0.761,
+        "helm_lite/NaturalQuestions (closed-book)": 0.482,
+        "helm_lite/OpenbookQA": 0.97,
+        "helm_lite/MMLU": 0.711,
+        "helm_lite/MATH": 0.833,
+        "helm_lite/GSM8K": 0.824,
+        "helm_lite/LegalBench": 0.727,
+        "helm_lite/MedQA": 0.783,
+        "helm_lite/WMT 2014": 0.218,
         "helm_mmlu/MMLU All Subjects": 0.813,
         "helm_mmlu/Abstract Algebra": 0.56,
         "helm_mmlu/Anatomy": 0.822,
@@ -549,16 +559,6 @@
         "helm_mmlu/Virology": 0.602,
         "helm_mmlu/World Religions": 0.848,
         "helm_mmlu/Mean win rate": 0.351,
-        "helm_lite/Mean win rate": 0.864,
-        "helm_lite/NarrativeQA": 0.761,
-        "helm_lite/NaturalQuestions (closed-book)": 0.482,
-        "helm_lite/OpenbookQA": 0.97,
-        "helm_lite/MMLU": 0.711,
-        "helm_lite/MATH": 0.833,
-        "helm_lite/GSM8K": 0.824,
-        "helm_lite/LegalBench": 0.727,
-        "helm_lite/MedQA": 0.783,
-        "helm_lite/WMT 2014": 0.218,
         "reward-bench/Score": 0.8395,
         "reward-bench/Chat": 0.9525,
         "reward-bench/Chat Hard": 0.7544,
@@ -772,16 +772,16 @@
         "helm_mmlu/Virology": 0.578,
         "helm_mmlu/World Religions": 0.883,
         "helm_mmlu/Mean win rate": 0.52,
-        "reward-bench/Score": 0.6493,
-        "reward-bench/Chat": 0.9609,
-        "reward-bench/Chat Hard": 0.761,
-        "reward-bench/Safety": 0.8619,
-        "reward-bench/Reasoning": 0.8661,
+        "reward-bench/Score": 0.8673,
         "reward-bench/Factuality": 0.5684,
         "reward-bench/Precise IF": 0.3312,
         "reward-bench/Math": 0.623,
+        "reward-bench/Safety": 0.8811,
         "reward-bench/Focus": 0.7293,
-        "reward-bench/Ties": 0.7819
+        "reward-bench/Ties": 0.7819,
+        "reward-bench/Chat": 0.9609,
+        "reward-bench/Chat Hard": 0.761,
+        "reward-bench/Reasoning": 0.8661
       }
     },
     {
@@ -877,7 +877,7 @@
       "developer": "OpenAI",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "terminal-bench-2.0/terminal-bench-2.0": 35.2
+        "terminal-bench-2.0/terminal-bench-2.0": 49.6
       }
     },
     {
@@ -911,9 +911,9 @@
         "helm_capabilities/IFEval": 0.875,
         "helm_capabilities/WildBench": 0.857,
         "helm_capabilities/Omni-MATH": 0.647,
-        "livecodebenchpro/Hard Problems": 0.0423,
-        "livecodebenchpro/Medium Problems": 0.4085,
-        "livecodebenchpro/Easy Problems": 0.9014
+        "livecodebenchpro/Hard Problems": 0.04225352112676056,
+        "livecodebenchpro/Medium Problems": 0.4084507042253521,
+        "livecodebenchpro/Easy Problems": 0.8873239436619719
       }
     },
     {
@@ -922,7 +922,7 @@
       "developer": "OpenAI",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "terminal-bench-2.0/terminal-bench-2.0": 44.3
+        "terminal-bench-2.0/terminal-bench-2.0": 43.4
       }
     },
     {
@@ -931,7 +931,7 @@
       "developer": "OpenAI",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "terminal-bench-2.0/terminal-bench-2.0": 34.8
+        "terminal-bench-2.0/terminal-bench-2.0": 24.0
       }
     },
     {
@@ -954,7 +954,7 @@
       "developer": "OpenAI",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "terminal-bench-2.0/terminal-bench-2.0": 9.9
+        "terminal-bench-2.0/terminal-bench-2.0": 11.5
       }
     },
     {
@@ -986,7 +986,7 @@
       "developer": "OpenAI",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "terminal-bench-2.0/terminal-bench-2.0": 53.5
+        "terminal-bench-2.0/terminal-bench-2.0": 57.8
       }
     },
     {
@@ -1013,7 +1013,7 @@
       "developer": "OpenAI",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "terminal-bench-2.0/terminal-bench-2.0": 60.7
+        "terminal-bench-2.0/terminal-bench-2.0": 62.9
       }
     },
     {
@@ -1023,14 +1023,14 @@
       "evaluator_relationship": null,
       "benchmark_scores": {
         "appworld_test_normal/appworld/test_normal": 0.0,
-        "browsecompplus/browsecompplus": 0.48,
+        "browsecompplus/browsecompplus": 0.43,
         "livecodebenchpro/Hard Problems": 0.1594,
         "livecodebenchpro/Medium Problems": 0.5211,
         "livecodebenchpro/Easy Problems": 0.9014,
         "swe-bench/swe-bench": 0.5455,
         "tau-bench-2_airline/tau-bench-2/airline": 0.6,
-        "tau-bench-2_retail/tau-bench-2/retail": 0.68,
-        "tau-bench-2_telecom/tau-bench-2/telecom": 0.5354
+        "tau-bench-2_retail/tau-bench-2/retail": 0.73,
+        "tau-bench-2_telecom/tau-bench-2/telecom": 0.71
       }
     },
     {
@@ -1048,7 +1048,7 @@
       "developer": "OpenAI",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "terminal-bench-2.0/terminal-bench-2.0": 64.7
+        "terminal-bench-2.0/terminal-bench-2.0": 77.3
       }
     },
     {
@@ -1112,7 +1112,7 @@
         "livecodebenchpro/Hard Problems": 0.0,
         "livecodebenchpro/Medium Problems": 0.11267605633802817,
         "livecodebenchpro/Easy Problems": 0.6619718309859155,
-        "terminal-bench-2.0/terminal-bench-2.0": 14.2
+        "terminal-bench-2.0/terminal-bench-2.0": 18.7
       }
     },
     {
@@ -1130,7 +1130,7 @@
         "livecodebenchpro/Hard Problems": 0.0,
         "livecodebenchpro/Medium Problems": 0.056338028169014086,
         "livecodebenchpro/Easy Problems": 0.5070422535211268,
-        "terminal-bench-2.0/terminal-bench-2.0": 3.1
+        "terminal-bench-2.0/terminal-bench-2.0": 3.4
       }
     },
     {
diff --git a/data/developers/openassistant.json b/data/developers/openassistant.json
index ad2c9bbf5e049296e5d368971f1365bc748e1226..7e4cf042377b82fe85282ade845062094d3d9218 100644
--- a/data/developers/openassistant.json
+++ b/data/developers/openassistant.json
@@ -7,17 +7,17 @@
       "developer": "OpenAssistant",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "reward-bench/Score": 0.615,
+        "reward-bench/Score": 0.2653,
+        "reward-bench/Chat": 0.9246,
+        "reward-bench/Chat Hard": 0.3728,
+        "reward-bench/Safety": 0.3289,
+        "reward-bench/Reasoning": 0.5855,
+        "reward-bench/Prior Sets (0.5 weight)": 0.6801,
         "reward-bench/Factuality": 0.3979,
         "reward-bench/Precise IF": 0.2875,
         "reward-bench/Math": 0.377,
-        "reward-bench/Safety": 0.5446,
         "reward-bench/Focus": 0.1535,
-        "reward-bench/Ties": 0.047,
-        "reward-bench/Chat": 0.9246,
-        "reward-bench/Chat Hard": 0.3728,
-        "reward-bench/Reasoning": 0.5855,
-        "reward-bench/Prior Sets (0.5 weight)": 0.6801
+        "reward-bench/Ties": 0.047
       }
     },
     {
@@ -26,17 +26,17 @@
       "developer": "OpenAssistant",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "reward-bench/Score": 0.2648,
-        "reward-bench/Chat": 0.8855,
-        "reward-bench/Chat Hard": 0.4868,
-        "reward-bench/Safety": 0.3244,
-        "reward-bench/Reasoning": 0.7752,
-        "reward-bench/Prior Sets (0.5 weight)": 0.6533,
+        "reward-bench/Score": 0.6901,
         "reward-bench/Factuality": 0.3179,
         "reward-bench/Precise IF": 0.2625,
         "reward-bench/Math": 0.3934,
+        "reward-bench/Safety": 0.6311,
         "reward-bench/Focus": 0.2707,
-        "reward-bench/Ties": 0.0198
+        "reward-bench/Ties": 0.0198,
+        "reward-bench/Chat": 0.8855,
+        "reward-bench/Chat Hard": 0.4868,
+        "reward-bench/Reasoning": 0.7752,
+        "reward-bench/Prior Sets (0.5 weight)": 0.6533
       }
     },
     {
diff --git a/data/developers/openbmb.json b/data/developers/openbmb.json
index b9795ed819d46987af7a443e489394fa08308cdd..d8dae84054074ce01b5c47fc58b69a148fdc99c0 100644
--- a/data/developers/openbmb.json
+++ b/data/developers/openbmb.json
@@ -21,17 +21,17 @@
       "developer": "openbmb",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "reward-bench/Score": 0.5806,
-        "reward-bench/Chat": 0.9804,
-        "reward-bench/Chat Hard": 0.6557,
-        "reward-bench/Safety": 0.6267,
-        "reward-bench/Reasoning": 0.8633,
-        "reward-bench/Prior Sets (0.5 weight)": 0.7172,
+        "reward-bench/Score": 0.8159,
         "reward-bench/Factuality": 0.6,
         "reward-bench/Precise IF": 0.3438,
         "reward-bench/Math": 0.5683,
+        "reward-bench/Safety": 0.8135,
         "reward-bench/Focus": 0.7475,
-        "reward-bench/Ties": 0.5972
+        "reward-bench/Ties": 0.5972,
+        "reward-bench/Chat": 0.9804,
+        "reward-bench/Chat Hard": 0.6557,
+        "reward-bench/Reasoning": 0.8633,
+        "reward-bench/Prior Sets (0.5 weight)": 0.7172
       }
     },
     {
diff --git a/data/developers/pku-alignment.json b/data/developers/pku-alignment.json
index 0ae80803f93980ecd8558a983e84a1646a1bd7d6..76e1f41b6171c4fd2a3d35d25df175a75a77a416 100644
--- a/data/developers/pku-alignment.json
+++ b/data/developers/pku-alignment.json
@@ -7,17 +7,17 @@
       "developer": "PKU-Alignment",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "reward-bench/Score": 0.3332,
-        "reward-bench/Chat": 0.6173,
-        "reward-bench/Chat Hard": 0.4232,
-        "reward-bench/Safety": 0.7589,
-        "reward-bench/Reasoning": 0.5482,
-        "reward-bench/Prior Sets (0.5 weight)": 0.57,
+        "reward-bench/Score": 0.5798,
         "reward-bench/Factuality": 0.3263,
         "reward-bench/Precise IF": 0.2313,
         "reward-bench/Math": 0.3989,
+        "reward-bench/Safety": 0.7351,
         "reward-bench/Focus": 0.2939,
-        "reward-bench/Ties": -0.01
+        "reward-bench/Ties": -0.01,
+        "reward-bench/Chat": 0.6173,
+        "reward-bench/Chat Hard": 0.4232,
+        "reward-bench/Reasoning": 0.5482,
+        "reward-bench/Prior Sets (0.5 weight)": 0.57
       }
     },
     {
@@ -26,17 +26,17 @@
       "developer": "PKU-Alignment",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "reward-bench/Score": 0.4727,
+        "reward-bench/Score": 0.1606,
+        "reward-bench/Chat": 0.8184,
+        "reward-bench/Chat Hard": 0.2873,
+        "reward-bench/Safety": 0.1422,
+        "reward-bench/Reasoning": 0.346,
+        "reward-bench/Prior Sets (0.5 weight)": 0.5993,
         "reward-bench/Factuality": 0.2105,
         "reward-bench/Precise IF": 0.2938,
         "reward-bench/Math": 0.2623,
-        "reward-bench/Safety": 0.3757,
         "reward-bench/Focus": 0.0646,
-        "reward-bench/Ties": -0.01,
-        "reward-bench/Chat": 0.8184,
-        "reward-bench/Chat Hard": 0.2873,
-        "reward-bench/Reasoning": 0.346,
-        "reward-bench/Prior Sets (0.5 weight)": 0.5993
+        "reward-bench/Ties": -0.01
       }
     },
     {
@@ -64,17 +64,17 @@
       "developer": "PKU-Alignment",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "reward-bench/Score": 0.6366,
+        "reward-bench/Score": 0.2544,
+        "reward-bench/Chat": 0.8994,
+        "reward-bench/Chat Hard": 0.364,
+        "reward-bench/Safety": 0.3156,
+        "reward-bench/Reasoning": 0.6887,
+        "reward-bench/Prior Sets (0.5 weight)": 0.6171,
         "reward-bench/Factuality": 0.2168,
         "reward-bench/Precise IF": 0.2562,
         "reward-bench/Math": 0.3825,
-        "reward-bench/Safety": 0.6041,
         "reward-bench/Focus": 0.2606,
-        "reward-bench/Ties": 0.0944,
-        "reward-bench/Chat": 0.8994,
-        "reward-bench/Chat Hard": 0.364,
-        "reward-bench/Reasoning": 0.6887,
-        "reward-bench/Prior Sets (0.5 weight)": 0.6171
+        "reward-bench/Ties": 0.0944
       }
     }
   ]
diff --git a/data/developers/primeintellect.json b/data/developers/primeintellect.json
index 674a0e3b141480d7e0d33d0a2fe9b205b710216f..160722785b06f80d9b220dee435fad1245d45495 100644
--- a/data/developers/primeintellect.json
+++ b/data/developers/primeintellect.json
@@ -8,11 +8,11 @@
       "evaluator_relationship": null,
       "benchmark_scores": {
         "hfopenllm_v2/IFEval": 0.1757,
-        "hfopenllm_v2/BBH": 0.276,
+        "hfopenllm_v2/BBH": 0.274,
         "hfopenllm_v2/MATH Level 5": 0.0,
-        "hfopenllm_v2/GPQA": 0.2534,
-        "hfopenllm_v2/MUSR": 0.3339,
-        "hfopenllm_v2/MMLU-PRO": 0.1123
+        "hfopenllm_v2/GPQA": 0.25,
+        "hfopenllm_v2/MUSR": 0.3753,
+        "hfopenllm_v2/MMLU-PRO": 0.112
       }
     },
     {
diff --git a/data/developers/princeton-nlp.json b/data/developers/princeton-nlp.json
index c2ef64ee6de3b0368749e3cc226eadfed8e75e4e..10775d33e9c59e099c9bb1a2b84aa429c8a82f44 100644
--- a/data/developers/princeton-nlp.json
+++ b/data/developers/princeton-nlp.json
@@ -49,12 +49,12 @@
       "developer": "princeton-nlp",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.5508,
-        "hfopenllm_v2/BBH": 0.5028,
-        "hfopenllm_v2/MATH Level 5": 0.0529,
-        "hfopenllm_v2/GPQA": 0.2861,
-        "hfopenllm_v2/MUSR": 0.4266,
-        "hfopenllm_v2/MMLU-PRO": 0.3231
+        "hfopenllm_v2/IFEval": 0.3978,
+        "hfopenllm_v2/BBH": 0.4983,
+        "hfopenllm_v2/MATH Level 5": 0.0582,
+        "hfopenllm_v2/GPQA": 0.281,
+        "hfopenllm_v2/MUSR": 0.425,
+        "hfopenllm_v2/MMLU-PRO": 0.3246
       }
     },
     {
diff --git a/data/developers/quazim0t0.json b/data/developers/quazim0t0.json
index c70b926d08d065ebcbe4ee7b0de956265e80bd56..496082d204a36ed57a388cd540f10080c702a420 100644
--- a/data/developers/quazim0t0.json
+++ b/data/developers/quazim0t0.json
@@ -637,12 +637,12 @@
       "developer": "Quazim0t0",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.7016,
-        "hfopenllm_v2/BBH": 0.6942,
-        "hfopenllm_v2/MATH Level 5": 0.4116,
-        "hfopenllm_v2/GPQA": 0.3624,
-        "hfopenllm_v2/MUSR": 0.4571,
-        "hfopenllm_v2/MMLU-PRO": 0.5411
+        "hfopenllm_v2/IFEval": 0.2922,
+        "hfopenllm_v2/BBH": 0.6559,
+        "hfopenllm_v2/MATH Level 5": 0.2545,
+        "hfopenllm_v2/GPQA": 0.2659,
+        "hfopenllm_v2/MUSR": 0.3929,
+        "hfopenllm_v2/MMLU-PRO": 0.5207
       }
     },
     {
diff --git a/data/developers/qwen.json b/data/developers/qwen.json
index b583c293c02ad4ba7bcdda51cb7b094b097d1824..8996ee46a22c0193f6bf8577a319d9b66b681c8b 100644
--- a/data/developers/qwen.json
+++ b/data/developers/qwen.json
@@ -775,12 +775,12 @@
       "developer": "Qwen",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.3153,
-        "hfopenllm_v2/BBH": 0.3322,
-        "hfopenllm_v2/MATH Level 5": 0.1035,
-        "hfopenllm_v2/GPQA": 0.2592,
-        "hfopenllm_v2/MUSR": 0.3342,
-        "hfopenllm_v2/MMLU-PRO": 0.172
+        "hfopenllm_v2/IFEval": 0.3071,
+        "hfopenllm_v2/BBH": 0.3341,
+        "hfopenllm_v2/MATH Level 5": 0.0,
+        "hfopenllm_v2/GPQA": 0.2576,
+        "hfopenllm_v2/MUSR": 0.3329,
+        "hfopenllm_v2/MMLU-PRO": 0.1697
       }
     },
     {
@@ -906,7 +906,8 @@
         "hfopenllm_v2/MATH Level 5": 0.3678,
         "hfopenllm_v2/GPQA": 0.2727,
         "hfopenllm_v2/MUSR": 0.3968,
-        "hfopenllm_v2/MMLU-PRO": 0.3255
+        "hfopenllm_v2/MMLU-PRO": 0.3255,
+        "theory_of_mind/accuracy on theory_of_mind for scorer model_graded_fact": 0.78
       }
     },
     {
@@ -1176,12 +1177,12 @@
       "developer": "Qwen",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.6147,
-        "hfopenllm_v2/BBH": 0.4999,
-        "hfopenllm_v2/MATH Level 5": 0.031,
-        "hfopenllm_v2/GPQA": 0.2936,
-        "hfopenllm_v2/MUSR": 0.4099,
-        "hfopenllm_v2/MMLU-PRO": 0.3354
+        "hfopenllm_v2/IFEval": 0.6101,
+        "hfopenllm_v2/BBH": 0.5008,
+        "hfopenllm_v2/MATH Level 5": 0.3716,
+        "hfopenllm_v2/GPQA": 0.2919,
+        "hfopenllm_v2/MUSR": 0.4073,
+        "hfopenllm_v2/MMLU-PRO": 0.3352
       }
     },
     {
diff --git a/data/developers/ray2333.json b/data/developers/ray2333.json
index 709d24161c8237750555868d00eabe667376cddb..0d33fd83045f7f1f2a0352b101b8f653545ad511 100644
--- a/data/developers/ray2333.json
+++ b/data/developers/ray2333.json
@@ -61,16 +61,16 @@
       "developer": "Ray2333",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "reward-bench/Score": 0.8839,
+        "reward-bench/Score": 0.5966,
+        "reward-bench/Chat": 0.9302,
+        "reward-bench/Chat Hard": 0.7719,
+        "reward-bench/Safety": 0.9222,
+        "reward-bench/Reasoning": 0.912,
         "reward-bench/Factuality": 0.5305,
         "reward-bench/Precise IF": 0.3125,
         "reward-bench/Math": 0.5902,
-        "reward-bench/Safety": 0.9216,
         "reward-bench/Focus": 0.7455,
-        "reward-bench/Ties": 0.4788,
-        "reward-bench/Chat": 0.9302,
-        "reward-bench/Chat Hard": 0.7719,
-        "reward-bench/Reasoning": 0.912
+        "reward-bench/Ties": 0.4788
       }
     },
     {
@@ -116,17 +116,17 @@
       "developer": "Ray2333",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "reward-bench/Score": 0.6089,
-        "reward-bench/Chat": 0.986,
-        "reward-bench/Chat Hard": 0.6776,
-        "reward-bench/Safety": 0.7867,
-        "reward-bench/Reasoning": 0.9229,
-        "reward-bench/Prior Sets (0.5 weight)": 0.7309,
+        "reward-bench/Score": 0.8542,
         "reward-bench/Factuality": 0.6189,
         "reward-bench/Precise IF": 0.3875,
         "reward-bench/Math": 0.5792,
+        "reward-bench/Safety": 0.8919,
         "reward-bench/Focus": 0.6828,
-        "reward-bench/Ties": 0.5981
+        "reward-bench/Ties": 0.5981,
+        "reward-bench/Chat": 0.986,
+        "reward-bench/Chat Hard": 0.6776,
+        "reward-bench/Reasoning": 0.9229,
+        "reward-bench/Prior Sets (0.5 weight)": 0.7309
       }
     },
     {
diff --git a/data/developers/recoilme.json b/data/developers/recoilme.json
index 1e7efb22560a32fb85354296593a7cbce1d04e91..862469c1653f0d1414e3f6dabd97ef04b6d4f9e2 100644
--- a/data/developers/recoilme.json
+++ b/data/developers/recoilme.json
@@ -7,12 +7,12 @@
       "developer": "recoilme",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.7649,
-        "hfopenllm_v2/BBH": 0.5974,
-        "hfopenllm_v2/MATH Level 5": 0.0174,
-        "hfopenllm_v2/GPQA": 0.3305,
-        "hfopenllm_v2/MUSR": 0.4245,
-        "hfopenllm_v2/MMLU-PRO": 0.4207
+        "hfopenllm_v2/IFEval": 0.2854,
+        "hfopenllm_v2/BBH": 0.5984,
+        "hfopenllm_v2/MATH Level 5": 0.1005,
+        "hfopenllm_v2/GPQA": 0.3297,
+        "hfopenllm_v2/MUSR": 0.4607,
+        "hfopenllm_v2/MMLU-PRO": 0.4162
       }
     },
     {
@@ -35,12 +35,12 @@
       "developer": "recoilme",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.2747,
-        "hfopenllm_v2/BBH": 0.6031,
-        "hfopenllm_v2/MATH Level 5": 0.0831,
-        "hfopenllm_v2/GPQA": 0.3305,
-        "hfopenllm_v2/MUSR": 0.4686,
-        "hfopenllm_v2/MMLU-PRO": 0.4122
+        "hfopenllm_v2/IFEval": 0.7592,
+        "hfopenllm_v2/BBH": 0.6026,
+        "hfopenllm_v2/MATH Level 5": 0.0529,
+        "hfopenllm_v2/GPQA": 0.3289,
+        "hfopenllm_v2/MUSR": 0.4099,
+        "hfopenllm_v2/MMLU-PRO": 0.4163
       }
     },
     {
@@ -49,12 +49,12 @@
       "developer": "recoilme",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.7439,
-        "hfopenllm_v2/BBH": 0.5993,
-        "hfopenllm_v2/MATH Level 5": 0.0876,
-        "hfopenllm_v2/GPQA": 0.3238,
-        "hfopenllm_v2/MUSR": 0.4204,
-        "hfopenllm_v2/MMLU-PRO": 0.4072
+        "hfopenllm_v2/IFEval": 0.5761,
+        "hfopenllm_v2/BBH": 0.602,
+        "hfopenllm_v2/MATH Level 5": 0.1888,
+        "hfopenllm_v2/GPQA": 0.3372,
+        "hfopenllm_v2/MUSR": 0.4632,
+        "hfopenllm_v2/MMLU-PRO": 0.4039
       }
     },
     {
diff --git a/data/developers/replete-ai.json b/data/developers/replete-ai.json
index dbb06f00736a7fcddf76b24a5c7673e098ecab57..0f038b31b0c0cf28a02b6c25fb4fd9bd374c118c 100644
--- a/data/developers/replete-ai.json
+++ b/data/developers/replete-ai.json
@@ -91,12 +91,12 @@
       "developer": "Replete-AI",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.0932,
-        "hfopenllm_v2/BBH": 0.2977,
+        "hfopenllm_v2/IFEval": 0.0905,
+        "hfopenllm_v2/BBH": 0.2985,
         "hfopenllm_v2/MATH Level 5": 0.0,
-        "hfopenllm_v2/GPQA": 0.2475,
-        "hfopenllm_v2/MUSR": 0.3941,
-        "hfopenllm_v2/MMLU-PRO": 0.1157
+        "hfopenllm_v2/GPQA": 0.2534,
+        "hfopenllm_v2/MUSR": 0.3848,
+        "hfopenllm_v2/MMLU-PRO": 0.1158
       }
     },
     {
diff --git a/data/developers/sfairxc.json b/data/developers/sfairxc.json
index f511504fa472f754b653d8fff6432038f2fa5642..0f83a5aa819fcf69906563e63fd5c5a7cc1f8ce0 100644
--- a/data/developers/sfairxc.json
+++ b/data/developers/sfairxc.json
@@ -7,17 +7,17 @@
       "developer": "sfairXC",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "reward-bench/Score": 0.6292,
-        "reward-bench/Chat": 0.9944,
-        "reward-bench/Chat Hard": 0.6513,
-        "reward-bench/Safety": 0.7667,
-        "reward-bench/Reasoning": 0.8644,
-        "reward-bench/Prior Sets (0.5 weight)": 0.7492,
+        "reward-bench/Score": 0.8338,
         "reward-bench/Factuality": 0.5916,
         "reward-bench/Precise IF": 0.4188,
         "reward-bench/Math": 0.6284,
+        "reward-bench/Safety": 0.8676,
         "reward-bench/Focus": 0.7051,
-        "reward-bench/Ties": 0.6647
+        "reward-bench/Ties": 0.6647,
+        "reward-bench/Chat": 0.9944,
+        "reward-bench/Chat Hard": 0.6513,
+        "reward-bench/Reasoning": 0.8644,
+        "reward-bench/Prior Sets (0.5 weight)": 0.7492
       }
     }
   ]
diff --git a/data/developers/skywork.json b/data/developers/skywork.json
index a1886dbde5ab2f08ceeb175a3c92e5b11655f24d..310fc474921f3ff87a2e73d5f74077c21e25d2fe 100644
--- a/data/developers/skywork.json
+++ b/data/developers/skywork.json
@@ -47,16 +47,16 @@
       "developer": "Skywork",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "reward-bench/Score": 0.938,
+        "reward-bench/Score": 0.7576,
+        "reward-bench/Chat": 0.9581,
+        "reward-bench/Chat Hard": 0.9145,
+        "reward-bench/Safety": 0.9422,
+        "reward-bench/Reasoning": 0.9606,
         "reward-bench/Factuality": 0.7368,
         "reward-bench/Precise IF": 0.4031,
         "reward-bench/Math": 0.7049,
-        "reward-bench/Safety": 0.9189,
         "reward-bench/Focus": 0.9323,
-        "reward-bench/Ties": 0.8261,
-        "reward-bench/Chat": 0.9581,
-        "reward-bench/Chat Hard": 0.9145,
-        "reward-bench/Reasoning": 0.9606
+        "reward-bench/Ties": 0.8261
       }
     },
     {
@@ -71,16 +71,16 @@
         "hfopenllm_v2/GPQA": 0.344,
         "hfopenllm_v2/MUSR": 0.4231,
         "hfopenllm_v2/MMLU-PRO": 0.4103,
-        "reward-bench/Score": 0.7531,
-        "reward-bench/Chat": 0.9609,
-        "reward-bench/Chat Hard": 0.8991,
-        "reward-bench/Safety": 0.9689,
-        "reward-bench/Reasoning": 0.9807,
+        "reward-bench/Score": 0.9426,
         "reward-bench/Factuality": 0.7674,
         "reward-bench/Precise IF": 0.375,
         "reward-bench/Math": 0.6721,
+        "reward-bench/Safety": 0.9297,
         "reward-bench/Focus": 0.9172,
-        "reward-bench/Ties": 0.8182
+        "reward-bench/Ties": 0.8182,
+        "reward-bench/Chat": 0.9609,
+        "reward-bench/Chat Hard": 0.8991,
+        "reward-bench/Reasoning": 0.9807
       }
     },
     {
@@ -89,16 +89,16 @@
       "developer": "Skywork",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "reward-bench/Score": 0.7314,
-        "reward-bench/Chat": 0.9581,
-        "reward-bench/Chat Hard": 0.8728,
-        "reward-bench/Safety": 0.9333,
-        "reward-bench/Reasoning": 0.962,
+        "reward-bench/Score": 0.9252,
         "reward-bench/Factuality": 0.6989,
         "reward-bench/Precise IF": 0.425,
         "reward-bench/Math": 0.6284,
+        "reward-bench/Safety": 0.9081,
         "reward-bench/Focus": 0.9616,
-        "reward-bench/Ties": 0.741
+        "reward-bench/Ties": 0.741,
+        "reward-bench/Chat": 0.9581,
+        "reward-bench/Chat Hard": 0.8728,
+        "reward-bench/Reasoning": 0.962
       }
     },
     {
@@ -230,16 +230,16 @@
       "developer": "Skywork",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "reward-bench/Score": 0.9007,
+        "reward-bench/Score": 0.6885,
+        "reward-bench/Chat": 0.8994,
+        "reward-bench/Chat Hard": 0.875,
+        "reward-bench/Safety": 0.8911,
+        "reward-bench/Reasoning": 0.9176,
         "reward-bench/Factuality": 0.6063,
         "reward-bench/Precise IF": 0.35,
         "reward-bench/Math": 0.6339,
-        "reward-bench/Safety": 0.9108,
         "reward-bench/Focus": 0.8909,
-        "reward-bench/Ties": 0.7586,
-        "reward-bench/Chat": 0.8994,
-        "reward-bench/Chat Hard": 0.875,
-        "reward-bench/Reasoning": 0.9176
+        "reward-bench/Ties": 0.7586
       }
     }
   ]
diff --git a/data/developers/snowflake.json b/data/developers/snowflake.json
index ea10d8d8be19d81cae9a42ff10694a41ac772666..d36a86d899a25b9271cd05a105d08de1ef5c396f 100644
--- a/data/developers/snowflake.json
+++ b/data/developers/snowflake.json
@@ -7,6 +7,16 @@
       "developer": "snowflake",
       "evaluator_relationship": null,
       "benchmark_scores": {
+        "helm_lite/Mean win rate": 0.338,
+        "helm_lite/NarrativeQA": 0.654,
+        "helm_lite/NaturalQuestions (closed-book)": 0.39,
+        "helm_lite/OpenbookQA": 0.828,
+        "helm_lite/MMLU": 0.575,
+        "helm_lite/MATH": 0.519,
+        "helm_lite/GSM8K": 0.768,
+        "helm_lite/LegalBench": 0.588,
+        "helm_lite/MedQA": 0.581,
+        "helm_lite/WMT 2014": 0.172,
         "helm_mmlu/MMLU All Subjects": 0.677,
         "helm_mmlu/Abstract Algebra": 0.35,
         "helm_mmlu/Anatomy": 0.652,
@@ -42,17 +52,7 @@
         "helm_mmlu/Sociology": 0.891,
         "helm_mmlu/Virology": 0.536,
         "helm_mmlu/World Religions": 0.854,
-        "helm_mmlu/Mean win rate": 0.565,
-        "helm_lite/Mean win rate": 0.338,
-        "helm_lite/NarrativeQA": 0.654,
-        "helm_lite/NaturalQuestions (closed-book)": 0.39,
-        "helm_lite/OpenbookQA": 0.828,
-        "helm_lite/MMLU": 0.575,
-        "helm_lite/MATH": 0.519,
-        "helm_lite/GSM8K": 0.768,
-        "helm_lite/LegalBench": 0.588,
-        "helm_lite/MedQA": 0.581,
-        "helm_lite/WMT 2014": 0.172
+        "helm_mmlu/Mean win rate": 0.565
       }
     }
   ]
diff --git a/data/developers/ucla-agi.json b/data/developers/ucla-agi.json
index 75dd11b6fe190b43556ce10dc96eff4392988245..93fb82e5b4d9ef216b964e7c543f4f7a063aa502 100644
--- a/data/developers/ucla-agi.json
+++ b/data/developers/ucla-agi.json
@@ -77,12 +77,12 @@
       "developer": "UCLA-AGI",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.6834,
-        "hfopenllm_v2/BBH": 0.508,
-        "hfopenllm_v2/MATH Level 5": 0.0959,
+        "hfopenllm_v2/IFEval": 0.6703,
+        "hfopenllm_v2/BBH": 0.5076,
+        "hfopenllm_v2/MATH Level 5": 0.0718,
         "hfopenllm_v2/GPQA": 0.2651,
-        "hfopenllm_v2/MUSR": 0.3661,
-        "hfopenllm_v2/MMLU-PRO": 0.3644
+        "hfopenllm_v2/MUSR": 0.3647,
+        "hfopenllm_v2/MMLU-PRO": 0.3658
       }
     },
     {
diff --git a/data/developers/valiantlabs.json b/data/developers/valiantlabs.json
index 28817feab6d48bbca18c53e003fe218c174dc60a..a0fc2b3d81b7ddfe31ae4c63f4f370e45c708501 100644
--- a/data/developers/valiantlabs.json
+++ b/data/developers/valiantlabs.json
@@ -91,12 +91,12 @@
       "developer": "ValiantLabs",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.5328,
-        "hfopenllm_v2/BBH": 0.4613,
-        "hfopenllm_v2/MATH Level 5": 0.0876,
-        "hfopenllm_v2/GPQA": 0.2894,
-        "hfopenllm_v2/MUSR": 0.3367,
-        "hfopenllm_v2/MMLU-PRO": 0.2424
+        "hfopenllm_v2/IFEval": 0.5483,
+        "hfopenllm_v2/BBH": 0.461,
+        "hfopenllm_v2/MATH Level 5": 0.0582,
+        "hfopenllm_v2/GPQA": 0.2886,
+        "hfopenllm_v2/MUSR": 0.3433,
+        "hfopenllm_v2/MMLU-PRO": 0.2407
       }
     },
     {
@@ -105,12 +105,12 @@
       "developer": "ValiantLabs",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.6496,
-        "hfopenllm_v2/BBH": 0.4774,
-        "hfopenllm_v2/MATH Level 5": 0.0566,
-        "hfopenllm_v2/GPQA": 0.3104,
-        "hfopenllm_v2/MUSR": 0.3909,
-        "hfopenllm_v2/MMLU-PRO": 0.3382
+        "hfopenllm_v2/IFEval": 0.2678,
+        "hfopenllm_v2/BBH": 0.4429,
+        "hfopenllm_v2/MATH Level 5": 0.0521,
+        "hfopenllm_v2/GPQA": 0.302,
+        "hfopenllm_v2/MUSR": 0.3959,
+        "hfopenllm_v2/MMLU-PRO": 0.2927
       }
     },
     {
diff --git a/data/developers/virnect.json b/data/developers/virnect.json
index 42ebd864a2c9cd593e1e2721cb69ddb9031aa22d..c4d108ef590950a6696d9775575feeb937623918 100644
--- a/data/developers/virnect.json
+++ b/data/developers/virnect.json
@@ -7,12 +7,12 @@
       "developer": "VIRNECT",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.5021,
-        "hfopenllm_v2/BBH": 0.4918,
-        "hfopenllm_v2/MATH Level 5": 0.108,
+        "hfopenllm_v2/IFEval": 0.5058,
+        "hfopenllm_v2/BBH": 0.4908,
+        "hfopenllm_v2/MATH Level 5": 0.0929,
         "hfopenllm_v2/GPQA": 0.271,
-        "hfopenllm_v2/MUSR": 0.3648,
-        "hfopenllm_v2/MMLU-PRO": 0.3536
+        "hfopenllm_v2/MUSR": 0.3662,
+        "hfopenllm_v2/MMLU-PRO": 0.3539
       }
     },
     {
diff --git a/data/developers/weqweasdas.json b/data/developers/weqweasdas.json
index 36aa4e34297ffab30b15be8c96f1b385a32071bf..fd061c03aa20db7f3ab1d7ab2e9b6a6249c65553 100644
--- a/data/developers/weqweasdas.json
+++ b/data/developers/weqweasdas.json
@@ -7,17 +7,17 @@
       "developer": "weqweasdas",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "reward-bench/Score": 0.2498,
-        "reward-bench/Chat": 0.8184,
-        "reward-bench/Chat Hard": 0.3728,
-        "reward-bench/Safety": 0.24,
-        "reward-bench/Reasoning": 0.3281,
-        "reward-bench/Prior Sets (0.5 weight)": 0.6564,
+        "reward-bench/Score": 0.5027,
         "reward-bench/Factuality": 0.3642,
         "reward-bench/Precise IF": 0.275,
         "reward-bench/Math": 0.3497,
+        "reward-bench/Safety": 0.4149,
         "reward-bench/Focus": 0.2384,
-        "reward-bench/Ties": 0.0315
+        "reward-bench/Ties": 0.0315,
+        "reward-bench/Chat": 0.8184,
+        "reward-bench/Chat Hard": 0.3728,
+        "reward-bench/Reasoning": 0.3281,
+        "reward-bench/Prior Sets (0.5 weight)": 0.6564
       }
     },
     {
@@ -26,17 +26,17 @@
       "developer": "weqweasdas",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "reward-bench/Score": 0.3057,
-        "reward-bench/Chat": 0.9441,
-        "reward-bench/Chat Hard": 0.4079,
-        "reward-bench/Safety": 0.3311,
-        "reward-bench/Reasoning": 0.7637,
-        "reward-bench/Prior Sets (0.5 weight)": 0.6652,
+        "reward-bench/Score": 0.6549,
         "reward-bench/Factuality": 0.3705,
         "reward-bench/Precise IF": 0.2812,
         "reward-bench/Math": 0.4317,
+        "reward-bench/Safety": 0.4986,
         "reward-bench/Focus": 0.2343,
-        "reward-bench/Ties": 0.1851
+        "reward-bench/Ties": 0.1851,
+        "reward-bench/Chat": 0.9441,
+        "reward-bench/Chat Hard": 0.4079,
+        "reward-bench/Reasoning": 0.7637,
+        "reward-bench/Prior Sets (0.5 weight)": 0.6652
       }
     },
     {
@@ -78,17 +78,17 @@
       "developer": "weqweasdas",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "reward-bench/Score": 0.596,
-        "reward-bench/Chat": 0.9665,
-        "reward-bench/Chat Hard": 0.6053,
-        "reward-bench/Safety": 0.6911,
-        "reward-bench/Reasoning": 0.7736,
-        "reward-bench/Prior Sets (0.5 weight)": 0.753,
+        "reward-bench/Score": 0.7982,
         "reward-bench/Factuality": 0.5937,
         "reward-bench/Precise IF": 0.3438,
         "reward-bench/Math": 0.5956,
+        "reward-bench/Safety": 0.8703,
         "reward-bench/Focus": 0.7293,
-        "reward-bench/Ties": 0.6226
+        "reward-bench/Ties": 0.6226,
+        "reward-bench/Chat": 0.9665,
+        "reward-bench/Chat Hard": 0.6053,
+        "reward-bench/Reasoning": 0.7736,
+        "reward-bench/Prior Sets (0.5 weight)": 0.753
       }
     }
   ]
diff --git a/data/developers/xai.json b/data/developers/xai.json
index 4539ab8505af399ecb6f06dee4d4f2618de3e3bc..5bf50e8754b14e71e2153e4564b26c11231d3d27 100644
--- a/data/developers/xai.json
+++ b/data/developers/xai.json
@@ -78,7 +78,7 @@
       "developer": "xAI",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "terminal-bench-2.0/terminal-bench-2.0": 25.4
+        "terminal-bench-2.0/terminal-bench-2.0": 23.1
       }
     },
     {
@@ -120,7 +120,7 @@
       "developer": "xAI",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "terminal-bench-2.0/terminal-bench-2.0": 25.8
+        "terminal-bench-2.0/terminal-bench-2.0": 14.2
       }
     }
   ]
diff --git a/data/developers/ycros.json b/data/developers/ycros.json
index e95400442aba80bbdb2ffa85224e7399ce946b0d..9b83f7f872f197f98fd71538cda202af618f5659 100644
--- a/data/developers/ycros.json
+++ b/data/developers/ycros.json
@@ -7,12 +7,12 @@
       "developer": "ycros",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.6262,
-        "hfopenllm_v2/BBH": 0.5142,
-        "hfopenllm_v2/MATH Level 5": 0.0937,
-        "hfopenllm_v2/GPQA": 0.3079,
-        "hfopenllm_v2/MUSR": 0.4138,
-        "hfopenllm_v2/MMLU-PRO": 0.3481
+        "hfopenllm_v2/IFEval": 0.5994,
+        "hfopenllm_v2/BBH": 0.5159,
+        "hfopenllm_v2/MATH Level 5": 0.0785,
+        "hfopenllm_v2/GPQA": 0.3045,
+        "hfopenllm_v2/MUSR": 0.4203,
+        "hfopenllm_v2/MMLU-PRO": 0.3473
       }
     }
   ]
diff --git a/data/developers/yoyo-ai.json b/data/developers/yoyo-ai.json
index 9dcbc81e86094f67bbf3f24e1917205fb4cc44f8..d0e307257898b20d6b2c21faa52d04a28d7cca51 100644
--- a/data/developers/yoyo-ai.json
+++ b/data/developers/yoyo-ai.json
@@ -105,12 +105,12 @@
       "developer": "YOYO-AI",
       "evaluator_relationship": null,
       "benchmark_scores": {
-        "hfopenllm_v2/IFEval": 0.5899,
-        "hfopenllm_v2/BBH": 0.654,
-        "hfopenllm_v2/MATH Level 5": 0.4509,
-        "hfopenllm_v2/GPQA": 0.3834,
-        "hfopenllm_v2/MUSR": 0.4744,
-        "hfopenllm_v2/MMLU-PRO": 0.5376
+        "hfopenllm_v2/IFEval": 0.7905,
+        "hfopenllm_v2/BBH": 0.6406,
+        "hfopenllm_v2/MATH Level 5": 0.0,
+        "hfopenllm_v2/GPQA": 0.3163,
+        "hfopenllm_v2/MUSR": 0.4181,
+        "hfopenllm_v2/MMLU-PRO": 0.4944
       }
     },
     {
diff --git a/data/models.json b/data/models.json
index 1c62b750346e574615920f5e5cddd93c6e67872c..2b93623cbdf71160760101030d0eb4a96cc29224 100644
--- a/data/models.json
+++ b/data/models.json
@@ -1005,12 +1005,12 @@
     "developer": "adriszmar",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "hfopenllm_v2/IFEval": 0.1685,
-      "hfopenllm_v2/BBH": 0.3124,
-      "hfopenllm_v2/MATH Level 5": 0.0015,
-      "hfopenllm_v2/GPQA": 0.2492,
-      "hfopenllm_v2/MUSR": 0.3963,
-      "hfopenllm_v2/MMLU-PRO": 0.1066
+      "hfopenllm_v2/IFEval": 0.1746,
+      "hfopenllm_v2/BBH": 0.3126,
+      "hfopenllm_v2/MATH Level 5": 0.0,
+      "hfopenllm_v2/GPQA": 0.245,
+      "hfopenllm_v2/MUSR": 0.4096,
+      "hfopenllm_v2/MMLU-PRO": 0.1087
     }
   },
   {
@@ -1391,9 +1391,9 @@
     "developer": "AI2",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "reward-bench/Score": 0.7008,
-      "reward-bench/Chat": 0.9385,
-      "reward-bench/Chat Hard": 0.3882,
+      "reward-bench/Score": 0.6924,
+      "reward-bench/Chat": 0.9441,
+      "reward-bench/Chat Hard": 0.3575,
       "reward-bench/Safety": 0.7757
     }
   },
@@ -2036,12 +2036,12 @@
     "developer": "akjindal53244",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "hfopenllm_v2/IFEval": 0.8051,
-      "hfopenllm_v2/BBH": 0.5189,
-      "hfopenllm_v2/MATH Level 5": 0.1722,
-      "hfopenllm_v2/GPQA": 0.3263,
+      "hfopenllm_v2/IFEval": 0.8033,
+      "hfopenllm_v2/BBH": 0.5196,
+      "hfopenllm_v2/MATH Level 5": 0.1624,
+      "hfopenllm_v2/GPQA": 0.3096,
       "hfopenllm_v2/MUSR": 0.4028,
-      "hfopenllm_v2/MMLU-PRO": 0.3803
+      "hfopenllm_v2/MMLU-PRO": 0.3812
     }
   },
   {
@@ -2390,17 +2390,17 @@
     "developer": "allenai",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "reward-bench/Score": 0.9021,
+      "reward-bench/Score": 0.7606,
+      "reward-bench/Chat": 0.9665,
+      "reward-bench/Chat Hard": 0.8355,
+      "reward-bench/Safety": 0.8844,
+      "reward-bench/Reasoning": 0.8969,
+      "reward-bench/Prior Sets (0.5 weight)": 0.0,
       "reward-bench/Factuality": 0.8126,
       "reward-bench/Precise IF": 0.4188,
       "reward-bench/Math": 0.6995,
-      "reward-bench/Safety": 0.9095,
       "reward-bench/Focus": 0.8646,
-      "reward-bench/Ties": 0.8835,
-      "reward-bench/Chat": 0.9665,
-      "reward-bench/Chat Hard": 0.8355,
-      "reward-bench/Reasoning": 0.8969,
-      "reward-bench/Prior Sets (0.5 weight)": 0.0
+      "reward-bench/Ties": 0.8835
     }
   },
   {
@@ -2428,17 +2428,17 @@
     "developer": "allenai",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "reward-bench/Score": 0.8885,
+      "reward-bench/Score": 0.7285,
+      "reward-bench/Chat": 0.9581,
+      "reward-bench/Chat Hard": 0.8158,
+      "reward-bench/Safety": 0.8956,
+      "reward-bench/Reasoning": 0.887,
+      "reward-bench/Prior Sets (0.5 weight)": 0.0,
       "reward-bench/Factuality": 0.7432,
       "reward-bench/Precise IF": 0.4437,
       "reward-bench/Math": 0.6175,
-      "reward-bench/Safety": 0.8932,
       "reward-bench/Focus": 0.9071,
-      "reward-bench/Ties": 0.7638,
-      "reward-bench/Chat": 0.9581,
-      "reward-bench/Chat Hard": 0.8158,
-      "reward-bench/Reasoning": 0.887,
-      "reward-bench/Prior Sets (0.5 weight)": 0.0
+      "reward-bench/Ties": 0.7638
     }
   },
   {
@@ -2447,12 +2447,12 @@
     "developer": "allenai",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "hfopenllm_v2/IFEval": 0.8379,
-      "hfopenllm_v2/BBH": 0.6157,
-      "hfopenllm_v2/MATH Level 5": 0.3829,
+      "hfopenllm_v2/IFEval": 0.8291,
+      "hfopenllm_v2/BBH": 0.6164,
+      "hfopenllm_v2/MATH Level 5": 0.4502,
       "hfopenllm_v2/GPQA": 0.3733,
-      "hfopenllm_v2/MUSR": 0.4988,
-      "hfopenllm_v2/MMLU-PRO": 0.4656
+      "hfopenllm_v2/MUSR": 0.4948,
+      "hfopenllm_v2/MMLU-PRO": 0.4645
     }
   },
   {
@@ -2489,17 +2489,17 @@
     "developer": "allenai",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "reward-bench/Score": 0.722,
-      "reward-bench/Chat": 0.9693,
-      "reward-bench/Chat Hard": 0.8268,
-      "reward-bench/Safety": 0.8689,
-      "reward-bench/Reasoning": 0.8583,
-      "reward-bench/Prior Sets (0.5 weight)": 0.0,
+      "reward-bench/Score": 0.8892,
       "reward-bench/Factuality": 0.8084,
       "reward-bench/Precise IF": 0.3688,
       "reward-bench/Math": 0.6776,
+      "reward-bench/Safety": 0.9027,
       "reward-bench/Focus": 0.7778,
-      "reward-bench/Ties": 0.8308
+      "reward-bench/Ties": 0.8308,
+      "reward-bench/Chat": 0.9693,
+      "reward-bench/Chat Hard": 0.8268,
+      "reward-bench/Reasoning": 0.8583,
+      "reward-bench/Prior Sets (0.5 weight)": 0.0
     }
   },
   {
@@ -2536,17 +2536,17 @@
     "developer": "allenai",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "reward-bench/Score": 0.687,
-      "reward-bench/Chat": 0.9553,
-      "reward-bench/Chat Hard": 0.761,
-      "reward-bench/Safety": 0.86,
-      "reward-bench/Reasoning": 0.7898,
-      "reward-bench/Prior Sets (0.5 weight)": 0.0,
+      "reward-bench/Score": 0.8431,
       "reward-bench/Factuality": 0.7516,
       "reward-bench/Precise IF": 0.3875,
       "reward-bench/Math": 0.6284,
+      "reward-bench/Safety": 0.8662,
       "reward-bench/Focus": 0.8545,
-      "reward-bench/Ties": 0.6397
+      "reward-bench/Ties": 0.6397,
+      "reward-bench/Chat": 0.9553,
+      "reward-bench/Chat Hard": 0.761,
+      "reward-bench/Reasoning": 0.7898,
+      "reward-bench/Prior Sets (0.5 weight)": 0.0
     }
   },
   {
@@ -7446,12 +7446,12 @@
     "developer": "Anthropic",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "appworld_test_normal/appworld/test_normal": 0.68,
+      "appworld_test_normal/appworld/test_normal": 0.7,
       "browsecompplus/browsecompplus": 0.61,
-      "swe-bench/swe-bench": 0.65,
-      "tau-bench-2_airline/tau-bench-2/airline": 0.72,
+      "swe-bench/swe-bench": 0.6061,
+      "tau-bench-2_airline/tau-bench-2/airline": 0.66,
       "tau-bench-2_retail/tau-bench-2/retail": 0.78,
-      "tau-bench-2_telecom/tau-bench-2/telecom": 0.76
+      "tau-bench-2_telecom/tau-bench-2/telecom": 0.84
     }
   },
   {
@@ -7460,7 +7460,7 @@
     "developer": "Anthropic",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "terminal-bench-2.0/terminal-bench-2.0": 38.0
+      "terminal-bench-2.0/terminal-bench-2.0": 35.1
     }
   },
   {
@@ -7469,7 +7469,7 @@
     "developer": "Anthropic",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "terminal-bench-2.0/terminal-bench-2.0": 59.1
+      "terminal-bench-2.0/terminal-bench-2.0": 52.1
     }
   },
   {
@@ -7478,7 +7478,7 @@
     "developer": "Anthropic",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "terminal-bench-2.0/terminal-bench-2.0": 58.0
+      "terminal-bench-2.0/terminal-bench-2.0": 62.9
     }
   },
   {
@@ -7552,7 +7552,7 @@
     "developer": "Anthropic",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "terminal-bench-2.0/terminal-bench-2.0": 43.1
+      "terminal-bench-2.0/terminal-bench-2.0": 42.6
     }
   },
   {
@@ -7596,8 +7596,6 @@
     "developer": "Anthropic",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "ace/Overall Score": 0.478,
-      "ace/Gaming Score": 0.391,
       "apex-agents/Overall Pass@1": 0.184,
       "apex-agents/Overall Pass@8": 0.34,
       "apex-agents/Overall Mean Score": 0.348,
@@ -7605,6 +7603,8 @@
       "apex-agents/Management Consulting Pass@1": 0.132,
       "apex-agents/Corporate Law Pass@1": 0.202,
       "apex-agents/Corporate Lawyer Mean Score": 0.471,
+      "ace/Overall Score": 0.478,
+      "ace/Gaming Score": 0.391,
       "apex-v1/Medicine (MD) Score": 0.65
     }
   },
@@ -12127,12 +12127,12 @@
     "developer": "cognitivecomputations",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "hfopenllm_v2/IFEval": 0.4124,
-      "hfopenllm_v2/BBH": 0.6383,
-      "hfopenllm_v2/MATH Level 5": 0.182,
-      "hfopenllm_v2/GPQA": 0.3289,
-      "hfopenllm_v2/MUSR": 0.4349,
-      "hfopenllm_v2/MMLU-PRO": 0.4525
+      "hfopenllm_v2/IFEval": 0.3613,
+      "hfopenllm_v2/BBH": 0.6123,
+      "hfopenllm_v2/MATH Level 5": 0.1239,
+      "hfopenllm_v2/GPQA": 0.328,
+      "hfopenllm_v2/MUSR": 0.4112,
+      "hfopenllm_v2/MMLU-PRO": 0.4494
     }
   },
   {
@@ -12852,12 +12852,12 @@
     "developer": "Columbia-NLP",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "hfopenllm_v2/IFEval": 0.3278,
-      "hfopenllm_v2/BBH": 0.392,
-      "hfopenllm_v2/MATH Level 5": 0.0431,
-      "hfopenllm_v2/GPQA": 0.2492,
-      "hfopenllm_v2/MUSR": 0.412,
-      "hfopenllm_v2/MMLU-PRO": 0.1666
+      "hfopenllm_v2/IFEval": 0.3102,
+      "hfopenllm_v2/BBH": 0.3881,
+      "hfopenllm_v2/MATH Level 5": 0.0536,
+      "hfopenllm_v2/GPQA": 0.2534,
+      "hfopenllm_v2/MUSR": 0.4081,
+      "hfopenllm_v2/MMLU-PRO": 0.1665
     }
   },
   {
@@ -13400,12 +13400,12 @@
     "developer": "cpayne1303",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "hfopenllm_v2/IFEval": 0.1916,
-      "hfopenllm_v2/BBH": 0.2977,
-      "hfopenllm_v2/MATH Level 5": 0.0,
+      "hfopenllm_v2/IFEval": 0.1949,
+      "hfopenllm_v2/BBH": 0.2965,
+      "hfopenllm_v2/MATH Level 5": 0.0045,
       "hfopenllm_v2/GPQA": 0.2685,
-      "hfopenllm_v2/MUSR": 0.3872,
-      "hfopenllm_v2/MMLU-PRO": 0.1132
+      "hfopenllm_v2/MUSR": 0.3885,
+      "hfopenllm_v2/MMLU-PRO": 0.1111
     }
   },
   {
@@ -14338,12 +14338,12 @@
     "developer": "Daemontatox",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "hfopenllm_v2/IFEval": 0.3745,
-      "hfopenllm_v2/BBH": 0.6668,
-      "hfopenllm_v2/MATH Level 5": 0.4758,
-      "hfopenllm_v2/GPQA": 0.3943,
-      "hfopenllm_v2/MUSR": 0.4858,
-      "hfopenllm_v2/MMLU-PRO": 0.5593
+      "hfopenllm_v2/IFEval": 0.4855,
+      "hfopenllm_v2/BBH": 0.6627,
+      "hfopenllm_v2/MATH Level 5": 0.4841,
+      "hfopenllm_v2/GPQA": 0.3096,
+      "hfopenllm_v2/MUSR": 0.4256,
+      "hfopenllm_v2/MMLU-PRO": 0.5542
     }
   },
   {
@@ -15729,12 +15729,12 @@
     "developer": "DeepMount00",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "hfopenllm_v2/IFEval": 0.7917,
-      "hfopenllm_v2/BBH": 0.5109,
-      "hfopenllm_v2/MATH Level 5": 0.1088,
-      "hfopenllm_v2/GPQA": 0.2878,
-      "hfopenllm_v2/MUSR": 0.4136,
-      "hfopenllm_v2/MMLU-PRO": 0.3876
+      "hfopenllm_v2/IFEval": 0.5365,
+      "hfopenllm_v2/BBH": 0.517,
+      "hfopenllm_v2/MATH Level 5": 0.1707,
+      "hfopenllm_v2/GPQA": 0.3062,
+      "hfopenllm_v2/MUSR": 0.4487,
+      "hfopenllm_v2/MMLU-PRO": 0.396
     }
   },
   {
@@ -16376,12 +16376,12 @@
     "developer": "dfurman",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "hfopenllm_v2/IFEval": 0.2835,
-      "hfopenllm_v2/BBH": 0.3842,
-      "hfopenllm_v2/MATH Level 5": 0.0521,
-      "hfopenllm_v2/GPQA": 0.2609,
-      "hfopenllm_v2/MUSR": 0.3566,
-      "hfopenllm_v2/MMLU-PRO": 0.2298
+      "hfopenllm_v2/IFEval": 0.3,
+      "hfopenllm_v2/BBH": 0.3853,
+      "hfopenllm_v2/MATH Level 5": 0.0415,
+      "hfopenllm_v2/GPQA": 0.2617,
+      "hfopenllm_v2/MUSR": 0.3579,
+      "hfopenllm_v2/MMLU-PRO": 0.2281
     }
   },
   {
@@ -17020,12 +17020,12 @@
     "developer": "DoppelReflEx",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "hfopenllm_v2/IFEval": 0.451,
-      "hfopenllm_v2/BBH": 0.4944,
-      "hfopenllm_v2/MATH Level 5": 0.1156,
-      "hfopenllm_v2/GPQA": 0.3196,
-      "hfopenllm_v2/MUSR": 0.3896,
-      "hfopenllm_v2/MMLU-PRO": 0.3256
+      "hfopenllm_v2/IFEval": 0.436,
+      "hfopenllm_v2/BBH": 0.4956,
+      "hfopenllm_v2/MATH Level 5": 0.0589,
+      "hfopenllm_v2/GPQA": 0.3205,
+      "hfopenllm_v2/MUSR": 0.3843,
+      "hfopenllm_v2/MMLU-PRO": 0.3237
     }
   },
   {
@@ -23519,6 +23519,7 @@
     "developer": "Google",
     "evaluator_relationship": null,
     "benchmark_scores": {
+      "ace/Gaming Score": 0.415,
       "apex-agents/Overall Pass@1": 0.24,
       "apex-agents/Overall Pass@8": 0.367,
       "apex-agents/Overall Mean Score": 0.395,
@@ -23526,7 +23527,6 @@
       "apex-agents/Management Consulting Pass@1": 0.193,
       "apex-agents/Corporate Law Pass@1": 0.259,
       "apex-agents/Corporate Lawyer Mean Score": 0.524,
-      "ace/Gaming Score": 0.415,
       "apex-v1/Overall Score": 0.64,
       "apex-v1/Consulting Score": 0.64
     }
@@ -23537,6 +23537,8 @@
     "developer": "Google",
     "evaluator_relationship": null,
     "benchmark_scores": {
+      "ace/Overall Score": 0.47,
+      "ace/Gaming Score": 0.509,
       "apex-agents/Overall Pass@1": 0.184,
       "apex-agents/Overall Pass@8": 0.373,
       "apex-agents/Overall Mean Score": 0.341,
@@ -23544,8 +23546,6 @@
       "apex-agents/Management Consulting Pass@1": 0.124,
       "apex-agents/Corporate Law Pass@1": 0.239,
       "apex-agents/Corporate Lawyer Mean Score": 0.487,
-      "ace/Overall Score": 0.47,
-      "ace/Gaming Score": 0.509,
       "apex-v1/Overall Score": 0.643,
       "apex-v1/Consulting Score": 0.64,
       "apex-v1/Investment Banking Score": 0.63
@@ -24103,7 +24103,7 @@
       "reward-bench/Safety": 0.909,
       "reward-bench/Focus": 0.841,
       "reward-bench/Ties": 0.809,
-      "terminal-bench-2.0/terminal-bench-2.0": 17.1
+      "terminal-bench-2.0/terminal-bench-2.0": 16.9
     }
   },
   {
@@ -24241,7 +24241,7 @@
     "developer": "Google",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "terminal-bench-2.0/terminal-bench-2.0": 51.0
+      "terminal-bench-2.0/terminal-bench-2.0": 47.4
     }
   },
   {
@@ -24259,8 +24259,8 @@
     "developer": "Google",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "appworld_test_normal/appworld/test_normal": 0.13,
-      "browsecompplus/browsecompplus": 0.48,
+      "appworld_test_normal/appworld/test_normal": 0.55,
+      "browsecompplus/browsecompplus": 0.3333,
       "global-mmlu-lite/Global MMLU Lite": 0.9453,
       "global-mmlu-lite/Culturally Sensitive": 0.9397,
       "global-mmlu-lite/Culturally Agnostic": 0.9509,
@@ -24280,8 +24280,8 @@
       "global-mmlu-lite/Yoruba": 0.9425,
       "global-mmlu-lite/Chinese": 0.9475,
       "global-mmlu-lite/Burmese": 0.9425,
-      "swe-bench/swe-bench": 0.7234,
-      "tau-bench-2_airline/tau-bench-2/airline": 0.7,
+      "swe-bench/swe-bench": 0.71,
+      "tau-bench-2_airline/tau-bench-2/airline": 0.68,
       "tau-bench-2_retail/tau-bench-2/retail": 0.73,
       "tau-bench-2_telecom/tau-bench-2/telecom": 0.73
     }
@@ -24408,12 +24408,12 @@
     "developer": "Google",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "hfopenllm_v2/IFEval": 0.2018,
-      "hfopenllm_v2/BBH": 0.3709,
-      "hfopenllm_v2/MATH Level 5": 0.0302,
+      "hfopenllm_v2/IFEval": 0.1993,
+      "hfopenllm_v2/BBH": 0.3656,
+      "hfopenllm_v2/MATH Level 5": 0.0287,
       "hfopenllm_v2/GPQA": 0.2626,
-      "hfopenllm_v2/MUSR": 0.4219,
-      "hfopenllm_v2/MMLU-PRO": 0.2217
+      "hfopenllm_v2/MUSR": 0.4232,
+      "hfopenllm_v2/MMLU-PRO": 0.218
     }
   },
   {
@@ -24436,12 +24436,12 @@
     "developer": "Google",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "hfopenllm_v2/IFEval": 0.5078,
-      "hfopenllm_v2/BBH": 0.4226,
-      "hfopenllm_v2/MATH Level 5": 0.0347,
-      "hfopenllm_v2/GPQA": 0.2852,
-      "hfopenllm_v2/MUSR": 0.3964,
-      "hfopenllm_v2/MMLU-PRO": 0.2578
+      "hfopenllm_v2/IFEval": 0.5288,
+      "hfopenllm_v2/BBH": 0.4178,
+      "hfopenllm_v2/MATH Level 5": 0.0476,
+      "hfopenllm_v2/GPQA": 0.2752,
+      "hfopenllm_v2/MUSR": 0.3728,
+      "hfopenllm_v2/MMLU-PRO": 0.2467
     }
   },
   {
@@ -26786,12 +26786,12 @@
     "developer": "HuggingFaceTB",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "hfopenllm_v2/IFEval": 0.0593,
-      "hfopenllm_v2/BBH": 0.3135,
-      "hfopenllm_v2/MATH Level 5": 0.0144,
-      "hfopenllm_v2/GPQA": 0.2341,
-      "hfopenllm_v2/MUSR": 0.3871,
-      "hfopenllm_v2/MMLU-PRO": 0.1092
+      "hfopenllm_v2/IFEval": 0.2883,
+      "hfopenllm_v2/BBH": 0.3124,
+      "hfopenllm_v2/MATH Level 5": 0.003,
+      "hfopenllm_v2/GPQA": 0.2357,
+      "hfopenllm_v2/MUSR": 0.3662,
+      "hfopenllm_v2/MMLU-PRO": 0.1115
     }
   },
   {
@@ -28507,16 +28507,16 @@
     "developer": "infly",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "reward-bench/Score": 0.7648,
-      "reward-bench/Chat": 0.9665,
-      "reward-bench/Chat Hard": 0.9101,
-      "reward-bench/Safety": 0.9644,
-      "reward-bench/Reasoning": 0.9912,
+      "reward-bench/Score": 0.9511,
       "reward-bench/Factuality": 0.7411,
       "reward-bench/Precise IF": 0.4188,
       "reward-bench/Math": 0.6995,
+      "reward-bench/Safety": 0.9365,
       "reward-bench/Focus": 0.903,
-      "reward-bench/Ties": 0.8622
+      "reward-bench/Ties": 0.8622,
+      "reward-bench/Chat": 0.9665,
+      "reward-bench/Chat Hard": 0.9101,
+      "reward-bench/Reasoning": 0.9912
     }
   },
   {
@@ -28701,16 +28701,16 @@
     "developer": "internlm",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "reward-bench/Score": 0.8759,
+      "reward-bench/Score": 0.5335,
+      "reward-bench/Chat": 0.9916,
+      "reward-bench/Chat Hard": 0.6952,
+      "reward-bench/Safety": 0.5956,
+      "reward-bench/Reasoning": 0.9453,
       "reward-bench/Factuality": 0.4211,
       "reward-bench/Precise IF": 0.4,
       "reward-bench/Math": 0.5628,
-      "reward-bench/Safety": 0.8716,
       "reward-bench/Focus": 0.7051,
-      "reward-bench/Ties": 0.5164,
-      "reward-bench/Chat": 0.9916,
-      "reward-bench/Chat Hard": 0.6952,
-      "reward-bench/Reasoning": 0.9453
+      "reward-bench/Ties": 0.5164
     }
   },
   {
@@ -30623,12 +30623,12 @@
     "developer": "jaspionjader",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "hfopenllm_v2/IFEval": 0.4345,
-      "hfopenllm_v2/BBH": 0.5419,
-      "hfopenllm_v2/MATH Level 5": 0.1292,
-      "hfopenllm_v2/GPQA": 0.3087,
+      "hfopenllm_v2/IFEval": 0.4418,
+      "hfopenllm_v2/BBH": 0.5406,
+      "hfopenllm_v2/MATH Level 5": 0.1352,
+      "hfopenllm_v2/GPQA": 0.3062,
       "hfopenllm_v2/MUSR": 0.4277,
-      "hfopenllm_v2/MMLU-PRO": 0.3854
+      "hfopenllm_v2/MMLU-PRO": 0.386
     }
   },
   {
@@ -38012,12 +38012,12 @@
     "developer": "LeroyDyer",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "hfopenllm_v2/IFEval": 0.3579,
-      "hfopenllm_v2/BBH": 0.4477,
-      "hfopenllm_v2/MATH Level 5": 0.0423,
-      "hfopenllm_v2/GPQA": 0.3096,
-      "hfopenllm_v2/MUSR": 0.4134,
-      "hfopenllm_v2/MMLU-PRO": 0.2376
+      "hfopenllm_v2/IFEval": 0.3798,
+      "hfopenllm_v2/BBH": 0.4483,
+      "hfopenllm_v2/MATH Level 5": 0.04,
+      "hfopenllm_v2/GPQA": 0.3129,
+      "hfopenllm_v2/MUSR": 0.4148,
+      "hfopenllm_v2/MMLU-PRO": 0.2389
     }
   },
   {
@@ -38936,12 +38936,12 @@
     "developer": "llmat",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "hfopenllm_v2/IFEval": 0.364,
-      "hfopenllm_v2/BBH": 0.4005,
-      "hfopenllm_v2/MATH Level 5": 0.0015,
-      "hfopenllm_v2/GPQA": 0.2693,
-      "hfopenllm_v2/MUSR": 0.3529,
-      "hfopenllm_v2/MMLU-PRO": 0.2301
+      "hfopenllm_v2/IFEval": 0.377,
+      "hfopenllm_v2/BBH": 0.3978,
+      "hfopenllm_v2/MATH Level 5": 0.0242,
+      "hfopenllm_v2/GPQA": 0.2668,
+      "hfopenllm_v2/MUSR": 0.3555,
+      "hfopenllm_v2/MMLU-PRO": 0.2278
     }
   },
   {
@@ -39569,16 +39569,16 @@
     "developer": "LxzGordon",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "reward-bench/Score": 0.9294,
+      "reward-bench/Score": 0.7394,
+      "reward-bench/Chat": 0.9553,
+      "reward-bench/Chat Hard": 0.8816,
+      "reward-bench/Safety": 0.9178,
+      "reward-bench/Reasoning": 0.9698,
       "reward-bench/Factuality": 0.6884,
       "reward-bench/Precise IF": 0.45,
       "reward-bench/Math": 0.6393,
-      "reward-bench/Safety": 0.9108,
       "reward-bench/Focus": 0.9758,
-      "reward-bench/Ties": 0.7653,
-      "reward-bench/Chat": 0.9553,
-      "reward-bench/Chat Hard": 0.8816,
-      "reward-bench/Reasoning": 0.9698
+      "reward-bench/Ties": 0.7653
     }
   },
   {
@@ -42623,6 +42623,16 @@
       "helm_capabilities/IFEval": 0.743,
       "helm_capabilities/WildBench": 0.686,
       "helm_capabilities/Omni-MATH": 0.137,
+      "helm_lite/Mean win rate": 0.303,
+      "helm_lite/NarrativeQA": 0.756,
+      "helm_lite/NaturalQuestions (closed-book)": 0.209,
+      "helm_lite/OpenbookQA": 0.74,
+      "helm_lite/MMLU": 0.5,
+      "helm_lite/MATH": 0.703,
+      "helm_lite/GSM8K": 0.798,
+      "helm_lite/LegalBench": 0.342,
+      "helm_lite/MedQA": 0.245,
+      "helm_lite/WMT 2014": 0.181,
       "helm_mmlu/MMLU All Subjects": 0.561,
       "helm_mmlu/Abstract Algebra": 0.26,
       "helm_mmlu/Anatomy": 0.459,
@@ -42658,17 +42668,7 @@
       "helm_mmlu/Sociology": 0.701,
       "helm_mmlu/Virology": 0.446,
       "helm_mmlu/World Religions": 0.789,
-      "helm_mmlu/Mean win rate": 0.475,
-      "helm_lite/Mean win rate": 0.303,
-      "helm_lite/NarrativeQA": 0.756,
-      "helm_lite/NaturalQuestions (closed-book)": 0.209,
-      "helm_lite/OpenbookQA": 0.74,
-      "helm_lite/MMLU": 0.5,
-      "helm_lite/MATH": 0.703,
-      "helm_lite/GSM8K": 0.798,
-      "helm_lite/LegalBench": 0.342,
-      "helm_lite/MedQA": 0.245,
-      "helm_lite/WMT 2014": 0.181
+      "helm_mmlu/Mean win rate": 0.475
     }
   },
   {
@@ -42731,6 +42731,16 @@
     "developer": "Meta",
     "evaluator_relationship": null,
     "benchmark_scores": {
+      "helm_lite/Mean win rate": 0.819,
+      "helm_lite/NarrativeQA": 0.777,
+      "helm_lite/NaturalQuestions (closed-book)": 0.457,
+      "helm_lite/OpenbookQA": 0.942,
+      "helm_lite/MMLU": 0.703,
+      "helm_lite/MATH": 0.791,
+      "helm_lite/GSM8K": 0.936,
+      "helm_lite/LegalBench": 0.68,
+      "helm_lite/MedQA": 0.769,
+      "helm_lite/WMT 2014": 0.224,
       "helm_mmlu/MMLU All Subjects": 0.803,
       "helm_mmlu/Abstract Algebra": 0.52,
       "helm_mmlu/Anatomy": 0.8,
@@ -42766,17 +42776,7 @@
       "helm_mmlu/Sociology": 0.92,
       "helm_mmlu/Virology": 0.584,
       "helm_mmlu/World Religions": 0.901,
-      "helm_mmlu/Mean win rate": 0.773,
-      "helm_lite/Mean win rate": 0.819,
-      "helm_lite/NarrativeQA": 0.777,
-      "helm_lite/NaturalQuestions (closed-book)": 0.457,
-      "helm_lite/OpenbookQA": 0.942,
-      "helm_lite/MMLU": 0.703,
-      "helm_lite/MATH": 0.791,
-      "helm_lite/GSM8K": 0.936,
-      "helm_lite/LegalBench": 0.68,
-      "helm_lite/MedQA": 0.769,
-      "helm_lite/WMT 2014": 0.224
+      "helm_mmlu/Mean win rate": 0.773
     }
   },
   {
@@ -43667,7 +43667,7 @@
     "developer": "MiniMax",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "terminal-bench-2.0/terminal-bench-2.0": 29.2
+      "terminal-bench-2.0/terminal-bench-2.0": 36.6
     }
   },
   {
@@ -43803,6 +43803,16 @@
       "helm_capabilities/IFEval": 0.567,
       "helm_capabilities/WildBench": 0.66,
       "helm_capabilities/Omni-MATH": 0.072,
+      "helm_lite/Mean win rate": 0.196,
+      "helm_lite/NarrativeQA": 0.716,
+      "helm_lite/NaturalQuestions (closed-book)": 0.253,
+      "helm_lite/OpenbookQA": 0.79,
+      "helm_lite/MMLU": 0.51,
+      "helm_lite/MATH": 0.289,
+      "helm_lite/GSM8K": 0.538,
+      "helm_lite/LegalBench": 0.331,
+      "helm_lite/MedQA": 0.517,
+      "helm_lite/WMT 2014": 0.142,
       "helm_mmlu/MMLU All Subjects": 0.599,
       "helm_mmlu/Abstract Algebra": 0.27,
       "helm_mmlu/Anatomy": 0.585,
@@ -43839,16 +43849,6 @@
       "helm_mmlu/Virology": 0.47,
       "helm_mmlu/World Religions": 0.825,
       "helm_mmlu/Mean win rate": 0.509,
-      "helm_lite/Mean win rate": 0.196,
-      "helm_lite/NarrativeQA": 0.716,
-      "helm_lite/NaturalQuestions (closed-book)": 0.253,
-      "helm_lite/OpenbookQA": 0.79,
-      "helm_lite/MMLU": 0.51,
-      "helm_lite/MATH": 0.289,
-      "helm_lite/GSM8K": 0.538,
-      "helm_lite/LegalBench": 0.331,
-      "helm_lite/MedQA": 0.517,
-      "helm_lite/WMT 2014": 0.142,
       "hfopenllm_v2/IFEval": 0.5465,
       "hfopenllm_v2/BBH": 0.4722,
       "hfopenllm_v2/MATH Level 5": 0.0385,
@@ -44452,12 +44452,12 @@
     "developer": "mistralai",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "hfopenllm_v2/IFEval": 0.2326,
-      "hfopenllm_v2/BBH": 0.5098,
-      "hfopenllm_v2/MATH Level 5": 0.0937,
-      "hfopenllm_v2/GPQA": 0.3205,
-      "hfopenllm_v2/MUSR": 0.4413,
-      "hfopenllm_v2/MMLU-PRO": 0.3871
+      "hfopenllm_v2/IFEval": 0.2415,
+      "hfopenllm_v2/BBH": 0.5087,
+      "hfopenllm_v2/MATH Level 5": 0.102,
+      "hfopenllm_v2/GPQA": 0.3138,
+      "hfopenllm_v2/MUSR": 0.4321,
+      "hfopenllm_v2/MMLU-PRO": 0.385
     }
   },
   {
@@ -44758,12 +44758,12 @@
     "developer": "mlabonne",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "hfopenllm_v2/IFEval": 0.4162,
-      "hfopenllm_v2/BBH": 0.5124,
-      "hfopenllm_v2/MATH Level 5": 0.0853,
-      "hfopenllm_v2/GPQA": 0.3029,
-      "hfopenllm_v2/MUSR": 0.415,
-      "hfopenllm_v2/MMLU-PRO": 0.3802
+      "hfopenllm_v2/IFEval": 0.7561,
+      "hfopenllm_v2/BBH": 0.5111,
+      "hfopenllm_v2/MATH Level 5": 0.0906,
+      "hfopenllm_v2/GPQA": 0.3062,
+      "hfopenllm_v2/MUSR": 0.4019,
+      "hfopenllm_v2/MMLU-PRO": 0.3841
     }
   },
   {
@@ -44996,7 +44996,7 @@
     "developer": "Moonshot AI",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "terminal-bench-2.0/terminal-bench-2.0": 26.7
+      "terminal-bench-2.0/terminal-bench-2.0": 27.8
     }
   },
   {
@@ -45317,7 +45317,7 @@
     "developer": "Multiple",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "terminal-bench-2.0/terminal-bench-2.0": 71.0
+      "terminal-bench-2.0/terminal-bench-2.0": 72.4
     }
   },
   {
@@ -45633,12 +45633,12 @@
     "developer": "nazimali",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "hfopenllm_v2/IFEval": 0.4964,
-      "hfopenllm_v2/BBH": 0.4699,
-      "hfopenllm_v2/MATH Level 5": 0.0045,
-      "hfopenllm_v2/GPQA": 0.2827,
-      "hfopenllm_v2/MUSR": 0.3979,
-      "hfopenllm_v2/MMLU-PRO": 0.3063
+      "hfopenllm_v2/IFEval": 0.486,
+      "hfopenllm_v2/BBH": 0.4721,
+      "hfopenllm_v2/MATH Level 5": 0.0846,
+      "hfopenllm_v2/GPQA": 0.2844,
+      "hfopenllm_v2/MUSR": 0.4006,
+      "hfopenllm_v2/MMLU-PRO": 0.3087
     }
   },
   {
@@ -48273,16 +48273,16 @@
     "developer": "nicolinho",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "reward-bench/Score": 0.9444,
+      "reward-bench/Score": 0.7667,
+      "reward-bench/Chat": 0.9665,
+      "reward-bench/Chat Hard": 0.9013,
+      "reward-bench/Safety": 0.9578,
+      "reward-bench/Reasoning": 0.9826,
       "reward-bench/Factuality": 0.7853,
       "reward-bench/Precise IF": 0.3719,
       "reward-bench/Math": 0.6995,
-      "reward-bench/Safety": 0.927,
       "reward-bench/Focus": 0.9535,
-      "reward-bench/Ties": 0.8321,
-      "reward-bench/Chat": 0.9665,
-      "reward-bench/Chat Hard": 0.9013,
-      "reward-bench/Reasoning": 0.9826
+      "reward-bench/Ties": 0.8321
     }
   },
   {
@@ -48317,16 +48317,16 @@
     "developer": "nicolinho",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "reward-bench/Score": 0.9314,
+      "reward-bench/Score": 0.7074,
+      "reward-bench/Chat": 0.9637,
+      "reward-bench/Chat Hard": 0.8684,
+      "reward-bench/Safety": 0.9467,
+      "reward-bench/Reasoning": 0.9677,
       "reward-bench/Factuality": 0.6653,
       "reward-bench/Precise IF": 0.4062,
       "reward-bench/Math": 0.612,
-      "reward-bench/Safety": 0.9257,
       "reward-bench/Focus": 0.8909,
-      "reward-bench/Ties": 0.7234,
-      "reward-bench/Chat": 0.9637,
-      "reward-bench/Chat Hard": 0.8684,
-      "reward-bench/Reasoning": 0.9677
+      "reward-bench/Ties": 0.7234
     }
   },
   {
@@ -48447,12 +48447,12 @@
     "developer": "nisten",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "hfopenllm_v2/IFEval": 0.3914,
-      "hfopenllm_v2/BBH": 0.6591,
-      "hfopenllm_v2/MATH Level 5": 0.3044,
-      "hfopenllm_v2/GPQA": 0.3591,
-      "hfopenllm_v2/MUSR": 0.4681,
-      "hfopenllm_v2/MMLU-PRO": 0.5611
+      "hfopenllm_v2/IFEval": 0.3799,
+      "hfopenllm_v2/BBH": 0.6647,
+      "hfopenllm_v2/MATH Level 5": 0.3406,
+      "hfopenllm_v2/GPQA": 0.4035,
+      "hfopenllm_v2/MUSR": 0.494,
+      "hfopenllm_v2/MMLU-PRO": 0.5731
     }
   },
   {
@@ -49326,20 +49326,6 @@
       "hfopenllm_v2/MMLU-PRO": 0.232
     }
   },
-  {
-    "id": "NousResearch/Yarn-Llama-2-7b-128k",
-    "name": "Yarn-Llama-2-7b-128k",
-    "developer": "NousResearch",
-    "evaluator_relationship": null,
-    "benchmark_scores": {
-      "hfopenllm_v2/IFEval": 0.1485,
-      "hfopenllm_v2/BBH": 0.3248,
-      "hfopenllm_v2/MATH Level 5": 0.0151,
-      "hfopenllm_v2/GPQA": 0.2601,
-      "hfopenllm_v2/MUSR": 0.3967,
-      "hfopenllm_v2/MMLU-PRO": 0.1791
-    }
-  },
   {
     "id": "NousResearch/Yarn-Llama-2-7b-64k",
     "name": "Yarn-Llama-2-7b-64k",
@@ -50085,12 +50071,12 @@
     "developer": "Omkar1102",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "hfopenllm_v2/IFEval": 0.2254,
-      "hfopenllm_v2/BBH": 0.275,
+      "hfopenllm_v2/IFEval": 0.2148,
+      "hfopenllm_v2/BBH": 0.276,
       "hfopenllm_v2/MATH Level 5": 0.0,
-      "hfopenllm_v2/GPQA": 0.2576,
-      "hfopenllm_v2/MUSR": 0.3762,
-      "hfopenllm_v2/MMLU-PRO": 0.1123
+      "hfopenllm_v2/GPQA": 0.2508,
+      "hfopenllm_v2/MUSR": 0.3802,
+      "hfopenllm_v2/MMLU-PRO": 0.1126
     }
   },
   {
@@ -51011,16 +50997,16 @@
     "developer": "OpenAI",
     "evaluator_relationship": null,
     "benchmark_scores": {
+      "ace/Overall Score": 0.515,
+      "ace/Food Score": 0.65,
+      "ace/Gaming Score": 0.578,
       "apex-agents/Overall Pass@1": 0.23,
       "apex-agents/Overall Pass@8": 0.4,
       "apex-agents/Overall Mean Score": 0.387,
       "apex-agents/Investment Banking Pass@1": 0.273,
       "apex-agents/Management Consulting Pass@1": 0.227,
       "apex-agents/Corporate Law Pass@1": 0.189,
-      "apex-agents/Corporate Lawyer Mean Score": 0.443,
-      "ace/Overall Score": 0.515,
-      "ace/Food Score": 0.65,
-      "ace/Gaming Score": 0.578
+      "apex-agents/Corporate Lawyer Mean Score": 0.443
     }
   },
   {
@@ -51148,13 +51134,6 @@
     "developer": "OpenAI",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "helm_instruct/Mean win rate": 0.689,
-      "helm_instruct/Anthropic RLHF dataset": 4.964,
-      "helm_instruct/Best ChatGPT Prompts": 4.986,
-      "helm_instruct/Koala test dataset": 4.987,
-      "helm_instruct/Open Assistant": 4.987,
-      "helm_instruct/Self Instruct": 4.99,
-      "helm_instruct/Vicuna": 4.992,
       "helm_classic/Mean win rate": 0.783,
       "helm_classic/MMLU": 0.391,
       "helm_classic/BoolQ": 0.87,
@@ -51170,6 +51149,13 @@
       "helm_classic/IMDB": 0.943,
       "helm_classic/CivilComments": 0.696,
       "helm_classic/RAFT": 0.748,
+      "helm_instruct/Mean win rate": 0.689,
+      "helm_instruct/Anthropic RLHF dataset": 4.964,
+      "helm_instruct/Best ChatGPT Prompts": 4.986,
+      "helm_instruct/Koala test dataset": 4.987,
+      "helm_instruct/Open Assistant": 4.987,
+      "helm_instruct/Self Instruct": 4.99,
+      "helm_instruct/Vicuna": 4.992,
       "helm_lite/Mean win rate": 0.358,
       "helm_lite/NarrativeQA": 0.655,
       "helm_lite/NaturalQuestions (closed-book)": 0.335,
@@ -51253,6 +51239,16 @@
     "developer": "OpenAI",
     "evaluator_relationship": null,
     "benchmark_scores": {
+      "helm_lite/Mean win rate": 0.867,
+      "helm_lite/NarrativeQA": 0.768,
+      "helm_lite/NaturalQuestions (closed-book)": 0.457,
+      "helm_lite/OpenbookQA": 0.96,
+      "helm_lite/MMLU": 0.735,
+      "helm_lite/MATH": 0.802,
+      "helm_lite/GSM8K": 0.932,
+      "helm_lite/LegalBench": 0.713,
+      "helm_lite/MedQA": 0.815,
+      "helm_lite/WMT 2014": 0.211,
       "helm_mmlu/MMLU All Subjects": 0.824,
       "helm_mmlu/Abstract Algebra": 0.63,
       "helm_mmlu/Anatomy": 0.8,
@@ -51288,17 +51284,7 @@
       "helm_mmlu/Sociology": 0.93,
       "helm_mmlu/Virology": 0.596,
       "helm_mmlu/World Religions": 0.877,
-      "helm_mmlu/Mean win rate": 0.517,
-      "helm_lite/Mean win rate": 0.867,
-      "helm_lite/NarrativeQA": 0.768,
-      "helm_lite/NaturalQuestions (closed-book)": 0.457,
-      "helm_lite/OpenbookQA": 0.96,
-      "helm_lite/MMLU": 0.735,
-      "helm_lite/MATH": 0.802,
-      "helm_lite/GSM8K": 0.932,
-      "helm_lite/LegalBench": 0.713,
-      "helm_lite/MedQA": 0.815,
-      "helm_lite/WMT 2014": 0.211
+      "helm_mmlu/Mean win rate": 0.517
     }
   },
   {
@@ -51361,6 +51347,16 @@
     "developer": "OpenAI",
     "evaluator_relationship": null,
     "benchmark_scores": {
+      "helm_lite/Mean win rate": 0.864,
+      "helm_lite/NarrativeQA": 0.761,
+      "helm_lite/NaturalQuestions (closed-book)": 0.482,
+      "helm_lite/OpenbookQA": 0.97,
+      "helm_lite/MMLU": 0.711,
+      "helm_lite/MATH": 0.833,
+      "helm_lite/GSM8K": 0.824,
+      "helm_lite/LegalBench": 0.727,
+      "helm_lite/MedQA": 0.783,
+      "helm_lite/WMT 2014": 0.218,
       "helm_mmlu/MMLU All Subjects": 0.813,
       "helm_mmlu/Abstract Algebra": 0.56,
       "helm_mmlu/Anatomy": 0.822,
@@ -51397,16 +51393,6 @@
       "helm_mmlu/Virology": 0.602,
       "helm_mmlu/World Religions": 0.848,
       "helm_mmlu/Mean win rate": 0.351,
-      "helm_lite/Mean win rate": 0.864,
-      "helm_lite/NarrativeQA": 0.761,
-      "helm_lite/NaturalQuestions (closed-book)": 0.482,
-      "helm_lite/OpenbookQA": 0.97,
-      "helm_lite/MMLU": 0.711,
-      "helm_lite/MATH": 0.833,
-      "helm_lite/GSM8K": 0.824,
-      "helm_lite/LegalBench": 0.727,
-      "helm_lite/MedQA": 0.783,
-      "helm_lite/WMT 2014": 0.218,
       "reward-bench/Score": 0.8395,
       "reward-bench/Chat": 0.9525,
       "reward-bench/Chat Hard": 0.7544,
@@ -51620,16 +51606,16 @@
       "helm_mmlu/Virology": 0.578,
       "helm_mmlu/World Religions": 0.883,
       "helm_mmlu/Mean win rate": 0.52,
-      "reward-bench/Score": 0.6493,
-      "reward-bench/Chat": 0.9609,
-      "reward-bench/Chat Hard": 0.761,
-      "reward-bench/Safety": 0.8619,
-      "reward-bench/Reasoning": 0.8661,
+      "reward-bench/Score": 0.8673,
       "reward-bench/Factuality": 0.5684,
       "reward-bench/Precise IF": 0.3312,
       "reward-bench/Math": 0.623,
+      "reward-bench/Safety": 0.8811,
       "reward-bench/Focus": 0.7293,
-      "reward-bench/Ties": 0.7819
+      "reward-bench/Ties": 0.7819,
+      "reward-bench/Chat": 0.9609,
+      "reward-bench/Chat Hard": 0.761,
+      "reward-bench/Reasoning": 0.8661
     }
   },
   {
@@ -51725,7 +51711,7 @@
     "developer": "OpenAI",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "terminal-bench-2.0/terminal-bench-2.0": 35.2
+      "terminal-bench-2.0/terminal-bench-2.0": 49.6
     }
   },
   {
@@ -51759,9 +51745,9 @@
       "helm_capabilities/IFEval": 0.875,
       "helm_capabilities/WildBench": 0.857,
       "helm_capabilities/Omni-MATH": 0.647,
-      "livecodebenchpro/Hard Problems": 0.0423,
-      "livecodebenchpro/Medium Problems": 0.4085,
-      "livecodebenchpro/Easy Problems": 0.9014
+      "livecodebenchpro/Hard Problems": 0.04225352112676056,
+      "livecodebenchpro/Medium Problems": 0.4084507042253521,
+      "livecodebenchpro/Easy Problems": 0.8873239436619719
     }
   },
   {
@@ -51770,7 +51756,7 @@
     "developer": "OpenAI",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "terminal-bench-2.0/terminal-bench-2.0": 44.3
+      "terminal-bench-2.0/terminal-bench-2.0": 43.4
     }
   },
   {
@@ -51779,7 +51765,7 @@
     "developer": "OpenAI",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "terminal-bench-2.0/terminal-bench-2.0": 34.8
+      "terminal-bench-2.0/terminal-bench-2.0": 24.0
     }
   },
   {
@@ -51802,7 +51788,7 @@
     "developer": "OpenAI",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "terminal-bench-2.0/terminal-bench-2.0": 9.9
+      "terminal-bench-2.0/terminal-bench-2.0": 11.5
     }
   },
   {
@@ -51834,7 +51820,7 @@
     "developer": "OpenAI",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "terminal-bench-2.0/terminal-bench-2.0": 53.5
+      "terminal-bench-2.0/terminal-bench-2.0": 57.8
     }
   },
   {
@@ -51861,7 +51847,7 @@
     "developer": "OpenAI",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "terminal-bench-2.0/terminal-bench-2.0": 60.7
+      "terminal-bench-2.0/terminal-bench-2.0": 62.9
     }
   },
   {
@@ -51871,14 +51857,14 @@
     "evaluator_relationship": null,
     "benchmark_scores": {
       "appworld_test_normal/appworld/test_normal": 0.0,
-      "browsecompplus/browsecompplus": 0.48,
+      "browsecompplus/browsecompplus": 0.43,
       "livecodebenchpro/Hard Problems": 0.1594,
       "livecodebenchpro/Medium Problems": 0.5211,
       "livecodebenchpro/Easy Problems": 0.9014,
       "swe-bench/swe-bench": 0.5455,
       "tau-bench-2_airline/tau-bench-2/airline": 0.6,
-      "tau-bench-2_retail/tau-bench-2/retail": 0.68,
-      "tau-bench-2_telecom/tau-bench-2/telecom": 0.5354
+      "tau-bench-2_retail/tau-bench-2/retail": 0.73,
+      "tau-bench-2_telecom/tau-bench-2/telecom": 0.71
     }
   },
   {
@@ -51896,7 +51882,7 @@
     "developer": "OpenAI",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "terminal-bench-2.0/terminal-bench-2.0": 64.7
+      "terminal-bench-2.0/terminal-bench-2.0": 77.3
     }
   },
   {
@@ -51960,7 +51946,7 @@
       "livecodebenchpro/Hard Problems": 0.0,
       "livecodebenchpro/Medium Problems": 0.11267605633802817,
       "livecodebenchpro/Easy Problems": 0.6619718309859155,
-      "terminal-bench-2.0/terminal-bench-2.0": 14.2
+      "terminal-bench-2.0/terminal-bench-2.0": 18.7
     }
   },
   {
@@ -51978,7 +51964,7 @@
       "livecodebenchpro/Hard Problems": 0.0,
       "livecodebenchpro/Medium Problems": 0.056338028169014086,
       "livecodebenchpro/Easy Problems": 0.5070422535211268,
-      "terminal-bench-2.0/terminal-bench-2.0": 3.1
+      "terminal-bench-2.0/terminal-bench-2.0": 3.4
     }
   },
   {
@@ -52227,17 +52213,17 @@
     "developer": "OpenAssistant",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "reward-bench/Score": 0.615,
+      "reward-bench/Score": 0.2653,
+      "reward-bench/Chat": 0.9246,
+      "reward-bench/Chat Hard": 0.3728,
+      "reward-bench/Safety": 0.3289,
+      "reward-bench/Reasoning": 0.5855,
+      "reward-bench/Prior Sets (0.5 weight)": 0.6801,
       "reward-bench/Factuality": 0.3979,
       "reward-bench/Precise IF": 0.2875,
       "reward-bench/Math": 0.377,
-      "reward-bench/Safety": 0.5446,
       "reward-bench/Focus": 0.1535,
-      "reward-bench/Ties": 0.047,
-      "reward-bench/Chat": 0.9246,
-      "reward-bench/Chat Hard": 0.3728,
-      "reward-bench/Reasoning": 0.5855,
-      "reward-bench/Prior Sets (0.5 weight)": 0.6801
+      "reward-bench/Ties": 0.047
     }
   },
   {
@@ -52246,17 +52232,17 @@
     "developer": "OpenAssistant",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "reward-bench/Score": 0.2648,
-      "reward-bench/Chat": 0.8855,
-      "reward-bench/Chat Hard": 0.4868,
-      "reward-bench/Safety": 0.3244,
-      "reward-bench/Reasoning": 0.7752,
-      "reward-bench/Prior Sets (0.5 weight)": 0.6533,
+      "reward-bench/Score": 0.6901,
       "reward-bench/Factuality": 0.3179,
       "reward-bench/Precise IF": 0.2625,
       "reward-bench/Math": 0.3934,
+      "reward-bench/Safety": 0.6311,
       "reward-bench/Focus": 0.2707,
-      "reward-bench/Ties": 0.0198
+      "reward-bench/Ties": 0.0198,
+      "reward-bench/Chat": 0.8855,
+      "reward-bench/Chat Hard": 0.4868,
+      "reward-bench/Reasoning": 0.7752,
+      "reward-bench/Prior Sets (0.5 weight)": 0.6533
     }
   },
   {
@@ -52312,17 +52298,17 @@
     "developer": "openbmb",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "reward-bench/Score": 0.5806,
-      "reward-bench/Chat": 0.9804,
-      "reward-bench/Chat Hard": 0.6557,
-      "reward-bench/Safety": 0.6267,
-      "reward-bench/Reasoning": 0.8633,
-      "reward-bench/Prior Sets (0.5 weight)": 0.7172,
+      "reward-bench/Score": 0.8159,
       "reward-bench/Factuality": 0.6,
       "reward-bench/Precise IF": 0.3438,
       "reward-bench/Math": 0.5683,
+      "reward-bench/Safety": 0.8135,
       "reward-bench/Focus": 0.7475,
-      "reward-bench/Ties": 0.5972
+      "reward-bench/Ties": 0.5972,
+      "reward-bench/Chat": 0.9804,
+      "reward-bench/Chat Hard": 0.6557,
+      "reward-bench/Reasoning": 0.8633,
+      "reward-bench/Prior Sets (0.5 weight)": 0.7172
     }
   },
   {
@@ -53830,17 +53816,17 @@
     "developer": "PKU-Alignment",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "reward-bench/Score": 0.3332,
-      "reward-bench/Chat": 0.6173,
-      "reward-bench/Chat Hard": 0.4232,
-      "reward-bench/Safety": 0.7589,
-      "reward-bench/Reasoning": 0.5482,
-      "reward-bench/Prior Sets (0.5 weight)": 0.57,
+      "reward-bench/Score": 0.5798,
       "reward-bench/Factuality": 0.3263,
       "reward-bench/Precise IF": 0.2313,
       "reward-bench/Math": 0.3989,
+      "reward-bench/Safety": 0.7351,
       "reward-bench/Focus": 0.2939,
-      "reward-bench/Ties": -0.01
+      "reward-bench/Ties": -0.01,
+      "reward-bench/Chat": 0.6173,
+      "reward-bench/Chat Hard": 0.4232,
+      "reward-bench/Reasoning": 0.5482,
+      "reward-bench/Prior Sets (0.5 weight)": 0.57
     }
   },
   {
@@ -53849,17 +53835,17 @@
     "developer": "PKU-Alignment",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "reward-bench/Score": 0.4727,
+      "reward-bench/Score": 0.1606,
+      "reward-bench/Chat": 0.8184,
+      "reward-bench/Chat Hard": 0.2873,
+      "reward-bench/Safety": 0.1422,
+      "reward-bench/Reasoning": 0.346,
+      "reward-bench/Prior Sets (0.5 weight)": 0.5993,
       "reward-bench/Factuality": 0.2105,
       "reward-bench/Precise IF": 0.2938,
       "reward-bench/Math": 0.2623,
-      "reward-bench/Safety": 0.3757,
       "reward-bench/Focus": 0.0646,
-      "reward-bench/Ties": -0.01,
-      "reward-bench/Chat": 0.8184,
-      "reward-bench/Chat Hard": 0.2873,
-      "reward-bench/Reasoning": 0.346,
-      "reward-bench/Prior Sets (0.5 weight)": 0.5993
+      "reward-bench/Ties": -0.01
     }
   },
   {
@@ -53887,17 +53873,17 @@
     "developer": "PKU-Alignment",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "reward-bench/Score": 0.6366,
+      "reward-bench/Score": 0.2544,
+      "reward-bench/Chat": 0.8994,
+      "reward-bench/Chat Hard": 0.364,
+      "reward-bench/Safety": 0.3156,
+      "reward-bench/Reasoning": 0.6887,
+      "reward-bench/Prior Sets (0.5 weight)": 0.6171,
       "reward-bench/Factuality": 0.2168,
       "reward-bench/Precise IF": 0.2562,
       "reward-bench/Math": 0.3825,
-      "reward-bench/Safety": 0.6041,
       "reward-bench/Focus": 0.2606,
-      "reward-bench/Ties": 0.0944,
-      "reward-bench/Chat": 0.8994,
-      "reward-bench/Chat Hard": 0.364,
-      "reward-bench/Reasoning": 0.6887,
-      "reward-bench/Prior Sets (0.5 weight)": 0.6171
+      "reward-bench/Ties": 0.0944
     }
   },
   {
@@ -54172,11 +54158,11 @@
     "evaluator_relationship": null,
     "benchmark_scores": {
       "hfopenllm_v2/IFEval": 0.1757,
-      "hfopenllm_v2/BBH": 0.276,
+      "hfopenllm_v2/BBH": 0.274,
       "hfopenllm_v2/MATH Level 5": 0.0,
-      "hfopenllm_v2/GPQA": 0.2534,
-      "hfopenllm_v2/MUSR": 0.3339,
-      "hfopenllm_v2/MMLU-PRO": 0.1123
+      "hfopenllm_v2/GPQA": 0.25,
+      "hfopenllm_v2/MUSR": 0.3753,
+      "hfopenllm_v2/MMLU-PRO": 0.112
     }
   },
   {
@@ -54255,12 +54241,12 @@
     "developer": "princeton-nlp",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "hfopenllm_v2/IFEval": 0.5508,
-      "hfopenllm_v2/BBH": 0.5028,
-      "hfopenllm_v2/MATH Level 5": 0.0529,
-      "hfopenllm_v2/GPQA": 0.2861,
-      "hfopenllm_v2/MUSR": 0.4266,
-      "hfopenllm_v2/MMLU-PRO": 0.3231
+      "hfopenllm_v2/IFEval": 0.3978,
+      "hfopenllm_v2/BBH": 0.4983,
+      "hfopenllm_v2/MATH Level 5": 0.0582,
+      "hfopenllm_v2/GPQA": 0.281,
+      "hfopenllm_v2/MUSR": 0.425,
+      "hfopenllm_v2/MMLU-PRO": 0.3246
     }
   },
   {
@@ -57543,12 +57529,12 @@
     "developer": "Quazim0t0",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "hfopenllm_v2/IFEval": 0.7016,
-      "hfopenllm_v2/BBH": 0.6942,
-      "hfopenllm_v2/MATH Level 5": 0.4116,
-      "hfopenllm_v2/GPQA": 0.3624,
-      "hfopenllm_v2/MUSR": 0.4571,
-      "hfopenllm_v2/MMLU-PRO": 0.5411
+      "hfopenllm_v2/IFEval": 0.2922,
+      "hfopenllm_v2/BBH": 0.6559,
+      "hfopenllm_v2/MATH Level 5": 0.2545,
+      "hfopenllm_v2/GPQA": 0.2659,
+      "hfopenllm_v2/MUSR": 0.3929,
+      "hfopenllm_v2/MMLU-PRO": 0.5207
     }
   },
   {
@@ -58647,12 +58633,12 @@
     "developer": "Qwen",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "hfopenllm_v2/IFEval": 0.3153,
-      "hfopenllm_v2/BBH": 0.3322,
-      "hfopenllm_v2/MATH Level 5": 0.1035,
-      "hfopenllm_v2/GPQA": 0.2592,
-      "hfopenllm_v2/MUSR": 0.3342,
-      "hfopenllm_v2/MMLU-PRO": 0.172
+      "hfopenllm_v2/IFEval": 0.3071,
+      "hfopenllm_v2/BBH": 0.3341,
+      "hfopenllm_v2/MATH Level 5": 0.0,
+      "hfopenllm_v2/GPQA": 0.2576,
+      "hfopenllm_v2/MUSR": 0.3329,
+      "hfopenllm_v2/MMLU-PRO": 0.1697
     }
   },
   {
@@ -58778,7 +58764,8 @@
       "hfopenllm_v2/MATH Level 5": 0.3678,
       "hfopenllm_v2/GPQA": 0.2727,
       "hfopenllm_v2/MUSR": 0.3968,
-      "hfopenllm_v2/MMLU-PRO": 0.3255
+      "hfopenllm_v2/MMLU-PRO": 0.3255,
+      "theory_of_mind/accuracy on theory_of_mind for scorer model_graded_fact": 0.78
     }
   },
   {
@@ -59048,12 +59035,12 @@
     "developer": "Qwen",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "hfopenllm_v2/IFEval": 0.6147,
-      "hfopenllm_v2/BBH": 0.4999,
-      "hfopenllm_v2/MATH Level 5": 0.031,
-      "hfopenllm_v2/GPQA": 0.2936,
-      "hfopenllm_v2/MUSR": 0.4099,
-      "hfopenllm_v2/MMLU-PRO": 0.3354
+      "hfopenllm_v2/IFEval": 0.6101,
+      "hfopenllm_v2/BBH": 0.5008,
+      "hfopenllm_v2/MATH Level 5": 0.3716,
+      "hfopenllm_v2/GPQA": 0.2919,
+      "hfopenllm_v2/MUSR": 0.4073,
+      "hfopenllm_v2/MMLU-PRO": 0.3352
     }
   },
   {
@@ -59381,16 +59368,16 @@
     "developer": "Ray2333",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "reward-bench/Score": 0.8839,
+      "reward-bench/Score": 0.5966,
+      "reward-bench/Chat": 0.9302,
+      "reward-bench/Chat Hard": 0.7719,
+      "reward-bench/Safety": 0.9222,
+      "reward-bench/Reasoning": 0.912,
       "reward-bench/Factuality": 0.5305,
       "reward-bench/Precise IF": 0.3125,
       "reward-bench/Math": 0.5902,
-      "reward-bench/Safety": 0.9216,
       "reward-bench/Focus": 0.7455,
-      "reward-bench/Ties": 0.4788,
-      "reward-bench/Chat": 0.9302,
-      "reward-bench/Chat Hard": 0.7719,
-      "reward-bench/Reasoning": 0.912
+      "reward-bench/Ties": 0.4788
     }
   },
   {
@@ -59436,17 +59423,17 @@
     "developer": "Ray2333",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "reward-bench/Score": 0.6089,
-      "reward-bench/Chat": 0.986,
-      "reward-bench/Chat Hard": 0.6776,
-      "reward-bench/Safety": 0.7867,
-      "reward-bench/Reasoning": 0.9229,
-      "reward-bench/Prior Sets (0.5 weight)": 0.7309,
+      "reward-bench/Score": 0.8542,
       "reward-bench/Factuality": 0.6189,
       "reward-bench/Precise IF": 0.3875,
       "reward-bench/Math": 0.5792,
+      "reward-bench/Safety": 0.8919,
       "reward-bench/Focus": 0.6828,
-      "reward-bench/Ties": 0.5981
+      "reward-bench/Ties": 0.5981,
+      "reward-bench/Chat": 0.986,
+      "reward-bench/Chat Hard": 0.6776,
+      "reward-bench/Reasoning": 0.9229,
+      "reward-bench/Prior Sets (0.5 weight)": 0.7309
     }
   },
   {
@@ -59510,12 +59497,12 @@
     "developer": "recoilme",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "hfopenllm_v2/IFEval": 0.7649,
-      "hfopenllm_v2/BBH": 0.5974,
-      "hfopenllm_v2/MATH Level 5": 0.0174,
-      "hfopenllm_v2/GPQA": 0.3305,
-      "hfopenllm_v2/MUSR": 0.4245,
-      "hfopenllm_v2/MMLU-PRO": 0.4207
+      "hfopenllm_v2/IFEval": 0.2854,
+      "hfopenllm_v2/BBH": 0.5984,
+      "hfopenllm_v2/MATH Level 5": 0.1005,
+      "hfopenllm_v2/GPQA": 0.3297,
+      "hfopenllm_v2/MUSR": 0.4607,
+      "hfopenllm_v2/MMLU-PRO": 0.4162
     }
   },
   {
@@ -59538,12 +59525,12 @@
     "developer": "recoilme",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "hfopenllm_v2/IFEval": 0.2747,
-      "hfopenllm_v2/BBH": 0.6031,
-      "hfopenllm_v2/MATH Level 5": 0.0831,
-      "hfopenllm_v2/GPQA": 0.3305,
-      "hfopenllm_v2/MUSR": 0.4686,
-      "hfopenllm_v2/MMLU-PRO": 0.4122
+      "hfopenllm_v2/IFEval": 0.7592,
+      "hfopenllm_v2/BBH": 0.6026,
+      "hfopenllm_v2/MATH Level 5": 0.0529,
+      "hfopenllm_v2/GPQA": 0.3289,
+      "hfopenllm_v2/MUSR": 0.4099,
+      "hfopenllm_v2/MMLU-PRO": 0.4163
     }
   },
   {
@@ -59552,12 +59539,12 @@
     "developer": "recoilme",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "hfopenllm_v2/IFEval": 0.7439,
-      "hfopenllm_v2/BBH": 0.5993,
-      "hfopenllm_v2/MATH Level 5": 0.0876,
-      "hfopenllm_v2/GPQA": 0.3238,
-      "hfopenllm_v2/MUSR": 0.4204,
-      "hfopenllm_v2/MMLU-PRO": 0.4072
+      "hfopenllm_v2/IFEval": 0.5761,
+      "hfopenllm_v2/BBH": 0.602,
+      "hfopenllm_v2/MATH Level 5": 0.1888,
+      "hfopenllm_v2/GPQA": 0.3372,
+      "hfopenllm_v2/MUSR": 0.4632,
+      "hfopenllm_v2/MMLU-PRO": 0.4039
     }
   },
   {
@@ -59720,12 +59707,12 @@
     "developer": "Replete-AI",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "hfopenllm_v2/IFEval": 0.0932,
-      "hfopenllm_v2/BBH": 0.2977,
+      "hfopenllm_v2/IFEval": 0.0905,
+      "hfopenllm_v2/BBH": 0.2985,
       "hfopenllm_v2/MATH Level 5": 0.0,
-      "hfopenllm_v2/GPQA": 0.2475,
-      "hfopenllm_v2/MUSR": 0.3941,
-      "hfopenllm_v2/MMLU-PRO": 0.1157
+      "hfopenllm_v2/GPQA": 0.2534,
+      "hfopenllm_v2/MUSR": 0.3848,
+      "hfopenllm_v2/MMLU-PRO": 0.1158
     }
   },
   {
@@ -62491,17 +62478,17 @@
     "developer": "sfairXC",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "reward-bench/Score": 0.6292,
-      "reward-bench/Chat": 0.9944,
-      "reward-bench/Chat Hard": 0.6513,
-      "reward-bench/Safety": 0.7667,
-      "reward-bench/Reasoning": 0.8644,
-      "reward-bench/Prior Sets (0.5 weight)": 0.7492,
+      "reward-bench/Score": 0.8338,
       "reward-bench/Factuality": 0.5916,
       "reward-bench/Precise IF": 0.4188,
       "reward-bench/Math": 0.6284,
+      "reward-bench/Safety": 0.8676,
       "reward-bench/Focus": 0.7051,
-      "reward-bench/Ties": 0.6647
+      "reward-bench/Ties": 0.6647,
+      "reward-bench/Chat": 0.9944,
+      "reward-bench/Chat Hard": 0.6513,
+      "reward-bench/Reasoning": 0.8644,
+      "reward-bench/Prior Sets (0.5 weight)": 0.7492
     }
   },
   {
@@ -63282,16 +63269,16 @@
     "developer": "Skywork",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "reward-bench/Score": 0.938,
+      "reward-bench/Score": 0.7576,
+      "reward-bench/Chat": 0.9581,
+      "reward-bench/Chat Hard": 0.9145,
+      "reward-bench/Safety": 0.9422,
+      "reward-bench/Reasoning": 0.9606,
       "reward-bench/Factuality": 0.7368,
       "reward-bench/Precise IF": 0.4031,
       "reward-bench/Math": 0.7049,
-      "reward-bench/Safety": 0.9189,
       "reward-bench/Focus": 0.9323,
-      "reward-bench/Ties": 0.8261,
-      "reward-bench/Chat": 0.9581,
-      "reward-bench/Chat Hard": 0.9145,
-      "reward-bench/Reasoning": 0.9606
+      "reward-bench/Ties": 0.8261
     }
   },
   {
@@ -63306,16 +63293,16 @@
       "hfopenllm_v2/GPQA": 0.344,
       "hfopenllm_v2/MUSR": 0.4231,
       "hfopenllm_v2/MMLU-PRO": 0.4103,
-      "reward-bench/Score": 0.7531,
-      "reward-bench/Chat": 0.9609,
-      "reward-bench/Chat Hard": 0.8991,
-      "reward-bench/Safety": 0.9689,
-      "reward-bench/Reasoning": 0.9807,
+      "reward-bench/Score": 0.9426,
       "reward-bench/Factuality": 0.7674,
       "reward-bench/Precise IF": 0.375,
       "reward-bench/Math": 0.6721,
+      "reward-bench/Safety": 0.9297,
       "reward-bench/Focus": 0.9172,
-      "reward-bench/Ties": 0.8182
+      "reward-bench/Ties": 0.8182,
+      "reward-bench/Chat": 0.9609,
+      "reward-bench/Chat Hard": 0.8991,
+      "reward-bench/Reasoning": 0.9807
     }
   },
   {
@@ -63324,16 +63311,16 @@
     "developer": "Skywork",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "reward-bench/Score": 0.7314,
-      "reward-bench/Chat": 0.9581,
-      "reward-bench/Chat Hard": 0.8728,
-      "reward-bench/Safety": 0.9333,
-      "reward-bench/Reasoning": 0.962,
+      "reward-bench/Score": 0.9252,
       "reward-bench/Factuality": 0.6989,
       "reward-bench/Precise IF": 0.425,
       "reward-bench/Math": 0.6284,
+      "reward-bench/Safety": 0.9081,
       "reward-bench/Focus": 0.9616,
-      "reward-bench/Ties": 0.741
+      "reward-bench/Ties": 0.741,
+      "reward-bench/Chat": 0.9581,
+      "reward-bench/Chat Hard": 0.8728,
+      "reward-bench/Reasoning": 0.962
     }
   },
   {
@@ -63465,16 +63452,16 @@
     "developer": "Skywork",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "reward-bench/Score": 0.9007,
+      "reward-bench/Score": 0.6885,
+      "reward-bench/Chat": 0.8994,
+      "reward-bench/Chat Hard": 0.875,
+      "reward-bench/Safety": 0.8911,
+      "reward-bench/Reasoning": 0.9176,
       "reward-bench/Factuality": 0.6063,
       "reward-bench/Precise IF": 0.35,
       "reward-bench/Math": 0.6339,
-      "reward-bench/Safety": 0.9108,
       "reward-bench/Focus": 0.8909,
-      "reward-bench/Ties": 0.7586,
-      "reward-bench/Chat": 0.8994,
-      "reward-bench/Chat Hard": 0.875,
-      "reward-bench/Reasoning": 0.9176
+      "reward-bench/Ties": 0.7586
     }
   },
   {
@@ -63483,6 +63470,16 @@
     "developer": "snowflake",
     "evaluator_relationship": null,
     "benchmark_scores": {
+      "helm_lite/Mean win rate": 0.338,
+      "helm_lite/NarrativeQA": 0.654,
+      "helm_lite/NaturalQuestions (closed-book)": 0.39,
+      "helm_lite/OpenbookQA": 0.828,
+      "helm_lite/MMLU": 0.575,
+      "helm_lite/MATH": 0.519,
+      "helm_lite/GSM8K": 0.768,
+      "helm_lite/LegalBench": 0.588,
+      "helm_lite/MedQA": 0.581,
+      "helm_lite/WMT 2014": 0.172,
       "helm_mmlu/MMLU All Subjects": 0.677,
       "helm_mmlu/Abstract Algebra": 0.35,
       "helm_mmlu/Anatomy": 0.652,
@@ -63518,17 +63515,7 @@
       "helm_mmlu/Sociology": 0.891,
       "helm_mmlu/Virology": 0.536,
       "helm_mmlu/World Religions": 0.854,
-      "helm_mmlu/Mean win rate": 0.565,
-      "helm_lite/Mean win rate": 0.338,
-      "helm_lite/NarrativeQA": 0.654,
-      "helm_lite/NaturalQuestions (closed-book)": 0.39,
-      "helm_lite/OpenbookQA": 0.828,
-      "helm_lite/MMLU": 0.575,
-      "helm_lite/MATH": 0.519,
-      "helm_lite/GSM8K": 0.768,
-      "helm_lite/LegalBench": 0.588,
-      "helm_lite/MedQA": 0.581,
-      "helm_lite/WMT 2014": 0.172
+      "helm_mmlu/Mean win rate": 0.565
     }
   },
   {
@@ -70093,12 +70080,12 @@
     "developer": "UCLA-AGI",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "hfopenllm_v2/IFEval": 0.6834,
-      "hfopenllm_v2/BBH": 0.508,
-      "hfopenllm_v2/MATH Level 5": 0.0959,
+      "hfopenllm_v2/IFEval": 0.6703,
+      "hfopenllm_v2/BBH": 0.5076,
+      "hfopenllm_v2/MATH Level 5": 0.0718,
       "hfopenllm_v2/GPQA": 0.2651,
-      "hfopenllm_v2/MUSR": 0.3661,
-      "hfopenllm_v2/MMLU-PRO": 0.3644
+      "hfopenllm_v2/MUSR": 0.3647,
+      "hfopenllm_v2/MMLU-PRO": 0.3658
     }
   },
   {
@@ -71041,12 +71028,12 @@
     "developer": "ValiantLabs",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "hfopenllm_v2/IFEval": 0.5328,
-      "hfopenllm_v2/BBH": 0.4613,
-      "hfopenllm_v2/MATH Level 5": 0.0876,
-      "hfopenllm_v2/GPQA": 0.2894,
-      "hfopenllm_v2/MUSR": 0.3367,
-      "hfopenllm_v2/MMLU-PRO": 0.2424
+      "hfopenllm_v2/IFEval": 0.5483,
+      "hfopenllm_v2/BBH": 0.461,
+      "hfopenllm_v2/MATH Level 5": 0.0582,
+      "hfopenllm_v2/GPQA": 0.2886,
+      "hfopenllm_v2/MUSR": 0.3433,
+      "hfopenllm_v2/MMLU-PRO": 0.2407
     }
   },
   {
@@ -71055,12 +71042,12 @@
     "developer": "ValiantLabs",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "hfopenllm_v2/IFEval": 0.6496,
-      "hfopenllm_v2/BBH": 0.4774,
-      "hfopenllm_v2/MATH Level 5": 0.0566,
-      "hfopenllm_v2/GPQA": 0.3104,
-      "hfopenllm_v2/MUSR": 0.3909,
-      "hfopenllm_v2/MMLU-PRO": 0.3382
+      "hfopenllm_v2/IFEval": 0.2678,
+      "hfopenllm_v2/BBH": 0.4429,
+      "hfopenllm_v2/MATH Level 5": 0.0521,
+      "hfopenllm_v2/GPQA": 0.302,
+      "hfopenllm_v2/MUSR": 0.3959,
+      "hfopenllm_v2/MMLU-PRO": 0.2927
     }
   },
   {
@@ -71377,12 +71364,12 @@
     "developer": "VIRNECT",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "hfopenllm_v2/IFEval": 0.5021,
-      "hfopenllm_v2/BBH": 0.4918,
-      "hfopenllm_v2/MATH Level 5": 0.108,
+      "hfopenllm_v2/IFEval": 0.5058,
+      "hfopenllm_v2/BBH": 0.4908,
+      "hfopenllm_v2/MATH Level 5": 0.0929,
       "hfopenllm_v2/GPQA": 0.271,
-      "hfopenllm_v2/MUSR": 0.3648,
-      "hfopenllm_v2/MMLU-PRO": 0.3536
+      "hfopenllm_v2/MUSR": 0.3662,
+      "hfopenllm_v2/MMLU-PRO": 0.3539
     }
   },
   {
@@ -71685,17 +71672,17 @@
     "developer": "weqweasdas",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "reward-bench/Score": 0.2498,
-      "reward-bench/Chat": 0.8184,
-      "reward-bench/Chat Hard": 0.3728,
-      "reward-bench/Safety": 0.24,
-      "reward-bench/Reasoning": 0.3281,
-      "reward-bench/Prior Sets (0.5 weight)": 0.6564,
+      "reward-bench/Score": 0.5027,
       "reward-bench/Factuality": 0.3642,
       "reward-bench/Precise IF": 0.275,
       "reward-bench/Math": 0.3497,
+      "reward-bench/Safety": 0.4149,
       "reward-bench/Focus": 0.2384,
-      "reward-bench/Ties": 0.0315
+      "reward-bench/Ties": 0.0315,
+      "reward-bench/Chat": 0.8184,
+      "reward-bench/Chat Hard": 0.3728,
+      "reward-bench/Reasoning": 0.3281,
+      "reward-bench/Prior Sets (0.5 weight)": 0.6564
     }
   },
   {
@@ -71704,17 +71691,17 @@
     "developer": "weqweasdas",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "reward-bench/Score": 0.3057,
-      "reward-bench/Chat": 0.9441,
-      "reward-bench/Chat Hard": 0.4079,
-      "reward-bench/Safety": 0.3311,
-      "reward-bench/Reasoning": 0.7637,
-      "reward-bench/Prior Sets (0.5 weight)": 0.6652,
+      "reward-bench/Score": 0.6549,
       "reward-bench/Factuality": 0.3705,
       "reward-bench/Precise IF": 0.2812,
       "reward-bench/Math": 0.4317,
+      "reward-bench/Safety": 0.4986,
       "reward-bench/Focus": 0.2343,
-      "reward-bench/Ties": 0.1851
+      "reward-bench/Ties": 0.1851,
+      "reward-bench/Chat": 0.9441,
+      "reward-bench/Chat Hard": 0.4079,
+      "reward-bench/Reasoning": 0.7637,
+      "reward-bench/Prior Sets (0.5 weight)": 0.6652
     }
   },
   {
@@ -71756,17 +71743,17 @@
     "developer": "weqweasdas",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "reward-bench/Score": 0.596,
-      "reward-bench/Chat": 0.9665,
-      "reward-bench/Chat Hard": 0.6053,
-      "reward-bench/Safety": 0.6911,
-      "reward-bench/Reasoning": 0.7736,
-      "reward-bench/Prior Sets (0.5 weight)": 0.753,
+      "reward-bench/Score": 0.7982,
       "reward-bench/Factuality": 0.5937,
       "reward-bench/Precise IF": 0.3438,
       "reward-bench/Math": 0.5956,
+      "reward-bench/Safety": 0.8703,
       "reward-bench/Focus": 0.7293,
-      "reward-bench/Ties": 0.6226
+      "reward-bench/Ties": 0.6226,
+      "reward-bench/Chat": 0.9665,
+      "reward-bench/Chat Hard": 0.6053,
+      "reward-bench/Reasoning": 0.7736,
+      "reward-bench/Prior Sets (0.5 weight)": 0.753
     }
   },
   {
@@ -72393,7 +72380,7 @@
     "developer": "xAI",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "terminal-bench-2.0/terminal-bench-2.0": 25.4
+      "terminal-bench-2.0/terminal-bench-2.0": 23.1
     }
   },
   {
@@ -72435,7 +72422,7 @@
     "developer": "xAI",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "terminal-bench-2.0/terminal-bench-2.0": 25.8
+      "terminal-bench-2.0/terminal-bench-2.0": 14.2
     }
   },
   {
@@ -73139,12 +73126,12 @@
     "developer": "ycros",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "hfopenllm_v2/IFEval": 0.6262,
-      "hfopenllm_v2/BBH": 0.5142,
-      "hfopenllm_v2/MATH Level 5": 0.0937,
-      "hfopenllm_v2/GPQA": 0.3079,
-      "hfopenllm_v2/MUSR": 0.4138,
-      "hfopenllm_v2/MMLU-PRO": 0.3481
+      "hfopenllm_v2/IFEval": 0.5994,
+      "hfopenllm_v2/BBH": 0.5159,
+      "hfopenllm_v2/MATH Level 5": 0.0785,
+      "hfopenllm_v2/GPQA": 0.3045,
+      "hfopenllm_v2/MUSR": 0.4203,
+      "hfopenllm_v2/MMLU-PRO": 0.3473
     }
   },
   {
@@ -73825,12 +73812,12 @@
     "developer": "YOYO-AI",
     "evaluator_relationship": null,
     "benchmark_scores": {
-      "hfopenllm_v2/IFEval": 0.5899,
-      "hfopenllm_v2/BBH": 0.654,
-      "hfopenllm_v2/MATH Level 5": 0.4509,
-      "hfopenllm_v2/GPQA": 0.3834,
-      "hfopenllm_v2/MUSR": 0.4744,
-      "hfopenllm_v2/MMLU-PRO": 0.5376
+      "hfopenllm_v2/IFEval": 0.7905,
+      "hfopenllm_v2/BBH": 0.6406,
+      "hfopenllm_v2/MATH Level 5": 0.0,
+      "hfopenllm_v2/GPQA": 0.3163,
+      "hfopenllm_v2/MUSR": 0.4181,
+      "hfopenllm_v2/MMLU-PRO": 0.4944
     }
   },
   {
diff --git a/data/models/adriszmar_qaimath-qwen2.5-7b-ties.json b/data/models/adriszmar_qaimath-qwen2.5-7b-ties.json
index a8a69697e53329e56da6b7741229b84b4f2eeb85..776a7c662b5afb276bf059bb90896cee5cd968ec 100644
--- a/data/models/adriszmar_qaimath-qwen2.5-7b-ties.json
+++ b/data/models/adriszmar_qaimath-qwen2.5-7b-ties.json
@@ -5,7 +5,7 @@
     "developer": "adriszmar",
     "inference_platform": "unknown",
     "additional_details": {
-      "precision": "float16",
+      "precision": "bfloat16",
       "architecture": "Qwen2ForCausalLM",
       "params_billions": "7.616"
     }
@@ -44,7 +44,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.1746
+            "score": 0.1685
           }
         },
         {
@@ -62,7 +62,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3126
+            "score": 0.3124
           }
         },
         {
@@ -80,7 +80,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0
+            "score": 0.0015
           }
         },
         {
@@ -98,7 +98,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.245
+            "score": 0.2492
           }
         },
         {
@@ -116,7 +116,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4096
+            "score": 0.3963
           }
         },
         {
@@ -134,7 +134,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.1087
+            "score": 0.1066
           }
         }
       ],
@@ -174,7 +174,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.1685
+            "score": 0.1746
           }
         },
         {
@@ -192,7 +192,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3124
+            "score": 0.3126
           }
         },
         {
@@ -210,7 +210,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0015
+            "score": 0.0
           }
         },
         {
@@ -228,7 +228,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2492
+            "score": 0.245
           }
         },
         {
@@ -246,7 +246,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3963
+            "score": 0.4096
           }
         },
         {
@@ -264,7 +264,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.1066
+            "score": 0.1087
           }
         }
       ],
diff --git a/data/models/ai2_tulu-2-7b-rm-v0-nectar-binarized-3.8m-check....json b/data/models/ai2_tulu-2-7b-rm-v0-nectar-binarized-3.8m-check....json
index fe877c718ccc5b06d1fe6c011897398e18bfe6e7..f109864a284814ea86e7b02eb2f140eae70857b8 100644
--- a/data/models/ai2_tulu-2-7b-rm-v0-nectar-binarized-3.8m-check....json
+++ b/data/models/ai2_tulu-2-7b-rm-v0-nectar-binarized-3.8m-check....json
@@ -326,7 +326,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7058
+            "score": 0.7008
           },
           "source_data": {
             "dataset_name": "RewardBench",
@@ -344,7 +344,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9525
+            "score": 0.9385
           },
           "source_data": {
             "dataset_name": "RewardBench",
@@ -362,7 +362,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3947
+            "score": 0.3882
           },
           "source_data": {
             "dataset_name": "RewardBench",
@@ -380,7 +380,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7703
+            "score": 0.7757
           },
           "source_data": {
             "dataset_name": "RewardBench",
@@ -422,7 +422,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6808
+            "score": 0.6895
           },
           "source_data": {
             "dataset_name": "RewardBench",
@@ -440,7 +440,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9302
+            "score": 0.9385
           },
           "source_data": {
             "dataset_name": "RewardBench",
@@ -458,7 +458,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3596
+            "score": 0.3706
           },
           "source_data": {
             "dataset_name": "RewardBench",
@@ -476,7 +476,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7527
+            "score": 0.7595
           },
           "source_data": {
             "dataset_name": "RewardBench",
@@ -518,7 +518,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6924
+            "score": 0.6808
           },
           "source_data": {
             "dataset_name": "RewardBench",
@@ -536,7 +536,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9441
+            "score": 0.9302
           },
           "source_data": {
             "dataset_name": "RewardBench",
@@ -554,7 +554,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3575
+            "score": 0.3596
           },
           "source_data": {
             "dataset_name": "RewardBench",
@@ -572,7 +572,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7757
+            "score": 0.7527
           },
           "source_data": {
             "dataset_name": "RewardBench",
@@ -710,7 +710,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6895
+            "score": 0.7058
           },
           "source_data": {
             "dataset_name": "RewardBench",
@@ -728,7 +728,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9385
+            "score": 0.9525
           },
           "source_data": {
             "dataset_name": "RewardBench",
@@ -746,7 +746,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3706
+            "score": 0.3947
           },
           "source_data": {
             "dataset_name": "RewardBench",
@@ -764,7 +764,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7595
+            "score": 0.7703
           },
           "source_data": {
             "dataset_name": "RewardBench",
@@ -806,7 +806,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7008
+            "score": 0.6924
           },
           "source_data": {
             "dataset_name": "RewardBench",
@@ -824,7 +824,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9385
+            "score": 0.9441
           },
           "source_data": {
             "dataset_name": "RewardBench",
@@ -842,7 +842,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3882
+            "score": 0.3575
           },
           "source_data": {
             "dataset_name": "RewardBench",
diff --git a/data/models/akjindal53244_llama-3.1-storm-8b.json b/data/models/akjindal53244_llama-3.1-storm-8b.json
index 10080988b150f205c06f956e608326efa2dd3fb0..cd06cbbbbe2ccc306c529154ea621a16fb9ebcf6 100644
--- a/data/models/akjindal53244_llama-3.1-storm-8b.json
+++ b/data/models/akjindal53244_llama-3.1-storm-8b.json
@@ -5,7 +5,7 @@
     "developer": "akjindal53244",
     "inference_platform": "unknown",
     "additional_details": {
-      "precision": "bfloat16",
+      "precision": "float16",
       "architecture": "LlamaForCausalLM",
       "params_billions": "8.03"
     }
@@ -44,7 +44,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8033
+            "score": 0.8051
           }
         },
         {
@@ -62,7 +62,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5196
+            "score": 0.5189
           }
         },
         {
@@ -80,7 +80,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.1624
+            "score": 0.1722
           }
         },
         {
@@ -98,7 +98,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3096
+            "score": 0.3263
           }
         },
         {
@@ -134,7 +134,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3812
+            "score": 0.3803
           }
         }
       ],
@@ -174,7 +174,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8051
+            "score": 0.8033
           }
         },
         {
@@ -192,7 +192,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5189
+            "score": 0.5196
           }
         },
         {
@@ -210,7 +210,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.1722
+            "score": 0.1624
           }
         },
         {
@@ -228,7 +228,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3263
+            "score": 0.3096
           }
         },
         {
@@ -264,7 +264,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3803
+            "score": 0.3812
           }
         }
       ],
diff --git a/data/models/alibaba_qwen-3-coder-480b.json b/data/models/alibaba_qwen-3-coder-480b.json
index 76736ecf57c8a847549ed4d201c4f8542bcb4b8e..4f06f3ef146c1e8e47df3955820bbce15d9ad09d 100644
--- a/data/models/alibaba_qwen-3-coder-480b.json
+++ b/data/models/alibaba_qwen-3-coder-480b.json
@@ -4,13 +4,13 @@
     "id": "alibaba/qwen-3-coder-480b",
     "developer": "Alibaba",
     "additional_details": {
-      "agent_name": "Terminus 2",
-      "agent_organization": "Terminal Bench"
+      "agent_name": "OpenHands",
+      "agent_organization": "OpenHands"
     }
   },
   "evaluations": [
     {
-      "evaluation_id": "terminal-bench-2.0/terminus-2__qwen-3-coder-480b/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/openhands__qwen-3-coder-480b/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -34,7 +34,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-11-01",
+          "evaluation_timestamp": "2025-11-02",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -43,17 +43,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 23.9,
+            "score": 25.4,
             "uncertainty": {
               "standard_error": {
-                "value": 2.8
+                "value": 2.6
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Qwen 3 Coder 480B\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Qwen 3 Coder 480B\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -70,7 +70,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Qwen 3 Coder 480B\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Qwen 3 Coder 480B\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -84,7 +84,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/openhands__qwen-3-coder-480b/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/terminus-2__qwen-3-coder-480b/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -108,7 +108,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-11-02",
+          "evaluation_timestamp": "2025-11-01",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -117,17 +117,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 25.4,
+            "score": 23.9,
             "uncertainty": {
               "standard_error": {
-                "value": 2.6
+                "value": 2.8
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Qwen 3 Coder 480B\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Qwen 3 Coder 480B\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -144,7 +144,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Qwen 3 Coder 480B\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Qwen 3 Coder 480B\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
diff --git a/data/models/alibaba_qwen3-235b-a22b-instruct-2507.json b/data/models/alibaba_qwen3-235b-a22b-instruct-2507.json
index 18e9257ee4a295645c5d70b31ea729844e529c44..73411ab4fcdf3bfd9a55ece3006bf091682be221 100644
--- a/data/models/alibaba_qwen3-235b-a22b-instruct-2507.json
+++ b/data/models/alibaba_qwen3-235b-a22b-instruct-2507.json
@@ -10,8 +10,8 @@
   },
   "evaluations": [
     {
-      "evaluation_id": "global-mmlu-lite/alibaba_qwen3-235b-a22b-instruct-2507/1773936496.366405",
-      "retrieved_timestamp": "1773936496.366405",
+      "evaluation_id": "global-mmlu-lite/alibaba_qwen3-235b-a22b-instruct-2507/1773936583.743359",
+      "retrieved_timestamp": "1773936583.743359",
       "source_metadata": {
         "source_name": "Global MMLU Lite Leaderboard",
         "source_type": "documentation",
@@ -525,8 +525,8 @@
       "generation_config": null
     },
     {
-      "evaluation_id": "global-mmlu-lite/alibaba_qwen3-235b-a22b-instruct-2507/1773936583.743359",
-      "retrieved_timestamp": "1773936583.743359",
+      "evaluation_id": "global-mmlu-lite/alibaba_qwen3-235b-a22b-instruct-2507/1773936496.366405",
+      "retrieved_timestamp": "1773936496.366405",
       "source_metadata": {
         "source_name": "Global MMLU Lite Leaderboard",
         "source_type": "documentation",
diff --git a/data/models/allenai_llama-3.1-70b-instruct-rm-rb2.json b/data/models/allenai_llama-3.1-70b-instruct-rm-rb2.json
index 6b82bed55df1bdb62067f41e1f2f588c7fd5e772..3edb4e861c9c885f1d8b857f90568f09968d9fbc 100644
--- a/data/models/allenai_llama-3.1-70b-instruct-rm-rb2.json
+++ b/data/models/allenai_llama-3.1-70b-instruct-rm-rb2.json
@@ -9,10 +9,10 @@
   },
   "evaluations": [
     {
-      "evaluation_id": "reward-bench-2/allenai_Llama-3.1-70B-Instruct-RM-RB2/1766412838.146816",
+      "evaluation_id": "reward-bench/allenai_Llama-3.1-70B-Instruct-RM-RB2/1766412838.146816",
       "retrieved_timestamp": "1766412838.146816",
       "source_metadata": {
-        "source_name": "RewardBench 2",
+        "source_name": "RewardBench",
         "source_type": "documentation",
         "source_organization_name": "Allen Institute for AI",
         "source_organization_url": "https://allenai.org",
@@ -31,127 +31,109 @@
         {
           "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-            "lower_is_better": false,
-            "score_type": "continuous",
-            "min_score": 0.0,
-            "max_score": 1.0
-          },
-          "score_details": {
-            "score": 0.7606
-          },
-          "source_data": {
-            "dataset_name": "RewardBench 2",
-            "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
-          }
-        },
-        {
-          "evaluation_name": "Factuality",
-          "metric_config": {
-            "evaluation_description": "Factuality score - measures factual accuracy",
+            "evaluation_description": "Overall RewardBench Score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8126
+            "score": 0.9021
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Precise IF",
+          "evaluation_name": "Chat",
           "metric_config": {
-            "evaluation_description": "Precise Instruction Following score",
+            "evaluation_description": "Chat accuracy - includes easy chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4188
+            "score": 0.9665
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Math",
+          "evaluation_name": "Chat Hard",
           "metric_config": {
-            "evaluation_description": "Math score - measures mathematical reasoning",
+            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6995
+            "score": 0.8355
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
           "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Safety score - measures safety awareness",
+            "evaluation_description": "Safety accuracy - includes safety subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8844
+            "score": 0.9095
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Focus",
+          "evaluation_name": "Reasoning",
           "metric_config": {
-            "evaluation_description": "Focus score - measures response focus",
+            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8646
+            "score": 0.8969
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Ties",
+          "evaluation_name": "Prior Sets (0.5 weight)",
           "metric_config": {
-            "evaluation_description": "Ties score - ability to identify tie cases",
+            "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8835
+            "score": 0.0
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         }
       ],
@@ -159,10 +141,10 @@
       "generation_config": null
     },
     {
-      "evaluation_id": "reward-bench/allenai_Llama-3.1-70B-Instruct-RM-RB2/1766412838.146816",
+      "evaluation_id": "reward-bench-2/allenai_Llama-3.1-70B-Instruct-RM-RB2/1766412838.146816",
       "retrieved_timestamp": "1766412838.146816",
       "source_metadata": {
-        "source_name": "RewardBench",
+        "source_name": "RewardBench 2",
         "source_type": "documentation",
         "source_organization_name": "Allen Institute for AI",
         "source_organization_url": "https://allenai.org",
@@ -181,109 +163,127 @@
         {
           "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench Score",
+            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9021
+            "score": 0.7606
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat",
+          "evaluation_name": "Factuality",
           "metric_config": {
-            "evaluation_description": "Chat accuracy - includes easy chat subsets",
+            "evaluation_description": "Factuality score - measures factual accuracy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9665
+            "score": 0.8126
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat Hard",
+          "evaluation_name": "Precise IF",
           "metric_config": {
-            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
+            "evaluation_description": "Precise Instruction Following score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8355
+            "score": 0.4188
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
+          }
+        },
+        {
+          "evaluation_name": "Math",
+          "metric_config": {
+            "evaluation_description": "Math score - measures mathematical reasoning",
+            "lower_is_better": false,
+            "score_type": "continuous",
+            "min_score": 0.0,
+            "max_score": 1.0
+          },
+          "score_details": {
+            "score": 0.6995
+          },
+          "source_data": {
+            "dataset_name": "RewardBench 2",
+            "source_type": "hf_dataset",
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
           "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Safety accuracy - includes safety subsets",
+            "evaluation_description": "Safety score - measures safety awareness",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9095
+            "score": 0.8844
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Reasoning",
+          "evaluation_name": "Focus",
           "metric_config": {
-            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
+            "evaluation_description": "Focus score - measures response focus",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8969
+            "score": 0.8646
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Prior Sets (0.5 weight)",
+          "evaluation_name": "Ties",
           "metric_config": {
-            "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
+            "evaluation_description": "Ties score - ability to identify tie cases",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0
+            "score": 0.8835
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         }
       ],
diff --git a/data/models/allenai_llama-3.1-8b-instruct-rm-rb2.json b/data/models/allenai_llama-3.1-8b-instruct-rm-rb2.json
index 60d204122c692e04be6211b26f2d79d76e83f431..2b424dce10a93f404c9945de99ddd768647b6184 100644
--- a/data/models/allenai_llama-3.1-8b-instruct-rm-rb2.json
+++ b/data/models/allenai_llama-3.1-8b-instruct-rm-rb2.json
@@ -9,10 +9,10 @@
   },
   "evaluations": [
     {
-      "evaluation_id": "reward-bench-2/allenai_Llama-3.1-8B-Instruct-RM-RB2/1766412838.146816",
+      "evaluation_id": "reward-bench/allenai_Llama-3.1-8B-Instruct-RM-RB2/1766412838.146816",
       "retrieved_timestamp": "1766412838.146816",
       "source_metadata": {
-        "source_name": "RewardBench 2",
+        "source_name": "RewardBench",
         "source_type": "documentation",
         "source_organization_name": "Allen Institute for AI",
         "source_organization_url": "https://allenai.org",
@@ -31,127 +31,109 @@
         {
           "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-            "lower_is_better": false,
-            "score_type": "continuous",
-            "min_score": 0.0,
-            "max_score": 1.0
-          },
-          "score_details": {
-            "score": 0.7285
-          },
-          "source_data": {
-            "dataset_name": "RewardBench 2",
-            "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
-          }
-        },
-        {
-          "evaluation_name": "Factuality",
-          "metric_config": {
-            "evaluation_description": "Factuality score - measures factual accuracy",
+            "evaluation_description": "Overall RewardBench Score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7432
+            "score": 0.8885
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Precise IF",
+          "evaluation_name": "Chat",
           "metric_config": {
-            "evaluation_description": "Precise Instruction Following score",
+            "evaluation_description": "Chat accuracy - includes easy chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4437
+            "score": 0.9581
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Math",
+          "evaluation_name": "Chat Hard",
           "metric_config": {
-            "evaluation_description": "Math score - measures mathematical reasoning",
+            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6175
+            "score": 0.8158
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
           "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Safety score - measures safety awareness",
+            "evaluation_description": "Safety accuracy - includes safety subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8956
+            "score": 0.8932
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Focus",
+          "evaluation_name": "Reasoning",
           "metric_config": {
-            "evaluation_description": "Focus score - measures response focus",
+            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9071
+            "score": 0.887
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Ties",
+          "evaluation_name": "Prior Sets (0.5 weight)",
           "metric_config": {
-            "evaluation_description": "Ties score - ability to identify tie cases",
+            "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7638
+            "score": 0.0
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         }
       ],
@@ -159,10 +141,10 @@
       "generation_config": null
     },
     {
-      "evaluation_id": "reward-bench/allenai_Llama-3.1-8B-Instruct-RM-RB2/1766412838.146816",
+      "evaluation_id": "reward-bench-2/allenai_Llama-3.1-8B-Instruct-RM-RB2/1766412838.146816",
       "retrieved_timestamp": "1766412838.146816",
       "source_metadata": {
-        "source_name": "RewardBench",
+        "source_name": "RewardBench 2",
         "source_type": "documentation",
         "source_organization_name": "Allen Institute for AI",
         "source_organization_url": "https://allenai.org",
@@ -181,109 +163,127 @@
         {
           "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench Score",
+            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8885
+            "score": 0.7285
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat",
+          "evaluation_name": "Factuality",
           "metric_config": {
-            "evaluation_description": "Chat accuracy - includes easy chat subsets",
+            "evaluation_description": "Factuality score - measures factual accuracy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9581
+            "score": 0.7432
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat Hard",
+          "evaluation_name": "Precise IF",
           "metric_config": {
-            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
+            "evaluation_description": "Precise Instruction Following score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8158
+            "score": 0.4437
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
+          }
+        },
+        {
+          "evaluation_name": "Math",
+          "metric_config": {
+            "evaluation_description": "Math score - measures mathematical reasoning",
+            "lower_is_better": false,
+            "score_type": "continuous",
+            "min_score": 0.0,
+            "max_score": 1.0
+          },
+          "score_details": {
+            "score": 0.6175
+          },
+          "source_data": {
+            "dataset_name": "RewardBench 2",
+            "source_type": "hf_dataset",
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
           "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Safety accuracy - includes safety subsets",
+            "evaluation_description": "Safety score - measures safety awareness",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8932
+            "score": 0.8956
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Reasoning",
+          "evaluation_name": "Focus",
           "metric_config": {
-            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
+            "evaluation_description": "Focus score - measures response focus",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.887
+            "score": 0.9071
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Prior Sets (0.5 weight)",
+          "evaluation_name": "Ties",
           "metric_config": {
-            "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
+            "evaluation_description": "Ties score - ability to identify tie cases",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0
+            "score": 0.7638
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         }
       ],
diff --git a/data/models/allenai_llama-3.1-tulu-3-70b-sft-rm-rb2.json b/data/models/allenai_llama-3.1-tulu-3-70b-sft-rm-rb2.json
index 3c470e6abd70f5f2bc9a8be57cc36f939ca7753c..56c7eedcfadd2447274047e1824ef99e14cf4e28 100644
--- a/data/models/allenai_llama-3.1-tulu-3-70b-sft-rm-rb2.json
+++ b/data/models/allenai_llama-3.1-tulu-3-70b-sft-rm-rb2.json
@@ -9,10 +9,10 @@
   },
   "evaluations": [
     {
-      "evaluation_id": "reward-bench/allenai_Llama-3.1-Tulu-3-70B-SFT-RM-RB2/1766412838.146816",
+      "evaluation_id": "reward-bench-2/allenai_Llama-3.1-Tulu-3-70B-SFT-RM-RB2/1766412838.146816",
       "retrieved_timestamp": "1766412838.146816",
       "source_metadata": {
-        "source_name": "RewardBench",
+        "source_name": "RewardBench 2",
         "source_type": "documentation",
         "source_organization_name": "Allen Institute for AI",
         "source_organization_url": "https://allenai.org",
@@ -31,109 +31,127 @@
         {
           "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench Score",
+            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8892
+            "score": 0.722
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat",
+          "evaluation_name": "Factuality",
           "metric_config": {
-            "evaluation_description": "Chat accuracy - includes easy chat subsets",
+            "evaluation_description": "Factuality score - measures factual accuracy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9693
+            "score": 0.8084
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat Hard",
+          "evaluation_name": "Precise IF",
           "metric_config": {
-            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
+            "evaluation_description": "Precise Instruction Following score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8268
+            "score": 0.3688
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
+          }
+        },
+        {
+          "evaluation_name": "Math",
+          "metric_config": {
+            "evaluation_description": "Math score - measures mathematical reasoning",
+            "lower_is_better": false,
+            "score_type": "continuous",
+            "min_score": 0.0,
+            "max_score": 1.0
+          },
+          "score_details": {
+            "score": 0.6776
+          },
+          "source_data": {
+            "dataset_name": "RewardBench 2",
+            "source_type": "hf_dataset",
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
           "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Safety accuracy - includes safety subsets",
+            "evaluation_description": "Safety score - measures safety awareness",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9027
+            "score": 0.8689
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Reasoning",
+          "evaluation_name": "Focus",
           "metric_config": {
-            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
+            "evaluation_description": "Focus score - measures response focus",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8583
+            "score": 0.7778
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Prior Sets (0.5 weight)",
+          "evaluation_name": "Ties",
           "metric_config": {
-            "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
+            "evaluation_description": "Ties score - ability to identify tie cases",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0
+            "score": 0.8308
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         }
       ],
@@ -141,10 +159,10 @@
       "generation_config": null
     },
     {
-      "evaluation_id": "reward-bench-2/allenai_Llama-3.1-Tulu-3-70B-SFT-RM-RB2/1766412838.146816",
+      "evaluation_id": "reward-bench/allenai_Llama-3.1-Tulu-3-70B-SFT-RM-RB2/1766412838.146816",
       "retrieved_timestamp": "1766412838.146816",
       "source_metadata": {
-        "source_name": "RewardBench 2",
+        "source_name": "RewardBench",
         "source_type": "documentation",
         "source_organization_name": "Allen Institute for AI",
         "source_organization_url": "https://allenai.org",
@@ -163,127 +181,109 @@
         {
           "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-            "lower_is_better": false,
-            "score_type": "continuous",
-            "min_score": 0.0,
-            "max_score": 1.0
-          },
-          "score_details": {
-            "score": 0.722
-          },
-          "source_data": {
-            "dataset_name": "RewardBench 2",
-            "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
-          }
-        },
-        {
-          "evaluation_name": "Factuality",
-          "metric_config": {
-            "evaluation_description": "Factuality score - measures factual accuracy",
+            "evaluation_description": "Overall RewardBench Score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8084
+            "score": 0.8892
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Precise IF",
+          "evaluation_name": "Chat",
           "metric_config": {
-            "evaluation_description": "Precise Instruction Following score",
+            "evaluation_description": "Chat accuracy - includes easy chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3688
+            "score": 0.9693
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Math",
+          "evaluation_name": "Chat Hard",
           "metric_config": {
-            "evaluation_description": "Math score - measures mathematical reasoning",
+            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6776
+            "score": 0.8268
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
           "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Safety score - measures safety awareness",
+            "evaluation_description": "Safety accuracy - includes safety subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8689
+            "score": 0.9027
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Focus",
+          "evaluation_name": "Reasoning",
           "metric_config": {
-            "evaluation_description": "Focus score - measures response focus",
+            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7778
+            "score": 0.8583
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Ties",
+          "evaluation_name": "Prior Sets (0.5 weight)",
           "metric_config": {
-            "evaluation_description": "Ties score - ability to identify tie cases",
+            "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8308
+            "score": 0.0
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         }
       ],
diff --git a/data/models/allenai_llama-3.1-tulu-3-70b.json b/data/models/allenai_llama-3.1-tulu-3-70b.json
index 1db7ede127ee803b7ffd5960ed83ddaa8fe5c58c..dedd220cd13dda3e92225bd87465795dab977b4c 100644
--- a/data/models/allenai_llama-3.1-tulu-3-70b.json
+++ b/data/models/allenai_llama-3.1-tulu-3-70b.json
@@ -5,7 +5,7 @@
     "developer": "allenai",
     "inference_platform": "unknown",
     "additional_details": {
-      "precision": "bfloat16",
+      "precision": "float16",
       "architecture": "LlamaForCausalLM",
       "params_billions": "70.554"
     }
@@ -44,7 +44,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8291
+            "score": 0.8379
           }
         },
         {
@@ -62,7 +62,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6164
+            "score": 0.6157
           }
         },
         {
@@ -80,7 +80,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4502
+            "score": 0.3829
           }
         },
         {
@@ -116,7 +116,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4948
+            "score": 0.4988
           }
         },
         {
@@ -134,7 +134,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4645
+            "score": 0.4656
           }
         }
       ],
@@ -174,7 +174,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8379
+            "score": 0.8291
           }
         },
         {
@@ -192,7 +192,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6157
+            "score": 0.6164
           }
         },
         {
@@ -210,7 +210,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3829
+            "score": 0.4502
           }
         },
         {
@@ -246,7 +246,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4988
+            "score": 0.4948
           }
         },
         {
@@ -264,7 +264,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4656
+            "score": 0.4645
           }
         }
       ],
diff --git a/data/models/allenai_llama-3.1-tulu-3-8b-dpo-rm-rb2.json b/data/models/allenai_llama-3.1-tulu-3-8b-dpo-rm-rb2.json
index 408cdcea425aaae910c976c1f57587d29fcf9c63..52eed39924ec31b815aba5be343333e3d61411ae 100644
--- a/data/models/allenai_llama-3.1-tulu-3-8b-dpo-rm-rb2.json
+++ b/data/models/allenai_llama-3.1-tulu-3-8b-dpo-rm-rb2.json
@@ -9,10 +9,10 @@
   },
   "evaluations": [
     {
-      "evaluation_id": "reward-bench/allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/1766412838.146816",
+      "evaluation_id": "reward-bench-2/allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/1766412838.146816",
       "retrieved_timestamp": "1766412838.146816",
       "source_metadata": {
-        "source_name": "RewardBench",
+        "source_name": "RewardBench 2",
         "source_type": "documentation",
         "source_organization_name": "Allen Institute for AI",
         "source_organization_url": "https://allenai.org",
@@ -31,109 +31,127 @@
         {
           "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench Score",
+            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8431
+            "score": 0.687
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat",
+          "evaluation_name": "Factuality",
           "metric_config": {
-            "evaluation_description": "Chat accuracy - includes easy chat subsets",
+            "evaluation_description": "Factuality score - measures factual accuracy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9553
+            "score": 0.7516
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat Hard",
+          "evaluation_name": "Precise IF",
           "metric_config": {
-            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
+            "evaluation_description": "Precise Instruction Following score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.761
+            "score": 0.3875
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
+          }
+        },
+        {
+          "evaluation_name": "Math",
+          "metric_config": {
+            "evaluation_description": "Math score - measures mathematical reasoning",
+            "lower_is_better": false,
+            "score_type": "continuous",
+            "min_score": 0.0,
+            "max_score": 1.0
+          },
+          "score_details": {
+            "score": 0.6284
+          },
+          "source_data": {
+            "dataset_name": "RewardBench 2",
+            "source_type": "hf_dataset",
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
           "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Safety accuracy - includes safety subsets",
+            "evaluation_description": "Safety score - measures safety awareness",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8662
+            "score": 0.86
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Reasoning",
+          "evaluation_name": "Focus",
           "metric_config": {
-            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
+            "evaluation_description": "Focus score - measures response focus",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7898
+            "score": 0.8545
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Prior Sets (0.5 weight)",
+          "evaluation_name": "Ties",
           "metric_config": {
-            "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
+            "evaluation_description": "Ties score - ability to identify tie cases",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0
+            "score": 0.6397
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         }
       ],
@@ -141,10 +159,10 @@
       "generation_config": null
     },
     {
-      "evaluation_id": "reward-bench-2/allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/1766412838.146816",
+      "evaluation_id": "reward-bench/allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/1766412838.146816",
       "retrieved_timestamp": "1766412838.146816",
       "source_metadata": {
-        "source_name": "RewardBench 2",
+        "source_name": "RewardBench",
         "source_type": "documentation",
         "source_organization_name": "Allen Institute for AI",
         "source_organization_url": "https://allenai.org",
@@ -163,127 +181,109 @@
         {
           "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-            "lower_is_better": false,
-            "score_type": "continuous",
-            "min_score": 0.0,
-            "max_score": 1.0
-          },
-          "score_details": {
-            "score": 0.687
-          },
-          "source_data": {
-            "dataset_name": "RewardBench 2",
-            "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
-          }
-        },
-        {
-          "evaluation_name": "Factuality",
-          "metric_config": {
-            "evaluation_description": "Factuality score - measures factual accuracy",
+            "evaluation_description": "Overall RewardBench Score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7516
+            "score": 0.8431
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Precise IF",
+          "evaluation_name": "Chat",
           "metric_config": {
-            "evaluation_description": "Precise Instruction Following score",
+            "evaluation_description": "Chat accuracy - includes easy chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3875
+            "score": 0.9553
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Math",
+          "evaluation_name": "Chat Hard",
           "metric_config": {
-            "evaluation_description": "Math score - measures mathematical reasoning",
+            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6284
+            "score": 0.761
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
           "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Safety score - measures safety awareness",
+            "evaluation_description": "Safety accuracy - includes safety subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.86
+            "score": 0.8662
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Focus",
+          "evaluation_name": "Reasoning",
           "metric_config": {
-            "evaluation_description": "Focus score - measures response focus",
+            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8545
+            "score": 0.7898
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Ties",
+          "evaluation_name": "Prior Sets (0.5 weight)",
           "metric_config": {
-            "evaluation_description": "Ties score - ability to identify tie cases",
+            "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6397
+            "score": 0.0
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         }
       ],
diff --git a/data/models/anthropic_claude-haiku-4.5.json b/data/models/anthropic_claude-haiku-4.5.json
index f3552167d1969c3e644bdda5d3cf5d022313285c..a51d1ecfb03be58e1a56ba7e5711a9bf62822a49 100644
--- a/data/models/anthropic_claude-haiku-4.5.json
+++ b/data/models/anthropic_claude-haiku-4.5.json
@@ -4,13 +4,13 @@
     "id": "anthropic/claude-haiku-4.5",
     "developer": "Anthropic",
     "additional_details": {
-      "agent_name": "Claude Code",
-      "agent_organization": "Anthropic"
+      "agent_name": "Mini-SWE-Agent",
+      "agent_organization": "Princeton"
     }
   },
   "evaluations": [
     {
-      "evaluation_id": "terminal-bench-2.0/claude-code__claude-haiku-4.5/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/mini-swe-agent__claude-haiku-4.5/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -34,7 +34,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-11-04",
+          "evaluation_timestamp": "2025-11-03",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -43,17 +43,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 27.5,
+            "score": 29.8,
             "uncertainty": {
               "standard_error": {
-                "value": 2.8
+                "value": 2.5
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Haiku 4.5\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Claude Haiku 4.5\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -70,7 +70,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Haiku 4.5\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Claude Haiku 4.5\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -84,7 +84,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/goose__claude-haiku-4.5/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/terminus-2__claude-haiku-4.5/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -108,7 +108,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-12-11",
+          "evaluation_timestamp": "2025-10-31",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -117,7 +117,7 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 35.5,
+            "score": 28.3,
             "uncertainty": {
               "standard_error": {
                 "value": 2.9
@@ -127,7 +127,7 @@
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Goose\" -m \"Claude Haiku 4.5\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Haiku 4.5\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -144,7 +144,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Goose\" -m \"Claude Haiku 4.5\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Haiku 4.5\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -158,7 +158,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/mini-swe-agent__claude-haiku-4.5/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/claude-code__claude-haiku-4.5/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -182,7 +182,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-11-03",
+          "evaluation_timestamp": "2025-11-04",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -191,17 +191,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 29.8,
+            "score": 27.5,
             "uncertainty": {
               "standard_error": {
-                "value": 2.5
+                "value": 2.8
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Claude Haiku 4.5\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Haiku 4.5\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -218,7 +218,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Claude Haiku 4.5\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Haiku 4.5\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -232,7 +232,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/terminus-2__claude-haiku-4.5/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/goose__claude-haiku-4.5/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -256,7 +256,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-10-31",
+          "evaluation_timestamp": "2025-12-11",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -265,7 +265,7 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 28.3,
+            "score": 35.5,
             "uncertainty": {
               "standard_error": {
                 "value": 2.9
@@ -275,7 +275,7 @@
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Haiku 4.5\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Goose\" -m \"Claude Haiku 4.5\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -292,7 +292,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Haiku 4.5\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Goose\" -m \"Claude Haiku 4.5\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
diff --git a/data/models/anthropic_claude-opus-4-1-20250805.json b/data/models/anthropic_claude-opus-4-1-20250805.json
index 97af67f7a9590f836aeb06cb6e8ac5e127131091..d8d76da387da094a25675bc9e4c49b25e5dc839b 100644
--- a/data/models/anthropic_claude-opus-4-1-20250805.json
+++ b/data/models/anthropic_claude-opus-4-1-20250805.json
@@ -10,8 +10,8 @@
   },
   "evaluations": [
     {
-      "evaluation_id": "global-mmlu-lite/anthropic_claude-opus-4-1-20250805/1773936496.366405",
-      "retrieved_timestamp": "1773936496.366405",
+      "evaluation_id": "global-mmlu-lite/anthropic_claude-opus-4-1-20250805/1773936583.743359",
+      "retrieved_timestamp": "1773936583.743359",
       "source_metadata": {
         "source_name": "Global MMLU Lite Leaderboard",
         "source_type": "documentation",
@@ -525,8 +525,8 @@
       "generation_config": null
     },
     {
-      "evaluation_id": "global-mmlu-lite/anthropic_claude-opus-4-1-20250805/1773936583.743359",
-      "retrieved_timestamp": "1773936583.743359",
+      "evaluation_id": "global-mmlu-lite/anthropic_claude-opus-4-1-20250805/1773936496.366405",
+      "retrieved_timestamp": "1773936496.366405",
       "source_metadata": {
         "source_name": "Global MMLU Lite Leaderboard",
         "source_type": "documentation",
diff --git a/data/models/anthropic_claude-opus-4-5.json b/data/models/anthropic_claude-opus-4-5.json
index b56ddf35bf238ee9affa206d56038693a0cc3a91..1de3c90261aa34fd7e316493f0ec72792c7a6a7a 100644
--- a/data/models/anthropic_claude-opus-4-5.json
+++ b/data/models/anthropic_claude-opus-4-5.json
@@ -4,13 +4,13 @@
     "id": "anthropic/claude-opus-4-5",
     "developer": "Anthropic",
     "additional_details": {
-      "agent_name": "SmolAgents Code",
-      "agent_framework": "smolagents_code"
+      "agent_name": "OpenAI Solo",
+      "agent_framework": "openai_solo"
     }
   },
   "evaluations": [
     {
-      "evaluation_id": "appworld/test_normal/smolagents-code__anthropic_claude-opus-4-5/1774263615.0201504",
+      "evaluation_id": "appworld/test_normal/openai-solo__anthropic_claude-opus-4-5/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -42,23 +42,23 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7,
+            "score": 0.68,
             "uncertainty": {
               "num_samples": 100
             },
             "details": {
-              "average_agent_cost": "5.59",
-              "total_run_cost": "558.51",
-              "average_steps": "41.07",
-              "percent_finished": "0.82"
+              "average_agent_cost": "22.76",
+              "total_run_cost": "2276.48",
+              "average_steps": "47.65",
+              "percent_finished": "0.77"
             }
           },
           "generation_config": {
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "SmolAgents Code",
-                  "agent_framework": "smolagents_code"
+                  "agent_name": "OpenAI Solo",
+                  "agent_framework": "openai_solo"
                 }
               }
             }
@@ -70,15 +70,15 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "SmolAgents Code",
-              "agent_framework": "smolagents_code"
+              "agent_name": "OpenAI Solo",
+              "agent_framework": "openai_solo"
             }
           }
         }
       }
     },
     {
-      "evaluation_id": "appworld/test_normal/claude-code-cli__anthropic_claude-opus-4-5/1774263615.0201504",
+      "evaluation_id": "browsecompplus/openai-solo__anthropic_claude-opus-4-5/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -91,42 +91,42 @@
         "name": "exgentic",
         "version": "0.1.0"
       },
-      "benchmark": "appworld_test_normal",
+      "benchmark": "browsecompplus",
       "evaluation_results": [
         {
-          "evaluation_name": "appworld/test_normal",
+          "evaluation_name": "browsecompplus",
           "source_data": {
-            "dataset_name": "appworld/test_normal",
+            "dataset_name": "browsecompplus",
             "source_type": "url",
             "url": [
               "https://github.com/Exgentic/exgentic"
             ]
           },
           "metric_config": {
-            "evaluation_description": "AppWorld benchmark evaluation (test_normal subset)",
+            "evaluation_description": "BrowseCompPlus benchmark evaluation",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.66,
+            "score": 0.61,
             "uncertainty": {
               "num_samples": 100
             },
             "details": {
-              "average_agent_cost": "13.08",
-              "total_run_cost": "1308.38",
-              "average_steps": "49.69",
-              "percent_finished": "0.74"
+              "average_agent_cost": "7.59",
+              "total_run_cost": "759.44",
+              "average_steps": "27.18",
+              "percent_finished": "1.0"
             }
           },
           "generation_config": {
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "Claude Code CLI",
-                  "agent_framework": "claude_code"
+                  "agent_name": "OpenAI Solo",
+                  "agent_framework": "openai_solo"
                 }
               }
             }
@@ -138,15 +138,15 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "Claude Code CLI",
-              "agent_framework": "claude_code"
+              "agent_name": "OpenAI Solo",
+              "agent_framework": "openai_solo"
             }
           }
         }
       }
     },
     {
-      "evaluation_id": "appworld/test_normal/litellm-tool-calling-with-shortlisting__anthropic_claude-opus-4-5/1774263615.0201504",
+      "evaluation_id": "browsecompplus/litellm-tool-calling__anthropic_claude-opus-4-5/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -159,42 +159,42 @@
         "name": "exgentic",
         "version": "0.1.0"
       },
-      "benchmark": "appworld_test_normal",
+      "benchmark": "browsecompplus",
       "evaluation_results": [
         {
-          "evaluation_name": "appworld/test_normal",
+          "evaluation_name": "browsecompplus",
           "source_data": {
-            "dataset_name": "appworld/test_normal",
+            "dataset_name": "browsecompplus",
             "source_type": "url",
             "url": [
               "https://github.com/Exgentic/exgentic"
             ]
           },
           "metric_config": {
-            "evaluation_description": "AppWorld benchmark evaluation (test_normal subset)",
+            "evaluation_description": "BrowseCompPlus benchmark evaluation",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.64,
+            "score": 0.49,
             "uncertainty": {
               "num_samples": 100
             },
             "details": {
-              "average_agent_cost": "3.43",
-              "total_run_cost": "343.32",
-              "average_steps": "20.06",
-              "percent_finished": "0.82"
+              "average_agent_cost": "7.09",
+              "total_run_cost": "709.54",
+              "average_steps": "21.66",
+              "percent_finished": "0.93"
             }
           },
           "generation_config": {
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "LiteLLM Tool Calling with Shortlisting",
-                  "agent_framework": "tool_calling_with_shortlisting"
+                  "agent_name": "LiteLLM Tool Calling",
+                  "agent_framework": "tool_calling"
                 }
               }
             }
@@ -206,8 +206,8 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "LiteLLM Tool Calling with Shortlisting",
-              "agent_framework": "tool_calling_with_shortlisting"
+              "agent_name": "LiteLLM Tool Calling",
+              "agent_framework": "tool_calling"
             }
           }
         }
@@ -282,7 +282,7 @@
       }
     },
     {
-      "evaluation_id": "appworld/test_normal/openai-solo__anthropic_claude-opus-4-5/1774263615.0201504",
+      "evaluation_id": "appworld/test_normal/claude-code-cli__anthropic_claude-opus-4-5/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -314,23 +314,23 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.68,
+            "score": 0.66,
             "uncertainty": {
               "num_samples": 100
             },
             "details": {
-              "average_agent_cost": "22.76",
-              "total_run_cost": "2276.48",
-              "average_steps": "47.65",
-              "percent_finished": "0.77"
+              "average_agent_cost": "13.08",
+              "total_run_cost": "1308.38",
+              "average_steps": "49.69",
+              "percent_finished": "0.74"
             }
           },
           "generation_config": {
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "OpenAI Solo",
-                  "agent_framework": "openai_solo"
+                  "agent_name": "Claude Code CLI",
+                  "agent_framework": "claude_code"
                 }
               }
             }
@@ -342,15 +342,15 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "OpenAI Solo",
-              "agent_framework": "openai_solo"
+              "agent_name": "Claude Code CLI",
+              "agent_framework": "claude_code"
             }
           }
         }
       }
     },
     {
-      "evaluation_id": "browsecompplus/litellm-tool-calling__anthropic_claude-opus-4-5/1774263615.0201504",
+      "evaluation_id": "appworld/test_normal/litellm-tool-calling-with-shortlisting__anthropic_claude-opus-4-5/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -363,42 +363,42 @@
         "name": "exgentic",
         "version": "0.1.0"
       },
-      "benchmark": "browsecompplus",
+      "benchmark": "appworld_test_normal",
       "evaluation_results": [
         {
-          "evaluation_name": "browsecompplus",
+          "evaluation_name": "appworld/test_normal",
           "source_data": {
-            "dataset_name": "browsecompplus",
+            "dataset_name": "appworld/test_normal",
             "source_type": "url",
             "url": [
               "https://github.com/Exgentic/exgentic"
             ]
           },
           "metric_config": {
-            "evaluation_description": "BrowseCompPlus benchmark evaluation",
+            "evaluation_description": "AppWorld benchmark evaluation (test_normal subset)",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.49,
+            "score": 0.64,
             "uncertainty": {
               "num_samples": 100
             },
             "details": {
-              "average_agent_cost": "7.09",
-              "total_run_cost": "709.54",
-              "average_steps": "21.66",
-              "percent_finished": "0.93"
+              "average_agent_cost": "3.43",
+              "total_run_cost": "343.32",
+              "average_steps": "20.06",
+              "percent_finished": "0.82"
             }
           },
           "generation_config": {
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "LiteLLM Tool Calling",
-                  "agent_framework": "tool_calling"
+                  "agent_name": "LiteLLM Tool Calling with Shortlisting",
+                  "agent_framework": "tool_calling_with_shortlisting"
                 }
               }
             }
@@ -410,8 +410,8 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "LiteLLM Tool Calling",
-              "agent_framework": "tool_calling"
+              "agent_name": "LiteLLM Tool Calling with Shortlisting",
+              "agent_framework": "tool_calling_with_shortlisting"
             }
           }
         }
@@ -486,7 +486,7 @@
       }
     },
     {
-      "evaluation_id": "browsecompplus/openai-solo__anthropic_claude-opus-4-5/1774263615.0201504",
+      "evaluation_id": "appworld/test_normal/smolagents-code__anthropic_claude-opus-4-5/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -499,42 +499,42 @@
         "name": "exgentic",
         "version": "0.1.0"
       },
-      "benchmark": "browsecompplus",
+      "benchmark": "appworld_test_normal",
       "evaluation_results": [
         {
-          "evaluation_name": "browsecompplus",
+          "evaluation_name": "appworld/test_normal",
           "source_data": {
-            "dataset_name": "browsecompplus",
+            "dataset_name": "appworld/test_normal",
             "source_type": "url",
             "url": [
               "https://github.com/Exgentic/exgentic"
             ]
           },
           "metric_config": {
-            "evaluation_description": "BrowseCompPlus benchmark evaluation",
+            "evaluation_description": "AppWorld benchmark evaluation (test_normal subset)",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.61,
+            "score": 0.7,
             "uncertainty": {
               "num_samples": 100
             },
             "details": {
-              "average_agent_cost": "7.59",
-              "total_run_cost": "759.44",
-              "average_steps": "27.18",
-              "percent_finished": "1.0"
+              "average_agent_cost": "5.59",
+              "total_run_cost": "558.51",
+              "average_steps": "41.07",
+              "percent_finished": "0.82"
             }
           },
           "generation_config": {
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "OpenAI Solo",
-                  "agent_framework": "openai_solo"
+                  "agent_name": "SmolAgents Code",
+                  "agent_framework": "smolagents_code"
                 }
               }
             }
@@ -546,8 +546,8 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "OpenAI Solo",
-              "agent_framework": "openai_solo"
+              "agent_name": "SmolAgents Code",
+              "agent_framework": "smolagents_code"
             }
           }
         }
@@ -690,7 +690,7 @@
       }
     },
     {
-      "evaluation_id": "swe-bench/openai-solo__anthropic_claude-opus-4-5/1774263615.0201504",
+      "evaluation_id": "swe-bench/claude-code-cli__anthropic_claude-opus-4-5/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -722,14 +722,14 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8072,
+            "score": 0.7423,
             "uncertainty": {
-              "num_samples": 83
+              "num_samples": 97
             },
             "details": {
-              "average_agent_cost": "2.96",
-              "total_run_cost": "245.78",
-              "average_steps": "34.1",
+              "average_agent_cost": "5.6",
+              "total_run_cost": "543.62",
+              "average_steps": "31.76",
               "percent_finished": "1.0"
             }
           },
@@ -737,8 +737,8 @@
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "OpenAI Solo",
-                  "agent_framework": "openai_solo"
+                  "agent_name": "Claude Code CLI",
+                  "agent_framework": "claude_code"
                 }
               }
             }
@@ -750,15 +750,15 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "OpenAI Solo",
-              "agent_framework": "openai_solo"
+              "agent_name": "Claude Code CLI",
+              "agent_framework": "claude_code"
             }
           }
         }
       }
     },
     {
-      "evaluation_id": "swe-bench/litellm-tool-calling-with-shortlisting__anthropic_claude-opus-4-5/1774263615.0201504",
+      "evaluation_id": "swe-bench/litellm-tool-calling__anthropic_claude-opus-4-5/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -805,8 +805,8 @@
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "LiteLLM Tool Calling with Shortlisting",
-                  "agent_framework": "tool_calling_with_shortlisting"
+                  "agent_name": "LiteLLM Tool Calling",
+                  "agent_framework": "tool_calling"
                 }
               }
             }
@@ -818,15 +818,15 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "LiteLLM Tool Calling with Shortlisting",
-              "agent_framework": "tool_calling_with_shortlisting"
+              "agent_name": "LiteLLM Tool Calling",
+              "agent_framework": "tool_calling"
             }
           }
         }
       }
     },
     {
-      "evaluation_id": "swe-bench/litellm-tool-calling__anthropic_claude-opus-4-5/1774263615.0201504",
+      "evaluation_id": "swe-bench/openai-solo__anthropic_claude-opus-4-5/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -858,14 +858,14 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6061,
+            "score": 0.8072,
             "uncertainty": {
-              "num_samples": 99
+              "num_samples": 83
             },
             "details": {
-              "average_agent_cost": "3.97",
-              "total_run_cost": "393.16",
-              "average_steps": "43.44",
+              "average_agent_cost": "2.96",
+              "total_run_cost": "245.78",
+              "average_steps": "34.1",
               "percent_finished": "1.0"
             }
           },
@@ -873,8 +873,8 @@
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "LiteLLM Tool Calling",
-                  "agent_framework": "tool_calling"
+                  "agent_name": "OpenAI Solo",
+                  "agent_framework": "openai_solo"
                 }
               }
             }
@@ -886,15 +886,15 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "LiteLLM Tool Calling",
-              "agent_framework": "tool_calling"
+              "agent_name": "OpenAI Solo",
+              "agent_framework": "openai_solo"
             }
           }
         }
       }
     },
     {
-      "evaluation_id": "swe-bench/claude-code-cli__anthropic_claude-opus-4-5/1774263615.0201504",
+      "evaluation_id": "swe-bench/smolagents-code__anthropic_claude-opus-4-5/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -926,14 +926,14 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7423,
+            "score": 0.65,
             "uncertainty": {
-              "num_samples": 97
+              "num_samples": 100
             },
             "details": {
-              "average_agent_cost": "5.6",
-              "total_run_cost": "543.62",
-              "average_steps": "31.76",
+              "average_agent_cost": "4.85",
+              "total_run_cost": "485.22",
+              "average_steps": "39.13",
               "percent_finished": "1.0"
             }
           },
@@ -941,8 +941,8 @@
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "Claude Code CLI",
-                  "agent_framework": "claude_code"
+                  "agent_name": "SmolAgents Code",
+                  "agent_framework": "smolagents_code"
                 }
               }
             }
@@ -954,15 +954,15 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "Claude Code CLI",
-              "agent_framework": "claude_code"
+              "agent_name": "SmolAgents Code",
+              "agent_framework": "smolagents_code"
             }
           }
         }
       }
     },
     {
-      "evaluation_id": "swe-bench/smolagents-code__anthropic_claude-opus-4-5/1774263615.0201504",
+      "evaluation_id": "swe-bench/litellm-tool-calling-with-shortlisting__anthropic_claude-opus-4-5/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -994,14 +994,14 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.65,
+            "score": 0.6061,
             "uncertainty": {
-              "num_samples": 100
+              "num_samples": 99
             },
             "details": {
-              "average_agent_cost": "4.85",
-              "total_run_cost": "485.22",
-              "average_steps": "39.13",
+              "average_agent_cost": "3.97",
+              "total_run_cost": "393.16",
+              "average_steps": "43.44",
               "percent_finished": "1.0"
             }
           },
@@ -1009,8 +1009,8 @@
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "SmolAgents Code",
-                  "agent_framework": "smolagents_code"
+                  "agent_name": "LiteLLM Tool Calling with Shortlisting",
+                  "agent_framework": "tool_calling_with_shortlisting"
                 }
               }
             }
@@ -1022,15 +1022,15 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "SmolAgents Code",
-              "agent_framework": "smolagents_code"
+              "agent_name": "LiteLLM Tool Calling with Shortlisting",
+              "agent_framework": "tool_calling_with_shortlisting"
             }
           }
         }
       }
     },
     {
-      "evaluation_id": "tau-bench-2/airline/litellm-tool-calling-with-shortlisting__anthropic_claude-opus-4-5/1774263615.0201504",
+      "evaluation_id": "tau-bench-2/airline/smolagents-code__anthropic_claude-opus-4-5/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -1062,14 +1062,14 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.66,
+            "score": 0.72,
             "uncertainty": {
               "num_samples": 50
             },
             "details": {
-              "average_agent_cost": "0.47",
-              "total_run_cost": "24.23",
-              "average_steps": "10.0",
+              "average_agent_cost": "0.78",
+              "total_run_cost": "39.67",
+              "average_steps": "11.88",
               "percent_finished": "1.0"
             }
           },
@@ -1077,8 +1077,8 @@
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "LiteLLM Tool Calling with Shortlisting",
-                  "agent_framework": "tool_calling_with_shortlisting"
+                  "agent_name": "SmolAgents Code",
+                  "agent_framework": "smolagents_code"
                 }
               }
             }
@@ -1090,8 +1090,8 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "LiteLLM Tool Calling with Shortlisting",
-              "agent_framework": "tool_calling_with_shortlisting"
+              "agent_name": "SmolAgents Code",
+              "agent_framework": "smolagents_code"
             }
           }
         }
@@ -1302,7 +1302,7 @@
       }
     },
     {
-      "evaluation_id": "tau-bench-2/airline/smolagents-code__anthropic_claude-opus-4-5/1774263615.0201504",
+      "evaluation_id": "tau-bench-2/retail/litellm-tool-calling__anthropic_claude-opus-4-5/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -1315,33 +1315,33 @@
         "name": "exgentic",
         "version": "0.1.0"
       },
-      "benchmark": "tau-bench-2_airline",
+      "benchmark": "tau-bench-2_retail",
       "evaluation_results": [
         {
-          "evaluation_name": "tau-bench-2/airline",
+          "evaluation_name": "tau-bench-2/retail",
           "source_data": {
-            "dataset_name": "tau-bench-2/airline",
+            "dataset_name": "tau-bench-2/retail",
             "source_type": "url",
             "url": [
               "https://github.com/Exgentic/exgentic"
             ]
           },
           "metric_config": {
-            "evaluation_description": "Tau Bench 2 benchmark evaluation (airline subset)",
+            "evaluation_description": "Tau Bench 2 benchmark evaluation (retail subset)",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.72,
+            "score": 0.78,
             "uncertainty": {
-              "num_samples": 50
+              "num_samples": 100
             },
             "details": {
-              "average_agent_cost": "0.78",
-              "total_run_cost": "39.67",
-              "average_steps": "11.88",
+              "average_agent_cost": "0.47",
+              "total_run_cost": "48.01",
+              "average_steps": "11.33",
               "percent_finished": "1.0"
             }
           },
@@ -1349,8 +1349,8 @@
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "SmolAgents Code",
-                  "agent_framework": "smolagents_code"
+                  "agent_name": "LiteLLM Tool Calling",
+                  "agent_framework": "tool_calling"
                 }
               }
             }
@@ -1362,15 +1362,15 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "SmolAgents Code",
-              "agent_framework": "smolagents_code"
+              "agent_name": "LiteLLM Tool Calling",
+              "agent_framework": "tool_calling"
             }
           }
         }
       }
     },
     {
-      "evaluation_id": "tau-bench-2/retail/litellm-tool-calling__anthropic_claude-opus-4-5/1774263615.0201504",
+      "evaluation_id": "tau-bench-2/retail/claude-code-cli__anthropic_claude-opus-4-5/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -1402,14 +1402,14 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.78,
+            "score": 0.83,
             "uncertainty": {
               "num_samples": 100
             },
             "details": {
-              "average_agent_cost": "0.47",
-              "total_run_cost": "48.01",
-              "average_steps": "11.33",
+              "average_agent_cost": "1.6",
+              "total_run_cost": "161.14",
+              "average_steps": "12.54",
               "percent_finished": "1.0"
             }
           },
@@ -1417,8 +1417,8 @@
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "LiteLLM Tool Calling",
-                  "agent_framework": "tool_calling"
+                  "agent_name": "Claude Code CLI",
+                  "agent_framework": "claude_code"
                 }
               }
             }
@@ -1430,15 +1430,15 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "LiteLLM Tool Calling",
-              "agent_framework": "tool_calling"
+              "agent_name": "Claude Code CLI",
+              "agent_framework": "claude_code"
             }
           }
         }
       }
     },
     {
-      "evaluation_id": "tau-bench-2/retail/claude-code-cli__anthropic_claude-opus-4-5/1774263615.0201504",
+      "evaluation_id": "tau-bench-2/retail/smolagents-code__anthropic_claude-opus-4-5/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -1470,14 +1470,14 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.83,
+            "score": 0.78,
             "uncertainty": {
               "num_samples": 100
             },
             "details": {
-              "average_agent_cost": "1.6",
-              "total_run_cost": "161.14",
-              "average_steps": "12.54",
+              "average_agent_cost": "0.67",
+              "total_run_cost": "68.24",
+              "average_steps": "11.71",
               "percent_finished": "1.0"
             }
           },
@@ -1485,8 +1485,8 @@
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "Claude Code CLI",
-                  "agent_framework": "claude_code"
+                  "agent_name": "SmolAgents Code",
+                  "agent_framework": "smolagents_code"
                 }
               }
             }
@@ -1498,15 +1498,15 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "Claude Code CLI",
-              "agent_framework": "claude_code"
+              "agent_name": "SmolAgents Code",
+              "agent_framework": "smolagents_code"
             }
           }
         }
       }
     },
     {
-      "evaluation_id": "tau-bench-2/retail/openai-solo__anthropic_claude-opus-4-5/1774263615.0201504",
+      "evaluation_id": "tau-bench-2/airline/litellm-tool-calling-with-shortlisting__anthropic_claude-opus-4-5/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -1519,33 +1519,33 @@
         "name": "exgentic",
         "version": "0.1.0"
       },
-      "benchmark": "tau-bench-2_retail",
+      "benchmark": "tau-bench-2_airline",
       "evaluation_results": [
         {
-          "evaluation_name": "tau-bench-2/retail",
+          "evaluation_name": "tau-bench-2/airline",
           "source_data": {
-            "dataset_name": "tau-bench-2/retail",
+            "dataset_name": "tau-bench-2/airline",
             "source_type": "url",
             "url": [
               "https://github.com/Exgentic/exgentic"
             ]
           },
           "metric_config": {
-            "evaluation_description": "Tau Bench 2 benchmark evaluation (retail subset)",
+            "evaluation_description": "Tau Bench 2 benchmark evaluation (airline subset)",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.85,
+            "score": 0.66,
             "uncertainty": {
-              "num_samples": 100
+              "num_samples": 50
             },
             "details": {
-              "average_agent_cost": "0.55",
-              "total_run_cost": "56.18",
-              "average_steps": "12.54",
+              "average_agent_cost": "0.47",
+              "total_run_cost": "24.23",
+              "average_steps": "10.0",
               "percent_finished": "1.0"
             }
           },
@@ -1553,8 +1553,8 @@
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "OpenAI Solo",
-                  "agent_framework": "openai_solo"
+                  "agent_name": "LiteLLM Tool Calling with Shortlisting",
+                  "agent_framework": "tool_calling_with_shortlisting"
                 }
               }
             }
@@ -1566,15 +1566,15 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "OpenAI Solo",
-              "agent_framework": "openai_solo"
+              "agent_name": "LiteLLM Tool Calling with Shortlisting",
+              "agent_framework": "tool_calling_with_shortlisting"
             }
           }
         }
       }
     },
     {
-      "evaluation_id": "tau-bench-2/retail/litellm-tool-calling-with-shortlisting__anthropic_claude-opus-4-5/1774263615.0201504",
+      "evaluation_id": "tau-bench-2/retail/openai-solo__anthropic_claude-opus-4-5/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -1606,14 +1606,14 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.78,
+            "score": 0.85,
             "uncertainty": {
               "num_samples": 100
             },
             "details": {
-              "average_agent_cost": "0.47",
-              "total_run_cost": "48.01",
-              "average_steps": "11.33",
+              "average_agent_cost": "0.55",
+              "total_run_cost": "56.18",
+              "average_steps": "12.54",
               "percent_finished": "1.0"
             }
           },
@@ -1621,8 +1621,8 @@
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "LiteLLM Tool Calling with Shortlisting",
-                  "agent_framework": "tool_calling_with_shortlisting"
+                  "agent_name": "OpenAI Solo",
+                  "agent_framework": "openai_solo"
                 }
               }
             }
@@ -1634,15 +1634,15 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "LiteLLM Tool Calling with Shortlisting",
-              "agent_framework": "tool_calling_with_shortlisting"
+              "agent_name": "OpenAI Solo",
+              "agent_framework": "openai_solo"
             }
           }
         }
       }
     },
     {
-      "evaluation_id": "tau-bench-2/retail/smolagents-code__anthropic_claude-opus-4-5/1774263615.0201504",
+      "evaluation_id": "tau-bench-2/retail/litellm-tool-calling-with-shortlisting__anthropic_claude-opus-4-5/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -1679,9 +1679,9 @@
               "num_samples": 100
             },
             "details": {
-              "average_agent_cost": "0.67",
-              "total_run_cost": "68.24",
-              "average_steps": "11.71",
+              "average_agent_cost": "0.47",
+              "total_run_cost": "48.01",
+              "average_steps": "11.33",
               "percent_finished": "1.0"
             }
           },
@@ -1689,8 +1689,8 @@
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "SmolAgents Code",
-                  "agent_framework": "smolagents_code"
+                  "agent_name": "LiteLLM Tool Calling with Shortlisting",
+                  "agent_framework": "tool_calling_with_shortlisting"
                 }
               }
             }
@@ -1702,15 +1702,15 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "SmolAgents Code",
-              "agent_framework": "smolagents_code"
+              "agent_name": "LiteLLM Tool Calling with Shortlisting",
+              "agent_framework": "tool_calling_with_shortlisting"
             }
           }
         }
       }
     },
     {
-      "evaluation_id": "tau-bench-2/telecom/claude-code-cli__anthropic_claude-opus-4-5/1774263615.0201504",
+      "evaluation_id": "tau-bench-2/telecom/litellm-tool-calling-with-shortlisting__anthropic_claude-opus-4-5/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -1747,9 +1747,9 @@
               "num_samples": 100
             },
             "details": {
-              "average_agent_cost": "2.45",
-              "total_run_cost": "255.97",
-              "average_steps": "18.71",
+              "average_agent_cost": "0.92",
+              "total_run_cost": "102.01",
+              "average_steps": "17.22",
               "percent_finished": "1.0"
             }
           },
@@ -1757,8 +1757,8 @@
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "Claude Code CLI",
-                  "agent_framework": "claude_code"
+                  "agent_name": "LiteLLM Tool Calling with Shortlisting",
+                  "agent_framework": "tool_calling_with_shortlisting"
                 }
               }
             }
@@ -1770,8 +1770,8 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "Claude Code CLI",
-              "agent_framework": "claude_code"
+              "agent_name": "LiteLLM Tool Calling with Shortlisting",
+              "agent_framework": "tool_calling_with_shortlisting"
             }
           }
         }
@@ -1846,7 +1846,7 @@
       }
     },
     {
-      "evaluation_id": "tau-bench-2/telecom/openai-solo__anthropic_claude-opus-4-5/1774263615.0201504",
+      "evaluation_id": "tau-bench-2/telecom/claude-code-cli__anthropic_claude-opus-4-5/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -1878,14 +1878,14 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.84,
+            "score": 0.76,
             "uncertainty": {
               "num_samples": 100
             },
             "details": {
-              "average_agent_cost": "1.25",
-              "total_run_cost": "136.84",
-              "average_steps": "17.15",
+              "average_agent_cost": "2.45",
+              "total_run_cost": "255.97",
+              "average_steps": "18.71",
               "percent_finished": "1.0"
             }
           },
@@ -1893,8 +1893,8 @@
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "OpenAI Solo",
-                  "agent_framework": "openai_solo"
+                  "agent_name": "Claude Code CLI",
+                  "agent_framework": "claude_code"
                 }
               }
             }
@@ -1906,15 +1906,15 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "OpenAI Solo",
-              "agent_framework": "openai_solo"
+              "agent_name": "Claude Code CLI",
+              "agent_framework": "claude_code"
             }
           }
         }
       }
     },
     {
-      "evaluation_id": "tau-bench-2/telecom/litellm-tool-calling-with-shortlisting__anthropic_claude-opus-4-5/1774263615.0201504",
+      "evaluation_id": "tau-bench-2/telecom/litellm-tool-calling__anthropic_claude-opus-4-5/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -1961,8 +1961,8 @@
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "LiteLLM Tool Calling with Shortlisting",
-                  "agent_framework": "tool_calling_with_shortlisting"
+                  "agent_name": "LiteLLM Tool Calling",
+                  "agent_framework": "tool_calling"
                 }
               }
             }
@@ -1974,15 +1974,15 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "LiteLLM Tool Calling with Shortlisting",
-              "agent_framework": "tool_calling_with_shortlisting"
+              "agent_name": "LiteLLM Tool Calling",
+              "agent_framework": "tool_calling"
             }
           }
         }
       }
     },
     {
-      "evaluation_id": "tau-bench-2/telecom/litellm-tool-calling__anthropic_claude-opus-4-5/1774263615.0201504",
+      "evaluation_id": "tau-bench-2/telecom/openai-solo__anthropic_claude-opus-4-5/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -2014,14 +2014,14 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.76,
+            "score": 0.84,
             "uncertainty": {
               "num_samples": 100
             },
             "details": {
-              "average_agent_cost": "0.92",
-              "total_run_cost": "102.01",
-              "average_steps": "17.22",
+              "average_agent_cost": "1.25",
+              "total_run_cost": "136.84",
+              "average_steps": "17.15",
               "percent_finished": "1.0"
             }
           },
@@ -2029,8 +2029,8 @@
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "LiteLLM Tool Calling",
-                  "agent_framework": "tool_calling"
+                  "agent_name": "OpenAI Solo",
+                  "agent_framework": "openai_solo"
                 }
               }
             }
@@ -2042,8 +2042,8 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "LiteLLM Tool Calling",
-              "agent_framework": "tool_calling"
+              "agent_name": "OpenAI Solo",
+              "agent_framework": "openai_solo"
             }
           }
         }
diff --git a/data/models/anthropic_claude-opus-4.1.json b/data/models/anthropic_claude-opus-4.1.json
index e9a568bd310f72259de9371fc544155b5b7255f5..b51256b4d5a95f9160a82aa03d45fc55ec0aba09 100644
--- a/data/models/anthropic_claude-opus-4.1.json
+++ b/data/models/anthropic_claude-opus-4.1.json
@@ -4,13 +4,13 @@
     "id": "anthropic/claude-opus-4.1",
     "developer": "Anthropic",
     "additional_details": {
-      "agent_name": "Claude Code",
-      "agent_organization": "Anthropic"
+      "agent_name": "OpenHands",
+      "agent_organization": "OpenHands"
     }
   },
   "evaluations": [
     {
-      "evaluation_id": "terminal-bench-2.0/claude-code__claude-opus-4.1/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/openhands__claude-opus-4.1/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -34,7 +34,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-11-04",
+          "evaluation_timestamp": "2025-11-02",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -43,17 +43,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 34.8,
+            "score": 36.9,
             "uncertainty": {
               "standard_error": {
-                "value": 2.9
+                "value": 2.7
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Opus 4.1\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Claude Opus 4.1\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -70,7 +70,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Opus 4.1\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Claude Opus 4.1\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -84,7 +84,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/mini-swe-agent__claude-opus-4.1/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/terminus-2__claude-opus-4.1/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -108,7 +108,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-11-03",
+          "evaluation_timestamp": "2025-10-31",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -117,17 +117,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 35.1,
+            "score": 38.0,
             "uncertainty": {
               "standard_error": {
-                "value": 2.5
+                "value": 2.6
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Claude Opus 4.1\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Opus 4.1\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -144,7 +144,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Claude Opus 4.1\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Opus 4.1\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -158,7 +158,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/openhands__claude-opus-4.1/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/claude-code__claude-opus-4.1/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -182,7 +182,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-11-02",
+          "evaluation_timestamp": "2025-11-04",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -191,17 +191,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 36.9,
+            "score": 34.8,
             "uncertainty": {
               "standard_error": {
-                "value": 2.7
+                "value": 2.9
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Claude Opus 4.1\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Opus 4.1\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -218,7 +218,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Claude Opus 4.1\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Opus 4.1\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -232,7 +232,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/terminus-2__claude-opus-4.1/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/mini-swe-agent__claude-opus-4.1/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -256,7 +256,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-10-31",
+          "evaluation_timestamp": "2025-11-03",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -265,17 +265,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 38.0,
+            "score": 35.1,
             "uncertainty": {
               "standard_error": {
-                "value": 2.6
+                "value": 2.5
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Opus 4.1\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Claude Opus 4.1\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -292,7 +292,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Opus 4.1\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Claude Opus 4.1\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
diff --git a/data/models/anthropic_claude-opus-4.5.json b/data/models/anthropic_claude-opus-4.5.json
index 67b02f7c77893feb7214e9d8713a329f346b20ae..14b1dc3ce90d80ab30d23bbb2e060321876d554a 100644
--- a/data/models/anthropic_claude-opus-4.5.json
+++ b/data/models/anthropic_claude-opus-4.5.json
@@ -4,13 +4,13 @@
     "id": "anthropic/claude-opus-4.5",
     "developer": "Anthropic",
     "additional_details": {
-      "agent_name": "Terminus 2",
-      "agent_organization": "Terminal Bench"
+      "agent_name": "OpenCode",
+      "agent_organization": "Anomaly Innovations"
     }
   },
   "evaluations": [
     {
-      "evaluation_id": "terminal-bench-2.0/terminus-2__claude-opus-4.5/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/opencode__claude-opus-4.5/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -34,7 +34,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-11-22",
+          "evaluation_timestamp": "2026-01-12",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -43,17 +43,11 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 57.8,
-            "uncertainty": {
-              "standard_error": {
-                "value": 2.5
-              },
-              "num_samples": 435
-            }
+            "score": 51.7
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Opus 4.5\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenCode\" -m \"Claude Opus 4.5\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -70,7 +64,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Opus 4.5\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenCode\" -m \"Claude Opus 4.5\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -84,7 +78,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/mux__claude-opus-4.5/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/terminus-2__claude-opus-4.5/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -108,7 +102,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2026-01-17",
+          "evaluation_timestamp": "2025-11-22",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -117,11 +111,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 58.4
+            "score": 57.8,
+            "uncertainty": {
+              "standard_error": {
+                "value": 2.5
+              },
+              "num_samples": 435
+            }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mux\" -m \"Claude Opus 4.5\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Opus 4.5\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -138,7 +138,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mux\" -m \"Claude Opus 4.5\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Opus 4.5\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -152,7 +152,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/opencode__claude-opus-4.5/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/goose__claude-opus-4.5/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -176,7 +176,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2026-01-12",
+          "evaluation_timestamp": "2025-12-11",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -185,11 +185,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 51.7
+            "score": 54.3,
+            "uncertainty": {
+              "standard_error": {
+                "value": 2.6
+              },
+              "num_samples": 435
+            }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenCode\" -m \"Claude Opus 4.5\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Goose\" -m \"Claude Opus 4.5\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -206,7 +212,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenCode\" -m \"Claude Opus 4.5\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Goose\" -m \"Claude Opus 4.5\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -294,7 +300,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/goose__claude-opus-4.5/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/mux__claude-opus-4.5/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -318,7 +324,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-12-11",
+          "evaluation_timestamp": "2026-01-17",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -327,17 +333,11 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 54.3,
-            "uncertainty": {
-              "standard_error": {
-                "value": 2.6
-              },
-              "num_samples": 435
-            }
+            "score": 58.4
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Goose\" -m \"Claude Opus 4.5\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mux\" -m \"Claude Opus 4.5\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -354,7 +354,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Goose\" -m \"Claude Opus 4.5\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mux\" -m \"Claude Opus 4.5\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -442,7 +442,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/claude-code__claude-opus-4.5/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/letta-code__claude-opus-4.5/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -466,7 +466,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-12-18",
+          "evaluation_timestamp": "2025-12-17",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -475,17 +475,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 52.1,
+            "score": 59.1,
             "uncertainty": {
               "standard_error": {
-                "value": 2.5
+                "value": 2.4
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Opus 4.5\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Letta Code\" -m \"Claude Opus 4.5\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -502,7 +502,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Opus 4.5\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Letta Code\" -m \"Claude Opus 4.5\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -516,7 +516,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/letta-code__claude-opus-4.5/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/claude-code__claude-opus-4.5/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -540,7 +540,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-12-17",
+          "evaluation_timestamp": "2025-12-18",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -549,17 +549,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 59.1,
+            "score": 52.1,
             "uncertainty": {
               "standard_error": {
-                "value": 2.4
+                "value": 2.5
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Letta Code\" -m \"Claude Opus 4.5\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Opus 4.5\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -576,7 +576,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Letta Code\" -m \"Claude Opus 4.5\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Opus 4.5\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
diff --git a/data/models/anthropic_claude-opus-4.6.json b/data/models/anthropic_claude-opus-4.6.json
index bc72df0490b3d006ed2917d9f7d68277e00092b2..048dae2de3da40230747c994f11a1351dfec5a14 100644
--- a/data/models/anthropic_claude-opus-4.6.json
+++ b/data/models/anthropic_claude-opus-4.6.json
@@ -4,13 +4,13 @@
     "id": "anthropic/claude-opus-4.6",
     "developer": "Anthropic",
     "additional_details": {
-      "agent_name": "Mux",
-      "agent_organization": "Coder"
+      "agent_name": "Terminus-KIRA",
+      "agent_organization": "KRAFTON AI"
     }
   },
   "evaluations": [
     {
-      "evaluation_id": "terminal-bench-2.0/mux__claude-opus-4.6/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/terminus-kira__claude-opus-4.6/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -34,7 +34,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2026-02-13",
+          "evaluation_timestamp": "2026-02-22",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -43,17 +43,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 66.5,
+            "score": 74.7,
             "uncertainty": {
               "standard_error": {
-                "value": 2.5
+                "value": 2.6
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mux\" -m \"Claude Opus 4.6\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus-KIRA\" -m \"Claude Opus 4.6\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -70,7 +70,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mux\" -m \"Claude Opus 4.6\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus-KIRA\" -m \"Claude Opus 4.6\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -84,7 +84,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/terminus-kira__claude-opus-4.6/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/droid__claude-opus-4.6/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -108,7 +108,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2026-02-22",
+          "evaluation_timestamp": "2026-02-05",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -117,17 +117,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 74.7,
+            "score": 69.9,
             "uncertainty": {
               "standard_error": {
-                "value": 2.6
+                "value": 2.5
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus-KIRA\" -m \"Claude Opus 4.6\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Droid\" -m \"Claude Opus 4.6\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -144,7 +144,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus-KIRA\" -m \"Claude Opus 4.6\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Droid\" -m \"Claude Opus 4.6\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -158,7 +158,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/crux__claude-opus-4.6/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/mux__claude-opus-4.6/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -182,7 +182,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2026-02-23",
+          "evaluation_timestamp": "2026-02-13",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -191,11 +191,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 66.9
+            "score": 66.5,
+            "uncertainty": {
+              "standard_error": {
+                "value": 2.5
+              },
+              "num_samples": 435
+            }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Crux\" -m \"Claude Opus 4.6\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mux\" -m \"Claude Opus 4.6\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -212,7 +218,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Crux\" -m \"Claude Opus 4.6\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mux\" -m \"Claude Opus 4.6\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -226,7 +232,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/droid__claude-opus-4.6/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/crux__claude-opus-4.6/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -250,7 +256,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2026-02-05",
+          "evaluation_timestamp": "2026-02-23",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -259,17 +265,11 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 69.9,
-            "uncertainty": {
-              "standard_error": {
-                "value": 2.5
-              },
-              "num_samples": 435
-            }
+            "score": 66.9
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Droid\" -m \"Claude Opus 4.6\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Crux\" -m \"Claude Opus 4.6\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -286,7 +286,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Droid\" -m \"Claude Opus 4.6\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Crux\" -m \"Claude Opus 4.6\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -300,7 +300,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/tongagents__claude-opus-4.6/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/claude-code__claude-opus-4.6/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -324,7 +324,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2026-02-22",
+          "evaluation_timestamp": "2026-02-07",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -333,17 +333,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 71.9,
+            "score": 58.0,
             "uncertainty": {
               "standard_error": {
-                "value": 2.7
+                "value": 2.9
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"TongAgents\" -m \"Claude Opus 4.6\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Opus 4.6\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -360,7 +360,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"TongAgents\" -m \"Claude Opus 4.6\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Opus 4.6\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -374,7 +374,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/terminus-2__claude-opus-4.6/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/tongagents__claude-opus-4.6/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -398,7 +398,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2026-02-06",
+          "evaluation_timestamp": "2026-02-22",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -407,7 +407,7 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 62.9,
+            "score": 71.9,
             "uncertainty": {
               "standard_error": {
                 "value": 2.7
@@ -417,7 +417,7 @@
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Opus 4.6\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"TongAgents\" -m \"Claude Opus 4.6\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -434,7 +434,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Opus 4.6\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"TongAgents\" -m \"Claude Opus 4.6\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -448,7 +448,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/claude-code__claude-opus-4.6/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/terminus-2__claude-opus-4.6/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -472,7 +472,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2026-02-07",
+          "evaluation_timestamp": "2026-02-06",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -481,17 +481,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 58.0,
+            "score": 62.9,
             "uncertainty": {
               "standard_error": {
-                "value": 2.9
+                "value": 2.7
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Opus 4.6\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Opus 4.6\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -508,7 +508,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Opus 4.6\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Opus 4.6\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
diff --git a/data/models/anthropic_claude-sonnet-4-20250514.json b/data/models/anthropic_claude-sonnet-4-20250514.json
index 17485295f0940239338e7d9ae5edbc1db77fa9eb..a43572d10d77d034b8cc0b4e9e80cb595c19f907 100644
--- a/data/models/anthropic_claude-sonnet-4-20250514.json
+++ b/data/models/anthropic_claude-sonnet-4-20250514.json
@@ -10,8 +10,8 @@
   },
   "evaluations": [
     {
-      "evaluation_id": "global-mmlu-lite/anthropic_claude-sonnet-4-20250514/1773936583.743359",
-      "retrieved_timestamp": "1773936583.743359",
+      "evaluation_id": "global-mmlu-lite/anthropic_claude-sonnet-4-20250514/1773936496.366405",
+      "retrieved_timestamp": "1773936496.366405",
       "source_metadata": {
         "source_name": "Global MMLU Lite Leaderboard",
         "source_type": "documentation",
@@ -525,8 +525,8 @@
       "generation_config": null
     },
     {
-      "evaluation_id": "global-mmlu-lite/anthropic_claude-sonnet-4-20250514/1773936496.366405",
-      "retrieved_timestamp": "1773936496.366405",
+      "evaluation_id": "global-mmlu-lite/anthropic_claude-sonnet-4-20250514/1773936583.743359",
+      "retrieved_timestamp": "1773936583.743359",
       "source_metadata": {
         "source_name": "Global MMLU Lite Leaderboard",
         "source_type": "documentation",
diff --git a/data/models/anthropic_claude-sonnet-4.5.json b/data/models/anthropic_claude-sonnet-4.5.json
index 9a3fab7ce187c0a2db0a367672e1fd3359d83a6f..2721b6a2b6511249d3161ee660ebb342753596b5 100644
--- a/data/models/anthropic_claude-sonnet-4.5.json
+++ b/data/models/anthropic_claude-sonnet-4.5.json
@@ -4,13 +4,13 @@
     "id": "anthropic/claude-sonnet-4.5",
     "developer": "Anthropic",
     "additional_details": {
-      "agent_name": "CAMEL-AI",
-      "agent_organization": "CAMEL-AI"
+      "agent_name": "Goose",
+      "agent_organization": "Block"
     }
   },
   "evaluations": [
     {
-      "evaluation_id": "terminal-bench-2.0/camel-ai__claude-sonnet-4.5/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/goose__claude-sonnet-4.5/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -34,7 +34,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-12-24",
+          "evaluation_timestamp": "2025-12-11",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -43,17 +43,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 46.5,
+            "score": 43.1,
             "uncertainty": {
               "standard_error": {
-                "value": 2.4
+                "value": 2.6
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"CAMEL-AI\" -m \"Claude Sonnet 4.5\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Goose\" -m \"Claude Sonnet 4.5\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -70,7 +70,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"CAMEL-AI\" -m \"Claude Sonnet 4.5\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Goose\" -m \"Claude Sonnet 4.5\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -84,7 +84,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/openhands__claude-sonnet-4.5/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/terminus-2__claude-sonnet-4.5/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -108,7 +108,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-11-02",
+          "evaluation_timestamp": "2025-10-31",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -117,7 +117,7 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 42.6,
+            "score": 42.8,
             "uncertainty": {
               "standard_error": {
                 "value": 2.8
@@ -127,7 +127,7 @@
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Claude Sonnet 4.5\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Sonnet 4.5\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -144,7 +144,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Claude Sonnet 4.5\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Sonnet 4.5\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -158,7 +158,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/terminus-2__claude-sonnet-4.5/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/claude-code__claude-sonnet-4.5/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -182,7 +182,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-10-31",
+          "evaluation_timestamp": "2025-11-04",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -191,17 +191,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 42.8,
+            "score": 40.1,
             "uncertainty": {
               "standard_error": {
-                "value": 2.8
+                "value": 2.9
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Sonnet 4.5\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Sonnet 4.5\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -218,7 +218,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Sonnet 4.5\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Sonnet 4.5\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -232,7 +232,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/mini-swe-agent__claude-sonnet-4.5/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/maya__claude-sonnet-4.5/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -256,7 +256,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-11-03",
+          "evaluation_timestamp": "2026-01-04",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -265,17 +265,11 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 42.5,
-            "uncertainty": {
-              "standard_error": {
-                "value": 2.8
-              },
-              "num_samples": 435
-            }
+            "score": 42.7
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Claude Sonnet 4.5\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"MAYA\" -m \"Claude Sonnet 4.5\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -292,7 +286,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Claude Sonnet 4.5\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"MAYA\" -m \"Claude Sonnet 4.5\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -306,7 +300,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/claude-code__claude-sonnet-4.5/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/mini-swe-agent__claude-sonnet-4.5/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -330,7 +324,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-11-04",
+          "evaluation_timestamp": "2025-11-03",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -339,17 +333,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 40.1,
+            "score": 42.5,
             "uncertainty": {
               "standard_error": {
-                "value": 2.9
+                "value": 2.8
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Sonnet 4.5\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Claude Sonnet 4.5\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -366,7 +360,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Sonnet 4.5\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Claude Sonnet 4.5\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -380,7 +374,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/maya__claude-sonnet-4.5/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/camel-ai__claude-sonnet-4.5/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -404,7 +398,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2026-01-04",
+          "evaluation_timestamp": "2025-12-24",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -413,11 +407,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 42.7
+            "score": 46.5,
+            "uncertainty": {
+              "standard_error": {
+                "value": 2.4
+              },
+              "num_samples": 435
+            }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"MAYA\" -m \"Claude Sonnet 4.5\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"CAMEL-AI\" -m \"Claude Sonnet 4.5\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -434,7 +434,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"MAYA\" -m \"Claude Sonnet 4.5\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"CAMEL-AI\" -m \"Claude Sonnet 4.5\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -448,7 +448,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/goose__claude-sonnet-4.5/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/openhands__claude-sonnet-4.5/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -472,7 +472,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-12-11",
+          "evaluation_timestamp": "2025-11-02",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -481,17 +481,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 43.1,
+            "score": 42.6,
             "uncertainty": {
               "standard_error": {
-                "value": 2.6
+                "value": 2.8
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Goose\" -m \"Claude Sonnet 4.5\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Claude Sonnet 4.5\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -508,7 +508,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Goose\" -m \"Claude Sonnet 4.5\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Claude Sonnet 4.5\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
diff --git a/data/models/anthropic_opus_4.5.json b/data/models/anthropic_opus_4.5.json
index 0952f13ca0e6f15c8598404fd4db3674326651d7..04269bfd22dff1677abd1f4b2e4e3d614c500f72 100644
--- a/data/models/anthropic_opus_4.5.json
+++ b/data/models/anthropic_opus_4.5.json
@@ -6,76 +6,6 @@
     "inference_platform": "unknown"
   },
   "evaluations": [
-    {
-      "evaluation_id": "ace/anthropic_opus-4.5/1773260200",
-      "retrieved_timestamp": "1773260200",
-      "source_metadata": {
-        "source_name": "Mercor ACE Leaderboard",
-        "source_type": "evaluation_run",
-        "source_organization_name": "Mercor",
-        "source_organization_url": "https://www.mercor.com",
-        "evaluator_relationship": "first_party"
-      },
-      "eval_library": {
-        "name": "archipelago",
-        "version": "1.0.0"
-      },
-      "benchmark": "ace",
-      "evaluation_results": [
-        {
-          "evaluation_name": "Overall Score",
-          "source_data": {
-            "dataset_name": "ace",
-            "source_type": "hf_dataset",
-            "hf_repo": "Mercor/ACE"
-          },
-          "metric_config": {
-            "evaluation_description": "Overall ACE score (paper snapshot).",
-            "lower_is_better": false,
-            "score_type": "continuous",
-            "min_score": 0,
-            "max_score": 1
-          },
-          "score_details": {
-            "score": 0.478
-          },
-          "generation_config": {
-            "additional_details": {
-              "run_setting": "On"
-            }
-          }
-        },
-        {
-          "evaluation_name": "Gaming Score",
-          "source_data": {
-            "dataset_name": "ace",
-            "source_type": "hf_dataset",
-            "hf_repo": "Mercor/ACE"
-          },
-          "metric_config": {
-            "evaluation_description": "Gaming domain score.",
-            "lower_is_better": false,
-            "score_type": "continuous",
-            "min_score": 0,
-            "max_score": 1
-          },
-          "score_details": {
-            "score": 0.391
-          },
-          "generation_config": {
-            "additional_details": {
-              "run_setting": "On"
-            }
-          }
-        }
-      ],
-      "detailed_evaluation_results": null,
-      "generation_config": {
-        "additional_details": {
-          "run_setting": "On"
-        }
-      }
-    },
     {
       "evaluation_id": "apex-agents/anthropic_opus-4.5/1773260200",
       "retrieved_timestamp": "1773260200",
@@ -275,6 +205,76 @@
         }
       }
     },
+    {
+      "evaluation_id": "ace/anthropic_opus-4.5/1773260200",
+      "retrieved_timestamp": "1773260200",
+      "source_metadata": {
+        "source_name": "Mercor ACE Leaderboard",
+        "source_type": "evaluation_run",
+        "source_organization_name": "Mercor",
+        "source_organization_url": "https://www.mercor.com",
+        "evaluator_relationship": "first_party"
+      },
+      "eval_library": {
+        "name": "archipelago",
+        "version": "1.0.0"
+      },
+      "benchmark": "ace",
+      "evaluation_results": [
+        {
+          "evaluation_name": "Overall Score",
+          "source_data": {
+            "dataset_name": "ace",
+            "source_type": "hf_dataset",
+            "hf_repo": "Mercor/ACE"
+          },
+          "metric_config": {
+            "evaluation_description": "Overall ACE score (paper snapshot).",
+            "lower_is_better": false,
+            "score_type": "continuous",
+            "min_score": 0,
+            "max_score": 1
+          },
+          "score_details": {
+            "score": 0.478
+          },
+          "generation_config": {
+            "additional_details": {
+              "run_setting": "On"
+            }
+          }
+        },
+        {
+          "evaluation_name": "Gaming Score",
+          "source_data": {
+            "dataset_name": "ace",
+            "source_type": "hf_dataset",
+            "hf_repo": "Mercor/ACE"
+          },
+          "metric_config": {
+            "evaluation_description": "Gaming domain score.",
+            "lower_is_better": false,
+            "score_type": "continuous",
+            "min_score": 0,
+            "max_score": 1
+          },
+          "score_details": {
+            "score": 0.391
+          },
+          "generation_config": {
+            "additional_details": {
+              "run_setting": "On"
+            }
+          }
+        }
+      ],
+      "detailed_evaluation_results": null,
+      "generation_config": {
+        "additional_details": {
+          "run_setting": "On"
+        }
+      }
+    },
     {
       "evaluation_id": "apex-v1/anthropic_opus-4.5/1773260200",
       "retrieved_timestamp": "1773260200",
diff --git a/data/models/cognitivecomputations_dolphin-2.9.2-phi-3-medium-abliterated.json b/data/models/cognitivecomputations_dolphin-2.9.2-phi-3-medium-abliterated.json
index 9b7cc35f188c98dba3f0427de09e77d4b26cfc40..62d09c3db567da9932caf7141d7a1389c1aafea9 100644
--- a/data/models/cognitivecomputations_dolphin-2.9.2-phi-3-medium-abliterated.json
+++ b/data/models/cognitivecomputations_dolphin-2.9.2-phi-3-medium-abliterated.json
@@ -5,7 +5,7 @@
     "developer": "cognitivecomputations",
     "inference_platform": "unknown",
     "additional_details": {
-      "precision": "float16",
+      "precision": "bfloat16",
       "architecture": "MistralForCausalLM",
       "params_billions": "13.96"
     }
@@ -44,7 +44,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3613
+            "score": 0.4124
           }
         },
         {
@@ -62,7 +62,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6123
+            "score": 0.6383
           }
         },
         {
@@ -80,7 +80,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.1239
+            "score": 0.182
           }
         },
         {
@@ -98,7 +98,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.328
+            "score": 0.3289
           }
         },
         {
@@ -116,7 +116,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4112
+            "score": 0.4349
           }
         },
         {
@@ -134,7 +134,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4494
+            "score": 0.4525
           }
         }
       ],
@@ -174,7 +174,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4124
+            "score": 0.3613
           }
         },
         {
@@ -192,7 +192,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6383
+            "score": 0.6123
           }
         },
         {
@@ -210,7 +210,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.182
+            "score": 0.1239
           }
         },
         {
@@ -228,7 +228,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3289
+            "score": 0.328
           }
         },
         {
@@ -246,7 +246,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4349
+            "score": 0.4112
           }
         },
         {
@@ -264,7 +264,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4525
+            "score": 0.4494
           }
         }
       ],
diff --git a/data/models/columbia-nlp_lion-gemma-2b-dpo-v1.0.json b/data/models/columbia-nlp_lion-gemma-2b-dpo-v1.0.json
index fcba7390760a9757eb868fa82619eef91d551f86..e1500a121f56ad89301900472e223d7937764cb0 100644
--- a/data/models/columbia-nlp_lion-gemma-2b-dpo-v1.0.json
+++ b/data/models/columbia-nlp_lion-gemma-2b-dpo-v1.0.json
@@ -5,7 +5,7 @@
     "developer": "Columbia-NLP",
     "inference_platform": "unknown",
     "additional_details": {
-      "precision": "bfloat16",
+      "precision": "float16",
       "architecture": "GemmaForCausalLM",
       "params_billions": "2.506"
     }
@@ -44,7 +44,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3102
+            "score": 0.3278
           }
         },
         {
@@ -62,7 +62,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3881
+            "score": 0.392
           }
         },
         {
@@ -80,7 +80,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0536
+            "score": 0.0431
           }
         },
         {
@@ -98,7 +98,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2534
+            "score": 0.2492
           }
         },
         {
@@ -116,7 +116,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4081
+            "score": 0.412
           }
         },
         {
@@ -134,7 +134,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.1665
+            "score": 0.1666
           }
         }
       ],
@@ -174,7 +174,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3278
+            "score": 0.3102
           }
         },
         {
@@ -192,7 +192,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.392
+            "score": 0.3881
           }
         },
         {
@@ -210,7 +210,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0431
+            "score": 0.0536
           }
         },
         {
@@ -228,7 +228,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2492
+            "score": 0.2534
           }
         },
         {
@@ -246,7 +246,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.412
+            "score": 0.4081
           }
         },
         {
@@ -264,7 +264,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.1666
+            "score": 0.1665
           }
         }
       ],
diff --git a/data/models/cpayne1303_llama-43m-beta.json b/data/models/cpayne1303_llama-43m-beta.json
index 0f0d3430f35b22a1aef8ac071591532d594d6d4d..8e62d1be9edc1ccaeda7702d1a793fd5db4d4150 100644
--- a/data/models/cpayne1303_llama-43m-beta.json
+++ b/data/models/cpayne1303_llama-43m-beta.json
@@ -5,7 +5,7 @@
     "developer": "cpayne1303",
     "inference_platform": "unknown",
     "additional_details": {
-      "precision": "float16",
+      "precision": "bfloat16",
       "architecture": "LlamaForCausalLM",
       "params_billions": "0.043"
     }
@@ -44,7 +44,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.1949
+            "score": 0.1916
           }
         },
         {
@@ -62,7 +62,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2965
+            "score": 0.2977
           }
         },
         {
@@ -80,7 +80,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0045
+            "score": 0.0
           }
         },
         {
@@ -116,7 +116,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3885
+            "score": 0.3872
           }
         },
         {
@@ -134,7 +134,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.1111
+            "score": 0.1132
           }
         }
       ],
@@ -174,7 +174,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.1916
+            "score": 0.1949
           }
         },
         {
@@ -192,7 +192,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2977
+            "score": 0.2965
           }
         },
         {
@@ -210,7 +210,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0
+            "score": 0.0045
           }
         },
         {
@@ -246,7 +246,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3872
+            "score": 0.3885
           }
         },
         {
@@ -264,7 +264,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.1132
+            "score": 0.1111
           }
         }
       ],
diff --git a/data/models/daemontatox_pathfinderai.json b/data/models/daemontatox_pathfinderai.json
index 7a5f7d25c7278e2df08548a48abfe0b0ee9b4f2a..8b13e2aabf79ff36b82f8d8bd4c0bcdf2b41e385 100644
--- a/data/models/daemontatox_pathfinderai.json
+++ b/data/models/daemontatox_pathfinderai.json
@@ -5,7 +5,7 @@
     "developer": "Daemontatox",
     "inference_platform": "unknown",
     "additional_details": {
-      "precision": "bfloat16",
+      "precision": "float16",
       "architecture": "Qwen2ForCausalLM",
       "params_billions": "32.764"
     }
@@ -44,7 +44,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4855
+            "score": 0.3745
           }
         },
         {
@@ -62,7 +62,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6627
+            "score": 0.6668
           }
         },
         {
@@ -80,7 +80,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4841
+            "score": 0.4758
           }
         },
         {
@@ -98,7 +98,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3096
+            "score": 0.3943
           }
         },
         {
@@ -116,7 +116,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4256
+            "score": 0.4858
           }
         },
         {
@@ -134,7 +134,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5542
+            "score": 0.5593
           }
         }
       ],
@@ -174,7 +174,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3745
+            "score": 0.4855
           }
         },
         {
@@ -192,7 +192,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6668
+            "score": 0.6627
           }
         },
         {
@@ -210,7 +210,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4758
+            "score": 0.4841
           }
         },
         {
@@ -228,7 +228,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3943
+            "score": 0.3096
           }
         },
         {
@@ -246,7 +246,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4858
+            "score": 0.4256
           }
         },
         {
@@ -264,7 +264,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5593
+            "score": 0.5542
           }
         }
       ],
diff --git a/data/models/deepmount00_llama-3.1-8b-ita.json b/data/models/deepmount00_llama-3.1-8b-ita.json
index be94466036e753700b8de72e15b468fc6edebdee..1fef7ca0b3d471692e7379016093b45312ecc7df 100644
--- a/data/models/deepmount00_llama-3.1-8b-ita.json
+++ b/data/models/deepmount00_llama-3.1-8b-ita.json
@@ -6,8 +6,8 @@
     "inference_platform": "unknown",
     "additional_details": {
       "precision": "bfloat16",
-      "architecture": "Unknown",
-      "params_billions": "0.0",
+      "architecture": "LlamaForCausalLM",
+      "params_billions": "8.03",
       "model_id_aliases": [
         "DeepMount00/Llama-3.1-8b-Ita"
       ]
@@ -15,7 +15,7 @@
   },
   "evaluations": [
     {
-      "evaluation_id": "hfopenllm_v2/DeepMount00_Llama-3.1-8b-Ita/1773936498.240187",
+      "evaluation_id": "hfopenllm_v2/DeepMount00_Llama-3.1-8b-ITA/1773936498.240187",
       "retrieved_timestamp": "1773936498.240187",
       "source_metadata": {
         "source_name": "HF Open LLM v2",
@@ -47,7 +47,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5365
+            "score": 0.7917
           }
         },
         {
@@ -65,7 +65,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.517
+            "score": 0.5109
           }
         },
         {
@@ -83,7 +83,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.1707
+            "score": 0.1088
           }
         },
         {
@@ -101,7 +101,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3062
+            "score": 0.2878
           }
         },
         {
@@ -119,7 +119,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4487
+            "score": 0.4136
           }
         },
         {
@@ -137,7 +137,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.396
+            "score": 0.3876
           }
         }
       ],
@@ -145,7 +145,7 @@
       "generation_config": null
     },
     {
-      "evaluation_id": "hfopenllm_v2/DeepMount00_Llama-3.1-8b-ITA/1773936498.240187",
+      "evaluation_id": "hfopenllm_v2/DeepMount00_Llama-3.1-8b-Ita/1773936498.240187",
       "retrieved_timestamp": "1773936498.240187",
       "source_metadata": {
         "source_name": "HF Open LLM v2",
@@ -177,7 +177,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7917
+            "score": 0.5365
           }
         },
         {
@@ -195,7 +195,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5109
+            "score": 0.517
           }
         },
         {
@@ -213,7 +213,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.1088
+            "score": 0.1707
           }
         },
         {
@@ -231,7 +231,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2878
+            "score": 0.3062
           }
         },
         {
@@ -249,7 +249,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4136
+            "score": 0.4487
           }
         },
         {
@@ -267,7 +267,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3876
+            "score": 0.396
           }
         }
       ],
diff --git a/data/models/dfurman_llama-3-8b-orpo-v0.1.json b/data/models/dfurman_llama-3-8b-orpo-v0.1.json
index 987b3112f03b34626e0421787b1c9cb2e2b55a46..a2dcf54fc8dfd61d5182710e882025f5ed46cdab 100644
--- a/data/models/dfurman_llama-3-8b-orpo-v0.1.json
+++ b/data/models/dfurman_llama-3-8b-orpo-v0.1.json
@@ -5,8 +5,8 @@
     "developer": "dfurman",
     "inference_platform": "unknown",
     "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
+      "precision": "float16",
+      "architecture": "?",
       "params_billions": "8.03"
     }
   },
@@ -44,7 +44,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3
+            "score": 0.2835
           }
         },
         {
@@ -62,7 +62,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3853
+            "score": 0.3842
           }
         },
         {
@@ -80,7 +80,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0415
+            "score": 0.0521
           }
         },
         {
@@ -98,7 +98,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2617
+            "score": 0.2609
           }
         },
         {
@@ -116,7 +116,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3579
+            "score": 0.3566
           }
         },
         {
@@ -134,7 +134,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2281
+            "score": 0.2298
           }
         }
       ],
@@ -174,7 +174,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2835
+            "score": 0.3
           }
         },
         {
@@ -192,7 +192,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3842
+            "score": 0.3853
           }
         },
         {
@@ -210,7 +210,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0521
+            "score": 0.0415
           }
         },
         {
@@ -228,7 +228,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2609
+            "score": 0.2617
           }
         },
         {
@@ -246,7 +246,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3566
+            "score": 0.3579
           }
         },
         {
@@ -264,7 +264,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2298
+            "score": 0.2281
           }
         }
       ],
diff --git a/data/models/doppelreflex_mn-12b-lilithframe.json b/data/models/doppelreflex_mn-12b-lilithframe.json
index 2532c65fe82e1d06f8d68b438dbffc94e3e4e4bb..720fbe0846130d5605f8d9f0e7d4c1731ec8e47c 100644
--- a/data/models/doppelreflex_mn-12b-lilithframe.json
+++ b/data/models/doppelreflex_mn-12b-lilithframe.json
@@ -5,7 +5,7 @@
     "developer": "DoppelReflEx",
     "inference_platform": "unknown",
     "additional_details": {
-      "precision": "bfloat16",
+      "precision": "float16",
       "architecture": "MistralForCausalLM",
       "params_billions": "12.248"
     }
@@ -44,7 +44,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.436
+            "score": 0.451
           }
         },
         {
@@ -62,7 +62,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4956
+            "score": 0.4944
           }
         },
         {
@@ -80,7 +80,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0589
+            "score": 0.1156
           }
         },
         {
@@ -98,7 +98,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3205
+            "score": 0.3196
           }
         },
         {
@@ -116,7 +116,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3843
+            "score": 0.3896
           }
         },
         {
@@ -134,7 +134,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3237
+            "score": 0.3256
           }
         }
       ],
@@ -174,7 +174,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.451
+            "score": 0.436
           }
         },
         {
@@ -192,7 +192,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4944
+            "score": 0.4956
           }
         },
         {
@@ -210,7 +210,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.1156
+            "score": 0.0589
           }
         },
         {
@@ -228,7 +228,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3196
+            "score": 0.3205
           }
         },
         {
@@ -246,7 +246,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3896
+            "score": 0.3843
           }
         },
         {
@@ -264,7 +264,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3256
+            "score": 0.3237
           }
         }
       ],
diff --git a/data/models/google_gemini-2.5-flash.json b/data/models/google_gemini-2.5-flash.json
index fed61d14fcef2477959a8fb49c1907eead45e035..a2e65a21393c95ddfc5f4fe71d47a11fac033db8 100644
--- a/data/models/google_gemini-2.5-flash.json
+++ b/data/models/google_gemini-2.5-flash.json
@@ -1269,7 +1269,7 @@
       "generation_config": null
     },
     {
-      "evaluation_id": "terminal-bench-2.0/terminus-2__gemini-2.5-flash/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/mini-swe-agent__gemini-2.5-flash/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -1293,7 +1293,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-10-31",
+          "evaluation_timestamp": "2025-11-03",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -1302,17 +1302,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 16.9,
+            "score": 17.1,
             "uncertainty": {
               "standard_error": {
-                "value": 2.4
+                "value": 2.5
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Gemini 2.5 Flash\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Gemini 2.5 Flash\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -1329,7 +1329,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Gemini 2.5 Flash\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Gemini 2.5 Flash\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -1343,7 +1343,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/gemini-cli__gemini-2.5-flash/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/openhands__gemini-2.5-flash/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -1367,7 +1367,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-11-04",
+          "evaluation_timestamp": "2025-11-02",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -1376,17 +1376,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 15.4,
+            "score": 16.4,
             "uncertainty": {
               "standard_error": {
-                "value": 2.3
+                "value": 2.4
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Gemini CLI\" -m \"Gemini 2.5 Flash\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Gemini 2.5 Flash\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -1403,7 +1403,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Gemini CLI\" -m \"Gemini 2.5 Flash\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Gemini 2.5 Flash\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -1417,7 +1417,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/openhands__gemini-2.5-flash/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/gemini-cli__gemini-2.5-flash/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -1441,7 +1441,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-11-02",
+          "evaluation_timestamp": "2025-11-04",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -1450,17 +1450,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 16.4,
+            "score": 15.4,
             "uncertainty": {
               "standard_error": {
-                "value": 2.4
+                "value": 2.3
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Gemini 2.5 Flash\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Gemini CLI\" -m \"Gemini 2.5 Flash\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -1477,7 +1477,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Gemini 2.5 Flash\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Gemini CLI\" -m \"Gemini 2.5 Flash\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -1491,7 +1491,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/mini-swe-agent__gemini-2.5-flash/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/terminus-2__gemini-2.5-flash/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -1515,7 +1515,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-11-03",
+          "evaluation_timestamp": "2025-10-31",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -1524,17 +1524,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 17.1,
+            "score": 16.9,
             "uncertainty": {
               "standard_error": {
-                "value": 2.5
+                "value": 2.4
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Gemini 2.5 Flash\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Gemini 2.5 Flash\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -1551,7 +1551,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Gemini 2.5 Flash\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Gemini 2.5 Flash\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
diff --git a/data/models/google_gemini-2.5-pro.json b/data/models/google_gemini-2.5-pro.json
index 9276c148869675ec0a23b9d18706b87bdb718f59..64bf8e5b0bbd8066e28305bcfb70c0755706abd7 100644
--- a/data/models/google_gemini-2.5-pro.json
+++ b/data/models/google_gemini-2.5-pro.json
@@ -1343,7 +1343,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/terminus-2__gemini-2.5-pro/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/gemini-cli__gemini-2.5-pro/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -1367,7 +1367,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-10-31",
+          "evaluation_timestamp": "2025-11-04",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -1376,17 +1376,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 32.6,
+            "score": 19.6,
             "uncertainty": {
               "standard_error": {
-                "value": 3.0
+                "value": 2.9
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Gemini 2.5 Pro\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Gemini CLI\" -m \"Gemini 2.5 Pro\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -1403,7 +1403,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Gemini 2.5 Pro\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Gemini CLI\" -m \"Gemini 2.5 Pro\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -1417,7 +1417,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/gemini-cli__gemini-2.5-pro/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/terminus-2__gemini-2.5-pro/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -1441,7 +1441,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-11-04",
+          "evaluation_timestamp": "2025-10-31",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -1450,17 +1450,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 19.6,
+            "score": 32.6,
             "uncertainty": {
               "standard_error": {
-                "value": 2.9
+                "value": 3.0
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Gemini CLI\" -m \"Gemini 2.5 Pro\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Gemini 2.5 Pro\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -1477,7 +1477,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Gemini CLI\" -m \"Gemini 2.5 Pro\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Gemini 2.5 Pro\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
diff --git a/data/models/google_gemini-3-flash.json b/data/models/google_gemini-3-flash.json
index 4ea354a43eede049751e0c22c3ef39159edfde83..96d81804f9403e5af725c031c3f055444b4c7604 100644
--- a/data/models/google_gemini-3-flash.json
+++ b/data/models/google_gemini-3-flash.json
@@ -4,13 +4,13 @@
     "id": "google/gemini-3-flash",
     "developer": "Google",
     "additional_details": {
-      "agent_name": "Junie CLI",
-      "agent_organization": "JetBrains"
+      "agent_name": "Gemini CLI",
+      "agent_organization": "Google"
     }
   },
   "evaluations": [
     {
-      "evaluation_id": "terminal-bench-2.0/junie-cli__gemini-3-flash/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/gemini-cli__gemini-3-flash/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -43,17 +43,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 64.3,
+            "score": 51.0,
             "uncertainty": {
               "standard_error": {
-                "value": 2.8
+                "value": 3.0
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Junie CLI\" -m \"Gemini 3 Flash\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Gemini CLI\" -m \"Gemini 3 Flash\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -70,7 +70,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Junie CLI\" -m \"Gemini 3 Flash\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Gemini CLI\" -m \"Gemini 3 Flash\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -84,7 +84,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/gemini-cli__gemini-3-flash/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/terminus-2__gemini-3-flash/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -108,7 +108,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2026-03-06",
+          "evaluation_timestamp": "2026-01-07",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -117,17 +117,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 47.4,
+            "score": 51.7,
             "uncertainty": {
               "standard_error": {
-                "value": 3.0
+                "value": 3.1
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Gemini CLI\" -m \"Gemini 3 Flash\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Gemini 3 Flash\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -144,7 +144,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Gemini CLI\" -m \"Gemini 3 Flash\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Gemini 3 Flash\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -158,7 +158,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/terminus-2__gemini-3-flash/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/junie-cli__gemini-3-flash/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -182,7 +182,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2026-01-07",
+          "evaluation_timestamp": "2025-12-23",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -191,17 +191,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 51.7,
+            "score": 64.3,
             "uncertainty": {
               "standard_error": {
-                "value": 3.1
+                "value": 2.8
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Gemini 3 Flash\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Junie CLI\" -m \"Gemini 3 Flash\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -218,7 +218,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Gemini 3 Flash\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Junie CLI\" -m \"Gemini 3 Flash\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -256,7 +256,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-12-23",
+          "evaluation_timestamp": "2026-03-06",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -265,7 +265,7 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 51.0,
+            "score": 47.4,
             "uncertainty": {
               "standard_error": {
                 "value": 3.0
diff --git a/data/models/google_gemini-3-pro-preview.json b/data/models/google_gemini-3-pro-preview.json
index f42cfe1c80eb9fdb58bb474a8d6a4066dfd5628d..7edffa00c381bc6c2a68a0a203fbe30120c52254 100644
--- a/data/models/google_gemini-3-pro-preview.json
+++ b/data/models/google_gemini-3-pro-preview.json
@@ -4,13 +4,13 @@
     "id": "google/gemini-3-pro-preview",
     "developer": "Google",
     "additional_details": {
-      "agent_name": "LiteLLM Tool Calling with Shortlisting",
-      "agent_framework": "tool_calling_with_shortlisting"
+      "agent_name": "OpenAI Solo",
+      "agent_framework": "openai_solo"
     }
   },
   "evaluations": [
     {
-      "evaluation_id": "appworld/test_normal/litellm-tool-calling-with-shortlisting__google_gemini-3-pro-preview/1774263615.0201504",
+      "evaluation_id": "appworld/test_normal/openai-solo__google_gemini-3-pro-preview/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -42,23 +42,23 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.55,
+            "score": 0.582,
             "uncertainty": {
               "num_samples": 100
             },
             "details": {
-              "average_agent_cost": "1.3",
-              "total_run_cost": "130.49",
-              "average_steps": "22.59",
-              "percent_finished": "1.0"
+              "average_agent_cost": "8.7",
+              "total_run_cost": "869.55",
+              "average_steps": "33.49",
+              "percent_finished": "0.98"
             }
           },
           "generation_config": {
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "LiteLLM Tool Calling with Shortlisting",
-                  "agent_framework": "tool_calling_with_shortlisting"
+                  "agent_name": "OpenAI Solo",
+                  "agent_framework": "openai_solo"
                 }
               }
             }
@@ -70,15 +70,15 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "LiteLLM Tool Calling with Shortlisting",
-              "agent_framework": "tool_calling_with_shortlisting"
+              "agent_name": "OpenAI Solo",
+              "agent_framework": "openai_solo"
             }
           }
         }
       }
     },
     {
-      "evaluation_id": "appworld/test_normal/litellm-tool-calling__google_gemini-3-pro-preview/1774263615.0201504",
+      "evaluation_id": "browsecompplus/claude-code-cli__google_gemini-3-pro-preview/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -91,42 +91,42 @@
         "name": "exgentic",
         "version": "0.1.0"
       },
-      "benchmark": "appworld_test_normal",
+      "benchmark": "browsecompplus",
       "evaluation_results": [
         {
-          "evaluation_name": "appworld/test_normal",
+          "evaluation_name": "browsecompplus",
           "source_data": {
-            "dataset_name": "appworld/test_normal",
+            "dataset_name": "browsecompplus",
             "source_type": "url",
             "url": [
               "https://github.com/Exgentic/exgentic"
             ]
           },
           "metric_config": {
-            "evaluation_description": "AppWorld benchmark evaluation (test_normal subset)",
+            "evaluation_description": "BrowseCompPlus benchmark evaluation",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.505,
+            "score": 0.51,
             "uncertainty": {
               "num_samples": 100
             },
             "details": {
-              "average_agent_cost": "1.88",
-              "total_run_cost": "188.19",
-              "average_steps": "21.76",
-              "percent_finished": "0.99"
+              "average_agent_cost": "2.85",
+              "total_run_cost": "284.68",
+              "average_steps": "22.88",
+              "percent_finished": "0.7"
             }
           },
           "generation_config": {
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "LiteLLM Tool Calling",
-                  "agent_framework": "tool_calling"
+                  "agent_name": "Claude Code CLI",
+                  "agent_framework": "claude_code"
                 }
               }
             }
@@ -138,15 +138,15 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "LiteLLM Tool Calling",
-              "agent_framework": "tool_calling"
+              "agent_name": "Claude Code CLI",
+              "agent_framework": "claude_code"
             }
           }
         }
       }
     },
     {
-      "evaluation_id": "browsecompplus/smolagents-code__google_gemini-3-pro-preview/1774263615.0201504",
+      "evaluation_id": "appworld/test_normal/litellm-tool-calling__google_gemini-3-pro-preview/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -159,42 +159,42 @@
         "name": "exgentic",
         "version": "0.1.0"
       },
-      "benchmark": "browsecompplus",
+      "benchmark": "appworld_test_normal",
       "evaluation_results": [
         {
-          "evaluation_name": "browsecompplus",
+          "evaluation_name": "appworld/test_normal",
           "source_data": {
-            "dataset_name": "browsecompplus",
+            "dataset_name": "appworld/test_normal",
             "source_type": "url",
             "url": [
               "https://github.com/Exgentic/exgentic"
             ]
           },
           "metric_config": {
-            "evaluation_description": "BrowseCompPlus benchmark evaluation",
+            "evaluation_description": "AppWorld benchmark evaluation (test_normal subset)",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.57,
+            "score": 0.505,
             "uncertainty": {
               "num_samples": 100
             },
             "details": {
-              "average_agent_cost": "2.39",
-              "total_run_cost": "239.0",
-              "average_steps": "29.63",
-              "percent_finished": "0.69"
+              "average_agent_cost": "1.88",
+              "total_run_cost": "188.19",
+              "average_steps": "21.76",
+              "percent_finished": "0.99"
             }
           },
           "generation_config": {
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "SmolAgents Code",
-                  "agent_framework": "smolagents_code"
+                  "agent_name": "LiteLLM Tool Calling",
+                  "agent_framework": "tool_calling"
                 }
               }
             }
@@ -206,15 +206,15 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "SmolAgents Code",
-              "agent_framework": "smolagents_code"
+              "agent_name": "LiteLLM Tool Calling",
+              "agent_framework": "tool_calling"
             }
           }
         }
       }
     },
     {
-      "evaluation_id": "appworld/test_normal/openai-solo__google_gemini-3-pro-preview/1774263615.0201504",
+      "evaluation_id": "appworld/test_normal/claude-code-cli__google_gemini-3-pro-preview/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -246,23 +246,23 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.582,
+            "score": 0.36,
             "uncertainty": {
               "num_samples": 100
             },
             "details": {
-              "average_agent_cost": "8.7",
-              "total_run_cost": "869.55",
-              "average_steps": "33.49",
-              "percent_finished": "0.98"
+              "average_agent_cost": "3.11",
+              "total_run_cost": "310.55",
+              "average_steps": "38.01",
+              "percent_finished": "0.86"
             }
           },
           "generation_config": {
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "OpenAI Solo",
-                  "agent_framework": "openai_solo"
+                  "agent_name": "Claude Code CLI",
+                  "agent_framework": "claude_code"
                 }
               }
             }
@@ -274,15 +274,15 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "OpenAI Solo",
-              "agent_framework": "openai_solo"
+              "agent_name": "Claude Code CLI",
+              "agent_framework": "claude_code"
             }
           }
         }
       }
     },
     {
-      "evaluation_id": "appworld/test_normal/claude-code-cli__google_gemini-3-pro-preview/1774263615.0201504",
+      "evaluation_id": "appworld/test_normal/smolagents-code__google_gemini-3-pro-preview/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -314,23 +314,23 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.36,
+            "score": 0.13,
             "uncertainty": {
               "num_samples": 100
             },
             "details": {
-              "average_agent_cost": "3.11",
-              "total_run_cost": "310.55",
-              "average_steps": "38.01",
-              "percent_finished": "0.86"
+              "average_agent_cost": "2.54",
+              "total_run_cost": "254.25",
+              "average_steps": "49.13",
+              "percent_finished": "0.71"
             }
           },
           "generation_config": {
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "Claude Code CLI",
-                  "agent_framework": "claude_code"
+                  "agent_name": "SmolAgents Code",
+                  "agent_framework": "smolagents_code"
                 }
               }
             }
@@ -342,15 +342,15 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "Claude Code CLI",
-              "agent_framework": "claude_code"
+              "agent_name": "SmolAgents Code",
+              "agent_framework": "smolagents_code"
             }
           }
         }
       }
     },
     {
-      "evaluation_id": "appworld/test_normal/smolagents-code__google_gemini-3-pro-preview/1774263615.0201504",
+      "evaluation_id": "appworld/test_normal/litellm-tool-calling-with-shortlisting__google_gemini-3-pro-preview/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -382,23 +382,23 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.13,
+            "score": 0.55,
             "uncertainty": {
               "num_samples": 100
             },
             "details": {
-              "average_agent_cost": "2.54",
-              "total_run_cost": "254.25",
-              "average_steps": "49.13",
-              "percent_finished": "0.71"
+              "average_agent_cost": "1.3",
+              "total_run_cost": "130.49",
+              "average_steps": "22.59",
+              "percent_finished": "1.0"
             }
           },
           "generation_config": {
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "SmolAgents Code",
-                  "agent_framework": "smolagents_code"
+                  "agent_name": "LiteLLM Tool Calling with Shortlisting",
+                  "agent_framework": "tool_calling_with_shortlisting"
                 }
               }
             }
@@ -410,15 +410,15 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "SmolAgents Code",
-              "agent_framework": "smolagents_code"
+              "agent_name": "LiteLLM Tool Calling with Shortlisting",
+              "agent_framework": "tool_calling_with_shortlisting"
             }
           }
         }
       }
     },
     {
-      "evaluation_id": "browsecompplus/claude-code-cli__google_gemini-3-pro-preview/1774263615.0201504",
+      "evaluation_id": "browsecompplus/smolagents-code__google_gemini-3-pro-preview/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -450,23 +450,23 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.51,
+            "score": 0.57,
             "uncertainty": {
               "num_samples": 100
             },
             "details": {
-              "average_agent_cost": "2.85",
-              "total_run_cost": "284.68",
-              "average_steps": "22.88",
-              "percent_finished": "0.7"
+              "average_agent_cost": "2.39",
+              "total_run_cost": "239.0",
+              "average_steps": "29.63",
+              "percent_finished": "0.69"
             }
           },
           "generation_config": {
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "Claude Code CLI",
-                  "agent_framework": "claude_code"
+                  "agent_name": "SmolAgents Code",
+                  "agent_framework": "smolagents_code"
                 }
               }
             }
@@ -478,15 +478,15 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "Claude Code CLI",
-              "agent_framework": "claude_code"
+              "agent_name": "SmolAgents Code",
+              "agent_framework": "smolagents_code"
             }
           }
         }
       }
     },
     {
-      "evaluation_id": "browsecompplus/openai-solo__google_gemini-3-pro-preview/1774263615.0201504",
+      "evaluation_id": "browsecompplus/litellm-tool-calling__google_gemini-3-pro-preview/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -518,23 +518,23 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3333,
+            "score": 0.48,
             "uncertainty": {
-              "num_samples": 99
+              "num_samples": 100
             },
             "details": {
-              "average_agent_cost": "0.64",
-              "total_run_cost": "63.79",
-              "average_steps": "8.45",
-              "percent_finished": "0.6061"
+              "average_agent_cost": "0.44",
+              "total_run_cost": "44.18",
+              "average_steps": "7.85",
+              "percent_finished": "0.99"
             }
           },
           "generation_config": {
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "OpenAI Solo",
-                  "agent_framework": "openai_solo"
+                  "agent_name": "LiteLLM Tool Calling",
+                  "agent_framework": "tool_calling"
                 }
               }
             }
@@ -546,8 +546,8 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "OpenAI Solo",
-              "agent_framework": "openai_solo"
+              "agent_name": "LiteLLM Tool Calling",
+              "agent_framework": "tool_calling"
             }
           }
         }
@@ -622,7 +622,7 @@
       }
     },
     {
-      "evaluation_id": "browsecompplus/litellm-tool-calling__google_gemini-3-pro-preview/1774263615.0201504",
+      "evaluation_id": "browsecompplus/openai-solo__google_gemini-3-pro-preview/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -654,23 +654,23 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.48,
+            "score": 0.3333,
             "uncertainty": {
-              "num_samples": 100
+              "num_samples": 99
             },
             "details": {
-              "average_agent_cost": "0.44",
-              "total_run_cost": "44.18",
-              "average_steps": "7.85",
-              "percent_finished": "0.99"
+              "average_agent_cost": "0.64",
+              "total_run_cost": "63.79",
+              "average_steps": "8.45",
+              "percent_finished": "0.6061"
             }
           },
           "generation_config": {
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "LiteLLM Tool Calling",
-                  "agent_framework": "tool_calling"
+                  "agent_name": "OpenAI Solo",
+                  "agent_framework": "openai_solo"
                 }
               }
             }
@@ -682,8 +682,8 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "LiteLLM Tool Calling",
-              "agent_framework": "tool_calling"
+              "agent_name": "OpenAI Solo",
+              "agent_framework": "openai_solo"
             }
           }
         }
@@ -1720,7 +1720,7 @@
       "generation_config": null
     },
     {
-      "evaluation_id": "swe-bench/litellm-tool-calling-with-shortlisting__google_gemini-3-pro-preview/1774263615.0201504",
+      "evaluation_id": "swe-bench/litellm-tool-calling__google_gemini-3-pro-preview/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -1767,8 +1767,8 @@
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "LiteLLM Tool Calling with Shortlisting",
-                  "agent_framework": "tool_calling_with_shortlisting"
+                  "agent_name": "LiteLLM Tool Calling",
+                  "agent_framework": "tool_calling"
                 }
               }
             }
@@ -1780,15 +1780,15 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "LiteLLM Tool Calling with Shortlisting",
-              "agent_framework": "tool_calling_with_shortlisting"
+              "agent_name": "LiteLLM Tool Calling",
+              "agent_framework": "tool_calling"
             }
           }
         }
       }
     },
     {
-      "evaluation_id": "swe-bench/claude-code-cli__google_gemini-3-pro-preview/1774263615.0201504",
+      "evaluation_id": "swe-bench/openai-solo__google_gemini-3-pro-preview/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -1820,14 +1820,14 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.67,
+            "score": 0.7234,
             "uncertainty": {
-              "num_samples": 100
+              "num_samples": 94
             },
             "details": {
-              "average_agent_cost": "3.68",
-              "total_run_cost": "367.97",
-              "average_steps": "43.72",
+              "average_agent_cost": "1.58",
+              "total_run_cost": "148.44",
+              "average_steps": "32.36",
               "percent_finished": "1.0"
             }
           },
@@ -1835,8 +1835,8 @@
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "Claude Code CLI",
-                  "agent_framework": "claude_code"
+                  "agent_name": "OpenAI Solo",
+                  "agent_framework": "openai_solo"
                 }
               }
             }
@@ -1848,8 +1848,8 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "Claude Code CLI",
-              "agent_framework": "claude_code"
+              "agent_name": "OpenAI Solo",
+              "agent_framework": "openai_solo"
             }
           }
         }
@@ -1924,7 +1924,7 @@
       }
     },
     {
-      "evaluation_id": "swe-bench/litellm-tool-calling__google_gemini-3-pro-preview/1774263615.0201504",
+      "evaluation_id": "swe-bench/claude-code-cli__google_gemini-3-pro-preview/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -1956,14 +1956,14 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.71,
+            "score": 0.67,
             "uncertainty": {
               "num_samples": 100
             },
             "details": {
-              "average_agent_cost": "0.7",
-              "total_run_cost": "69.56",
-              "average_steps": "32.55",
+              "average_agent_cost": "3.68",
+              "total_run_cost": "367.97",
+              "average_steps": "43.72",
               "percent_finished": "1.0"
             }
           },
@@ -1971,8 +1971,8 @@
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "LiteLLM Tool Calling",
-                  "agent_framework": "tool_calling"
+                  "agent_name": "Claude Code CLI",
+                  "agent_framework": "claude_code"
                 }
               }
             }
@@ -1984,15 +1984,15 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "LiteLLM Tool Calling",
-              "agent_framework": "tool_calling"
+              "agent_name": "Claude Code CLI",
+              "agent_framework": "claude_code"
             }
           }
         }
       }
     },
     {
-      "evaluation_id": "swe-bench/openai-solo__google_gemini-3-pro-preview/1774263615.0201504",
+      "evaluation_id": "swe-bench/litellm-tool-calling-with-shortlisting__google_gemini-3-pro-preview/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -2024,14 +2024,14 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7234,
+            "score": 0.71,
             "uncertainty": {
-              "num_samples": 94
+              "num_samples": 100
             },
             "details": {
-              "average_agent_cost": "1.58",
-              "total_run_cost": "148.44",
-              "average_steps": "32.36",
+              "average_agent_cost": "0.7",
+              "total_run_cost": "69.56",
+              "average_steps": "32.55",
               "percent_finished": "1.0"
             }
           },
@@ -2039,8 +2039,8 @@
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "OpenAI Solo",
-                  "agent_framework": "openai_solo"
+                  "agent_name": "LiteLLM Tool Calling with Shortlisting",
+                  "agent_framework": "tool_calling_with_shortlisting"
                 }
               }
             }
@@ -2052,8 +2052,8 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "OpenAI Solo",
-              "agent_framework": "openai_solo"
+              "agent_name": "LiteLLM Tool Calling with Shortlisting",
+              "agent_framework": "tool_calling_with_shortlisting"
             }
           }
         }
@@ -2127,74 +2127,6 @@
         }
       }
     },
-    {
-      "evaluation_id": "tau-bench-2/airline/litellm-tool-calling__google_gemini-3-pro-preview/1774263615.0201504",
-      "retrieved_timestamp": "1774263615.0201504",
-      "source_metadata": {
-        "source_name": "Exgentic Open Agent Leaderboard",
-        "source_type": "evaluation_run",
-        "source_organization_name": "Exgentic",
-        "source_organization_url": "https://github.com/Exgentic",
-        "evaluator_relationship": "third_party"
-      },
-      "eval_library": {
-        "name": "exgentic",
-        "version": "0.1.0"
-      },
-      "benchmark": "tau-bench-2_airline",
-      "evaluation_results": [
-        {
-          "evaluation_name": "tau-bench-2/airline",
-          "source_data": {
-            "dataset_name": "tau-bench-2/airline",
-            "source_type": "url",
-            "url": [
-              "https://github.com/Exgentic/exgentic"
-            ]
-          },
-          "metric_config": {
-            "evaluation_description": "Tau Bench 2 benchmark evaluation (airline subset)",
-            "lower_is_better": false,
-            "score_type": "continuous",
-            "min_score": 0.0,
-            "max_score": 1.0
-          },
-          "score_details": {
-            "score": 0.7,
-            "uncertainty": {
-              "num_samples": 50
-            },
-            "details": {
-              "average_agent_cost": "0.16",
-              "total_run_cost": "8.48",
-              "average_steps": "10.14",
-              "percent_finished": "1.0"
-            }
-          },
-          "generation_config": {
-            "generation_args": {
-              "agentic_eval_config": {
-                "additional_details": {
-                  "agent_name": "LiteLLM Tool Calling",
-                  "agent_framework": "tool_calling"
-                }
-              }
-            }
-          }
-        }
-      ],
-      "detailed_evaluation_results": null,
-      "generation_config": {
-        "generation_args": {
-          "agentic_eval_config": {
-            "additional_details": {
-              "agent_name": "LiteLLM Tool Calling",
-              "agent_framework": "tool_calling"
-            }
-          }
-        }
-      }
-    },
     {
       "evaluation_id": "tau-bench-2/airline/openai-solo__google_gemini-3-pro-preview/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
@@ -2264,7 +2196,7 @@
       }
     },
     {
-      "evaluation_id": "tau-bench-2/airline/smolagents-code__google_gemini-3-pro-preview/1774263615.0201504",
+      "evaluation_id": "tau-bench-2/airline/litellm-tool-calling-with-shortlisting__google_gemini-3-pro-preview/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -2296,14 +2228,14 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.68,
+            "score": 0.7,
             "uncertainty": {
               "num_samples": 50
             },
             "details": {
-              "average_agent_cost": "0.2",
-              "total_run_cost": "10.29",
-              "average_steps": "12.28",
+              "average_agent_cost": "0.16",
+              "total_run_cost": "8.48",
+              "average_steps": "10.14",
               "percent_finished": "1.0"
             }
           },
@@ -2311,8 +2243,8 @@
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "SmolAgents Code",
-                  "agent_framework": "smolagents_code"
+                  "agent_name": "LiteLLM Tool Calling with Shortlisting",
+                  "agent_framework": "tool_calling_with_shortlisting"
                 }
               }
             }
@@ -2324,15 +2256,15 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "SmolAgents Code",
-              "agent_framework": "smolagents_code"
+              "agent_name": "LiteLLM Tool Calling with Shortlisting",
+              "agent_framework": "tool_calling_with_shortlisting"
             }
           }
         }
       }
     },
     {
-      "evaluation_id": "tau-bench-2/airline/litellm-tool-calling-with-shortlisting__google_gemini-3-pro-preview/1774263615.0201504",
+      "evaluation_id": "tau-bench-2/airline/litellm-tool-calling__google_gemini-3-pro-preview/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -2379,8 +2311,8 @@
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "LiteLLM Tool Calling with Shortlisting",
-                  "agent_framework": "tool_calling_with_shortlisting"
+                  "agent_name": "LiteLLM Tool Calling",
+                  "agent_framework": "tool_calling"
                 }
               }
             }
@@ -2392,15 +2324,15 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "LiteLLM Tool Calling with Shortlisting",
-              "agent_framework": "tool_calling_with_shortlisting"
+              "agent_name": "LiteLLM Tool Calling",
+              "agent_framework": "tool_calling"
             }
           }
         }
       }
     },
     {
-      "evaluation_id": "tau-bench-2/retail/claude-code-cli__google_gemini-3-pro-preview/1774263615.0201504",
+      "evaluation_id": "tau-bench-2/retail/smolagents-code__google_gemini-3-pro-preview/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -2432,14 +2364,14 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7805,
+            "score": 0.7576,
             "uncertainty": {
               "num_samples": 100
             },
             "details": {
-              "average_agent_cost": "0.19",
-              "total_run_cost": "19.38",
-              "average_steps": "11.18",
+              "average_agent_cost": "0.21",
+              "total_run_cost": "21.43",
+              "average_steps": "11.3",
               "percent_finished": "1.0"
             }
           },
@@ -2447,8 +2379,8 @@
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "Claude Code CLI",
-                  "agent_framework": "claude_code"
+                  "agent_name": "SmolAgents Code",
+                  "agent_framework": "smolagents_code"
                 }
               }
             }
@@ -2460,8 +2392,8 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "Claude Code CLI",
-              "agent_framework": "claude_code"
+              "agent_name": "SmolAgents Code",
+              "agent_framework": "smolagents_code"
             }
           }
         }
@@ -2536,7 +2468,7 @@
       }
     },
     {
-      "evaluation_id": "tau-bench-2/retail/smolagents-code__google_gemini-3-pro-preview/1774263615.0201504",
+      "evaluation_id": "tau-bench-2/retail/claude-code-cli__google_gemini-3-pro-preview/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -2568,14 +2500,14 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7576,
+            "score": 0.7805,
             "uncertainty": {
               "num_samples": 100
             },
             "details": {
-              "average_agent_cost": "0.21",
-              "total_run_cost": "21.43",
-              "average_steps": "11.3",
+              "average_agent_cost": "0.19",
+              "total_run_cost": "19.38",
+              "average_steps": "11.18",
               "percent_finished": "1.0"
             }
           },
@@ -2583,8 +2515,8 @@
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "SmolAgents Code",
-                  "agent_framework": "smolagents_code"
+                  "agent_name": "Claude Code CLI",
+                  "agent_framework": "claude_code"
                 }
               }
             }
@@ -2596,8 +2528,8 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "SmolAgents Code",
-              "agent_framework": "smolagents_code"
+              "agent_name": "Claude Code CLI",
+              "agent_framework": "claude_code"
             }
           }
         }
@@ -2740,7 +2672,7 @@
       }
     },
     {
-      "evaluation_id": "tau-bench-2/telecom/smolagents-code__google_gemini-3-pro-preview/1774263615.0201504",
+      "evaluation_id": "tau-bench-2/airline/smolagents-code__google_gemini-3-pro-preview/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -2753,33 +2685,33 @@
         "name": "exgentic",
         "version": "0.1.0"
       },
-      "benchmark": "tau-bench-2_telecom",
+      "benchmark": "tau-bench-2_airline",
       "evaluation_results": [
         {
-          "evaluation_name": "tau-bench-2/telecom",
+          "evaluation_name": "tau-bench-2/airline",
           "source_data": {
-            "dataset_name": "tau-bench-2/telecom",
+            "dataset_name": "tau-bench-2/airline",
             "source_type": "url",
             "url": [
               "https://github.com/Exgentic/exgentic"
             ]
           },
           "metric_config": {
-            "evaluation_description": "Tau Bench 2 benchmark evaluation (telecom subset)",
+            "evaluation_description": "Tau Bench 2 benchmark evaluation (airline subset)",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.88,
+            "score": 0.68,
             "uncertainty": {
-              "num_samples": 100
+              "num_samples": 50
             },
             "details": {
-              "average_agent_cost": "0.35",
-              "total_run_cost": "40.25",
-              "average_steps": "12.71",
+              "average_agent_cost": "0.2",
+              "total_run_cost": "10.29",
+              "average_steps": "12.28",
               "percent_finished": "1.0"
             }
           },
@@ -2876,7 +2808,7 @@
       }
     },
     {
-      "evaluation_id": "tau-bench-2/telecom/litellm-tool-calling__google_gemini-3-pro-preview/1774263615.0201504",
+      "evaluation_id": "tau-bench-2/telecom/smolagents-code__google_gemini-3-pro-preview/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -2908,14 +2840,14 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.73,
+            "score": 0.88,
             "uncertainty": {
               "num_samples": 100
             },
             "details": {
-              "average_agent_cost": "0.3",
-              "total_run_cost": "36.75",
-              "average_steps": "14.84",
+              "average_agent_cost": "0.35",
+              "total_run_cost": "40.25",
+              "average_steps": "12.71",
               "percent_finished": "1.0"
             }
           },
@@ -2923,8 +2855,8 @@
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "LiteLLM Tool Calling",
-                  "agent_framework": "tool_calling"
+                  "agent_name": "SmolAgents Code",
+                  "agent_framework": "smolagents_code"
                 }
               }
             }
@@ -2936,8 +2868,8 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "LiteLLM Tool Calling",
-              "agent_framework": "tool_calling"
+              "agent_name": "SmolAgents Code",
+              "agent_framework": "smolagents_code"
             }
           }
         }
@@ -3078,6 +3010,74 @@
           }
         }
       }
+    },
+    {
+      "evaluation_id": "tau-bench-2/telecom/litellm-tool-calling__google_gemini-3-pro-preview/1774263615.0201504",
+      "retrieved_timestamp": "1774263615.0201504",
+      "source_metadata": {
+        "source_name": "Exgentic Open Agent Leaderboard",
+        "source_type": "evaluation_run",
+        "source_organization_name": "Exgentic",
+        "source_organization_url": "https://github.com/Exgentic",
+        "evaluator_relationship": "third_party"
+      },
+      "eval_library": {
+        "name": "exgentic",
+        "version": "0.1.0"
+      },
+      "benchmark": "tau-bench-2_telecom",
+      "evaluation_results": [
+        {
+          "evaluation_name": "tau-bench-2/telecom",
+          "source_data": {
+            "dataset_name": "tau-bench-2/telecom",
+            "source_type": "url",
+            "url": [
+              "https://github.com/Exgentic/exgentic"
+            ]
+          },
+          "metric_config": {
+            "evaluation_description": "Tau Bench 2 benchmark evaluation (telecom subset)",
+            "lower_is_better": false,
+            "score_type": "continuous",
+            "min_score": 0.0,
+            "max_score": 1.0
+          },
+          "score_details": {
+            "score": 0.73,
+            "uncertainty": {
+              "num_samples": 100
+            },
+            "details": {
+              "average_agent_cost": "0.3",
+              "total_run_cost": "36.75",
+              "average_steps": "14.84",
+              "percent_finished": "1.0"
+            }
+          },
+          "generation_config": {
+            "generation_args": {
+              "agentic_eval_config": {
+                "additional_details": {
+                  "agent_name": "LiteLLM Tool Calling",
+                  "agent_framework": "tool_calling"
+                }
+              }
+            }
+          }
+        }
+      ],
+      "detailed_evaluation_results": null,
+      "generation_config": {
+        "generation_args": {
+          "agentic_eval_config": {
+            "additional_details": {
+              "agent_name": "LiteLLM Tool Calling",
+              "agent_framework": "tool_calling"
+            }
+          }
+        }
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/models/google_gemini-3-pro.json b/data/models/google_gemini-3-pro.json
index 459b730e51fcbb9b4826bc74bd506760f8199202..01e84795b0cee23dd2bc66c9e2c84da542f9aae9 100644
--- a/data/models/google_gemini-3-pro.json
+++ b/data/models/google_gemini-3-pro.json
@@ -4,13 +4,13 @@
     "id": "google/gemini-3-pro",
     "developer": "Google",
     "additional_details": {
-      "agent_name": "Droid",
-      "agent_organization": "Factory"
+      "agent_name": "SageAgent",
+      "agent_organization": "OpenSage"
     }
   },
   "evaluations": [
     {
-      "evaluation_id": "terminal-bench-2.0/droid__gemini-3-pro/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/sageagent__gemini-3-pro/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -34,7 +34,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-12-24",
+          "evaluation_timestamp": "2026-02-23",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -43,17 +43,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 61.1,
+            "score": 65.2,
             "uncertainty": {
               "standard_error": {
-                "value": 2.8
+                "value": 2.1
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Droid\" -m \"Gemini 3 Pro\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"SageAgent\" -m \"Gemini 3 Pro\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -70,7 +70,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Droid\" -m \"Gemini 3 Pro\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"SageAgent\" -m \"Gemini 3 Pro\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -84,7 +84,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/sageagent__gemini-3-pro/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/terminus-2__gemini-3-pro/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -108,7 +108,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2026-02-23",
+          "evaluation_timestamp": "2025-11-21",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -117,17 +117,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 65.2,
+            "score": 56.9,
             "uncertainty": {
               "standard_error": {
-                "value": 2.1
+                "value": 2.5
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"SageAgent\" -m \"Gemini 3 Pro\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Gemini 3 Pro\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -144,7 +144,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"SageAgent\" -m \"Gemini 3 Pro\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Gemini 3 Pro\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -232,7 +232,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/ante__gemini-3-pro/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/ii-agent__gemini-3-pro/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -256,7 +256,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2026-01-06",
+          "evaluation_timestamp": "2025-12-23",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -265,17 +265,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 69.4,
+            "score": 61.8,
             "uncertainty": {
               "standard_error": {
-                "value": 2.1
+                "value": 2.8
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Ante\" -m \"Gemini 3 Pro\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"II-Agent\" -m \"Gemini 3 Pro\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -292,7 +292,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Ante\" -m \"Gemini 3 Pro\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"II-Agent\" -m \"Gemini 3 Pro\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -306,7 +306,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/ii-agent__gemini-3-pro/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/ante__gemini-3-pro/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -330,7 +330,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-12-23",
+          "evaluation_timestamp": "2026-01-06",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -339,17 +339,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 61.8,
+            "score": 69.4,
             "uncertainty": {
               "standard_error": {
-                "value": 2.8
+                "value": 2.1
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"II-Agent\" -m \"Gemini 3 Pro\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Ante\" -m \"Gemini 3 Pro\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -366,7 +366,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"II-Agent\" -m \"Gemini 3 Pro\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Ante\" -m \"Gemini 3 Pro\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -380,7 +380,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/terminus-2__gemini-3-pro/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/droid__gemini-3-pro/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -404,7 +404,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-11-21",
+          "evaluation_timestamp": "2025-12-24",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -413,17 +413,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 56.9,
+            "score": 61.1,
             "uncertainty": {
               "standard_error": {
-                "value": 2.5
+                "value": 2.8
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Gemini 3 Pro\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Droid\" -m \"Gemini 3 Pro\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -440,7 +440,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Gemini 3 Pro\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Droid\" -m \"Gemini 3 Pro\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
diff --git a/data/models/google_gemini_3_flash.json b/data/models/google_gemini_3_flash.json
index 6beb736773081d366f65edf05d1bf54bd7e07ce2..3a911939d887cdc9433884fdde5ee11c97c277a4 100644
--- a/data/models/google_gemini_3_flash.json
+++ b/data/models/google_gemini_3_flash.json
@@ -6,6 +6,53 @@
     "inference_platform": "unknown"
   },
   "evaluations": [
+    {
+      "evaluation_id": "ace/google_gemini-3-flash/1773260200",
+      "retrieved_timestamp": "1773260200",
+      "source_metadata": {
+        "source_name": "Mercor ACE Leaderboard",
+        "source_type": "evaluation_run",
+        "source_organization_name": "Mercor",
+        "source_organization_url": "https://www.mercor.com",
+        "evaluator_relationship": "first_party"
+      },
+      "eval_library": {
+        "name": "archipelago",
+        "version": "1.0.0"
+      },
+      "benchmark": "ace",
+      "evaluation_results": [
+        {
+          "evaluation_name": "Gaming Score",
+          "source_data": {
+            "dataset_name": "ace",
+            "source_type": "hf_dataset",
+            "hf_repo": "Mercor/ACE"
+          },
+          "metric_config": {
+            "evaluation_description": "Gaming domain score.",
+            "lower_is_better": false,
+            "score_type": "continuous",
+            "min_score": 0,
+            "max_score": 1
+          },
+          "score_details": {
+            "score": 0.415
+          },
+          "generation_config": {
+            "additional_details": {
+              "run_setting": "High"
+            }
+          }
+        }
+      ],
+      "detailed_evaluation_results": null,
+      "generation_config": {
+        "additional_details": {
+          "run_setting": "High"
+        }
+      }
+    },
     {
       "evaluation_id": "apex-agents/google_gemini-3-flash/1773260200",
       "retrieved_timestamp": "1773260200",
@@ -205,53 +252,6 @@
         }
       }
     },
-    {
-      "evaluation_id": "ace/google_gemini-3-flash/1773260200",
-      "retrieved_timestamp": "1773260200",
-      "source_metadata": {
-        "source_name": "Mercor ACE Leaderboard",
-        "source_type": "evaluation_run",
-        "source_organization_name": "Mercor",
-        "source_organization_url": "https://www.mercor.com",
-        "evaluator_relationship": "first_party"
-      },
-      "eval_library": {
-        "name": "archipelago",
-        "version": "1.0.0"
-      },
-      "benchmark": "ace",
-      "evaluation_results": [
-        {
-          "evaluation_name": "Gaming Score",
-          "source_data": {
-            "dataset_name": "ace",
-            "source_type": "hf_dataset",
-            "hf_repo": "Mercor/ACE"
-          },
-          "metric_config": {
-            "evaluation_description": "Gaming domain score.",
-            "lower_is_better": false,
-            "score_type": "continuous",
-            "min_score": 0,
-            "max_score": 1
-          },
-          "score_details": {
-            "score": 0.415
-          },
-          "generation_config": {
-            "additional_details": {
-              "run_setting": "High"
-            }
-          }
-        }
-      ],
-      "detailed_evaluation_results": null,
-      "generation_config": {
-        "additional_details": {
-          "run_setting": "High"
-        }
-      }
-    },
     {
       "evaluation_id": "apex-v1/google_gemini-3-flash/1773260200",
       "retrieved_timestamp": "1773260200",
diff --git a/data/models/google_gemini_3_pro.json b/data/models/google_gemini_3_pro.json
index 104ce8e17e340df7c6fe6dc4bc1f7866f9c1cc71..8f4f634348fdc10ff492053d2a6241085b80ddd7 100644
--- a/data/models/google_gemini_3_pro.json
+++ b/data/models/google_gemini_3_pro.json
@@ -6,6 +6,78 @@
     "inference_platform": "unknown"
   },
   "evaluations": [
+    {
+      "evaluation_id": "ace/google_gemini-3-pro/1773260200",
+      "retrieved_timestamp": "1773260200",
+      "source_metadata": {
+        "source_name": "Mercor ACE Leaderboard",
+        "source_type": "evaluation_run",
+        "source_organization_name": "Mercor",
+        "source_organization_url": "https://www.mercor.com",
+        "evaluator_relationship": "first_party"
+      },
+      "eval_library": {
+        "name": "archipelago",
+        "version": "1.0.0"
+      },
+      "benchmark": "ace",
+      "evaluation_results": [
+        {
+          "evaluation_name": "Overall Score",
+          "source_data": {
+            "dataset_name": "ace",
+            "source_type": "hf_dataset",
+            "hf_repo": "Mercor/ACE"
+          },
+          "metric_config": {
+            "evaluation_description": "Overall ACE score (paper snapshot, approximate).",
+            "lower_is_better": false,
+            "score_type": "continuous",
+            "min_score": 0,
+            "max_score": 1
+          },
+          "score_details": {
+            "score": 0.47
+          },
+          "generation_config": {
+            "additional_details": {
+              "run_setting": "High",
+              "value_quality": "approximate"
+            }
+          }
+        },
+        {
+          "evaluation_name": "Gaming Score",
+          "source_data": {
+            "dataset_name": "ace",
+            "source_type": "hf_dataset",
+            "hf_repo": "Mercor/ACE"
+          },
+          "metric_config": {
+            "evaluation_description": "Gaming domain score.",
+            "lower_is_better": false,
+            "score_type": "continuous",
+            "min_score": 0,
+            "max_score": 1
+          },
+          "score_details": {
+            "score": 0.509
+          },
+          "generation_config": {
+            "additional_details": {
+              "run_setting": "High"
+            }
+          }
+        }
+      ],
+      "detailed_evaluation_results": null,
+      "generation_config": {
+        "additional_details": {
+          "run_setting": "High",
+          "value_quality": "approximate"
+        }
+      }
+    },
     {
       "evaluation_id": "apex-agents/google_gemini-3-pro/1773260200",
       "retrieved_timestamp": "1773260200",
@@ -205,78 +277,6 @@
         }
       }
     },
-    {
-      "evaluation_id": "ace/google_gemini-3-pro/1773260200",
-      "retrieved_timestamp": "1773260200",
-      "source_metadata": {
-        "source_name": "Mercor ACE Leaderboard",
-        "source_type": "evaluation_run",
-        "source_organization_name": "Mercor",
-        "source_organization_url": "https://www.mercor.com",
-        "evaluator_relationship": "first_party"
-      },
-      "eval_library": {
-        "name": "archipelago",
-        "version": "1.0.0"
-      },
-      "benchmark": "ace",
-      "evaluation_results": [
-        {
-          "evaluation_name": "Overall Score",
-          "source_data": {
-            "dataset_name": "ace",
-            "source_type": "hf_dataset",
-            "hf_repo": "Mercor/ACE"
-          },
-          "metric_config": {
-            "evaluation_description": "Overall ACE score (paper snapshot, approximate).",
-            "lower_is_better": false,
-            "score_type": "continuous",
-            "min_score": 0,
-            "max_score": 1
-          },
-          "score_details": {
-            "score": 0.47
-          },
-          "generation_config": {
-            "additional_details": {
-              "run_setting": "High",
-              "value_quality": "approximate"
-            }
-          }
-        },
-        {
-          "evaluation_name": "Gaming Score",
-          "source_data": {
-            "dataset_name": "ace",
-            "source_type": "hf_dataset",
-            "hf_repo": "Mercor/ACE"
-          },
-          "metric_config": {
-            "evaluation_description": "Gaming domain score.",
-            "lower_is_better": false,
-            "score_type": "continuous",
-            "min_score": 0,
-            "max_score": 1
-          },
-          "score_details": {
-            "score": 0.509
-          },
-          "generation_config": {
-            "additional_details": {
-              "run_setting": "High"
-            }
-          }
-        }
-      ],
-      "detailed_evaluation_results": null,
-      "generation_config": {
-        "additional_details": {
-          "run_setting": "High",
-          "value_quality": "approximate"
-        }
-      }
-    },
     {
       "evaluation_id": "apex-v1/google_gemini-3-pro/1773260200",
       "retrieved_timestamp": "1773260200",
diff --git a/data/models/google_gemma-2-2b-jpn-it.json b/data/models/google_gemma-2-2b-jpn-it.json
index ad8eb46cb13cde4ab2a8a3521d4c0b618fb71ee6..208075f00df888593e36a118e84f64272df933ff 100644
--- a/data/models/google_gemma-2-2b-jpn-it.json
+++ b/data/models/google_gemma-2-2b-jpn-it.json
@@ -5,7 +5,7 @@
     "developer": "Google",
     "inference_platform": "unknown",
     "additional_details": {
-      "precision": "bfloat16",
+      "precision": "float16",
       "architecture": "Gemma2ForCausalLM",
       "params_billions": "2.614"
     }
@@ -44,7 +44,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5288
+            "score": 0.5078
           }
         },
         {
@@ -62,7 +62,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4178
+            "score": 0.4226
           }
         },
         {
@@ -80,7 +80,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0476
+            "score": 0.0347
           }
         },
         {
@@ -98,7 +98,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2752
+            "score": 0.2852
           }
         },
         {
@@ -116,7 +116,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3728
+            "score": 0.3964
           }
         },
         {
@@ -134,7 +134,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2467
+            "score": 0.2578
           }
         }
       ],
@@ -174,7 +174,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5078
+            "score": 0.5288
           }
         },
         {
@@ -192,7 +192,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4226
+            "score": 0.4178
           }
         },
         {
@@ -210,7 +210,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0347
+            "score": 0.0476
           }
         },
         {
@@ -228,7 +228,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2852
+            "score": 0.2752
           }
         },
         {
@@ -246,7 +246,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3964
+            "score": 0.3728
           }
         },
         {
@@ -264,7 +264,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2578
+            "score": 0.2467
           }
         }
       ],
diff --git a/data/models/google_gemma-2-2b.json b/data/models/google_gemma-2-2b.json
index 853dbf9408f0d97b5b090784ef420b0479e50847..3875c2305ff98601b733940bab48bde956384743 100644
--- a/data/models/google_gemma-2-2b.json
+++ b/data/models/google_gemma-2-2b.json
@@ -5,7 +5,7 @@
     "developer": "Google",
     "inference_platform": "unknown",
     "additional_details": {
-      "precision": "bfloat16",
+      "precision": "float16",
       "architecture": "InternLM2ForCausalLM",
       "params_billions": "2.614"
     }
@@ -44,7 +44,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.1993
+            "score": 0.2018
           }
         },
         {
@@ -62,7 +62,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3656
+            "score": 0.3709
           }
         },
         {
@@ -80,7 +80,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0287
+            "score": 0.0302
           }
         },
         {
@@ -116,7 +116,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4232
+            "score": 0.4219
           }
         },
         {
@@ -134,7 +134,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.218
+            "score": 0.2217
           }
         }
       ],
@@ -174,7 +174,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2018
+            "score": 0.1993
           }
         },
         {
@@ -192,7 +192,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3709
+            "score": 0.3656
           }
         },
         {
@@ -210,7 +210,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0302
+            "score": 0.0287
           }
         },
         {
@@ -246,7 +246,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4219
+            "score": 0.4232
           }
         },
         {
@@ -264,7 +264,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2217
+            "score": 0.218
           }
         }
       ],
diff --git a/data/models/google_gemma-3-27b-it.json b/data/models/google_gemma-3-27b-it.json
index 31e90c4548397bec1dec70a558a4830dc0c4f7c9..0d22aa7a55f613493f23d93430a44590b7aa715d 100644
--- a/data/models/google_gemma-3-27b-it.json
+++ b/data/models/google_gemma-3-27b-it.json
@@ -10,8 +10,8 @@
   },
   "evaluations": [
     {
-      "evaluation_id": "global-mmlu-lite/google_gemma-3-27b-it/1773936496.366405",
-      "retrieved_timestamp": "1773936496.366405",
+      "evaluation_id": "global-mmlu-lite/google_gemma-3-27b-it/1773936583.743359",
+      "retrieved_timestamp": "1773936583.743359",
       "source_metadata": {
         "source_name": "Global MMLU Lite Leaderboard",
         "source_type": "documentation",
@@ -525,8 +525,8 @@
       "generation_config": null
     },
     {
-      "evaluation_id": "global-mmlu-lite/google_gemma-3-27b-it/1773936583.743359",
-      "retrieved_timestamp": "1773936583.743359",
+      "evaluation_id": "global-mmlu-lite/google_gemma-3-27b-it/1773936496.366405",
+      "retrieved_timestamp": "1773936496.366405",
       "source_metadata": {
         "source_name": "Global MMLU Lite Leaderboard",
         "source_type": "documentation",
diff --git a/data/models/huggingfacetb_smollm2-135m-instruct.json b/data/models/huggingfacetb_smollm2-135m-instruct.json
index 4930fd119b327af5dcaff7489655e3b5a735f796..7413ccc13689271f5c637f0314f401eea3acbcec 100644
--- a/data/models/huggingfacetb_smollm2-135m-instruct.json
+++ b/data/models/huggingfacetb_smollm2-135m-instruct.json
@@ -5,7 +5,7 @@
     "developer": "HuggingFaceTB",
     "inference_platform": "unknown",
     "additional_details": {
-      "precision": "bfloat16",
+      "precision": "float16",
       "architecture": "LlamaForCausalLM",
       "params_billions": "0.135"
     }
@@ -44,7 +44,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2883
+            "score": 0.0593
           }
         },
         {
@@ -62,7 +62,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3124
+            "score": 0.3135
           }
         },
         {
@@ -80,7 +80,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.003
+            "score": 0.0144
           }
         },
         {
@@ -98,7 +98,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2357
+            "score": 0.2341
           }
         },
         {
@@ -116,7 +116,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3662
+            "score": 0.3871
           }
         },
         {
@@ -134,7 +134,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.1115
+            "score": 0.1092
           }
         }
       ],
@@ -174,7 +174,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0593
+            "score": 0.2883
           }
         },
         {
@@ -192,7 +192,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3135
+            "score": 0.3124
           }
         },
         {
@@ -210,7 +210,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0144
+            "score": 0.003
           }
         },
         {
@@ -228,7 +228,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2341
+            "score": 0.2357
           }
         },
         {
@@ -246,7 +246,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3871
+            "score": 0.3662
           }
         },
         {
@@ -264,7 +264,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.1092
+            "score": 0.1115
           }
         }
       ],
diff --git a/data/models/infly_inf-orm-llama3.1-70b.json b/data/models/infly_inf-orm-llama3.1-70b.json
index 82e76ad6cd43b3a105b801967d5f616a3924844a..e7947ee940015eb0652da9a52891a9ab47739595 100644
--- a/data/models/infly_inf-orm-llama3.1-70b.json
+++ b/data/models/infly_inf-orm-llama3.1-70b.json
@@ -9,10 +9,10 @@
   },
   "evaluations": [
     {
-      "evaluation_id": "reward-bench/infly_INF-ORM-Llama3.1-70B/1766412838.146816",
+      "evaluation_id": "reward-bench-2/infly_INF-ORM-Llama3.1-70B/1766412838.146816",
       "retrieved_timestamp": "1766412838.146816",
       "source_metadata": {
-        "source_name": "RewardBench",
+        "source_name": "RewardBench 2",
         "source_type": "documentation",
         "source_organization_name": "Allen Institute for AI",
         "source_organization_url": "https://allenai.org",
@@ -31,128 +31,104 @@
         {
           "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench Score",
+            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9511
+            "score": 0.7648
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat",
+          "evaluation_name": "Factuality",
           "metric_config": {
-            "evaluation_description": "Chat accuracy - includes easy chat subsets",
+            "evaluation_description": "Factuality score - measures factual accuracy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9665
+            "score": 0.7411
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat Hard",
+          "evaluation_name": "Precise IF",
           "metric_config": {
-            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
+            "evaluation_description": "Precise Instruction Following score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9101
+            "score": 0.4188
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Safety",
+          "evaluation_name": "Math",
           "metric_config": {
-            "evaluation_description": "Safety accuracy - includes safety subsets",
+            "evaluation_description": "Math score - measures mathematical reasoning",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9365
+            "score": 0.6995
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Reasoning",
+          "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
+            "evaluation_description": "Safety score - measures safety awareness",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9912
+            "score": 0.9644
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
-        }
-      ],
-      "detailed_evaluation_results": null,
-      "generation_config": null
-    },
-    {
-      "evaluation_id": "reward-bench-2/infly_INF-ORM-Llama3.1-70B/1766412838.146816",
-      "retrieved_timestamp": "1766412838.146816",
-      "source_metadata": {
-        "source_name": "RewardBench 2",
-        "source_type": "documentation",
-        "source_organization_name": "Allen Institute for AI",
-        "source_organization_url": "https://allenai.org",
-        "evaluator_relationship": "third_party"
-      },
-      "eval_library": {
-        "name": "rewardbench",
-        "version": "0.1.3",
-        "additional_details": {
-          "subsets": "Chat, Chat Hard, Safety, Reasoning",
-          "hf_space": "allenai/reward-bench"
-        }
-      },
-      "benchmark": "reward-bench",
-      "evaluation_results": [
+        },
         {
-          "evaluation_name": "Score",
+          "evaluation_name": "Focus",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
+            "evaluation_description": "Focus score - measures response focus",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7648
+            "score": 0.903
           },
           "source_data": {
             "dataset_name": "RewardBench 2",
@@ -161,111 +137,135 @@
           }
         },
         {
-          "evaluation_name": "Factuality",
+          "evaluation_name": "Ties",
           "metric_config": {
-            "evaluation_description": "Factuality score - measures factual accuracy",
+            "evaluation_description": "Ties score - ability to identify tie cases",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7411
+            "score": 0.8622
           },
           "source_data": {
             "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
             "hf_repo": "allenai/reward-bench-2-results"
           }
-        },
+        }
+      ],
+      "detailed_evaluation_results": null,
+      "generation_config": null
+    },
+    {
+      "evaluation_id": "reward-bench/infly_INF-ORM-Llama3.1-70B/1766412838.146816",
+      "retrieved_timestamp": "1766412838.146816",
+      "source_metadata": {
+        "source_name": "RewardBench",
+        "source_type": "documentation",
+        "source_organization_name": "Allen Institute for AI",
+        "source_organization_url": "https://allenai.org",
+        "evaluator_relationship": "third_party"
+      },
+      "eval_library": {
+        "name": "rewardbench",
+        "version": "0.1.3",
+        "additional_details": {
+          "subsets": "Chat, Chat Hard, Safety, Reasoning",
+          "hf_space": "allenai/reward-bench"
+        }
+      },
+      "benchmark": "reward-bench",
+      "evaluation_results": [
         {
-          "evaluation_name": "Precise IF",
+          "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Precise Instruction Following score",
+            "evaluation_description": "Overall RewardBench Score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4188
+            "score": 0.9511
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Math",
+          "evaluation_name": "Chat",
           "metric_config": {
-            "evaluation_description": "Math score - measures mathematical reasoning",
+            "evaluation_description": "Chat accuracy - includes easy chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6995
+            "score": 0.9665
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Safety",
+          "evaluation_name": "Chat Hard",
           "metric_config": {
-            "evaluation_description": "Safety score - measures safety awareness",
+            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9644
+            "score": 0.9101
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Focus",
+          "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Focus score - measures response focus",
+            "evaluation_description": "Safety accuracy - includes safety subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.903
+            "score": 0.9365
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Ties",
+          "evaluation_name": "Reasoning",
           "metric_config": {
-            "evaluation_description": "Ties score - ability to identify tie cases",
+            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8622
+            "score": 0.9912
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         }
       ],
diff --git a/data/models/internlm_internlm2-7b-reward.json b/data/models/internlm_internlm2-7b-reward.json
index 850f56827380f314c355f2c532b41bbbb69062e8..5907aad77cdf2d42a7b25e1a1a35520112be4497 100644
--- a/data/models/internlm_internlm2-7b-reward.json
+++ b/data/models/internlm_internlm2-7b-reward.json
@@ -9,10 +9,10 @@
   },
   "evaluations": [
     {
-      "evaluation_id": "reward-bench-2/internlm_internlm2-7b-reward/1766412838.146816",
+      "evaluation_id": "reward-bench/internlm_internlm2-7b-reward/1766412838.146816",
       "retrieved_timestamp": "1766412838.146816",
       "source_metadata": {
-        "source_name": "RewardBench 2",
+        "source_name": "RewardBench",
         "source_type": "documentation",
         "source_organization_name": "Allen Institute for AI",
         "source_organization_url": "https://allenai.org",
@@ -31,104 +31,128 @@
         {
           "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
+            "evaluation_description": "Overall RewardBench Score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5335
+            "score": 0.8759
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Factuality",
+          "evaluation_name": "Chat",
           "metric_config": {
-            "evaluation_description": "Factuality score - measures factual accuracy",
+            "evaluation_description": "Chat accuracy - includes easy chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4211
+            "score": 0.9916
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Precise IF",
+          "evaluation_name": "Chat Hard",
           "metric_config": {
-            "evaluation_description": "Precise Instruction Following score",
+            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4
+            "score": 0.6952
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Math",
+          "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Math score - measures mathematical reasoning",
+            "evaluation_description": "Safety accuracy - includes safety subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5628
+            "score": 0.8716
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Safety",
+          "evaluation_name": "Reasoning",
           "metric_config": {
-            "evaluation_description": "Safety score - measures safety awareness",
+            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5956
+            "score": 0.9453
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
-        },
+        }
+      ],
+      "detailed_evaluation_results": null,
+      "generation_config": null
+    },
+    {
+      "evaluation_id": "reward-bench-2/internlm_internlm2-7b-reward/1766412838.146816",
+      "retrieved_timestamp": "1766412838.146816",
+      "source_metadata": {
+        "source_name": "RewardBench 2",
+        "source_type": "documentation",
+        "source_organization_name": "Allen Institute for AI",
+        "source_organization_url": "https://allenai.org",
+        "evaluator_relationship": "third_party"
+      },
+      "eval_library": {
+        "name": "rewardbench",
+        "version": "0.1.3",
+        "additional_details": {
+          "subsets": "Chat, Chat Hard, Safety, Reasoning",
+          "hf_space": "allenai/reward-bench"
+        }
+      },
+      "benchmark": "reward-bench",
+      "evaluation_results": [
         {
-          "evaluation_name": "Focus",
+          "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Focus score - measures response focus",
+            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7051
+            "score": 0.5335
           },
           "source_data": {
             "dataset_name": "RewardBench 2",
@@ -137,135 +161,111 @@
           }
         },
         {
-          "evaluation_name": "Ties",
+          "evaluation_name": "Factuality",
           "metric_config": {
-            "evaluation_description": "Ties score - ability to identify tie cases",
+            "evaluation_description": "Factuality score - measures factual accuracy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5164
+            "score": 0.4211
           },
           "source_data": {
             "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
             "hf_repo": "allenai/reward-bench-2-results"
           }
-        }
-      ],
-      "detailed_evaluation_results": null,
-      "generation_config": null
-    },
-    {
-      "evaluation_id": "reward-bench/internlm_internlm2-7b-reward/1766412838.146816",
-      "retrieved_timestamp": "1766412838.146816",
-      "source_metadata": {
-        "source_name": "RewardBench",
-        "source_type": "documentation",
-        "source_organization_name": "Allen Institute for AI",
-        "source_organization_url": "https://allenai.org",
-        "evaluator_relationship": "third_party"
-      },
-      "eval_library": {
-        "name": "rewardbench",
-        "version": "0.1.3",
-        "additional_details": {
-          "subsets": "Chat, Chat Hard, Safety, Reasoning",
-          "hf_space": "allenai/reward-bench"
-        }
-      },
-      "benchmark": "reward-bench",
-      "evaluation_results": [
+        },
         {
-          "evaluation_name": "Score",
+          "evaluation_name": "Precise IF",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench Score",
+            "evaluation_description": "Precise Instruction Following score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8759
+            "score": 0.4
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat",
+          "evaluation_name": "Math",
           "metric_config": {
-            "evaluation_description": "Chat accuracy - includes easy chat subsets",
+            "evaluation_description": "Math score - measures mathematical reasoning",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9916
+            "score": 0.5628
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat Hard",
+          "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
+            "evaluation_description": "Safety score - measures safety awareness",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6952
+            "score": 0.5956
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Safety",
+          "evaluation_name": "Focus",
           "metric_config": {
-            "evaluation_description": "Safety accuracy - includes safety subsets",
+            "evaluation_description": "Focus score - measures response focus",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8716
+            "score": 0.7051
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Reasoning",
+          "evaluation_name": "Ties",
           "metric_config": {
-            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
+            "evaluation_description": "Ties score - ability to identify tie cases",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9453
+            "score": 0.5164
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         }
       ],
diff --git a/data/models/jaspionjader_kosmos-evaa-fusion-8b.json b/data/models/jaspionjader_kosmos-evaa-fusion-8b.json
index 88cf241ec28671fc887627a1e7bb1af09c15f9cf..1160e36435338576bea29bef2eb39f295798e22d 100644
--- a/data/models/jaspionjader_kosmos-evaa-fusion-8b.json
+++ b/data/models/jaspionjader_kosmos-evaa-fusion-8b.json
@@ -5,7 +5,7 @@
     "developer": "jaspionjader",
     "inference_platform": "unknown",
     "additional_details": {
-      "precision": "float16",
+      "precision": "bfloat16",
       "architecture": "LlamaForCausalLM",
       "params_billions": "8.03"
     }
@@ -44,7 +44,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4418
+            "score": 0.4345
           }
         },
         {
@@ -62,7 +62,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5406
+            "score": 0.5419
           }
         },
         {
@@ -80,7 +80,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.1352
+            "score": 0.1292
           }
         },
         {
@@ -98,7 +98,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3062
+            "score": 0.3087
           }
         },
         {
@@ -134,7 +134,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.386
+            "score": 0.3854
           }
         }
       ],
@@ -174,7 +174,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4345
+            "score": 0.4418
           }
         },
         {
@@ -192,7 +192,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5419
+            "score": 0.5406
           }
         },
         {
@@ -210,7 +210,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.1292
+            "score": 0.1352
           }
         },
         {
@@ -228,7 +228,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3087
+            "score": 0.3062
           }
         },
         {
@@ -264,7 +264,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3854
+            "score": 0.386
           }
         }
       ],
diff --git a/data/models/leroydyer_spydazweb_ai_humanai_012_instruct_xa.json b/data/models/leroydyer_spydazweb_ai_humanai_012_instruct_xa.json
index 199a2574ee197c6be6eb6b38849e4c63ab43b085..90656456ac84eb015314278cafdaa288005eb1ba 100644
--- a/data/models/leroydyer_spydazweb_ai_humanai_012_instruct_xa.json
+++ b/data/models/leroydyer_spydazweb_ai_humanai_012_instruct_xa.json
@@ -5,7 +5,7 @@
     "developer": "LeroyDyer",
     "inference_platform": "unknown",
     "additional_details": {
-      "precision": "float16",
+      "precision": "bfloat16",
       "architecture": "MistralForCausalLM",
       "params_billions": "7.242"
     }
@@ -44,7 +44,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3798
+            "score": 0.3579
           }
         },
         {
@@ -62,7 +62,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4483
+            "score": 0.4477
           }
         },
         {
@@ -80,7 +80,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.04
+            "score": 0.0423
           }
         },
         {
@@ -98,7 +98,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3129
+            "score": 0.3096
           }
         },
         {
@@ -116,7 +116,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4148
+            "score": 0.4134
           }
         },
         {
@@ -134,7 +134,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2389
+            "score": 0.2376
           }
         }
       ],
@@ -174,7 +174,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3579
+            "score": 0.3798
           }
         },
         {
@@ -192,7 +192,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4477
+            "score": 0.4483
           }
         },
         {
@@ -210,7 +210,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0423
+            "score": 0.04
           }
         },
         {
@@ -228,7 +228,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3096
+            "score": 0.3129
           }
         },
         {
@@ -246,7 +246,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4134
+            "score": 0.4148
           }
         },
         {
@@ -264,7 +264,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2376
+            "score": 0.2389
           }
         }
       ],
diff --git a/data/models/llmat_mistral-v0.3-7b-orpo.json b/data/models/llmat_mistral-v0.3-7b-orpo.json
index c2c9120f6bd010e2ff566effb35617377de554ee..3a1b947d84c5d76bb2423237a20cbf150415592e 100644
--- a/data/models/llmat_mistral-v0.3-7b-orpo.json
+++ b/data/models/llmat_mistral-v0.3-7b-orpo.json
@@ -5,7 +5,7 @@
     "developer": "llmat",
     "inference_platform": "unknown",
     "additional_details": {
-      "precision": "bfloat16",
+      "precision": "float16",
       "architecture": "MistralForCausalLM",
       "params_billions": "7.248"
     }
@@ -44,7 +44,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.377
+            "score": 0.364
           }
         },
         {
@@ -62,7 +62,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3978
+            "score": 0.4005
           }
         },
         {
@@ -80,7 +80,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0242
+            "score": 0.0015
           }
         },
         {
@@ -98,7 +98,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2668
+            "score": 0.2693
           }
         },
         {
@@ -116,7 +116,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3555
+            "score": 0.3529
           }
         },
         {
@@ -134,7 +134,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2278
+            "score": 0.2301
           }
         }
       ],
@@ -174,7 +174,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.364
+            "score": 0.377
           }
         },
         {
@@ -192,7 +192,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4005
+            "score": 0.3978
           }
         },
         {
@@ -210,7 +210,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0015
+            "score": 0.0242
           }
         },
         {
@@ -228,7 +228,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2693
+            "score": 0.2668
           }
         },
         {
@@ -246,7 +246,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3529
+            "score": 0.3555
           }
         },
         {
@@ -264,7 +264,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2301
+            "score": 0.2278
           }
         }
       ],
diff --git a/data/models/lxzgordon_urm-llama-3.1-8b.json b/data/models/lxzgordon_urm-llama-3.1-8b.json
index 7f03035f2bd809fb14be27130e313f5b86a26f9a..2ce56c90ce0d5fb5788660f5b3f1f1e179701786 100644
--- a/data/models/lxzgordon_urm-llama-3.1-8b.json
+++ b/data/models/lxzgordon_urm-llama-3.1-8b.json
@@ -9,10 +9,10 @@
   },
   "evaluations": [
     {
-      "evaluation_id": "reward-bench-2/LxzGordon_URM-LLaMa-3.1-8B/1766412838.146816",
+      "evaluation_id": "reward-bench/LxzGordon_URM-LLaMa-3.1-8B/1766412838.146816",
       "retrieved_timestamp": "1766412838.146816",
       "source_metadata": {
-        "source_name": "RewardBench 2",
+        "source_name": "RewardBench",
         "source_type": "documentation",
         "source_organization_name": "Allen Institute for AI",
         "source_organization_url": "https://allenai.org",
@@ -31,104 +31,128 @@
         {
           "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
+            "evaluation_description": "Overall RewardBench Score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7394
+            "score": 0.9294
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Factuality",
+          "evaluation_name": "Chat",
           "metric_config": {
-            "evaluation_description": "Factuality score - measures factual accuracy",
+            "evaluation_description": "Chat accuracy - includes easy chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6884
+            "score": 0.9553
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Precise IF",
+          "evaluation_name": "Chat Hard",
           "metric_config": {
-            "evaluation_description": "Precise Instruction Following score",
+            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.45
+            "score": 0.8816
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Math",
+          "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Math score - measures mathematical reasoning",
+            "evaluation_description": "Safety accuracy - includes safety subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6393
+            "score": 0.9108
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Safety",
+          "evaluation_name": "Reasoning",
           "metric_config": {
-            "evaluation_description": "Safety score - measures safety awareness",
+            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9178
+            "score": 0.9698
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
-        },
+        }
+      ],
+      "detailed_evaluation_results": null,
+      "generation_config": null
+    },
+    {
+      "evaluation_id": "reward-bench-2/LxzGordon_URM-LLaMa-3.1-8B/1766412838.146816",
+      "retrieved_timestamp": "1766412838.146816",
+      "source_metadata": {
+        "source_name": "RewardBench 2",
+        "source_type": "documentation",
+        "source_organization_name": "Allen Institute for AI",
+        "source_organization_url": "https://allenai.org",
+        "evaluator_relationship": "third_party"
+      },
+      "eval_library": {
+        "name": "rewardbench",
+        "version": "0.1.3",
+        "additional_details": {
+          "subsets": "Chat, Chat Hard, Safety, Reasoning",
+          "hf_space": "allenai/reward-bench"
+        }
+      },
+      "benchmark": "reward-bench",
+      "evaluation_results": [
         {
-          "evaluation_name": "Focus",
+          "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Focus score - measures response focus",
+            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9758
+            "score": 0.7394
           },
           "source_data": {
             "dataset_name": "RewardBench 2",
@@ -137,135 +161,111 @@
           }
         },
         {
-          "evaluation_name": "Ties",
+          "evaluation_name": "Factuality",
           "metric_config": {
-            "evaluation_description": "Ties score - ability to identify tie cases",
+            "evaluation_description": "Factuality score - measures factual accuracy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7653
+            "score": 0.6884
           },
           "source_data": {
             "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
             "hf_repo": "allenai/reward-bench-2-results"
           }
-        }
-      ],
-      "detailed_evaluation_results": null,
-      "generation_config": null
-    },
-    {
-      "evaluation_id": "reward-bench/LxzGordon_URM-LLaMa-3.1-8B/1766412838.146816",
-      "retrieved_timestamp": "1766412838.146816",
-      "source_metadata": {
-        "source_name": "RewardBench",
-        "source_type": "documentation",
-        "source_organization_name": "Allen Institute for AI",
-        "source_organization_url": "https://allenai.org",
-        "evaluator_relationship": "third_party"
-      },
-      "eval_library": {
-        "name": "rewardbench",
-        "version": "0.1.3",
-        "additional_details": {
-          "subsets": "Chat, Chat Hard, Safety, Reasoning",
-          "hf_space": "allenai/reward-bench"
-        }
-      },
-      "benchmark": "reward-bench",
-      "evaluation_results": [
+        },
         {
-          "evaluation_name": "Score",
+          "evaluation_name": "Precise IF",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench Score",
+            "evaluation_description": "Precise Instruction Following score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9294
+            "score": 0.45
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat",
+          "evaluation_name": "Math",
           "metric_config": {
-            "evaluation_description": "Chat accuracy - includes easy chat subsets",
+            "evaluation_description": "Math score - measures mathematical reasoning",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9553
+            "score": 0.6393
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat Hard",
+          "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
+            "evaluation_description": "Safety score - measures safety awareness",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8816
+            "score": 0.9178
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Safety",
+          "evaluation_name": "Focus",
           "metric_config": {
-            "evaluation_description": "Safety accuracy - includes safety subsets",
+            "evaluation_description": "Focus score - measures response focus",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9108
+            "score": 0.9758
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Reasoning",
+          "evaluation_name": "Ties",
           "metric_config": {
-            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
+            "evaluation_description": "Ties score - ability to identify tie cases",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9698
+            "score": 0.7653
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         }
       ],
diff --git a/data/models/meta_llama-3.1-8b-instruct-turbo.json b/data/models/meta_llama-3.1-8b-instruct-turbo.json
index db79a2433c0e2503859f110928ebb7ec1a96138c..511e540608c036dc4147fb2f225109769aa099a1 100644
--- a/data/models/meta_llama-3.1-8b-instruct-turbo.json
+++ b/data/models/meta_llama-3.1-8b-instruct-turbo.json
@@ -231,10 +231,10 @@
       }
     },
     {
-      "evaluation_id": "helm_mmlu/meta_llama-3.1-8b-instruct-turbo/1774096312.00548",
-      "retrieved_timestamp": "1774096312.00548",
+      "evaluation_id": "helm_lite/meta_llama-3.1-8b-instruct-turbo/1774096306.427425",
+      "retrieved_timestamp": "1774096306.427425",
       "source_metadata": {
-        "source_name": "helm_mmlu",
+        "source_name": "helm_lite",
         "source_type": "documentation",
         "source_organization_name": "crfm",
         "evaluator_relationship": "third_party"
@@ -243,438 +243,382 @@
         "name": "helm",
         "version": "unknown"
       },
-      "benchmark": "helm_mmlu",
+      "benchmark": "helm_lite",
       "evaluation_results": [
         {
-          "evaluation_name": "MMLU All Subjects",
+          "evaluation_name": "Mean win rate",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "helm_lite",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on MMLU All Subjects",
+            "evaluation_description": "How many models this model outperforms on average (over columns).",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.561,
+            "score": 0.303,
             "details": {
-              "description": "min=0.26, mean=0.561, max=0.865, sum=63.912 (114)",
+              "description": "",
               "tab": "Accuracy",
-              "MMLU All Subjects - Observed inference time (s)": "{\"description\": \"min=0.202, mean=0.56, max=1.485, sum=63.854 (114)\", \"tab\": \"Efficiency\", \"score\": \"0.5601251981506405\"}",
-              "MMLU All Subjects - # eval": "{\"description\": \"min=100, mean=246.351, max=1534, sum=28084 (114)\", \"tab\": \"General information\", \"score\": \"246.35087719298247\"}",
-              "MMLU All Subjects - # train": "{\"description\": \"min=5, mean=5, max=5, sum=570 (114)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "MMLU All Subjects - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (114)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "MMLU All Subjects - # prompt tokens": "{\"description\": \"min=274.52, mean=614.619, max=2797.885, sum=70066.61 (114)\", \"tab\": \"General information\", \"score\": \"614.6193817308517\"}",
-              "MMLU All Subjects - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=114 (114)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Mean win rate - Efficiency": "{\"description\": \"\", \"tab\": \"Efficiency\", \"score\": \"0.5896504369538077\"}",
+              "Mean win rate - General information": "{\"description\": \"\", \"tab\": \"General information\", \"score\": \"\"}"
             }
           },
           "generation_config": {
-            "additional_details": {
-              "subject": "[\"abstract_algebra\", \"anatomy\", \"astronomy\", \"business_ethics\", \"clinical_knowledge\", \"college_biology\", \"college_chemistry\", \"college_computer_science\", \"college_mathematics\", \"college_medicine\", \"college_physics\", \"computer_security\", \"conceptual_physics\", \"econometrics\", \"electrical_engineering\", \"elementary_mathematics\", \"formal_logic\", \"global_facts\", \"high_school_biology\", \"high_school_chemistry\", \"high_school_computer_science\", \"high_school_european_history\", \"high_school_geography\", \"high_school_government_and_politics\", \"high_school_macroeconomics\", \"high_school_mathematics\", \"high_school_microeconomics\", \"high_school_physics\", \"high_school_psychology\", \"high_school_statistics\", \"high_school_us_history\", \"high_school_world_history\", \"human_aging\", \"human_sexuality\", \"international_law\", \"jurisprudence\", \"logical_fallacies\", \"machine_learning\", \"management\", \"marketing\", \"medical_genetics\", \"miscellaneous\", \"moral_disputes\", \"moral_scenarios\", \"nutrition\", \"philosophy\", \"prehistory\", \"professional_accounting\", \"professional_law\", \"professional_medicine\", \"professional_psychology\", \"public_relations\", \"security_studies\", \"sociology\", \"us_foreign_policy\", \"virology\", \"world_religions\"]",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "[\"mmlu_abstract_algebra\", \"mmlu_anatomy\", \"mmlu_astronomy\", \"mmlu_business_ethics\", \"mmlu_clinical_knowledge\", \"mmlu_college_biology\", \"mmlu_college_chemistry\", \"mmlu_college_computer_science\", \"mmlu_college_mathematics\", \"mmlu_college_medicine\", \"mmlu_college_physics\", \"mmlu_computer_security\", \"mmlu_conceptual_physics\", \"mmlu_econometrics\", \"mmlu_electrical_engineering\", \"mmlu_elementary_mathematics\", \"mmlu_formal_logic\", \"mmlu_global_facts\", \"mmlu_high_school_biology\", \"mmlu_high_school_chemistry\", \"mmlu_high_school_computer_science\", \"mmlu_high_school_european_history\", \"mmlu_high_school_geography\", \"mmlu_high_school_government_and_politics\", \"mmlu_high_school_macroeconomics\", \"mmlu_high_school_mathematics\", \"mmlu_high_school_microeconomics\", \"mmlu_high_school_physics\", \"mmlu_high_school_psychology\", \"mmlu_high_school_statistics\", \"mmlu_high_school_us_history\", \"mmlu_high_school_world_history\", \"mmlu_human_aging\", \"mmlu_human_sexuality\", \"mmlu_international_law\", \"mmlu_jurisprudence\", \"mmlu_logical_fallacies\", \"mmlu_machine_learning\", \"mmlu_management\", \"mmlu_marketing\", \"mmlu_medical_genetics\", \"mmlu_miscellaneous\", \"mmlu_moral_disputes\", \"mmlu_moral_scenarios\", \"mmlu_nutrition\", \"mmlu_philosophy\", \"mmlu_prehistory\", \"mmlu_professional_accounting\", \"mmlu_professional_law\", \"mmlu_professional_medicine\", \"mmlu_professional_psychology\", \"mmlu_public_relations\", \"mmlu_security_studies\", \"mmlu_sociology\", \"mmlu_us_foreign_policy\", \"mmlu_virology\", \"mmlu_world_religions\"]"
-            }
+            "additional_details": {}
           }
         },
         {
-          "evaluation_name": "Abstract Algebra",
+          "evaluation_name": "NarrativeQA",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "NarrativeQA",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Abstract Algebra",
+            "evaluation_description": "F1 on NarrativeQA",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.26,
+            "score": 0.756,
             "details": {
-              "description": "min=0.26, mean=0.26, max=0.26, sum=0.52 (2)",
+              "description": "min=0.756, mean=0.756, max=0.756, sum=0.756 (1)",
               "tab": "Accuracy",
-              "Abstract Algebra - Observed inference time (s)": "{\"description\": \"min=0.284, mean=0.284, max=0.284, sum=0.568 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.28381933450698854\"}",
-              "Abstract Algebra - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "Abstract Algebra - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Abstract Algebra - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Abstract Algebra - # prompt tokens": "{\"description\": \"min=373.43, mean=373.43, max=373.43, sum=746.86 (2)\", \"tab\": \"General information\", \"score\": \"373.43\"}",
-              "Abstract Algebra - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "NarrativeQA - Observed inference time (s)": "{\"description\": \"min=0.581, mean=0.581, max=0.581, sum=0.581 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.5813529316808136\"}",
+              "NarrativeQA - # eval": "{\"description\": \"min=355, mean=355, max=355, sum=355 (1)\", \"tab\": \"General information\", \"score\": \"355.0\"}",
+              "NarrativeQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "NarrativeQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "NarrativeQA - # prompt tokens": "{\"description\": \"min=3484.268, mean=3484.268, max=3484.268, sum=3484.268 (1)\", \"tab\": \"General information\", \"score\": \"3484.2676056338028\"}",
+              "NarrativeQA - # output tokens": "{\"description\": \"min=7.287, mean=7.287, max=7.287, sum=7.287 (1)\", \"tab\": \"General information\", \"score\": \"7.2873239436619714\"}"
             }
           },
           "generation_config": {
-            "additional_details": {
-              "subject": "\"abstract_algebra\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_abstract_algebra\""
-            }
+            "additional_details": {}
           }
         },
         {
-          "evaluation_name": "Anatomy",
+          "evaluation_name": "NaturalQuestions (closed-book)",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "NaturalQuestions (closed-book)",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Anatomy",
+            "evaluation_description": "F1 on NaturalQuestions (closed-book)",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.459,
+            "score": 0.209,
             "details": {
-              "description": "min=0.459, mean=0.459, max=0.459, sum=0.919 (2)",
+              "description": "min=0.209, mean=0.209, max=0.209, sum=0.209 (1)",
               "tab": "Accuracy",
-              "Anatomy - Observed inference time (s)": "{\"description\": \"min=0.323, mean=0.323, max=0.323, sum=0.646 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3231998196354619\"}",
-              "Anatomy - # eval": "{\"description\": \"min=135, mean=135, max=135, sum=270 (2)\", \"tab\": \"General information\", \"score\": \"135.0\"}",
-              "Anatomy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Anatomy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Anatomy - # prompt tokens": "{\"description\": \"min=353.874, mean=353.874, max=353.874, sum=707.748 (2)\", \"tab\": \"General information\", \"score\": \"353.8740740740741\"}",
-              "Anatomy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "NaturalQuestions (open-book) - Observed inference time (s)": "{\"description\": \"min=0.544, mean=0.544, max=0.544, sum=0.544 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.5441543731689453\"}",
+              "NaturalQuestions (closed-book) - Observed inference time (s)": "{\"description\": \"min=0.752, mean=0.752, max=0.752, sum=0.752 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.751717613697052\"}",
+              "NaturalQuestions (open-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}",
+              "NaturalQuestions (open-book) - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "NaturalQuestions (open-book) - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "NaturalQuestions (open-book) - # prompt tokens": "{\"description\": \"min=1716.78, mean=1716.78, max=1716.78, sum=1716.78 (1)\", \"tab\": \"General information\", \"score\": \"1716.78\"}",
+              "NaturalQuestions (open-book) - # output tokens": "{\"description\": \"min=8.736, mean=8.736, max=8.736, sum=8.736 (1)\", \"tab\": \"General information\", \"score\": \"8.736\"}",
+              "NaturalQuestions (closed-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}",
+              "NaturalQuestions (closed-book) - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "NaturalQuestions (closed-book) - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "NaturalQuestions (closed-book) - # prompt tokens": "{\"description\": \"min=129.12, mean=129.12, max=129.12, sum=129.12 (1)\", \"tab\": \"General information\", \"score\": \"129.12\"}",
+              "NaturalQuestions (closed-book) - # output tokens": "{\"description\": \"min=11.732, mean=11.732, max=11.732, sum=11.732 (1)\", \"tab\": \"General information\", \"score\": \"11.732\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"anatomy\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_anatomy\""
+              "mode": "\"closedbook\""
             }
           }
         },
         {
-          "evaluation_name": "College Physics",
+          "evaluation_name": "OpenbookQA",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "OpenbookQA",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on College Physics",
+            "evaluation_description": "EM on OpenbookQA",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.363,
+            "score": 0.74,
             "details": {
-              "description": "min=0.363, mean=0.363, max=0.363, sum=0.725 (2)",
+              "description": "min=0.74, mean=0.74, max=0.74, sum=0.74 (1)",
               "tab": "Accuracy",
-              "College Chemistry - Observed inference time (s)": "{\"description\": \"min=0.431, mean=0.431, max=0.431, sum=0.862 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.43078258752822873\"}",
-              "College Biology - Observed inference time (s)": "{\"description\": \"min=0.426, mean=0.426, max=0.426, sum=0.853 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42637243535783553\"}",
-              "College Computer Science - Observed inference time (s)": "{\"description\": \"min=0.562, mean=0.562, max=0.562, sum=1.125 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5623248195648194\"}",
-              "College Mathematics - Observed inference time (s)": "{\"description\": \"min=0.371, mean=0.371, max=0.371, sum=0.742 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3709776735305786\"}",
-              "College Medicine - Observed inference time (s)": "{\"description\": \"min=0.395, mean=0.395, max=0.395, sum=0.79 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3948341918129452\"}",
-              "College Physics - Observed inference time (s)": "{\"description\": \"min=0.395, mean=0.395, max=0.395, sum=0.789 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.39474552051693784\"}",
-              "College Chemistry - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "College Chemistry - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "College Chemistry - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "College Chemistry - # prompt tokens": "{\"description\": \"min=549.28, mean=549.28, max=549.28, sum=1098.56 (2)\", \"tab\": \"General information\", \"score\": \"549.28\"}",
-              "College Chemistry - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "College Biology - # eval": "{\"description\": \"min=144, mean=144, max=144, sum=288 (2)\", \"tab\": \"General information\", \"score\": \"144.0\"}",
-              "College Biology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "College Biology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "College Biology - # prompt tokens": "{\"description\": \"min=473.875, mean=473.875, max=473.875, sum=947.75 (2)\", \"tab\": \"General information\", \"score\": \"473.875\"}",
-              "College Biology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "College Computer Science - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "College Computer Science - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "College Computer Science - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "College Computer Science - # prompt tokens": "{\"description\": \"min=828.29, mean=828.29, max=828.29, sum=1656.58 (2)\", \"tab\": \"General information\", \"score\": \"828.29\"}",
-              "College Computer Science - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "College Mathematics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "College Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "College Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "College Mathematics - # prompt tokens": "{\"description\": \"min=594.51, mean=594.51, max=594.51, sum=1189.02 (2)\", \"tab\": \"General information\", \"score\": \"594.51\"}",
-              "College Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "College Medicine - # eval": "{\"description\": \"min=173, mean=173, max=173, sum=346 (2)\", \"tab\": \"General information\", \"score\": \"173.0\"}",
-              "College Medicine - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "College Medicine - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "College Medicine - # prompt tokens": "{\"description\": \"min=502.705, mean=502.705, max=502.705, sum=1005.41 (2)\", \"tab\": \"General information\", \"score\": \"502.70520231213874\"}",
-              "College Medicine - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "College Physics - # eval": "{\"description\": \"min=102, mean=102, max=102, sum=204 (2)\", \"tab\": \"General information\", \"score\": \"102.0\"}",
-              "College Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "College Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "College Physics - # prompt tokens": "{\"description\": \"min=503.569, mean=503.569, max=503.569, sum=1007.137 (2)\", \"tab\": \"General information\", \"score\": \"503.5686274509804\"}",
-              "College Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "OpenbookQA - Observed inference time (s)": "{\"description\": \"min=2.937, mean=2.937, max=2.937, sum=2.937 (1)\", \"tab\": \"Efficiency\", \"score\": \"2.9374450149536133\"}",
+              "OpenbookQA - # eval": "{\"description\": \"min=500, mean=500, max=500, sum=500 (1)\", \"tab\": \"General information\", \"score\": \"500.0\"}",
+              "OpenbookQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "OpenbookQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "OpenbookQA - # prompt tokens": "{\"description\": \"min=249.776, mean=249.776, max=249.776, sum=249.776 (1)\", \"tab\": \"General information\", \"score\": \"249.776\"}",
+              "OpenbookQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"college_physics\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_college_physics\""
+              "dataset": "\"openbookqa\"",
+              "method": "\"multiple_choice_joint\""
             }
           }
         },
         {
-          "evaluation_name": "Computer Security",
+          "evaluation_name": "MMLU",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "MMLU",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Computer Security",
+            "evaluation_description": "EM on MMLU",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.71,
+            "score": 0.5,
             "details": {
-              "description": "min=0.71, mean=0.71, max=0.71, sum=1.42 (2)",
+              "description": "min=0.26, mean=0.5, max=0.79, sum=2.501 (5)",
               "tab": "Accuracy",
-              "Computer Security - Observed inference time (s)": "{\"description\": \"min=0.434, mean=0.434, max=0.434, sum=0.867 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.43369229555130007\"}",
-              "Computer Security - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "Computer Security - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Computer Security - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Computer Security - # prompt tokens": "{\"description\": \"min=378.51, mean=378.51, max=378.51, sum=757.02 (2)\", \"tab\": \"General information\", \"score\": \"378.51\"}",
-              "Computer Security - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "MMLU - Observed inference time (s)": "{\"description\": \"min=0.284, mean=0.417, max=0.567, sum=2.086 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.41729471965421716\"}",
+              "MMLU - # eval": "{\"description\": \"min=100, mean=102.8, max=114, sum=514 (5)\", \"tab\": \"General information\", \"score\": \"102.8\"}",
+              "MMLU - # train": "{\"description\": \"min=5, mean=5, max=5, sum=25 (5)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "MMLU - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "MMLU - # prompt tokens": "{\"description\": \"min=373.43, mean=467.686, max=614.421, sum=2338.431 (5)\", \"tab\": \"General information\", \"score\": \"467.6862105263158\"}",
+              "MMLU - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"computer_security\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_computer_security\""
+              "subject": "[\"abstract_algebra\", \"college_chemistry\", \"computer_security\", \"econometrics\", \"us_foreign_policy\"]",
+              "method": "\"multiple_choice_joint\""
             }
           }
         },
         {
-          "evaluation_name": "Econometrics",
+          "evaluation_name": "MATH",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "MATH",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Econometrics",
+            "evaluation_description": "Equivalent (CoT) on MATH",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.351,
+            "score": 0.703,
             "details": {
-              "description": "min=0.351, mean=0.351, max=0.351, sum=0.702 (2)",
+              "description": "min=0.509, mean=0.703, max=0.849, sum=4.92 (7)",
               "tab": "Accuracy",
-              "Econometrics - Observed inference time (s)": "{\"description\": \"min=0.371, mean=0.371, max=0.371, sum=0.742 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3707838414008157\"}",
-              "Econometrics - # eval": "{\"description\": \"min=114, mean=114, max=114, sum=228 (2)\", \"tab\": \"General information\", \"score\": \"114.0\"}",
-              "Econometrics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Econometrics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Econometrics - # prompt tokens": "{\"description\": \"min=614.421, mean=614.421, max=614.421, sum=1228.842 (2)\", \"tab\": \"General information\", \"score\": \"614.421052631579\"}",
-              "Econometrics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "MATH - Observed inference time (s)": "{\"description\": \"min=1.617, mean=1.927, max=2.175, sum=13.492 (7)\", \"tab\": \"Efficiency\", \"score\": \"1.9274194573191807\"}",
+              "MATH - # eval": "{\"description\": \"min=30, mean=62.429, max=135, sum=437 (7)\", \"tab\": \"General information\", \"score\": \"62.42857142857143\"}",
+              "MATH - # train": "{\"description\": \"min=8, mean=8, max=8, sum=56 (7)\", \"tab\": \"General information\", \"score\": \"8.0\"}",
+              "MATH - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (7)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "MATH - # prompt tokens": "{\"description\": \"min=881.363, mean=1262.909, max=2197.577, sum=8840.364 (7)\", \"tab\": \"General information\", \"score\": \"1262.9092130545007\"}",
+              "MATH - # output tokens": "{\"description\": \"min=203.384, mean=253.982, max=288.596, sum=1777.872 (7)\", \"tab\": \"General information\", \"score\": \"253.98170179473732\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"econometrics\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_econometrics\""
+              "subject": "[\"algebra\", \"counting_and_probability\", \"geometry\", \"intermediate_algebra\", \"number_theory\", \"prealgebra\", \"precalculus\"]",
+              "level": "\"1\"",
+              "use_official_examples": "\"False\"",
+              "use_chain_of_thought": "\"True\""
             }
           }
         },
         {
-          "evaluation_name": "Global Facts",
+          "evaluation_name": "GSM8K",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "GSM8K",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Global Facts",
+            "evaluation_description": "EM on GSM8K",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.26,
+            "score": 0.798,
             "details": {
-              "description": "min=0.26, mean=0.26, max=0.26, sum=0.52 (2)",
+              "description": "min=0.798, mean=0.798, max=0.798, sum=0.798 (1)",
               "tab": "Accuracy",
-              "Global Facts - Observed inference time (s)": "{\"description\": \"min=0.202, mean=0.202, max=0.202, sum=0.403 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2015515398979187\"}",
-              "Global Facts - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "Global Facts - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Global Facts - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Global Facts - # prompt tokens": "{\"description\": \"min=399.71, mean=399.71, max=399.71, sum=799.42 (2)\", \"tab\": \"General information\", \"score\": \"399.71\"}",
-              "Global Facts - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "GSM8K - Observed inference time (s)": "{\"description\": \"min=2.109, mean=2.109, max=2.109, sum=2.109 (1)\", \"tab\": \"Efficiency\", \"score\": \"2.108796592712402\"}",
+              "GSM8K - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}",
+              "GSM8K - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "GSM8K - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "GSM8K - # prompt tokens": "{\"description\": \"min=959.032, mean=959.032, max=959.032, sum=959.032 (1)\", \"tab\": \"General information\", \"score\": \"959.032\"}",
+              "GSM8K - # output tokens": "{\"description\": \"min=150.02, mean=150.02, max=150.02, sum=150.02 (1)\", \"tab\": \"General information\", \"score\": \"150.02\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"global_facts\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_global_facts\""
+              "stop": "\"none\""
             }
           }
         },
         {
-          "evaluation_name": "Jurisprudence",
+          "evaluation_name": "LegalBench",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "LegalBench",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Jurisprudence",
+            "evaluation_description": "EM on LegalBench",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.731,
+            "score": 0.342,
             "details": {
-              "description": "min=0.731, mean=0.731, max=0.731, sum=1.463 (2)",
+              "description": "min=0, mean=0.342, max=0.8, sum=1.71 (5)",
               "tab": "Accuracy",
-              "Jurisprudence - Observed inference time (s)": "{\"description\": \"min=1.035, mean=1.035, max=1.035, sum=2.07 (2)\", \"tab\": \"Efficiency\", \"score\": \"1.0347525963076838\"}",
-              "Jurisprudence - # eval": "{\"description\": \"min=108, mean=108, max=108, sum=216 (2)\", \"tab\": \"General information\", \"score\": \"108.0\"}",
-              "Jurisprudence - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Jurisprudence - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Jurisprudence - # prompt tokens": "{\"description\": \"min=394.63, mean=394.63, max=394.63, sum=789.259 (2)\", \"tab\": \"General information\", \"score\": \"394.6296296296296\"}",
-              "Jurisprudence - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "LegalBench - Observed inference time (s)": "{\"description\": \"min=0.409, mean=0.481, max=0.626, sum=2.407 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.4814103188942614\"}",
+              "LegalBench - # eval": "{\"description\": \"min=95, mean=409.4, max=1000, sum=2047 (5)\", \"tab\": \"General information\", \"score\": \"409.4\"}",
+              "LegalBench - # train": "{\"description\": \"min=4, mean=4.8, max=5, sum=24 (5)\", \"tab\": \"General information\", \"score\": \"4.8\"}",
+              "LegalBench - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "LegalBench - # prompt tokens": "{\"description\": \"min=197.442, mean=1513.882, max=6300.012, sum=7569.412 (5)\", \"tab\": \"General information\", \"score\": \"1513.8824197238912\"}",
+              "LegalBench - # output tokens": "{\"description\": \"min=2.032, mean=6.824, max=10.886, sum=34.118 (5)\", \"tab\": \"General information\", \"score\": \"6.823557876005701\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"jurisprudence\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_jurisprudence\""
+              "subset": "[\"abercrombie\", \"corporate_lobbying\", \"function_of_decision_section\", \"international_citizenship_questions\", \"proa\"]"
             }
           }
         },
         {
-          "evaluation_name": "Philosophy",
+          "evaluation_name": "MedQA",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "MedQA",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Philosophy",
+            "evaluation_description": "EM on MedQA",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.64,
+            "score": 0.245,
             "details": {
-              "description": "min=0.64, mean=0.64, max=0.64, sum=1.28 (2)",
+              "description": "min=0.245, mean=0.245, max=0.245, sum=0.245 (1)",
               "tab": "Accuracy",
-              "Philosophy - Observed inference time (s)": "{\"description\": \"min=0.681, mean=0.681, max=0.681, sum=1.363 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6814629341628391\"}",
-              "Philosophy - # eval": "{\"description\": \"min=311, mean=311, max=311, sum=622 (2)\", \"tab\": \"General information\", \"score\": \"311.0\"}",
-              "Philosophy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Philosophy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Philosophy - # prompt tokens": "{\"description\": \"min=329.084, mean=329.084, max=329.084, sum=658.167 (2)\", \"tab\": \"General information\", \"score\": \"329.08360128617363\"}",
-              "Philosophy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "MedQA - Observed inference time (s)": "{\"description\": \"min=0.743, mean=0.743, max=0.743, sum=0.743 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.742541556803891\"}",
+              "MedQA - # eval": "{\"description\": \"min=503, mean=503, max=503, sum=503 (1)\", \"tab\": \"General information\", \"score\": \"503.0\"}",
+              "MedQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "MedQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "MedQA - # prompt tokens": "{\"description\": \"min=1025.274, mean=1025.274, max=1025.274, sum=1025.274 (1)\", \"tab\": \"General information\", \"score\": \"1025.2743538767395\"}",
+              "MedQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
-            "additional_details": {
-              "subject": "\"philosophy\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_philosophy\""
-            }
+            "additional_details": {}
           }
         },
         {
-          "evaluation_name": "Professional Psychology",
+          "evaluation_name": "WMT 2014",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "WMT 2014",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Professional Psychology",
+            "evaluation_description": "BLEU-4 on WMT 2014",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.649,
+            "score": 0.181,
             "details": {
-              "description": "min=0.649, mean=0.649, max=0.649, sum=1.297 (2)",
+              "description": "min=0.132, mean=0.181, max=0.219, sum=0.907 (5)",
               "tab": "Accuracy",
-              "Professional Medicine - Observed inference time (s)": "{\"description\": \"min=0.546, mean=0.546, max=0.546, sum=1.091 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5456299475010704\"}",
-              "Professional Accounting - Observed inference time (s)": "{\"description\": \"min=0.538, mean=0.538, max=0.538, sum=1.077 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5383730044601657\"}",
-              "Professional Law - Observed inference time (s)": "{\"description\": \"min=0.881, mean=0.881, max=0.881, sum=1.762 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.8808572895368355\"}",
-              "Professional Psychology - Observed inference time (s)": "{\"description\": \"min=0.694, mean=0.694, max=0.694, sum=1.388 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6941978611977272\"}",
-              "Professional Medicine - # eval": "{\"description\": \"min=272, mean=272, max=272, sum=544 (2)\", \"tab\": \"General information\", \"score\": \"272.0\"}",
-              "Professional Medicine - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Professional Medicine - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Professional Medicine - # prompt tokens": "{\"description\": \"min=1094.489, mean=1094.489, max=1094.489, sum=2188.978 (2)\", \"tab\": \"General information\", \"score\": \"1094.4889705882354\"}",
-              "Professional Medicine - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "Professional Accounting - # eval": "{\"description\": \"min=282, mean=282, max=282, sum=564 (2)\", \"tab\": \"General information\", \"score\": \"282.0\"}",
-              "Professional Accounting - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Professional Accounting - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Professional Accounting - # prompt tokens": "{\"description\": \"min=658.585, mean=658.585, max=658.585, sum=1317.17 (2)\", \"tab\": \"General information\", \"score\": \"658.5851063829788\"}",
-              "Professional Accounting - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "Professional Law - # eval": "{\"description\": \"min=1534, mean=1534, max=1534, sum=3068 (2)\", \"tab\": \"General information\", \"score\": \"1534.0\"}",
-              "Professional Law - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Professional Law - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Professional Law - # prompt tokens": "{\"description\": \"min=1637.601, mean=1637.601, max=1637.601, sum=3275.202 (2)\", \"tab\": \"General information\", \"score\": \"1637.6010430247718\"}",
-              "Professional Law - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "Professional Psychology - # eval": "{\"description\": \"min=612, mean=612, max=612, sum=1224 (2)\", \"tab\": \"General information\", \"score\": \"612.0\"}",
-              "Professional Psychology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Professional Psychology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Professional Psychology - # prompt tokens": "{\"description\": \"min=575.098, mean=575.098, max=575.098, sum=1150.196 (2)\", \"tab\": \"General information\", \"score\": \"575.0980392156863\"}",
-              "Professional Psychology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "WMT 2014 - Observed inference time (s)": "{\"description\": \"min=0.439, mean=0.565, max=0.727, sum=2.826 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.5651802479746801\"}",
+              "WMT 2014 - # eval": "{\"description\": \"min=503, mean=568.8, max=832, sum=2844 (5)\", \"tab\": \"General information\", \"score\": \"568.8\"}",
+              "WMT 2014 - # train": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "WMT 2014 - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "WMT 2014 - # prompt tokens": "{\"description\": \"min=101.139, mean=120.712, max=141.117, sum=603.559 (5)\", \"tab\": \"General information\", \"score\": \"120.71178123566294\"}",
+              "WMT 2014 - # output tokens": "{\"description\": \"min=24.354, mean=25.779, max=26.833, sum=128.893 (5)\", \"tab\": \"General information\", \"score\": \"25.778561802263347\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"professional_psychology\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_professional_psychology\""
+              "language_pair": "[\"cs-en\", \"de-en\", \"fr-en\", \"hi-en\", \"ru-en\"]"
             }
           }
-        },
+        }
+      ],
+      "detailed_evaluation_results": null,
+      "generation_config": {
+        "additional_details": {}
+      }
+    },
+    {
+      "evaluation_id": "helm_mmlu/meta_llama-3.1-8b-instruct-turbo/1774096312.00548",
+      "retrieved_timestamp": "1774096312.00548",
+      "source_metadata": {
+        "source_name": "helm_mmlu",
+        "source_type": "documentation",
+        "source_organization_name": "crfm",
+        "evaluator_relationship": "third_party"
+      },
+      "eval_library": {
+        "name": "helm",
+        "version": "unknown"
+      },
+      "benchmark": "helm_mmlu",
+      "evaluation_results": [
         {
-          "evaluation_name": "Us Foreign Policy",
+          "evaluation_name": "MMLU All Subjects",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -683,36 +627,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Us Foreign Policy",
+            "evaluation_description": "EM on MMLU All Subjects",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.79,
+            "score": 0.561,
             "details": {
-              "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)",
+              "description": "min=0.26, mean=0.561, max=0.865, sum=63.912 (114)",
               "tab": "Accuracy",
-              "Us Foreign Policy - Observed inference time (s)": "{\"description\": \"min=0.567, mean=0.567, max=0.567, sum=1.135 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5673955392837524\"}",
-              "Us Foreign Policy - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "Us Foreign Policy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Us Foreign Policy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Us Foreign Policy - # prompt tokens": "{\"description\": \"min=422.79, mean=422.79, max=422.79, sum=845.58 (2)\", \"tab\": \"General information\", \"score\": \"422.79\"}",
-              "Us Foreign Policy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "MMLU All Subjects - Observed inference time (s)": "{\"description\": \"min=0.202, mean=0.56, max=1.485, sum=63.854 (114)\", \"tab\": \"Efficiency\", \"score\": \"0.5601251981506405\"}",
+              "MMLU All Subjects - # eval": "{\"description\": \"min=100, mean=246.351, max=1534, sum=28084 (114)\", \"tab\": \"General information\", \"score\": \"246.35087719298247\"}",
+              "MMLU All Subjects - # train": "{\"description\": \"min=5, mean=5, max=5, sum=570 (114)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "MMLU All Subjects - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (114)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "MMLU All Subjects - # prompt tokens": "{\"description\": \"min=274.52, mean=614.619, max=2797.885, sum=70066.61 (114)\", \"tab\": \"General information\", \"score\": \"614.6193817308517\"}",
+              "MMLU All Subjects - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=114 (114)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"us_foreign_policy\"",
+              "subject": "[\"abstract_algebra\", \"anatomy\", \"astronomy\", \"business_ethics\", \"clinical_knowledge\", \"college_biology\", \"college_chemistry\", \"college_computer_science\", \"college_mathematics\", \"college_medicine\", \"college_physics\", \"computer_security\", \"conceptual_physics\", \"econometrics\", \"electrical_engineering\", \"elementary_mathematics\", \"formal_logic\", \"global_facts\", \"high_school_biology\", \"high_school_chemistry\", \"high_school_computer_science\", \"high_school_european_history\", \"high_school_geography\", \"high_school_government_and_politics\", \"high_school_macroeconomics\", \"high_school_mathematics\", \"high_school_microeconomics\", \"high_school_physics\", \"high_school_psychology\", \"high_school_statistics\", \"high_school_us_history\", \"high_school_world_history\", \"human_aging\", \"human_sexuality\", \"international_law\", \"jurisprudence\", \"logical_fallacies\", \"machine_learning\", \"management\", \"marketing\", \"medical_genetics\", \"miscellaneous\", \"moral_disputes\", \"moral_scenarios\", \"nutrition\", \"philosophy\", \"prehistory\", \"professional_accounting\", \"professional_law\", \"professional_medicine\", \"professional_psychology\", \"public_relations\", \"security_studies\", \"sociology\", \"us_foreign_policy\", \"virology\", \"world_religions\"]",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_us_foreign_policy\""
+              "groups": "[\"mmlu_abstract_algebra\", \"mmlu_anatomy\", \"mmlu_astronomy\", \"mmlu_business_ethics\", \"mmlu_clinical_knowledge\", \"mmlu_college_biology\", \"mmlu_college_chemistry\", \"mmlu_college_computer_science\", \"mmlu_college_mathematics\", \"mmlu_college_medicine\", \"mmlu_college_physics\", \"mmlu_computer_security\", \"mmlu_conceptual_physics\", \"mmlu_econometrics\", \"mmlu_electrical_engineering\", \"mmlu_elementary_mathematics\", \"mmlu_formal_logic\", \"mmlu_global_facts\", \"mmlu_high_school_biology\", \"mmlu_high_school_chemistry\", \"mmlu_high_school_computer_science\", \"mmlu_high_school_european_history\", \"mmlu_high_school_geography\", \"mmlu_high_school_government_and_politics\", \"mmlu_high_school_macroeconomics\", \"mmlu_high_school_mathematics\", \"mmlu_high_school_microeconomics\", \"mmlu_high_school_physics\", \"mmlu_high_school_psychology\", \"mmlu_high_school_statistics\", \"mmlu_high_school_us_history\", \"mmlu_high_school_world_history\", \"mmlu_human_aging\", \"mmlu_human_sexuality\", \"mmlu_international_law\", \"mmlu_jurisprudence\", \"mmlu_logical_fallacies\", \"mmlu_machine_learning\", \"mmlu_management\", \"mmlu_marketing\", \"mmlu_medical_genetics\", \"mmlu_miscellaneous\", \"mmlu_moral_disputes\", \"mmlu_moral_scenarios\", \"mmlu_nutrition\", \"mmlu_philosophy\", \"mmlu_prehistory\", \"mmlu_professional_accounting\", \"mmlu_professional_law\", \"mmlu_professional_medicine\", \"mmlu_professional_psychology\", \"mmlu_public_relations\", \"mmlu_security_studies\", \"mmlu_sociology\", \"mmlu_us_foreign_policy\", \"mmlu_virology\", \"mmlu_world_religions\"]"
             }
           }
         },
         {
-          "evaluation_name": "Astronomy",
+          "evaluation_name": "Abstract Algebra",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -721,36 +665,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Astronomy",
+            "evaluation_description": "EM on Abstract Algebra",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.645,
+            "score": 0.26,
             "details": {
-              "description": "min=0.645, mean=0.645, max=0.645, sum=1.289 (2)",
+              "description": "min=0.26, mean=0.26, max=0.26, sum=0.52 (2)",
               "tab": "Accuracy",
-              "Astronomy - Observed inference time (s)": "{\"description\": \"min=0.317, mean=0.317, max=0.317, sum=0.634 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3168644199245854\"}",
-              "Astronomy - # eval": "{\"description\": \"min=152, mean=152, max=152, sum=304 (2)\", \"tab\": \"General information\", \"score\": \"152.0\"}",
-              "Astronomy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Astronomy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Astronomy - # prompt tokens": "{\"description\": \"min=579.684, mean=579.684, max=579.684, sum=1159.368 (2)\", \"tab\": \"General information\", \"score\": \"579.6842105263158\"}",
-              "Astronomy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Abstract Algebra - Observed inference time (s)": "{\"description\": \"min=0.284, mean=0.284, max=0.284, sum=0.568 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.28381933450698854\"}",
+              "Abstract Algebra - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "Abstract Algebra - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Abstract Algebra - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Abstract Algebra - # prompt tokens": "{\"description\": \"min=373.43, mean=373.43, max=373.43, sum=746.86 (2)\", \"tab\": \"General information\", \"score\": \"373.43\"}",
+              "Abstract Algebra - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"astronomy\"",
+              "subject": "\"abstract_algebra\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_astronomy\""
+              "groups": "\"mmlu_abstract_algebra\""
             }
           }
         },
         {
-          "evaluation_name": "Business Ethics",
+          "evaluation_name": "Anatomy",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -759,36 +703,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Business Ethics",
+            "evaluation_description": "EM on Anatomy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.65,
+            "score": 0.459,
             "details": {
-              "description": "min=0.65, mean=0.65, max=0.65, sum=1.3 (2)",
+              "description": "min=0.459, mean=0.459, max=0.459, sum=0.919 (2)",
               "tab": "Accuracy",
-              "Business Ethics - Observed inference time (s)": "{\"description\": \"min=0.444, mean=0.444, max=0.444, sum=0.888 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.44396358251571655\"}",
-              "Business Ethics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "Business Ethics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Business Ethics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Business Ethics - # prompt tokens": "{\"description\": \"min=569.52, mean=569.52, max=569.52, sum=1139.04 (2)\", \"tab\": \"General information\", \"score\": \"569.52\"}",
-              "Business Ethics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Anatomy - Observed inference time (s)": "{\"description\": \"min=0.323, mean=0.323, max=0.323, sum=0.646 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3231998196354619\"}",
+              "Anatomy - # eval": "{\"description\": \"min=135, mean=135, max=135, sum=270 (2)\", \"tab\": \"General information\", \"score\": \"135.0\"}",
+              "Anatomy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Anatomy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Anatomy - # prompt tokens": "{\"description\": \"min=353.874, mean=353.874, max=353.874, sum=707.748 (2)\", \"tab\": \"General information\", \"score\": \"353.8740740740741\"}",
+              "Anatomy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"business_ethics\"",
+              "subject": "\"anatomy\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_business_ethics\""
+              "groups": "\"mmlu_anatomy\""
             }
           }
         },
         {
-          "evaluation_name": "Clinical Knowledge",
+          "evaluation_name": "College Physics",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -797,36 +741,66 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Clinical Knowledge",
+            "evaluation_description": "EM on College Physics",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.615,
+            "score": 0.363,
             "details": {
-              "description": "min=0.615, mean=0.615, max=0.615, sum=1.23 (2)",
+              "description": "min=0.363, mean=0.363, max=0.363, sum=0.725 (2)",
               "tab": "Accuracy",
-              "Clinical Knowledge - Observed inference time (s)": "{\"description\": \"min=0.369, mean=0.369, max=0.369, sum=0.738 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3692442273193935\"}",
-              "Clinical Knowledge - # eval": "{\"description\": \"min=265, mean=265, max=265, sum=530 (2)\", \"tab\": \"General information\", \"score\": \"265.0\"}",
-              "Clinical Knowledge - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Clinical Knowledge - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Clinical Knowledge - # prompt tokens": "{\"description\": \"min=397.928, mean=397.928, max=397.928, sum=795.857 (2)\", \"tab\": \"General information\", \"score\": \"397.92830188679244\"}",
-              "Clinical Knowledge - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "College Chemistry - Observed inference time (s)": "{\"description\": \"min=0.431, mean=0.431, max=0.431, sum=0.862 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.43078258752822873\"}",
+              "College Biology - Observed inference time (s)": "{\"description\": \"min=0.426, mean=0.426, max=0.426, sum=0.853 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42637243535783553\"}",
+              "College Computer Science - Observed inference time (s)": "{\"description\": \"min=0.562, mean=0.562, max=0.562, sum=1.125 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5623248195648194\"}",
+              "College Mathematics - Observed inference time (s)": "{\"description\": \"min=0.371, mean=0.371, max=0.371, sum=0.742 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3709776735305786\"}",
+              "College Medicine - Observed inference time (s)": "{\"description\": \"min=0.395, mean=0.395, max=0.395, sum=0.79 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3948341918129452\"}",
+              "College Physics - Observed inference time (s)": "{\"description\": \"min=0.395, mean=0.395, max=0.395, sum=0.789 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.39474552051693784\"}",
+              "College Chemistry - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "College Chemistry - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "College Chemistry - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "College Chemistry - # prompt tokens": "{\"description\": \"min=549.28, mean=549.28, max=549.28, sum=1098.56 (2)\", \"tab\": \"General information\", \"score\": \"549.28\"}",
+              "College Chemistry - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "College Biology - # eval": "{\"description\": \"min=144, mean=144, max=144, sum=288 (2)\", \"tab\": \"General information\", \"score\": \"144.0\"}",
+              "College Biology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "College Biology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "College Biology - # prompt tokens": "{\"description\": \"min=473.875, mean=473.875, max=473.875, sum=947.75 (2)\", \"tab\": \"General information\", \"score\": \"473.875\"}",
+              "College Biology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "College Computer Science - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "College Computer Science - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "College Computer Science - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "College Computer Science - # prompt tokens": "{\"description\": \"min=828.29, mean=828.29, max=828.29, sum=1656.58 (2)\", \"tab\": \"General information\", \"score\": \"828.29\"}",
+              "College Computer Science - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "College Mathematics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "College Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "College Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "College Mathematics - # prompt tokens": "{\"description\": \"min=594.51, mean=594.51, max=594.51, sum=1189.02 (2)\", \"tab\": \"General information\", \"score\": \"594.51\"}",
+              "College Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "College Medicine - # eval": "{\"description\": \"min=173, mean=173, max=173, sum=346 (2)\", \"tab\": \"General information\", \"score\": \"173.0\"}",
+              "College Medicine - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "College Medicine - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "College Medicine - # prompt tokens": "{\"description\": \"min=502.705, mean=502.705, max=502.705, sum=1005.41 (2)\", \"tab\": \"General information\", \"score\": \"502.70520231213874\"}",
+              "College Medicine - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "College Physics - # eval": "{\"description\": \"min=102, mean=102, max=102, sum=204 (2)\", \"tab\": \"General information\", \"score\": \"102.0\"}",
+              "College Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "College Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "College Physics - # prompt tokens": "{\"description\": \"min=503.569, mean=503.569, max=503.569, sum=1007.137 (2)\", \"tab\": \"General information\", \"score\": \"503.5686274509804\"}",
+              "College Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"clinical_knowledge\"",
+              "subject": "\"college_physics\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_clinical_knowledge\""
+              "groups": "\"mmlu_college_physics\""
             }
           }
         },
         {
-          "evaluation_name": "Conceptual Physics",
+          "evaluation_name": "Computer Security",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -835,36 +809,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Conceptual Physics",
+            "evaluation_description": "EM on Computer Security",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.528,
+            "score": 0.71,
             "details": {
-              "description": "min=0.528, mean=0.528, max=0.528, sum=1.055 (2)",
+              "description": "min=0.71, mean=0.71, max=0.71, sum=1.42 (2)",
               "tab": "Accuracy",
-              "Conceptual Physics - Observed inference time (s)": "{\"description\": \"min=0.351, mean=0.351, max=0.351, sum=0.701 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.35051030605397326\"}",
-              "Conceptual Physics - # eval": "{\"description\": \"min=235, mean=235, max=235, sum=470 (2)\", \"tab\": \"General information\", \"score\": \"235.0\"}",
-              "Conceptual Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Conceptual Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Conceptual Physics - # prompt tokens": "{\"description\": \"min=304.834, mean=304.834, max=304.834, sum=609.668 (2)\", \"tab\": \"General information\", \"score\": \"304.83404255319147\"}",
-              "Conceptual Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Computer Security - Observed inference time (s)": "{\"description\": \"min=0.434, mean=0.434, max=0.434, sum=0.867 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.43369229555130007\"}",
+              "Computer Security - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "Computer Security - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Computer Security - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Computer Security - # prompt tokens": "{\"description\": \"min=378.51, mean=378.51, max=378.51, sum=757.02 (2)\", \"tab\": \"General information\", \"score\": \"378.51\"}",
+              "Computer Security - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"conceptual_physics\"",
+              "subject": "\"computer_security\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_conceptual_physics\""
+              "groups": "\"mmlu_computer_security\""
             }
           }
         },
         {
-          "evaluation_name": "Electrical Engineering",
+          "evaluation_name": "Econometrics",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -873,36 +847,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Electrical Engineering",
+            "evaluation_description": "EM on Econometrics",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.441,
+            "score": 0.351,
             "details": {
-              "description": "min=0.441, mean=0.441, max=0.441, sum=0.883 (2)",
+              "description": "min=0.351, mean=0.351, max=0.351, sum=0.702 (2)",
               "tab": "Accuracy",
-              "Electrical Engineering - Observed inference time (s)": "{\"description\": \"min=0.35, mean=0.35, max=0.35, sum=0.7 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.34982287637118636\"}",
-              "Electrical Engineering - # eval": "{\"description\": \"min=145, mean=145, max=145, sum=290 (2)\", \"tab\": \"General information\", \"score\": \"145.0\"}",
-              "Electrical Engineering - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Electrical Engineering - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Electrical Engineering - # prompt tokens": "{\"description\": \"min=435.607, mean=435.607, max=435.607, sum=871.214 (2)\", \"tab\": \"General information\", \"score\": \"435.60689655172416\"}",
-              "Electrical Engineering - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Econometrics - Observed inference time (s)": "{\"description\": \"min=0.371, mean=0.371, max=0.371, sum=0.742 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3707838414008157\"}",
+              "Econometrics - # eval": "{\"description\": \"min=114, mean=114, max=114, sum=228 (2)\", \"tab\": \"General information\", \"score\": \"114.0\"}",
+              "Econometrics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Econometrics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Econometrics - # prompt tokens": "{\"description\": \"min=614.421, mean=614.421, max=614.421, sum=1228.842 (2)\", \"tab\": \"General information\", \"score\": \"614.421052631579\"}",
+              "Econometrics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"electrical_engineering\"",
+              "subject": "\"econometrics\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_electrical_engineering\""
+              "groups": "\"mmlu_econometrics\""
             }
           }
         },
         {
-          "evaluation_name": "Elementary Mathematics",
+          "evaluation_name": "Global Facts",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -911,36 +885,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Elementary Mathematics",
+            "evaluation_description": "EM on Global Facts",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.429,
+            "score": 0.26,
             "details": {
-              "description": "min=0.429, mean=0.429, max=0.429, sum=0.857 (2)",
+              "description": "min=0.26, mean=0.26, max=0.26, sum=0.52 (2)",
               "tab": "Accuracy",
-              "Elementary Mathematics - Observed inference time (s)": "{\"description\": \"min=0.4, mean=0.4, max=0.4, sum=0.801 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4003569991500289\"}",
-              "Elementary Mathematics - # eval": "{\"description\": \"min=378, mean=378, max=378, sum=756 (2)\", \"tab\": \"General information\", \"score\": \"378.0\"}",
-              "Elementary Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Elementary Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Elementary Mathematics - # prompt tokens": "{\"description\": \"min=531.854, mean=531.854, max=531.854, sum=1063.709 (2)\", \"tab\": \"General information\", \"score\": \"531.8544973544973\"}",
-              "Elementary Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Global Facts - Observed inference time (s)": "{\"description\": \"min=0.202, mean=0.202, max=0.202, sum=0.403 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2015515398979187\"}",
+              "Global Facts - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "Global Facts - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Global Facts - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Global Facts - # prompt tokens": "{\"description\": \"min=399.71, mean=399.71, max=399.71, sum=799.42 (2)\", \"tab\": \"General information\", \"score\": \"399.71\"}",
+              "Global Facts - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"elementary_mathematics\"",
+              "subject": "\"global_facts\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_elementary_mathematics\""
+              "groups": "\"mmlu_global_facts\""
             }
           }
         },
         {
-          "evaluation_name": "Formal Logic",
+          "evaluation_name": "Jurisprudence",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -949,36 +923,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Formal Logic",
+            "evaluation_description": "EM on Jurisprudence",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.444,
+            "score": 0.731,
             "details": {
-              "description": "min=0.444, mean=0.444, max=0.444, sum=0.889 (2)",
+              "description": "min=0.731, mean=0.731, max=0.731, sum=1.463 (2)",
               "tab": "Accuracy",
-              "Formal Logic - Observed inference time (s)": "{\"description\": \"min=0.357, mean=0.357, max=0.357, sum=0.714 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.35707327108534553\"}",
-              "Formal Logic - # eval": "{\"description\": \"min=126, mean=126, max=126, sum=252 (2)\", \"tab\": \"General information\", \"score\": \"126.0\"}",
-              "Formal Logic - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Formal Logic - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Formal Logic - # prompt tokens": "{\"description\": \"min=601.778, mean=601.778, max=601.778, sum=1203.556 (2)\", \"tab\": \"General information\", \"score\": \"601.7777777777778\"}",
-              "Formal Logic - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Jurisprudence - Observed inference time (s)": "{\"description\": \"min=1.035, mean=1.035, max=1.035, sum=2.07 (2)\", \"tab\": \"Efficiency\", \"score\": \"1.0347525963076838\"}",
+              "Jurisprudence - # eval": "{\"description\": \"min=108, mean=108, max=108, sum=216 (2)\", \"tab\": \"General information\", \"score\": \"108.0\"}",
+              "Jurisprudence - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Jurisprudence - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Jurisprudence - # prompt tokens": "{\"description\": \"min=394.63, mean=394.63, max=394.63, sum=789.259 (2)\", \"tab\": \"General information\", \"score\": \"394.6296296296296\"}",
+              "Jurisprudence - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"formal_logic\"",
+              "subject": "\"jurisprudence\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_formal_logic\""
+              "groups": "\"mmlu_jurisprudence\""
             }
           }
         },
         {
-          "evaluation_name": "High School World History",
+          "evaluation_name": "Philosophy",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -987,114 +961,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on High School World History",
+            "evaluation_description": "EM on Philosophy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.515,
+            "score": 0.64,
             "details": {
-              "description": "min=0.515, mean=0.515, max=0.515, sum=1.03 (2)",
+              "description": "min=0.64, mean=0.64, max=0.64, sum=1.28 (2)",
               "tab": "Accuracy",
-              "High School Biology - Observed inference time (s)": "{\"description\": \"min=0.211, mean=0.211, max=0.211, sum=0.423 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.21137587870320967\"}",
-              "High School Chemistry - Observed inference time (s)": "{\"description\": \"min=0.211, mean=0.211, max=0.211, sum=0.423 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2113605567387172\"}",
-              "High School Computer Science - Observed inference time (s)": "{\"description\": \"min=0.214, mean=0.214, max=0.214, sum=0.428 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2138903546333313\"}",
-              "High School European History - Observed inference time (s)": "{\"description\": \"min=0.332, mean=0.332, max=0.332, sum=0.664 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.33188523668231384\"}",
-              "High School Geography - Observed inference time (s)": "{\"description\": \"min=0.218, mean=0.218, max=0.218, sum=0.435 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.21753037818754561\"}",
-              "High School Government And Politics - Observed inference time (s)": "{\"description\": \"min=0.558, mean=0.558, max=0.558, sum=1.117 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.558492410985917\"}",
-              "High School Macroeconomics - Observed inference time (s)": "{\"description\": \"min=0.703, mean=0.703, max=0.703, sum=1.407 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.7033225890917656\"}",
-              "High School Mathematics - Observed inference time (s)": "{\"description\": \"min=0.649, mean=0.649, max=0.649, sum=1.299 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6494572189119127\"}",
-              "High School Microeconomics - Observed inference time (s)": "{\"description\": \"min=0.612, mean=0.612, max=0.612, sum=1.223 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6115654797113242\"}",
-              "High School Physics - Observed inference time (s)": "{\"description\": \"min=0.564, mean=0.564, max=0.564, sum=1.127 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5636763351642533\"}",
-              "High School Psychology - Observed inference time (s)": "{\"description\": \"min=0.681, mean=0.681, max=0.681, sum=1.363 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6813242522948378\"}",
-              "High School Statistics - Observed inference time (s)": "{\"description\": \"min=0.606, mean=0.606, max=0.606, sum=1.212 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6060926814874014\"}",
-              "High School US History - Observed inference time (s)": "{\"description\": \"min=1.122, mean=1.122, max=1.122, sum=2.244 (2)\", \"tab\": \"Efficiency\", \"score\": \"1.1218917334780973\"}",
-              "High School World History - Observed inference time (s)": "{\"description\": \"min=0.538, mean=0.538, max=0.538, sum=1.076 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5378943324592043\"}",
-              "High School Biology - # eval": "{\"description\": \"min=310, mean=310, max=310, sum=620 (2)\", \"tab\": \"General information\", \"score\": \"310.0\"}",
-              "High School Biology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Biology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Biology - # prompt tokens": "{\"description\": \"min=513.671, mean=513.671, max=513.671, sum=1027.342 (2)\", \"tab\": \"General information\", \"score\": \"513.6709677419354\"}",
-              "High School Biology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Chemistry - # eval": "{\"description\": \"min=203, mean=203, max=203, sum=406 (2)\", \"tab\": \"General information\", \"score\": \"203.0\"}",
-              "High School Chemistry - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Chemistry - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Chemistry - # prompt tokens": "{\"description\": \"min=496.704, mean=496.704, max=496.704, sum=993.409 (2)\", \"tab\": \"General information\", \"score\": \"496.70443349753697\"}",
-              "High School Chemistry - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Computer Science - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "High School Computer Science - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Computer Science - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Computer Science - # prompt tokens": "{\"description\": \"min=867.78, mean=867.78, max=867.78, sum=1735.56 (2)\", \"tab\": \"General information\", \"score\": \"867.78\"}",
-              "High School Computer Science - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School European History - # eval": "{\"description\": \"min=165, mean=165, max=165, sum=330 (2)\", \"tab\": \"General information\", \"score\": \"165.0\"}",
-              "High School European History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School European History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School European History - # prompt tokens": "{\"description\": \"min=2797.885, mean=2797.885, max=2797.885, sum=5595.77 (2)\", \"tab\": \"General information\", \"score\": \"2797.8848484848486\"}",
-              "High School European History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Geography - # eval": "{\"description\": \"min=198, mean=198, max=198, sum=396 (2)\", \"tab\": \"General information\", \"score\": \"198.0\"}",
-              "High School Geography - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Geography - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Geography - # prompt tokens": "{\"description\": \"min=372.035, mean=372.035, max=372.035, sum=744.071 (2)\", \"tab\": \"General information\", \"score\": \"372.0353535353535\"}",
-              "High School Geography - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Government And Politics - # eval": "{\"description\": \"min=193, mean=193, max=193, sum=386 (2)\", \"tab\": \"General information\", \"score\": \"193.0\"}",
-              "High School Government And Politics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Government And Politics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Government And Politics - # prompt tokens": "{\"description\": \"min=465.824, mean=465.824, max=465.824, sum=931.648 (2)\", \"tab\": \"General information\", \"score\": \"465.8238341968912\"}",
-              "High School Government And Politics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Macroeconomics - # eval": "{\"description\": \"min=390, mean=390, max=390, sum=780 (2)\", \"tab\": \"General information\", \"score\": \"390.0\"}",
-              "High School Macroeconomics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Macroeconomics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Macroeconomics - # prompt tokens": "{\"description\": \"min=370.908, mean=370.908, max=370.908, sum=741.815 (2)\", \"tab\": \"General information\", \"score\": \"370.9076923076923\"}",
-              "High School Macroeconomics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Mathematics - # eval": "{\"description\": \"min=270, mean=270, max=270, sum=540 (2)\", \"tab\": \"General information\", \"score\": \"270.0\"}",
-              "High School Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Mathematics - # prompt tokens": "{\"description\": \"min=532.356, mean=532.356, max=532.356, sum=1064.711 (2)\", \"tab\": \"General information\", \"score\": \"532.3555555555556\"}",
-              "High School Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Microeconomics - # eval": "{\"description\": \"min=238, mean=238, max=238, sum=476 (2)\", \"tab\": \"General information\", \"score\": \"238.0\"}",
-              "High School Microeconomics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Microeconomics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Microeconomics - # prompt tokens": "{\"description\": \"min=399.013, mean=399.013, max=399.013, sum=798.025 (2)\", \"tab\": \"General information\", \"score\": \"399.0126050420168\"}",
-              "High School Microeconomics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Physics - # eval": "{\"description\": \"min=151, mean=151, max=151, sum=302 (2)\", \"tab\": \"General information\", \"score\": \"151.0\"}",
-              "High School Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Physics - # prompt tokens": "{\"description\": \"min=560.457, mean=560.457, max=560.457, sum=1120.914 (2)\", \"tab\": \"General information\", \"score\": \"560.4569536423841\"}",
-              "High School Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Psychology - # eval": "{\"description\": \"min=545, mean=545, max=545, sum=1090 (2)\", \"tab\": \"General information\", \"score\": \"545.0\"}",
-              "High School Psychology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Psychology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Psychology - # prompt tokens": "{\"description\": \"min=495.242, mean=495.242, max=495.242, sum=990.484 (2)\", \"tab\": \"General information\", \"score\": \"495.2422018348624\"}",
-              "High School Psychology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Statistics - # eval": "{\"description\": \"min=216, mean=216, max=216, sum=432 (2)\", \"tab\": \"General information\", \"score\": \"216.0\"}",
-              "High School Statistics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Statistics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Statistics - # prompt tokens": "{\"description\": \"min=795.639, mean=795.639, max=795.639, sum=1591.278 (2)\", \"tab\": \"General information\", \"score\": \"795.6388888888889\"}",
-              "High School Statistics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School US History - # eval": "{\"description\": \"min=204, mean=204, max=204, sum=408 (2)\", \"tab\": \"General information\", \"score\": \"204.0\"}",
-              "High School US History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School US History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School US History - # prompt tokens": "{\"description\": \"min=2217.809, mean=2217.809, max=2217.809, sum=4435.618 (2)\", \"tab\": \"General information\", \"score\": \"2217.8088235294117\"}",
-              "High School US History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School World History - # eval": "{\"description\": \"min=237, mean=237, max=237, sum=474 (2)\", \"tab\": \"General information\", \"score\": \"237.0\"}",
-              "High School World History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School World History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School World History - # prompt tokens": "{\"description\": \"min=1428.173, mean=1428.173, max=1428.173, sum=2856.346 (2)\", \"tab\": \"General information\", \"score\": \"1428.1729957805908\"}",
-              "High School World History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Philosophy - Observed inference time (s)": "{\"description\": \"min=0.681, mean=0.681, max=0.681, sum=1.363 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6814629341628391\"}",
+              "Philosophy - # eval": "{\"description\": \"min=311, mean=311, max=311, sum=622 (2)\", \"tab\": \"General information\", \"score\": \"311.0\"}",
+              "Philosophy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Philosophy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Philosophy - # prompt tokens": "{\"description\": \"min=329.084, mean=329.084, max=329.084, sum=658.167 (2)\", \"tab\": \"General information\", \"score\": \"329.08360128617363\"}",
+              "Philosophy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"high_school_world_history\"",
+              "subject": "\"philosophy\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_high_school_world_history\""
+              "groups": "\"mmlu_philosophy\""
             }
           }
         },
         {
-          "evaluation_name": "Human Sexuality",
+          "evaluation_name": "Professional Psychology",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1103,42 +999,54 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Human Sexuality",
+            "evaluation_description": "EM on Professional Psychology",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.733,
+            "score": 0.649,
             "details": {
-              "description": "min=0.733, mean=0.733, max=0.733, sum=1.466 (2)",
+              "description": "min=0.649, mean=0.649, max=0.649, sum=1.297 (2)",
               "tab": "Accuracy",
-              "Human Aging - Observed inference time (s)": "{\"description\": \"min=0.685, mean=0.685, max=0.685, sum=1.369 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6845707412257858\"}",
-              "Human Sexuality - Observed inference time (s)": "{\"description\": \"min=1.227, mean=1.227, max=1.227, sum=2.455 (2)\", \"tab\": \"Efficiency\", \"score\": \"1.2273387745136524\"}",
-              "Human Aging - # eval": "{\"description\": \"min=223, mean=223, max=223, sum=446 (2)\", \"tab\": \"General information\", \"score\": \"223.0\"}",
-              "Human Aging - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Human Aging - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Human Aging - # prompt tokens": "{\"description\": \"min=319.888, mean=319.888, max=319.888, sum=639.776 (2)\", \"tab\": \"General information\", \"score\": \"319.88789237668163\"}",
-              "Human Aging - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "Human Sexuality - # eval": "{\"description\": \"min=131, mean=131, max=131, sum=262 (2)\", \"tab\": \"General information\", \"score\": \"131.0\"}",
-              "Human Sexuality - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Human Sexuality - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Human Sexuality - # prompt tokens": "{\"description\": \"min=341.168, mean=341.168, max=341.168, sum=682.336 (2)\", \"tab\": \"General information\", \"score\": \"341.1679389312977\"}",
-              "Human Sexuality - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Professional Medicine - Observed inference time (s)": "{\"description\": \"min=0.546, mean=0.546, max=0.546, sum=1.091 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5456299475010704\"}",
+              "Professional Accounting - Observed inference time (s)": "{\"description\": \"min=0.538, mean=0.538, max=0.538, sum=1.077 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5383730044601657\"}",
+              "Professional Law - Observed inference time (s)": "{\"description\": \"min=0.881, mean=0.881, max=0.881, sum=1.762 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.8808572895368355\"}",
+              "Professional Psychology - Observed inference time (s)": "{\"description\": \"min=0.694, mean=0.694, max=0.694, sum=1.388 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6941978611977272\"}",
+              "Professional Medicine - # eval": "{\"description\": \"min=272, mean=272, max=272, sum=544 (2)\", \"tab\": \"General information\", \"score\": \"272.0\"}",
+              "Professional Medicine - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Professional Medicine - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Professional Medicine - # prompt tokens": "{\"description\": \"min=1094.489, mean=1094.489, max=1094.489, sum=2188.978 (2)\", \"tab\": \"General information\", \"score\": \"1094.4889705882354\"}",
+              "Professional Medicine - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "Professional Accounting - # eval": "{\"description\": \"min=282, mean=282, max=282, sum=564 (2)\", \"tab\": \"General information\", \"score\": \"282.0\"}",
+              "Professional Accounting - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Professional Accounting - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Professional Accounting - # prompt tokens": "{\"description\": \"min=658.585, mean=658.585, max=658.585, sum=1317.17 (2)\", \"tab\": \"General information\", \"score\": \"658.5851063829788\"}",
+              "Professional Accounting - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "Professional Law - # eval": "{\"description\": \"min=1534, mean=1534, max=1534, sum=3068 (2)\", \"tab\": \"General information\", \"score\": \"1534.0\"}",
+              "Professional Law - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Professional Law - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Professional Law - # prompt tokens": "{\"description\": \"min=1637.601, mean=1637.601, max=1637.601, sum=3275.202 (2)\", \"tab\": \"General information\", \"score\": \"1637.6010430247718\"}",
+              "Professional Law - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "Professional Psychology - # eval": "{\"description\": \"min=612, mean=612, max=612, sum=1224 (2)\", \"tab\": \"General information\", \"score\": \"612.0\"}",
+              "Professional Psychology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Professional Psychology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Professional Psychology - # prompt tokens": "{\"description\": \"min=575.098, mean=575.098, max=575.098, sum=1150.196 (2)\", \"tab\": \"General information\", \"score\": \"575.0980392156863\"}",
+              "Professional Psychology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"human_sexuality\"",
+              "subject": "\"professional_psychology\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_human_sexuality\""
+              "groups": "\"mmlu_professional_psychology\""
             }
           }
         },
         {
-          "evaluation_name": "International Law",
+          "evaluation_name": "Us Foreign Policy",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1147,36 +1055,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on International Law",
+            "evaluation_description": "EM on Us Foreign Policy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.694,
+            "score": 0.79,
             "details": {
-              "description": "min=0.694, mean=0.694, max=0.694, sum=1.388 (2)",
+              "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)",
               "tab": "Accuracy",
-              "International Law - Observed inference time (s)": "{\"description\": \"min=0.684, mean=0.684, max=0.684, sum=1.369 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6842782950598346\"}",
-              "International Law - # eval": "{\"description\": \"min=121, mean=121, max=121, sum=242 (2)\", \"tab\": \"General information\", \"score\": \"121.0\"}",
-              "International Law - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "International Law - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "International Law - # prompt tokens": "{\"description\": \"min=639.818, mean=639.818, max=639.818, sum=1279.636 (2)\", \"tab\": \"General information\", \"score\": \"639.8181818181819\"}",
-              "International Law - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Us Foreign Policy - Observed inference time (s)": "{\"description\": \"min=0.567, mean=0.567, max=0.567, sum=1.135 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5673955392837524\"}",
+              "Us Foreign Policy - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "Us Foreign Policy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Us Foreign Policy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Us Foreign Policy - # prompt tokens": "{\"description\": \"min=422.79, mean=422.79, max=422.79, sum=845.58 (2)\", \"tab\": \"General information\", \"score\": \"422.79\"}",
+              "Us Foreign Policy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"international_law\"",
+              "subject": "\"us_foreign_policy\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_international_law\""
+              "groups": "\"mmlu_us_foreign_policy\""
             }
           }
         },
         {
-          "evaluation_name": "Logical Fallacies",
+          "evaluation_name": "Astronomy",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1185,36 +1093,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Logical Fallacies",
+            "evaluation_description": "EM on Astronomy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.742,
+            "score": 0.645,
             "details": {
-              "description": "min=0.742, mean=0.742, max=0.742, sum=1.485 (2)",
+              "description": "min=0.645, mean=0.645, max=0.645, sum=1.289 (2)",
               "tab": "Accuracy",
-              "Logical Fallacies - Observed inference time (s)": "{\"description\": \"min=1.35, mean=1.35, max=1.35, sum=2.7 (2)\", \"tab\": \"Efficiency\", \"score\": \"1.3501118970063566\"}",
-              "Logical Fallacies - # eval": "{\"description\": \"min=163, mean=163, max=163, sum=326 (2)\", \"tab\": \"General information\", \"score\": \"163.0\"}",
-              "Logical Fallacies - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Logical Fallacies - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Logical Fallacies - # prompt tokens": "{\"description\": \"min=449.564, mean=449.564, max=449.564, sum=899.129 (2)\", \"tab\": \"General information\", \"score\": \"449.5644171779141\"}",
-              "Logical Fallacies - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Astronomy - Observed inference time (s)": "{\"description\": \"min=0.317, mean=0.317, max=0.317, sum=0.634 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3168644199245854\"}",
+              "Astronomy - # eval": "{\"description\": \"min=152, mean=152, max=152, sum=304 (2)\", \"tab\": \"General information\", \"score\": \"152.0\"}",
+              "Astronomy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Astronomy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Astronomy - # prompt tokens": "{\"description\": \"min=579.684, mean=579.684, max=579.684, sum=1159.368 (2)\", \"tab\": \"General information\", \"score\": \"579.6842105263158\"}",
+              "Astronomy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"logical_fallacies\"",
+              "subject": "\"astronomy\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_logical_fallacies\""
+              "groups": "\"mmlu_astronomy\""
             }
           }
         },
         {
-          "evaluation_name": "Machine Learning",
+          "evaluation_name": "Business Ethics",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1223,36 +1131,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Machine Learning",
+            "evaluation_description": "EM on Business Ethics",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.384,
+            "score": 0.65,
             "details": {
-              "description": "min=0.384, mean=0.384, max=0.384, sum=0.768 (2)",
+              "description": "min=0.65, mean=0.65, max=0.65, sum=1.3 (2)",
               "tab": "Accuracy",
-              "Machine Learning - Observed inference time (s)": "{\"description\": \"min=0.46, mean=0.46, max=0.46, sum=0.919 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.45964209735393524\"}",
-              "Machine Learning - # eval": "{\"description\": \"min=112, mean=112, max=112, sum=224 (2)\", \"tab\": \"General information\", \"score\": \"112.0\"}",
-              "Machine Learning - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Machine Learning - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Machine Learning - # prompt tokens": "{\"description\": \"min=668.054, mean=668.054, max=668.054, sum=1336.107 (2)\", \"tab\": \"General information\", \"score\": \"668.0535714285714\"}",
-              "Machine Learning - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Business Ethics - Observed inference time (s)": "{\"description\": \"min=0.444, mean=0.444, max=0.444, sum=0.888 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.44396358251571655\"}",
+              "Business Ethics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "Business Ethics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Business Ethics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Business Ethics - # prompt tokens": "{\"description\": \"min=569.52, mean=569.52, max=569.52, sum=1139.04 (2)\", \"tab\": \"General information\", \"score\": \"569.52\"}",
+              "Business Ethics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"machine_learning\"",
+              "subject": "\"business_ethics\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_machine_learning\""
+              "groups": "\"mmlu_business_ethics\""
             }
           }
         },
         {
-          "evaluation_name": "Management",
+          "evaluation_name": "Clinical Knowledge",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1261,36 +1169,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Management",
+            "evaluation_description": "EM on Clinical Knowledge",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.709,
+            "score": 0.615,
             "details": {
-              "description": "min=0.709, mean=0.709, max=0.709, sum=1.417 (2)",
+              "description": "min=0.615, mean=0.615, max=0.615, sum=1.23 (2)",
               "tab": "Accuracy",
-              "Management - Observed inference time (s)": "{\"description\": \"min=0.481, mean=0.481, max=0.481, sum=0.963 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.48132226536574874\"}",
-              "Management - # eval": "{\"description\": \"min=103, mean=103, max=103, sum=206 (2)\", \"tab\": \"General information\", \"score\": \"103.0\"}",
-              "Management - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Management - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Management - # prompt tokens": "{\"description\": \"min=283.786, mean=283.786, max=283.786, sum=567.573 (2)\", \"tab\": \"General information\", \"score\": \"283.7864077669903\"}",
-              "Management - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Clinical Knowledge - Observed inference time (s)": "{\"description\": \"min=0.369, mean=0.369, max=0.369, sum=0.738 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3692442273193935\"}",
+              "Clinical Knowledge - # eval": "{\"description\": \"min=265, mean=265, max=265, sum=530 (2)\", \"tab\": \"General information\", \"score\": \"265.0\"}",
+              "Clinical Knowledge - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Clinical Knowledge - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Clinical Knowledge - # prompt tokens": "{\"description\": \"min=397.928, mean=397.928, max=397.928, sum=795.857 (2)\", \"tab\": \"General information\", \"score\": \"397.92830188679244\"}",
+              "Clinical Knowledge - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"management\"",
+              "subject": "\"clinical_knowledge\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_management\""
+              "groups": "\"mmlu_clinical_knowledge\""
             }
           }
         },
         {
-          "evaluation_name": "Marketing",
+          "evaluation_name": "Conceptual Physics",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1299,36 +1207,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Marketing",
+            "evaluation_description": "EM on Conceptual Physics",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.833,
+            "score": 0.528,
             "details": {
-              "description": "min=0.833, mean=0.833, max=0.833, sum=1.667 (2)",
+              "description": "min=0.528, mean=0.528, max=0.528, sum=1.055 (2)",
               "tab": "Accuracy",
-              "Marketing - Observed inference time (s)": "{\"description\": \"min=0.529, mean=0.529, max=0.529, sum=1.059 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5294545297948723\"}",
-              "Marketing - # eval": "{\"description\": \"min=234, mean=234, max=234, sum=468 (2)\", \"tab\": \"General information\", \"score\": \"234.0\"}",
-              "Marketing - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Marketing - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Marketing - # prompt tokens": "{\"description\": \"min=404.218, mean=404.218, max=404.218, sum=808.436 (2)\", \"tab\": \"General information\", \"score\": \"404.21794871794873\"}",
-              "Marketing - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Conceptual Physics - Observed inference time (s)": "{\"description\": \"min=0.351, mean=0.351, max=0.351, sum=0.701 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.35051030605397326\"}",
+              "Conceptual Physics - # eval": "{\"description\": \"min=235, mean=235, max=235, sum=470 (2)\", \"tab\": \"General information\", \"score\": \"235.0\"}",
+              "Conceptual Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Conceptual Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Conceptual Physics - # prompt tokens": "{\"description\": \"min=304.834, mean=304.834, max=304.834, sum=609.668 (2)\", \"tab\": \"General information\", \"score\": \"304.83404255319147\"}",
+              "Conceptual Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"marketing\"",
+              "subject": "\"conceptual_physics\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_marketing\""
+              "groups": "\"mmlu_conceptual_physics\""
             }
           }
         },
         {
-          "evaluation_name": "Medical Genetics",
+          "evaluation_name": "Electrical Engineering",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1337,36 +1245,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Medical Genetics",
+            "evaluation_description": "EM on Electrical Engineering",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.66,
+            "score": 0.441,
             "details": {
-              "description": "min=0.66, mean=0.66, max=0.66, sum=1.32 (2)",
+              "description": "min=0.441, mean=0.441, max=0.441, sum=0.883 (2)",
               "tab": "Accuracy",
-              "Medical Genetics - Observed inference time (s)": "{\"description\": \"min=0.521, mean=0.521, max=0.521, sum=1.041 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.520596706867218\"}",
-              "Medical Genetics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "Medical Genetics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Medical Genetics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Medical Genetics - # prompt tokens": "{\"description\": \"min=340.99, mean=340.99, max=340.99, sum=681.98 (2)\", \"tab\": \"General information\", \"score\": \"340.99\"}",
-              "Medical Genetics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Electrical Engineering - Observed inference time (s)": "{\"description\": \"min=0.35, mean=0.35, max=0.35, sum=0.7 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.34982287637118636\"}",
+              "Electrical Engineering - # eval": "{\"description\": \"min=145, mean=145, max=145, sum=290 (2)\", \"tab\": \"General information\", \"score\": \"145.0\"}",
+              "Electrical Engineering - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Electrical Engineering - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Electrical Engineering - # prompt tokens": "{\"description\": \"min=435.607, mean=435.607, max=435.607, sum=871.214 (2)\", \"tab\": \"General information\", \"score\": \"435.60689655172416\"}",
+              "Electrical Engineering - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"medical_genetics\"",
+              "subject": "\"electrical_engineering\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_medical_genetics\""
+              "groups": "\"mmlu_electrical_engineering\""
             }
           }
         },
         {
-          "evaluation_name": "Miscellaneous",
+          "evaluation_name": "Elementary Mathematics",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1375,36 +1283,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Miscellaneous",
+            "evaluation_description": "EM on Elementary Mathematics",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.653,
+            "score": 0.429,
             "details": {
-              "description": "min=0.653, mean=0.653, max=0.653, sum=1.305 (2)",
+              "description": "min=0.429, mean=0.429, max=0.429, sum=0.857 (2)",
               "tab": "Accuracy",
-              "Miscellaneous - Observed inference time (s)": "{\"description\": \"min=0.803, mean=0.803, max=0.803, sum=1.606 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.8030396217282857\"}",
-              "Miscellaneous - # eval": "{\"description\": \"min=783, mean=783, max=783, sum=1566 (2)\", \"tab\": \"General information\", \"score\": \"783.0\"}",
-              "Miscellaneous - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Miscellaneous - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Miscellaneous - # prompt tokens": "{\"description\": \"min=299.911, mean=299.911, max=299.911, sum=599.821 (2)\", \"tab\": \"General information\", \"score\": \"299.9106002554278\"}",
-              "Miscellaneous - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Elementary Mathematics - Observed inference time (s)": "{\"description\": \"min=0.4, mean=0.4, max=0.4, sum=0.801 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4003569991500289\"}",
+              "Elementary Mathematics - # eval": "{\"description\": \"min=378, mean=378, max=378, sum=756 (2)\", \"tab\": \"General information\", \"score\": \"378.0\"}",
+              "Elementary Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Elementary Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Elementary Mathematics - # prompt tokens": "{\"description\": \"min=531.854, mean=531.854, max=531.854, sum=1063.709 (2)\", \"tab\": \"General information\", \"score\": \"531.8544973544973\"}",
+              "Elementary Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"miscellaneous\"",
+              "subject": "\"elementary_mathematics\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_miscellaneous\""
+              "groups": "\"mmlu_elementary_mathematics\""
             }
           }
         },
         {
-          "evaluation_name": "Moral Scenarios",
+          "evaluation_name": "Formal Logic",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1413,42 +1321,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Moral Scenarios",
+            "evaluation_description": "EM on Formal Logic",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.368,
+            "score": 0.444,
             "details": {
-              "description": "min=0.368, mean=0.368, max=0.368, sum=0.735 (2)",
+              "description": "min=0.444, mean=0.444, max=0.444, sum=0.889 (2)",
               "tab": "Accuracy",
-              "Moral Disputes - Observed inference time (s)": "{\"description\": \"min=0.657, mean=0.657, max=0.657, sum=1.314 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6570079657383737\"}",
-              "Moral Scenarios - Observed inference time (s)": "{\"description\": \"min=0.65, mean=0.65, max=0.65, sum=1.299 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.649639103266114\"}",
-              "Moral Disputes - # eval": "{\"description\": \"min=346, mean=346, max=346, sum=692 (2)\", \"tab\": \"General information\", \"score\": \"346.0\"}",
-              "Moral Disputes - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Moral Disputes - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Moral Disputes - # prompt tokens": "{\"description\": \"min=476.113, mean=476.113, max=476.113, sum=952.225 (2)\", \"tab\": \"General information\", \"score\": \"476.1127167630058\"}",
-              "Moral Disputes - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "Moral Scenarios - # eval": "{\"description\": \"min=895, mean=895, max=895, sum=1790 (2)\", \"tab\": \"General information\", \"score\": \"895.0\"}",
-              "Moral Scenarios - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Moral Scenarios - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Moral Scenarios - # prompt tokens": "{\"description\": \"min=656.455, mean=656.455, max=656.455, sum=1312.909 (2)\", \"tab\": \"General information\", \"score\": \"656.454748603352\"}",
-              "Moral Scenarios - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Formal Logic - Observed inference time (s)": "{\"description\": \"min=0.357, mean=0.357, max=0.357, sum=0.714 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.35707327108534553\"}",
+              "Formal Logic - # eval": "{\"description\": \"min=126, mean=126, max=126, sum=252 (2)\", \"tab\": \"General information\", \"score\": \"126.0\"}",
+              "Formal Logic - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Formal Logic - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Formal Logic - # prompt tokens": "{\"description\": \"min=601.778, mean=601.778, max=601.778, sum=1203.556 (2)\", \"tab\": \"General information\", \"score\": \"601.7777777777778\"}",
+              "Formal Logic - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"moral_scenarios\"",
+              "subject": "\"formal_logic\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_moral_scenarios\""
+              "groups": "\"mmlu_formal_logic\""
             }
           }
         },
         {
-          "evaluation_name": "Nutrition",
+          "evaluation_name": "High School World History",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1457,36 +1359,114 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Nutrition",
+            "evaluation_description": "EM on High School World History",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.712,
+            "score": 0.515,
             "details": {
-              "description": "min=0.712, mean=0.712, max=0.712, sum=1.425 (2)",
+              "description": "min=0.515, mean=0.515, max=0.515, sum=1.03 (2)",
               "tab": "Accuracy",
-              "Nutrition - Observed inference time (s)": "{\"description\": \"min=1.485, mean=1.485, max=1.485, sum=2.971 (2)\", \"tab\": \"Efficiency\", \"score\": \"1.4853957338270798\"}",
-              "Nutrition - # eval": "{\"description\": \"min=306, mean=306, max=306, sum=612 (2)\", \"tab\": \"General information\", \"score\": \"306.0\"}",
-              "Nutrition - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Nutrition - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Nutrition - # prompt tokens": "{\"description\": \"min=586.814, mean=586.814, max=586.814, sum=1173.627 (2)\", \"tab\": \"General information\", \"score\": \"586.8137254901961\"}",
-              "Nutrition - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "High School Biology - Observed inference time (s)": "{\"description\": \"min=0.211, mean=0.211, max=0.211, sum=0.423 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.21137587870320967\"}",
+              "High School Chemistry - Observed inference time (s)": "{\"description\": \"min=0.211, mean=0.211, max=0.211, sum=0.423 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2113605567387172\"}",
+              "High School Computer Science - Observed inference time (s)": "{\"description\": \"min=0.214, mean=0.214, max=0.214, sum=0.428 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2138903546333313\"}",
+              "High School European History - Observed inference time (s)": "{\"description\": \"min=0.332, mean=0.332, max=0.332, sum=0.664 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.33188523668231384\"}",
+              "High School Geography - Observed inference time (s)": "{\"description\": \"min=0.218, mean=0.218, max=0.218, sum=0.435 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.21753037818754561\"}",
+              "High School Government And Politics - Observed inference time (s)": "{\"description\": \"min=0.558, mean=0.558, max=0.558, sum=1.117 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.558492410985917\"}",
+              "High School Macroeconomics - Observed inference time (s)": "{\"description\": \"min=0.703, mean=0.703, max=0.703, sum=1.407 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.7033225890917656\"}",
+              "High School Mathematics - Observed inference time (s)": "{\"description\": \"min=0.649, mean=0.649, max=0.649, sum=1.299 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6494572189119127\"}",
+              "High School Microeconomics - Observed inference time (s)": "{\"description\": \"min=0.612, mean=0.612, max=0.612, sum=1.223 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6115654797113242\"}",
+              "High School Physics - Observed inference time (s)": "{\"description\": \"min=0.564, mean=0.564, max=0.564, sum=1.127 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5636763351642533\"}",
+              "High School Psychology - Observed inference time (s)": "{\"description\": \"min=0.681, mean=0.681, max=0.681, sum=1.363 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6813242522948378\"}",
+              "High School Statistics - Observed inference time (s)": "{\"description\": \"min=0.606, mean=0.606, max=0.606, sum=1.212 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6060926814874014\"}",
+              "High School US History - Observed inference time (s)": "{\"description\": \"min=1.122, mean=1.122, max=1.122, sum=2.244 (2)\", \"tab\": \"Efficiency\", \"score\": \"1.1218917334780973\"}",
+              "High School World History - Observed inference time (s)": "{\"description\": \"min=0.538, mean=0.538, max=0.538, sum=1.076 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5378943324592043\"}",
+              "High School Biology - # eval": "{\"description\": \"min=310, mean=310, max=310, sum=620 (2)\", \"tab\": \"General information\", \"score\": \"310.0\"}",
+              "High School Biology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Biology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Biology - # prompt tokens": "{\"description\": \"min=513.671, mean=513.671, max=513.671, sum=1027.342 (2)\", \"tab\": \"General information\", \"score\": \"513.6709677419354\"}",
+              "High School Biology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Chemistry - # eval": "{\"description\": \"min=203, mean=203, max=203, sum=406 (2)\", \"tab\": \"General information\", \"score\": \"203.0\"}",
+              "High School Chemistry - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Chemistry - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Chemistry - # prompt tokens": "{\"description\": \"min=496.704, mean=496.704, max=496.704, sum=993.409 (2)\", \"tab\": \"General information\", \"score\": \"496.70443349753697\"}",
+              "High School Chemistry - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Computer Science - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "High School Computer Science - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Computer Science - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Computer Science - # prompt tokens": "{\"description\": \"min=867.78, mean=867.78, max=867.78, sum=1735.56 (2)\", \"tab\": \"General information\", \"score\": \"867.78\"}",
+              "High School Computer Science - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School European History - # eval": "{\"description\": \"min=165, mean=165, max=165, sum=330 (2)\", \"tab\": \"General information\", \"score\": \"165.0\"}",
+              "High School European History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School European History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School European History - # prompt tokens": "{\"description\": \"min=2797.885, mean=2797.885, max=2797.885, sum=5595.77 (2)\", \"tab\": \"General information\", \"score\": \"2797.8848484848486\"}",
+              "High School European History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Geography - # eval": "{\"description\": \"min=198, mean=198, max=198, sum=396 (2)\", \"tab\": \"General information\", \"score\": \"198.0\"}",
+              "High School Geography - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Geography - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Geography - # prompt tokens": "{\"description\": \"min=372.035, mean=372.035, max=372.035, sum=744.071 (2)\", \"tab\": \"General information\", \"score\": \"372.0353535353535\"}",
+              "High School Geography - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Government And Politics - # eval": "{\"description\": \"min=193, mean=193, max=193, sum=386 (2)\", \"tab\": \"General information\", \"score\": \"193.0\"}",
+              "High School Government And Politics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Government And Politics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Government And Politics - # prompt tokens": "{\"description\": \"min=465.824, mean=465.824, max=465.824, sum=931.648 (2)\", \"tab\": \"General information\", \"score\": \"465.8238341968912\"}",
+              "High School Government And Politics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Macroeconomics - # eval": "{\"description\": \"min=390, mean=390, max=390, sum=780 (2)\", \"tab\": \"General information\", \"score\": \"390.0\"}",
+              "High School Macroeconomics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Macroeconomics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Macroeconomics - # prompt tokens": "{\"description\": \"min=370.908, mean=370.908, max=370.908, sum=741.815 (2)\", \"tab\": \"General information\", \"score\": \"370.9076923076923\"}",
+              "High School Macroeconomics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Mathematics - # eval": "{\"description\": \"min=270, mean=270, max=270, sum=540 (2)\", \"tab\": \"General information\", \"score\": \"270.0\"}",
+              "High School Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Mathematics - # prompt tokens": "{\"description\": \"min=532.356, mean=532.356, max=532.356, sum=1064.711 (2)\", \"tab\": \"General information\", \"score\": \"532.3555555555556\"}",
+              "High School Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Microeconomics - # eval": "{\"description\": \"min=238, mean=238, max=238, sum=476 (2)\", \"tab\": \"General information\", \"score\": \"238.0\"}",
+              "High School Microeconomics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Microeconomics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Microeconomics - # prompt tokens": "{\"description\": \"min=399.013, mean=399.013, max=399.013, sum=798.025 (2)\", \"tab\": \"General information\", \"score\": \"399.0126050420168\"}",
+              "High School Microeconomics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Physics - # eval": "{\"description\": \"min=151, mean=151, max=151, sum=302 (2)\", \"tab\": \"General information\", \"score\": \"151.0\"}",
+              "High School Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Physics - # prompt tokens": "{\"description\": \"min=560.457, mean=560.457, max=560.457, sum=1120.914 (2)\", \"tab\": \"General information\", \"score\": \"560.4569536423841\"}",
+              "High School Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Psychology - # eval": "{\"description\": \"min=545, mean=545, max=545, sum=1090 (2)\", \"tab\": \"General information\", \"score\": \"545.0\"}",
+              "High School Psychology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Psychology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Psychology - # prompt tokens": "{\"description\": \"min=495.242, mean=495.242, max=495.242, sum=990.484 (2)\", \"tab\": \"General information\", \"score\": \"495.2422018348624\"}",
+              "High School Psychology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Statistics - # eval": "{\"description\": \"min=216, mean=216, max=216, sum=432 (2)\", \"tab\": \"General information\", \"score\": \"216.0\"}",
+              "High School Statistics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Statistics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Statistics - # prompt tokens": "{\"description\": \"min=795.639, mean=795.639, max=795.639, sum=1591.278 (2)\", \"tab\": \"General information\", \"score\": \"795.6388888888889\"}",
+              "High School Statistics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School US History - # eval": "{\"description\": \"min=204, mean=204, max=204, sum=408 (2)\", \"tab\": \"General information\", \"score\": \"204.0\"}",
+              "High School US History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School US History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School US History - # prompt tokens": "{\"description\": \"min=2217.809, mean=2217.809, max=2217.809, sum=4435.618 (2)\", \"tab\": \"General information\", \"score\": \"2217.8088235294117\"}",
+              "High School US History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School World History - # eval": "{\"description\": \"min=237, mean=237, max=237, sum=474 (2)\", \"tab\": \"General information\", \"score\": \"237.0\"}",
+              "High School World History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School World History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School World History - # prompt tokens": "{\"description\": \"min=1428.173, mean=1428.173, max=1428.173, sum=2856.346 (2)\", \"tab\": \"General information\", \"score\": \"1428.1729957805908\"}",
+              "High School World History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"nutrition\"",
+              "subject": "\"high_school_world_history\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_nutrition\""
+              "groups": "\"mmlu_high_school_world_history\""
             }
           }
         },
         {
-          "evaluation_name": "Prehistory",
+          "evaluation_name": "Human Sexuality",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1495,36 +1475,42 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Prehistory",
+            "evaluation_description": "EM on Human Sexuality",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.728,
+            "score": 0.733,
             "details": {
-              "description": "min=0.728, mean=0.728, max=0.728, sum=1.457 (2)",
+              "description": "min=0.733, mean=0.733, max=0.733, sum=1.466 (2)",
               "tab": "Accuracy",
-              "Prehistory - Observed inference time (s)": "{\"description\": \"min=0.792, mean=0.792, max=0.792, sum=1.584 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.7917959955003526\"}",
-              "Prehistory - # eval": "{\"description\": \"min=324, mean=324, max=324, sum=648 (2)\", \"tab\": \"General information\", \"score\": \"324.0\"}",
-              "Prehistory - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Prehistory - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Prehistory - # prompt tokens": "{\"description\": \"min=514.528, mean=514.528, max=514.528, sum=1029.056 (2)\", \"tab\": \"General information\", \"score\": \"514.5277777777778\"}",
-              "Prehistory - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Human Aging - Observed inference time (s)": "{\"description\": \"min=0.685, mean=0.685, max=0.685, sum=1.369 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6845707412257858\"}",
+              "Human Sexuality - Observed inference time (s)": "{\"description\": \"min=1.227, mean=1.227, max=1.227, sum=2.455 (2)\", \"tab\": \"Efficiency\", \"score\": \"1.2273387745136524\"}",
+              "Human Aging - # eval": "{\"description\": \"min=223, mean=223, max=223, sum=446 (2)\", \"tab\": \"General information\", \"score\": \"223.0\"}",
+              "Human Aging - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Human Aging - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Human Aging - # prompt tokens": "{\"description\": \"min=319.888, mean=319.888, max=319.888, sum=639.776 (2)\", \"tab\": \"General information\", \"score\": \"319.88789237668163\"}",
+              "Human Aging - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "Human Sexuality - # eval": "{\"description\": \"min=131, mean=131, max=131, sum=262 (2)\", \"tab\": \"General information\", \"score\": \"131.0\"}",
+              "Human Sexuality - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Human Sexuality - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Human Sexuality - # prompt tokens": "{\"description\": \"min=341.168, mean=341.168, max=341.168, sum=682.336 (2)\", \"tab\": \"General information\", \"score\": \"341.1679389312977\"}",
+              "Human Sexuality - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"prehistory\"",
+              "subject": "\"human_sexuality\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_prehistory\""
+              "groups": "\"mmlu_human_sexuality\""
             }
           }
         },
         {
-          "evaluation_name": "Public Relations",
+          "evaluation_name": "International Law",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1533,36 +1519,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Public Relations",
+            "evaluation_description": "EM on International Law",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.664,
+            "score": 0.694,
             "details": {
-              "description": "min=0.664, mean=0.664, max=0.664, sum=1.327 (2)",
+              "description": "min=0.694, mean=0.694, max=0.694, sum=1.388 (2)",
               "tab": "Accuracy",
-              "Public Relations - Observed inference time (s)": "{\"description\": \"min=0.493, mean=0.493, max=0.493, sum=0.986 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.49318039634011007\"}",
-              "Public Relations - # eval": "{\"description\": \"min=110, mean=110, max=110, sum=220 (2)\", \"tab\": \"General information\", \"score\": \"110.0\"}",
-              "Public Relations - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Public Relations - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Public Relations - # prompt tokens": "{\"description\": \"min=405.318, mean=405.318, max=405.318, sum=810.636 (2)\", \"tab\": \"General information\", \"score\": \"405.3181818181818\"}",
-              "Public Relations - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "International Law - Observed inference time (s)": "{\"description\": \"min=0.684, mean=0.684, max=0.684, sum=1.369 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6842782950598346\"}",
+              "International Law - # eval": "{\"description\": \"min=121, mean=121, max=121, sum=242 (2)\", \"tab\": \"General information\", \"score\": \"121.0\"}",
+              "International Law - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "International Law - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "International Law - # prompt tokens": "{\"description\": \"min=639.818, mean=639.818, max=639.818, sum=1279.636 (2)\", \"tab\": \"General information\", \"score\": \"639.8181818181819\"}",
+              "International Law - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"public_relations\"",
+              "subject": "\"international_law\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_public_relations\""
+              "groups": "\"mmlu_international_law\""
             }
           }
         },
         {
-          "evaluation_name": "Security Studies",
+          "evaluation_name": "Logical Fallacies",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1571,36 +1557,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Security Studies",
+            "evaluation_description": "EM on Logical Fallacies",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.576,
+            "score": 0.742,
             "details": {
-              "description": "min=0.576, mean=0.576, max=0.576, sum=1.151 (2)",
+              "description": "min=0.742, mean=0.742, max=0.742, sum=1.485 (2)",
               "tab": "Accuracy",
-              "Security Studies - Observed inference time (s)": "{\"description\": \"min=0.656, mean=0.656, max=0.656, sum=1.312 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6561975401275012\"}",
-              "Security Studies - # eval": "{\"description\": \"min=245, mean=245, max=245, sum=490 (2)\", \"tab\": \"General information\", \"score\": \"245.0\"}",
-              "Security Studies - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Security Studies - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Security Studies - # prompt tokens": "{\"description\": \"min=1164.473, mean=1164.473, max=1164.473, sum=2328.947 (2)\", \"tab\": \"General information\", \"score\": \"1164.4734693877551\"}",
-              "Security Studies - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Logical Fallacies - Observed inference time (s)": "{\"description\": \"min=1.35, mean=1.35, max=1.35, sum=2.7 (2)\", \"tab\": \"Efficiency\", \"score\": \"1.3501118970063566\"}",
+              "Logical Fallacies - # eval": "{\"description\": \"min=163, mean=163, max=163, sum=326 (2)\", \"tab\": \"General information\", \"score\": \"163.0\"}",
+              "Logical Fallacies - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Logical Fallacies - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Logical Fallacies - # prompt tokens": "{\"description\": \"min=449.564, mean=449.564, max=449.564, sum=899.129 (2)\", \"tab\": \"General information\", \"score\": \"449.5644171779141\"}",
+              "Logical Fallacies - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"security_studies\"",
+              "subject": "\"logical_fallacies\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_security_studies\""
+              "groups": "\"mmlu_logical_fallacies\""
             }
           }
         },
         {
-          "evaluation_name": "Sociology",
+          "evaluation_name": "Machine Learning",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1609,36 +1595,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Sociology",
+            "evaluation_description": "EM on Machine Learning",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.701,
+            "score": 0.384,
             "details": {
-              "description": "min=0.701, mean=0.701, max=0.701, sum=1.403 (2)",
+              "description": "min=0.384, mean=0.384, max=0.384, sum=0.768 (2)",
               "tab": "Accuracy",
-              "Sociology - Observed inference time (s)": "{\"description\": \"min=0.517, mean=0.517, max=0.517, sum=1.034 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5170851643405744\"}",
-              "Sociology - # eval": "{\"description\": \"min=201, mean=201, max=201, sum=402 (2)\", \"tab\": \"General information\", \"score\": \"201.0\"}",
-              "Sociology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Sociology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Sociology - # prompt tokens": "{\"description\": \"min=445.517, mean=445.517, max=445.517, sum=891.035 (2)\", \"tab\": \"General information\", \"score\": \"445.51741293532336\"}",
-              "Sociology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Machine Learning - Observed inference time (s)": "{\"description\": \"min=0.46, mean=0.46, max=0.46, sum=0.919 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.45964209735393524\"}",
+              "Machine Learning - # eval": "{\"description\": \"min=112, mean=112, max=112, sum=224 (2)\", \"tab\": \"General information\", \"score\": \"112.0\"}",
+              "Machine Learning - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Machine Learning - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Machine Learning - # prompt tokens": "{\"description\": \"min=668.054, mean=668.054, max=668.054, sum=1336.107 (2)\", \"tab\": \"General information\", \"score\": \"668.0535714285714\"}",
+              "Machine Learning - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"sociology\"",
+              "subject": "\"machine_learning\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_sociology\""
+              "groups": "\"mmlu_machine_learning\""
             }
           }
         },
         {
-          "evaluation_name": "Virology",
+          "evaluation_name": "Management",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1647,36 +1633,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Virology",
+            "evaluation_description": "EM on Management",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.446,
+            "score": 0.709,
             "details": {
-              "description": "min=0.446, mean=0.446, max=0.446, sum=0.892 (2)",
+              "description": "min=0.709, mean=0.709, max=0.709, sum=1.417 (2)",
               "tab": "Accuracy",
-              "Virology - Observed inference time (s)": "{\"description\": \"min=0.406, mean=0.406, max=0.406, sum=0.813 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.40646702553852493\"}",
-              "Virology - # eval": "{\"description\": \"min=166, mean=166, max=166, sum=332 (2)\", \"tab\": \"General information\", \"score\": \"166.0\"}",
-              "Virology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Virology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Virology - # prompt tokens": "{\"description\": \"min=343.018, mean=343.018, max=343.018, sum=686.036 (2)\", \"tab\": \"General information\", \"score\": \"343.01807228915663\"}",
-              "Virology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Management - Observed inference time (s)": "{\"description\": \"min=0.481, mean=0.481, max=0.481, sum=0.963 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.48132226536574874\"}",
+              "Management - # eval": "{\"description\": \"min=103, mean=103, max=103, sum=206 (2)\", \"tab\": \"General information\", \"score\": \"103.0\"}",
+              "Management - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Management - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Management - # prompt tokens": "{\"description\": \"min=283.786, mean=283.786, max=283.786, sum=567.573 (2)\", \"tab\": \"General information\", \"score\": \"283.7864077669903\"}",
+              "Management - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"virology\"",
+              "subject": "\"management\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_virology\""
+              "groups": "\"mmlu_management\""
             }
           }
         },
         {
-          "evaluation_name": "World Religions",
+          "evaluation_name": "Marketing",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1685,36 +1671,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on World Religions",
+            "evaluation_description": "EM on Marketing",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.789,
+            "score": 0.833,
             "details": {
-              "description": "min=0.789, mean=0.789, max=0.789, sum=1.579 (2)",
+              "description": "min=0.833, mean=0.833, max=0.833, sum=1.667 (2)",
               "tab": "Accuracy",
-              "World Religions - Observed inference time (s)": "{\"description\": \"min=0.587, mean=0.587, max=0.587, sum=1.173 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5866640882882458\"}",
-              "World Religions - # eval": "{\"description\": \"min=171, mean=171, max=171, sum=342 (2)\", \"tab\": \"General information\", \"score\": \"171.0\"}",
-              "World Religions - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "World Religions - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "World Religions - # prompt tokens": "{\"description\": \"min=274.52, mean=274.52, max=274.52, sum=549.041 (2)\", \"tab\": \"General information\", \"score\": \"274.5204678362573\"}",
-              "World Religions - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Marketing - Observed inference time (s)": "{\"description\": \"min=0.529, mean=0.529, max=0.529, sum=1.059 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5294545297948723\"}",
+              "Marketing - # eval": "{\"description\": \"min=234, mean=234, max=234, sum=468 (2)\", \"tab\": \"General information\", \"score\": \"234.0\"}",
+              "Marketing - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Marketing - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Marketing - # prompt tokens": "{\"description\": \"min=404.218, mean=404.218, max=404.218, sum=808.436 (2)\", \"tab\": \"General information\", \"score\": \"404.21794871794873\"}",
+              "Marketing - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"world_religions\"",
+              "subject": "\"marketing\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_world_religions\""
+              "groups": "\"mmlu_marketing\""
             }
           }
         },
         {
-          "evaluation_name": "Mean win rate",
+          "evaluation_name": "Medical Genetics",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1723,404 +1709,418 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "How many models this model outperforms on average (over columns).",
+            "evaluation_description": "EM on Medical Genetics",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.475,
+            "score": 0.66,
             "details": {
-              "description": "",
-              "tab": "Efficiency"
+              "description": "min=0.66, mean=0.66, max=0.66, sum=1.32 (2)",
+              "tab": "Accuracy",
+              "Medical Genetics - Observed inference time (s)": "{\"description\": \"min=0.521, mean=0.521, max=0.521, sum=1.041 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.520596706867218\"}",
+              "Medical Genetics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "Medical Genetics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Medical Genetics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Medical Genetics - # prompt tokens": "{\"description\": \"min=340.99, mean=340.99, max=340.99, sum=681.98 (2)\", \"tab\": \"General information\", \"score\": \"340.99\"}",
+              "Medical Genetics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
-            "additional_details": {}
+            "additional_details": {
+              "subject": "\"medical_genetics\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_medical_genetics\""
+            }
           }
-        }
-      ],
-      "detailed_evaluation_results": null,
-      "generation_config": {
-        "additional_details": {
-          "subject": "[\"abstract_algebra\", \"anatomy\", \"astronomy\", \"business_ethics\", \"clinical_knowledge\", \"college_biology\", \"college_chemistry\", \"college_computer_science\", \"college_mathematics\", \"college_medicine\", \"college_physics\", \"computer_security\", \"conceptual_physics\", \"econometrics\", \"electrical_engineering\", \"elementary_mathematics\", \"formal_logic\", \"global_facts\", \"high_school_biology\", \"high_school_chemistry\", \"high_school_computer_science\", \"high_school_european_history\", \"high_school_geography\", \"high_school_government_and_politics\", \"high_school_macroeconomics\", \"high_school_mathematics\", \"high_school_microeconomics\", \"high_school_physics\", \"high_school_psychology\", \"high_school_statistics\", \"high_school_us_history\", \"high_school_world_history\", \"human_aging\", \"human_sexuality\", \"international_law\", \"jurisprudence\", \"logical_fallacies\", \"machine_learning\", \"management\", \"marketing\", \"medical_genetics\", \"miscellaneous\", \"moral_disputes\", \"moral_scenarios\", \"nutrition\", \"philosophy\", \"prehistory\", \"professional_accounting\", \"professional_law\", \"professional_medicine\", \"professional_psychology\", \"public_relations\", \"security_studies\", \"sociology\", \"us_foreign_policy\", \"virology\", \"world_religions\"]",
-          "method": "\"multiple_choice_joint\"",
-          "eval_split": "\"test\"",
-          "groups": "[\"mmlu_abstract_algebra\", \"mmlu_anatomy\", \"mmlu_astronomy\", \"mmlu_business_ethics\", \"mmlu_clinical_knowledge\", \"mmlu_college_biology\", \"mmlu_college_chemistry\", \"mmlu_college_computer_science\", \"mmlu_college_mathematics\", \"mmlu_college_medicine\", \"mmlu_college_physics\", \"mmlu_computer_security\", \"mmlu_conceptual_physics\", \"mmlu_econometrics\", \"mmlu_electrical_engineering\", \"mmlu_elementary_mathematics\", \"mmlu_formal_logic\", \"mmlu_global_facts\", \"mmlu_high_school_biology\", \"mmlu_high_school_chemistry\", \"mmlu_high_school_computer_science\", \"mmlu_high_school_european_history\", \"mmlu_high_school_geography\", \"mmlu_high_school_government_and_politics\", \"mmlu_high_school_macroeconomics\", \"mmlu_high_school_mathematics\", \"mmlu_high_school_microeconomics\", \"mmlu_high_school_physics\", \"mmlu_high_school_psychology\", \"mmlu_high_school_statistics\", \"mmlu_high_school_us_history\", \"mmlu_high_school_world_history\", \"mmlu_human_aging\", \"mmlu_human_sexuality\", \"mmlu_international_law\", \"mmlu_jurisprudence\", \"mmlu_logical_fallacies\", \"mmlu_machine_learning\", \"mmlu_management\", \"mmlu_marketing\", \"mmlu_medical_genetics\", \"mmlu_miscellaneous\", \"mmlu_moral_disputes\", \"mmlu_moral_scenarios\", \"mmlu_nutrition\", \"mmlu_philosophy\", \"mmlu_prehistory\", \"mmlu_professional_accounting\", \"mmlu_professional_law\", \"mmlu_professional_medicine\", \"mmlu_professional_psychology\", \"mmlu_public_relations\", \"mmlu_security_studies\", \"mmlu_sociology\", \"mmlu_us_foreign_policy\", \"mmlu_virology\", \"mmlu_world_religions\"]"
-        }
-      }
-    },
-    {
-      "evaluation_id": "helm_lite/meta_llama-3.1-8b-instruct-turbo/1774096306.427425",
-      "retrieved_timestamp": "1774096306.427425",
-      "source_metadata": {
-        "source_name": "helm_lite",
-        "source_type": "documentation",
-        "source_organization_name": "crfm",
-        "evaluator_relationship": "third_party"
-      },
-      "eval_library": {
-        "name": "helm",
-        "version": "unknown"
-      },
-      "benchmark": "helm_lite",
-      "evaluation_results": [
+        },
         {
-          "evaluation_name": "Mean win rate",
+          "evaluation_name": "Miscellaneous",
           "source_data": {
-            "dataset_name": "helm_lite",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "How many models this model outperforms on average (over columns).",
+            "evaluation_description": "EM on Miscellaneous",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.303,
+            "score": 0.653,
             "details": {
-              "description": "",
+              "description": "min=0.653, mean=0.653, max=0.653, sum=1.305 (2)",
               "tab": "Accuracy",
-              "Mean win rate - Efficiency": "{\"description\": \"\", \"tab\": \"Efficiency\", \"score\": \"0.5896504369538077\"}",
-              "Mean win rate - General information": "{\"description\": \"\", \"tab\": \"General information\", \"score\": \"\"}"
+              "Miscellaneous - Observed inference time (s)": "{\"description\": \"min=0.803, mean=0.803, max=0.803, sum=1.606 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.8030396217282857\"}",
+              "Miscellaneous - # eval": "{\"description\": \"min=783, mean=783, max=783, sum=1566 (2)\", \"tab\": \"General information\", \"score\": \"783.0\"}",
+              "Miscellaneous - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Miscellaneous - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Miscellaneous - # prompt tokens": "{\"description\": \"min=299.911, mean=299.911, max=299.911, sum=599.821 (2)\", \"tab\": \"General information\", \"score\": \"299.9106002554278\"}",
+              "Miscellaneous - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
-            "additional_details": {}
+            "additional_details": {
+              "subject": "\"miscellaneous\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_miscellaneous\""
+            }
           }
         },
         {
-          "evaluation_name": "NarrativeQA",
+          "evaluation_name": "Moral Scenarios",
           "source_data": {
-            "dataset_name": "NarrativeQA",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "F1 on NarrativeQA",
+            "evaluation_description": "EM on Moral Scenarios",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.756,
+            "score": 0.368,
             "details": {
-              "description": "min=0.756, mean=0.756, max=0.756, sum=0.756 (1)",
+              "description": "min=0.368, mean=0.368, max=0.368, sum=0.735 (2)",
               "tab": "Accuracy",
-              "NarrativeQA - Observed inference time (s)": "{\"description\": \"min=0.581, mean=0.581, max=0.581, sum=0.581 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.5813529316808136\"}",
-              "NarrativeQA - # eval": "{\"description\": \"min=355, mean=355, max=355, sum=355 (1)\", \"tab\": \"General information\", \"score\": \"355.0\"}",
-              "NarrativeQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "NarrativeQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "NarrativeQA - # prompt tokens": "{\"description\": \"min=3484.268, mean=3484.268, max=3484.268, sum=3484.268 (1)\", \"tab\": \"General information\", \"score\": \"3484.2676056338028\"}",
-              "NarrativeQA - # output tokens": "{\"description\": \"min=7.287, mean=7.287, max=7.287, sum=7.287 (1)\", \"tab\": \"General information\", \"score\": \"7.2873239436619714\"}"
+              "Moral Disputes - Observed inference time (s)": "{\"description\": \"min=0.657, mean=0.657, max=0.657, sum=1.314 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6570079657383737\"}",
+              "Moral Scenarios - Observed inference time (s)": "{\"description\": \"min=0.65, mean=0.65, max=0.65, sum=1.299 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.649639103266114\"}",
+              "Moral Disputes - # eval": "{\"description\": \"min=346, mean=346, max=346, sum=692 (2)\", \"tab\": \"General information\", \"score\": \"346.0\"}",
+              "Moral Disputes - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Moral Disputes - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Moral Disputes - # prompt tokens": "{\"description\": \"min=476.113, mean=476.113, max=476.113, sum=952.225 (2)\", \"tab\": \"General information\", \"score\": \"476.1127167630058\"}",
+              "Moral Disputes - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "Moral Scenarios - # eval": "{\"description\": \"min=895, mean=895, max=895, sum=1790 (2)\", \"tab\": \"General information\", \"score\": \"895.0\"}",
+              "Moral Scenarios - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Moral Scenarios - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Moral Scenarios - # prompt tokens": "{\"description\": \"min=656.455, mean=656.455, max=656.455, sum=1312.909 (2)\", \"tab\": \"General information\", \"score\": \"656.454748603352\"}",
+              "Moral Scenarios - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
-            "additional_details": {}
+            "additional_details": {
+              "subject": "\"moral_scenarios\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_moral_scenarios\""
+            }
           }
         },
         {
-          "evaluation_name": "NaturalQuestions (closed-book)",
+          "evaluation_name": "Nutrition",
           "source_data": {
-            "dataset_name": "NaturalQuestions (closed-book)",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "F1 on NaturalQuestions (closed-book)",
+            "evaluation_description": "EM on Nutrition",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.209,
+            "score": 0.712,
             "details": {
-              "description": "min=0.209, mean=0.209, max=0.209, sum=0.209 (1)",
+              "description": "min=0.712, mean=0.712, max=0.712, sum=1.425 (2)",
               "tab": "Accuracy",
-              "NaturalQuestions (open-book) - Observed inference time (s)": "{\"description\": \"min=0.544, mean=0.544, max=0.544, sum=0.544 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.5441543731689453\"}",
-              "NaturalQuestions (closed-book) - Observed inference time (s)": "{\"description\": \"min=0.752, mean=0.752, max=0.752, sum=0.752 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.751717613697052\"}",
-              "NaturalQuestions (open-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}",
-              "NaturalQuestions (open-book) - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "NaturalQuestions (open-book) - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "NaturalQuestions (open-book) - # prompt tokens": "{\"description\": \"min=1716.78, mean=1716.78, max=1716.78, sum=1716.78 (1)\", \"tab\": \"General information\", \"score\": \"1716.78\"}",
-              "NaturalQuestions (open-book) - # output tokens": "{\"description\": \"min=8.736, mean=8.736, max=8.736, sum=8.736 (1)\", \"tab\": \"General information\", \"score\": \"8.736\"}",
-              "NaturalQuestions (closed-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}",
-              "NaturalQuestions (closed-book) - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "NaturalQuestions (closed-book) - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "NaturalQuestions (closed-book) - # prompt tokens": "{\"description\": \"min=129.12, mean=129.12, max=129.12, sum=129.12 (1)\", \"tab\": \"General information\", \"score\": \"129.12\"}",
-              "NaturalQuestions (closed-book) - # output tokens": "{\"description\": \"min=11.732, mean=11.732, max=11.732, sum=11.732 (1)\", \"tab\": \"General information\", \"score\": \"11.732\"}"
+              "Nutrition - Observed inference time (s)": "{\"description\": \"min=1.485, mean=1.485, max=1.485, sum=2.971 (2)\", \"tab\": \"Efficiency\", \"score\": \"1.4853957338270798\"}",
+              "Nutrition - # eval": "{\"description\": \"min=306, mean=306, max=306, sum=612 (2)\", \"tab\": \"General information\", \"score\": \"306.0\"}",
+              "Nutrition - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Nutrition - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Nutrition - # prompt tokens": "{\"description\": \"min=586.814, mean=586.814, max=586.814, sum=1173.627 (2)\", \"tab\": \"General information\", \"score\": \"586.8137254901961\"}",
+              "Nutrition - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "mode": "\"closedbook\""
+              "subject": "\"nutrition\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_nutrition\""
             }
           }
         },
         {
-          "evaluation_name": "OpenbookQA",
+          "evaluation_name": "Prehistory",
           "source_data": {
-            "dataset_name": "OpenbookQA",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on OpenbookQA",
+            "evaluation_description": "EM on Prehistory",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.74,
+            "score": 0.728,
             "details": {
-              "description": "min=0.74, mean=0.74, max=0.74, sum=0.74 (1)",
+              "description": "min=0.728, mean=0.728, max=0.728, sum=1.457 (2)",
               "tab": "Accuracy",
-              "OpenbookQA - Observed inference time (s)": "{\"description\": \"min=2.937, mean=2.937, max=2.937, sum=2.937 (1)\", \"tab\": \"Efficiency\", \"score\": \"2.9374450149536133\"}",
-              "OpenbookQA - # eval": "{\"description\": \"min=500, mean=500, max=500, sum=500 (1)\", \"tab\": \"General information\", \"score\": \"500.0\"}",
-              "OpenbookQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "OpenbookQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "OpenbookQA - # prompt tokens": "{\"description\": \"min=249.776, mean=249.776, max=249.776, sum=249.776 (1)\", \"tab\": \"General information\", \"score\": \"249.776\"}",
-              "OpenbookQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Prehistory - Observed inference time (s)": "{\"description\": \"min=0.792, mean=0.792, max=0.792, sum=1.584 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.7917959955003526\"}",
+              "Prehistory - # eval": "{\"description\": \"min=324, mean=324, max=324, sum=648 (2)\", \"tab\": \"General information\", \"score\": \"324.0\"}",
+              "Prehistory - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Prehistory - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Prehistory - # prompt tokens": "{\"description\": \"min=514.528, mean=514.528, max=514.528, sum=1029.056 (2)\", \"tab\": \"General information\", \"score\": \"514.5277777777778\"}",
+              "Prehistory - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "dataset": "\"openbookqa\"",
-              "method": "\"multiple_choice_joint\""
+              "subject": "\"prehistory\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_prehistory\""
             }
           }
         },
         {
-          "evaluation_name": "MMLU",
+          "evaluation_name": "Public Relations",
           "source_data": {
-            "dataset_name": "MMLU",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on MMLU",
+            "evaluation_description": "EM on Public Relations",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5,
+            "score": 0.664,
             "details": {
-              "description": "min=0.26, mean=0.5, max=0.79, sum=2.501 (5)",
+              "description": "min=0.664, mean=0.664, max=0.664, sum=1.327 (2)",
               "tab": "Accuracy",
-              "MMLU - Observed inference time (s)": "{\"description\": \"min=0.284, mean=0.417, max=0.567, sum=2.086 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.41729471965421716\"}",
-              "MMLU - # eval": "{\"description\": \"min=100, mean=102.8, max=114, sum=514 (5)\", \"tab\": \"General information\", \"score\": \"102.8\"}",
-              "MMLU - # train": "{\"description\": \"min=5, mean=5, max=5, sum=25 (5)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "MMLU - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "MMLU - # prompt tokens": "{\"description\": \"min=373.43, mean=467.686, max=614.421, sum=2338.431 (5)\", \"tab\": \"General information\", \"score\": \"467.6862105263158\"}",
-              "MMLU - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Public Relations - Observed inference time (s)": "{\"description\": \"min=0.493, mean=0.493, max=0.493, sum=0.986 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.49318039634011007\"}",
+              "Public Relations - # eval": "{\"description\": \"min=110, mean=110, max=110, sum=220 (2)\", \"tab\": \"General information\", \"score\": \"110.0\"}",
+              "Public Relations - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Public Relations - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Public Relations - # prompt tokens": "{\"description\": \"min=405.318, mean=405.318, max=405.318, sum=810.636 (2)\", \"tab\": \"General information\", \"score\": \"405.3181818181818\"}",
+              "Public Relations - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "[\"abstract_algebra\", \"college_chemistry\", \"computer_security\", \"econometrics\", \"us_foreign_policy\"]",
-              "method": "\"multiple_choice_joint\""
+              "subject": "\"public_relations\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_public_relations\""
             }
           }
         },
         {
-          "evaluation_name": "MATH",
+          "evaluation_name": "Security Studies",
           "source_data": {
-            "dataset_name": "MATH",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "Equivalent (CoT) on MATH",
+            "evaluation_description": "EM on Security Studies",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.703,
+            "score": 0.576,
             "details": {
-              "description": "min=0.509, mean=0.703, max=0.849, sum=4.92 (7)",
+              "description": "min=0.576, mean=0.576, max=0.576, sum=1.151 (2)",
               "tab": "Accuracy",
-              "MATH - Observed inference time (s)": "{\"description\": \"min=1.617, mean=1.927, max=2.175, sum=13.492 (7)\", \"tab\": \"Efficiency\", \"score\": \"1.9274194573191807\"}",
-              "MATH - # eval": "{\"description\": \"min=30, mean=62.429, max=135, sum=437 (7)\", \"tab\": \"General information\", \"score\": \"62.42857142857143\"}",
-              "MATH - # train": "{\"description\": \"min=8, mean=8, max=8, sum=56 (7)\", \"tab\": \"General information\", \"score\": \"8.0\"}",
-              "MATH - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (7)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "MATH - # prompt tokens": "{\"description\": \"min=881.363, mean=1262.909, max=2197.577, sum=8840.364 (7)\", \"tab\": \"General information\", \"score\": \"1262.9092130545007\"}",
-              "MATH - # output tokens": "{\"description\": \"min=203.384, mean=253.982, max=288.596, sum=1777.872 (7)\", \"tab\": \"General information\", \"score\": \"253.98170179473732\"}"
+              "Security Studies - Observed inference time (s)": "{\"description\": \"min=0.656, mean=0.656, max=0.656, sum=1.312 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6561975401275012\"}",
+              "Security Studies - # eval": "{\"description\": \"min=245, mean=245, max=245, sum=490 (2)\", \"tab\": \"General information\", \"score\": \"245.0\"}",
+              "Security Studies - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Security Studies - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Security Studies - # prompt tokens": "{\"description\": \"min=1164.473, mean=1164.473, max=1164.473, sum=2328.947 (2)\", \"tab\": \"General information\", \"score\": \"1164.4734693877551\"}",
+              "Security Studies - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "[\"algebra\", \"counting_and_probability\", \"geometry\", \"intermediate_algebra\", \"number_theory\", \"prealgebra\", \"precalculus\"]",
-              "level": "\"1\"",
-              "use_official_examples": "\"False\"",
-              "use_chain_of_thought": "\"True\""
+              "subject": "\"security_studies\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_security_studies\""
             }
           }
         },
         {
-          "evaluation_name": "GSM8K",
+          "evaluation_name": "Sociology",
           "source_data": {
-            "dataset_name": "GSM8K",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on GSM8K",
+            "evaluation_description": "EM on Sociology",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.798,
+            "score": 0.701,
             "details": {
-              "description": "min=0.798, mean=0.798, max=0.798, sum=0.798 (1)",
+              "description": "min=0.701, mean=0.701, max=0.701, sum=1.403 (2)",
               "tab": "Accuracy",
-              "GSM8K - Observed inference time (s)": "{\"description\": \"min=2.109, mean=2.109, max=2.109, sum=2.109 (1)\", \"tab\": \"Efficiency\", \"score\": \"2.108796592712402\"}",
-              "GSM8K - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}",
-              "GSM8K - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "GSM8K - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "GSM8K - # prompt tokens": "{\"description\": \"min=959.032, mean=959.032, max=959.032, sum=959.032 (1)\", \"tab\": \"General information\", \"score\": \"959.032\"}",
-              "GSM8K - # output tokens": "{\"description\": \"min=150.02, mean=150.02, max=150.02, sum=150.02 (1)\", \"tab\": \"General information\", \"score\": \"150.02\"}"
+              "Sociology - Observed inference time (s)": "{\"description\": \"min=0.517, mean=0.517, max=0.517, sum=1.034 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5170851643405744\"}",
+              "Sociology - # eval": "{\"description\": \"min=201, mean=201, max=201, sum=402 (2)\", \"tab\": \"General information\", \"score\": \"201.0\"}",
+              "Sociology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Sociology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Sociology - # prompt tokens": "{\"description\": \"min=445.517, mean=445.517, max=445.517, sum=891.035 (2)\", \"tab\": \"General information\", \"score\": \"445.51741293532336\"}",
+              "Sociology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "stop": "\"none\""
+              "subject": "\"sociology\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_sociology\""
             }
           }
         },
         {
-          "evaluation_name": "LegalBench",
+          "evaluation_name": "Virology",
           "source_data": {
-            "dataset_name": "LegalBench",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on LegalBench",
+            "evaluation_description": "EM on Virology",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.342,
+            "score": 0.446,
             "details": {
-              "description": "min=0, mean=0.342, max=0.8, sum=1.71 (5)",
+              "description": "min=0.446, mean=0.446, max=0.446, sum=0.892 (2)",
               "tab": "Accuracy",
-              "LegalBench - Observed inference time (s)": "{\"description\": \"min=0.409, mean=0.481, max=0.626, sum=2.407 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.4814103188942614\"}",
-              "LegalBench - # eval": "{\"description\": \"min=95, mean=409.4, max=1000, sum=2047 (5)\", \"tab\": \"General information\", \"score\": \"409.4\"}",
-              "LegalBench - # train": "{\"description\": \"min=4, mean=4.8, max=5, sum=24 (5)\", \"tab\": \"General information\", \"score\": \"4.8\"}",
-              "LegalBench - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "LegalBench - # prompt tokens": "{\"description\": \"min=197.442, mean=1513.882, max=6300.012, sum=7569.412 (5)\", \"tab\": \"General information\", \"score\": \"1513.8824197238912\"}",
-              "LegalBench - # output tokens": "{\"description\": \"min=2.032, mean=6.824, max=10.886, sum=34.118 (5)\", \"tab\": \"General information\", \"score\": \"6.823557876005701\"}"
+              "Virology - Observed inference time (s)": "{\"description\": \"min=0.406, mean=0.406, max=0.406, sum=0.813 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.40646702553852493\"}",
+              "Virology - # eval": "{\"description\": \"min=166, mean=166, max=166, sum=332 (2)\", \"tab\": \"General information\", \"score\": \"166.0\"}",
+              "Virology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Virology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Virology - # prompt tokens": "{\"description\": \"min=343.018, mean=343.018, max=343.018, sum=686.036 (2)\", \"tab\": \"General information\", \"score\": \"343.01807228915663\"}",
+              "Virology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subset": "[\"abercrombie\", \"corporate_lobbying\", \"function_of_decision_section\", \"international_citizenship_questions\", \"proa\"]"
+              "subject": "\"virology\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_virology\""
             }
           }
         },
         {
-          "evaluation_name": "MedQA",
+          "evaluation_name": "World Religions",
           "source_data": {
-            "dataset_name": "MedQA",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on MedQA",
+            "evaluation_description": "EM on World Religions",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.245,
+            "score": 0.789,
             "details": {
-              "description": "min=0.245, mean=0.245, max=0.245, sum=0.245 (1)",
+              "description": "min=0.789, mean=0.789, max=0.789, sum=1.579 (2)",
               "tab": "Accuracy",
-              "MedQA - Observed inference time (s)": "{\"description\": \"min=0.743, mean=0.743, max=0.743, sum=0.743 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.742541556803891\"}",
-              "MedQA - # eval": "{\"description\": \"min=503, mean=503, max=503, sum=503 (1)\", \"tab\": \"General information\", \"score\": \"503.0\"}",
-              "MedQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "MedQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "MedQA - # prompt tokens": "{\"description\": \"min=1025.274, mean=1025.274, max=1025.274, sum=1025.274 (1)\", \"tab\": \"General information\", \"score\": \"1025.2743538767395\"}",
-              "MedQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "World Religions - Observed inference time (s)": "{\"description\": \"min=0.587, mean=0.587, max=0.587, sum=1.173 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5866640882882458\"}",
+              "World Religions - # eval": "{\"description\": \"min=171, mean=171, max=171, sum=342 (2)\", \"tab\": \"General information\", \"score\": \"171.0\"}",
+              "World Religions - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "World Religions - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "World Religions - # prompt tokens": "{\"description\": \"min=274.52, mean=274.52, max=274.52, sum=549.041 (2)\", \"tab\": \"General information\", \"score\": \"274.5204678362573\"}",
+              "World Religions - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
-            "additional_details": {}
+            "additional_details": {
+              "subject": "\"world_religions\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_world_religions\""
+            }
           }
         },
         {
-          "evaluation_name": "WMT 2014",
+          "evaluation_name": "Mean win rate",
           "source_data": {
-            "dataset_name": "WMT 2014",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "BLEU-4 on WMT 2014",
+            "evaluation_description": "How many models this model outperforms on average (over columns).",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.181,
+            "score": 0.475,
             "details": {
-              "description": "min=0.132, mean=0.181, max=0.219, sum=0.907 (5)",
-              "tab": "Accuracy",
-              "WMT 2014 - Observed inference time (s)": "{\"description\": \"min=0.439, mean=0.565, max=0.727, sum=2.826 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.5651802479746801\"}",
-              "WMT 2014 - # eval": "{\"description\": \"min=503, mean=568.8, max=832, sum=2844 (5)\", \"tab\": \"General information\", \"score\": \"568.8\"}",
-              "WMT 2014 - # train": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "WMT 2014 - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "WMT 2014 - # prompt tokens": "{\"description\": \"min=101.139, mean=120.712, max=141.117, sum=603.559 (5)\", \"tab\": \"General information\", \"score\": \"120.71178123566294\"}",
-              "WMT 2014 - # output tokens": "{\"description\": \"min=24.354, mean=25.779, max=26.833, sum=128.893 (5)\", \"tab\": \"General information\", \"score\": \"25.778561802263347\"}"
+              "description": "",
+              "tab": "Efficiency"
             }
           },
           "generation_config": {
-            "additional_details": {
-              "language_pair": "[\"cs-en\", \"de-en\", \"fr-en\", \"hi-en\", \"ru-en\"]"
-            }
+            "additional_details": {}
           }
         }
       ],
       "detailed_evaluation_results": null,
       "generation_config": {
-        "additional_details": {}
+        "additional_details": {
+          "subject": "[\"abstract_algebra\", \"anatomy\", \"astronomy\", \"business_ethics\", \"clinical_knowledge\", \"college_biology\", \"college_chemistry\", \"college_computer_science\", \"college_mathematics\", \"college_medicine\", \"college_physics\", \"computer_security\", \"conceptual_physics\", \"econometrics\", \"electrical_engineering\", \"elementary_mathematics\", \"formal_logic\", \"global_facts\", \"high_school_biology\", \"high_school_chemistry\", \"high_school_computer_science\", \"high_school_european_history\", \"high_school_geography\", \"high_school_government_and_politics\", \"high_school_macroeconomics\", \"high_school_mathematics\", \"high_school_microeconomics\", \"high_school_physics\", \"high_school_psychology\", \"high_school_statistics\", \"high_school_us_history\", \"high_school_world_history\", \"human_aging\", \"human_sexuality\", \"international_law\", \"jurisprudence\", \"logical_fallacies\", \"machine_learning\", \"management\", \"marketing\", \"medical_genetics\", \"miscellaneous\", \"moral_disputes\", \"moral_scenarios\", \"nutrition\", \"philosophy\", \"prehistory\", \"professional_accounting\", \"professional_law\", \"professional_medicine\", \"professional_psychology\", \"public_relations\", \"security_studies\", \"sociology\", \"us_foreign_policy\", \"virology\", \"world_religions\"]",
+          "method": "\"multiple_choice_joint\"",
+          "eval_split": "\"test\"",
+          "groups": "[\"mmlu_abstract_algebra\", \"mmlu_anatomy\", \"mmlu_astronomy\", \"mmlu_business_ethics\", \"mmlu_clinical_knowledge\", \"mmlu_college_biology\", \"mmlu_college_chemistry\", \"mmlu_college_computer_science\", \"mmlu_college_mathematics\", \"mmlu_college_medicine\", \"mmlu_college_physics\", \"mmlu_computer_security\", \"mmlu_conceptual_physics\", \"mmlu_econometrics\", \"mmlu_electrical_engineering\", \"mmlu_elementary_mathematics\", \"mmlu_formal_logic\", \"mmlu_global_facts\", \"mmlu_high_school_biology\", \"mmlu_high_school_chemistry\", \"mmlu_high_school_computer_science\", \"mmlu_high_school_european_history\", \"mmlu_high_school_geography\", \"mmlu_high_school_government_and_politics\", \"mmlu_high_school_macroeconomics\", \"mmlu_high_school_mathematics\", \"mmlu_high_school_microeconomics\", \"mmlu_high_school_physics\", \"mmlu_high_school_psychology\", \"mmlu_high_school_statistics\", \"mmlu_high_school_us_history\", \"mmlu_high_school_world_history\", \"mmlu_human_aging\", \"mmlu_human_sexuality\", \"mmlu_international_law\", \"mmlu_jurisprudence\", \"mmlu_logical_fallacies\", \"mmlu_machine_learning\", \"mmlu_management\", \"mmlu_marketing\", \"mmlu_medical_genetics\", \"mmlu_miscellaneous\", \"mmlu_moral_disputes\", \"mmlu_moral_scenarios\", \"mmlu_nutrition\", \"mmlu_philosophy\", \"mmlu_prehistory\", \"mmlu_professional_accounting\", \"mmlu_professional_law\", \"mmlu_professional_medicine\", \"mmlu_professional_psychology\", \"mmlu_public_relations\", \"mmlu_security_studies\", \"mmlu_sociology\", \"mmlu_us_foreign_policy\", \"mmlu_virology\", \"mmlu_world_religions\"]"
+        }
       }
     }
   ]
diff --git a/data/models/meta_llama-3.2-90b-vision-instruct-turbo.json b/data/models/meta_llama-3.2-90b-vision-instruct-turbo.json
index 8db40ff310b6015e156a3e086e416f8136fe7cde..d30ec26b3cfffb3a8ce0593911166de1a589585a 100644
--- a/data/models/meta_llama-3.2-90b-vision-instruct-turbo.json
+++ b/data/models/meta_llama-3.2-90b-vision-instruct-turbo.json
@@ -7,10 +7,10 @@
   },
   "evaluations": [
     {
-      "evaluation_id": "helm_mmlu/meta_llama-3.2-90b-vision-instruct-turbo/1774096312.00548",
-      "retrieved_timestamp": "1774096312.00548",
+      "evaluation_id": "helm_lite/meta_llama-3.2-90b-vision-instruct-turbo/1774096306.427425",
+      "retrieved_timestamp": "1774096306.427425",
       "source_metadata": {
-        "source_name": "helm_mmlu",
+        "source_name": "helm_lite",
         "source_type": "documentation",
         "source_organization_name": "crfm",
         "evaluator_relationship": "third_party"
@@ -19,438 +19,382 @@
         "name": "helm",
         "version": "unknown"
       },
-      "benchmark": "helm_mmlu",
+      "benchmark": "helm_lite",
       "evaluation_results": [
         {
-          "evaluation_name": "MMLU All Subjects",
+          "evaluation_name": "Mean win rate",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "helm_lite",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on MMLU All Subjects",
+            "evaluation_description": "How many models this model outperforms on average (over columns).",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.803,
+            "score": 0.819,
             "details": {
-              "description": "min=0.407, mean=0.803, max=0.979, sum=91.503 (114)",
+              "description": "",
               "tab": "Accuracy",
-              "MMLU All Subjects - Observed inference time (s)": "{\"description\": \"min=0.256, mean=0.374, max=2.612, sum=42.58 (114)\", \"tab\": \"Efficiency\", \"score\": \"0.37350966276831277\"}",
-              "MMLU All Subjects - # eval": "{\"description\": \"min=100, mean=246.351, max=1534, sum=28084 (114)\", \"tab\": \"General information\", \"score\": \"246.35087719298247\"}",
-              "MMLU All Subjects - # train": "{\"description\": \"min=5, mean=5, max=5, sum=570 (114)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "MMLU All Subjects - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (114)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "MMLU All Subjects - # prompt tokens": "{\"description\": \"min=274.52, mean=614.619, max=2797.885, sum=70066.61 (114)\", \"tab\": \"General information\", \"score\": \"614.6193817308517\"}",
-              "MMLU All Subjects - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (114)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
+              "Mean win rate - Efficiency": "{\"description\": \"\", \"tab\": \"Efficiency\", \"score\": \"0.5839825218476904\"}",
+              "Mean win rate - General information": "{\"description\": \"\", \"tab\": \"General information\", \"score\": \"\"}"
             }
           },
           "generation_config": {
-            "additional_details": {
-              "subject": "[\"abstract_algebra\", \"anatomy\", \"astronomy\", \"business_ethics\", \"clinical_knowledge\", \"college_biology\", \"college_chemistry\", \"college_computer_science\", \"college_mathematics\", \"college_medicine\", \"college_physics\", \"computer_security\", \"conceptual_physics\", \"econometrics\", \"electrical_engineering\", \"elementary_mathematics\", \"formal_logic\", \"global_facts\", \"high_school_biology\", \"high_school_chemistry\", \"high_school_computer_science\", \"high_school_european_history\", \"high_school_geography\", \"high_school_government_and_politics\", \"high_school_macroeconomics\", \"high_school_mathematics\", \"high_school_microeconomics\", \"high_school_physics\", \"high_school_psychology\", \"high_school_statistics\", \"high_school_us_history\", \"high_school_world_history\", \"human_aging\", \"human_sexuality\", \"international_law\", \"jurisprudence\", \"logical_fallacies\", \"machine_learning\", \"management\", \"marketing\", \"medical_genetics\", \"miscellaneous\", \"moral_disputes\", \"moral_scenarios\", \"nutrition\", \"philosophy\", \"prehistory\", \"professional_accounting\", \"professional_law\", \"professional_medicine\", \"professional_psychology\", \"public_relations\", \"security_studies\", \"sociology\", \"us_foreign_policy\", \"virology\", \"world_religions\"]",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "[\"mmlu_abstract_algebra\", \"mmlu_anatomy\", \"mmlu_astronomy\", \"mmlu_business_ethics\", \"mmlu_clinical_knowledge\", \"mmlu_college_biology\", \"mmlu_college_chemistry\", \"mmlu_college_computer_science\", \"mmlu_college_mathematics\", \"mmlu_college_medicine\", \"mmlu_college_physics\", \"mmlu_computer_security\", \"mmlu_conceptual_physics\", \"mmlu_econometrics\", \"mmlu_electrical_engineering\", \"mmlu_elementary_mathematics\", \"mmlu_formal_logic\", \"mmlu_global_facts\", \"mmlu_high_school_biology\", \"mmlu_high_school_chemistry\", \"mmlu_high_school_computer_science\", \"mmlu_high_school_european_history\", \"mmlu_high_school_geography\", \"mmlu_high_school_government_and_politics\", \"mmlu_high_school_macroeconomics\", \"mmlu_high_school_mathematics\", \"mmlu_high_school_microeconomics\", \"mmlu_high_school_physics\", \"mmlu_high_school_psychology\", \"mmlu_high_school_statistics\", \"mmlu_high_school_us_history\", \"mmlu_high_school_world_history\", \"mmlu_human_aging\", \"mmlu_human_sexuality\", \"mmlu_international_law\", \"mmlu_jurisprudence\", \"mmlu_logical_fallacies\", \"mmlu_machine_learning\", \"mmlu_management\", \"mmlu_marketing\", \"mmlu_medical_genetics\", \"mmlu_miscellaneous\", \"mmlu_moral_disputes\", \"mmlu_moral_scenarios\", \"mmlu_nutrition\", \"mmlu_philosophy\", \"mmlu_prehistory\", \"mmlu_professional_accounting\", \"mmlu_professional_law\", \"mmlu_professional_medicine\", \"mmlu_professional_psychology\", \"mmlu_public_relations\", \"mmlu_security_studies\", \"mmlu_sociology\", \"mmlu_us_foreign_policy\", \"mmlu_virology\", \"mmlu_world_religions\"]"
-            }
+            "additional_details": {}
           }
         },
         {
-          "evaluation_name": "Abstract Algebra",
+          "evaluation_name": "NarrativeQA",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "NarrativeQA",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Abstract Algebra",
+            "evaluation_description": "F1 on NarrativeQA",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.52,
+            "score": 0.777,
             "details": {
-              "description": "min=0.52, mean=0.52, max=0.52, sum=1.04 (2)",
+              "description": "min=0.777, mean=0.777, max=0.777, sum=0.777 (1)",
               "tab": "Accuracy",
-              "Abstract Algebra - Observed inference time (s)": "{\"description\": \"min=2.612, mean=2.612, max=2.612, sum=5.224 (2)\", \"tab\": \"Efficiency\", \"score\": \"2.611864836215973\"}",
-              "Abstract Algebra - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "Abstract Algebra - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Abstract Algebra - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Abstract Algebra - # prompt tokens": "{\"description\": \"min=373.43, mean=373.43, max=373.43, sum=746.86 (2)\", \"tab\": \"General information\", \"score\": \"373.43\"}",
-              "Abstract Algebra - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
+              "NarrativeQA - Observed inference time (s)": "{\"description\": \"min=0.83, mean=0.83, max=0.83, sum=0.83 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.8297326531208736\"}",
+              "NarrativeQA - # eval": "{\"description\": \"min=355, mean=355, max=355, sum=355 (1)\", \"tab\": \"General information\", \"score\": \"355.0\"}",
+              "NarrativeQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "NarrativeQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "NarrativeQA - # prompt tokens": "{\"description\": \"min=3484.268, mean=3484.268, max=3484.268, sum=3484.268 (1)\", \"tab\": \"General information\", \"score\": \"3484.2676056338028\"}",
+              "NarrativeQA - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
             }
           },
           "generation_config": {
-            "additional_details": {
-              "subject": "\"abstract_algebra\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_abstract_algebra\""
-            }
+            "additional_details": {}
           }
         },
         {
-          "evaluation_name": "Anatomy",
+          "evaluation_name": "NaturalQuestions (closed-book)",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "NaturalQuestions (closed-book)",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Anatomy",
+            "evaluation_description": "F1 on NaturalQuestions (closed-book)",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8,
+            "score": 0.457,
             "details": {
-              "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)",
+              "description": "min=0.457, mean=0.457, max=0.457, sum=0.457 (1)",
               "tab": "Accuracy",
-              "Anatomy - Observed inference time (s)": "{\"description\": \"min=0.336, mean=0.336, max=0.336, sum=0.672 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3359027315069128\"}",
-              "Anatomy - # eval": "{\"description\": \"min=135, mean=135, max=135, sum=270 (2)\", \"tab\": \"General information\", \"score\": \"135.0\"}",
-              "Anatomy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Anatomy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Anatomy - # prompt tokens": "{\"description\": \"min=353.874, mean=353.874, max=353.874, sum=707.748 (2)\", \"tab\": \"General information\", \"score\": \"353.8740740740741\"}",
-              "Anatomy - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
+              "NaturalQuestions (open-book) - Observed inference time (s)": "{\"description\": \"min=1.111, mean=1.111, max=1.111, sum=1.111 (1)\", \"tab\": \"Efficiency\", \"score\": \"1.110703297138214\"}",
+              "NaturalQuestions (closed-book) - Observed inference time (s)": "{\"description\": \"min=0.422, mean=0.422, max=0.422, sum=0.422 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.4218848171234131\"}",
+              "NaturalQuestions (open-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}",
+              "NaturalQuestions (open-book) - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "NaturalQuestions (open-book) - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "NaturalQuestions (open-book) - # prompt tokens": "{\"description\": \"min=1716.785, mean=1716.785, max=1716.785, sum=1716.785 (1)\", \"tab\": \"General information\", \"score\": \"1716.785\"}",
+              "NaturalQuestions (open-book) - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "NaturalQuestions (closed-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}",
+              "NaturalQuestions (closed-book) - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "NaturalQuestions (closed-book) - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "NaturalQuestions (closed-book) - # prompt tokens": "{\"description\": \"min=129.12, mean=129.12, max=129.12, sum=129.12 (1)\", \"tab\": \"General information\", \"score\": \"129.12\"}",
+              "NaturalQuestions (closed-book) - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"anatomy\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_anatomy\""
+              "mode": "\"closedbook\""
             }
           }
         },
         {
-          "evaluation_name": "College Physics",
+          "evaluation_name": "OpenbookQA",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "OpenbookQA",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on College Physics",
+            "evaluation_description": "EM on OpenbookQA",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.539,
+            "score": 0.942,
             "details": {
-              "description": "min=0.539, mean=0.539, max=0.539, sum=1.078 (2)",
+              "description": "min=0.942, mean=0.942, max=0.942, sum=0.942 (1)",
               "tab": "Accuracy",
-              "College Chemistry - Observed inference time (s)": "{\"description\": \"min=0.31, mean=0.31, max=0.31, sum=0.621 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3104448890686035\"}",
-              "College Biology - Observed inference time (s)": "{\"description\": \"min=0.272, mean=0.272, max=0.272, sum=0.544 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2720499005582597\"}",
-              "College Computer Science - Observed inference time (s)": "{\"description\": \"min=0.321, mean=0.321, max=0.321, sum=0.642 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.32119542360305786\"}",
-              "College Mathematics - Observed inference time (s)": "{\"description\": \"min=0.315, mean=0.315, max=0.315, sum=0.63 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.31477957487106323\"}",
-              "College Medicine - Observed inference time (s)": "{\"description\": \"min=0.283, mean=0.283, max=0.283, sum=0.566 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.28313319255850905\"}",
-              "College Physics - Observed inference time (s)": "{\"description\": \"min=0.317, mean=0.317, max=0.317, sum=0.634 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.31692570097306194\"}",
-              "College Chemistry - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "College Chemistry - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "College Chemistry - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "College Chemistry - # prompt tokens": "{\"description\": \"min=549.28, mean=549.28, max=549.28, sum=1098.56 (2)\", \"tab\": \"General information\", \"score\": \"549.28\"}",
-              "College Chemistry - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "College Biology - # eval": "{\"description\": \"min=144, mean=144, max=144, sum=288 (2)\", \"tab\": \"General information\", \"score\": \"144.0\"}",
-              "College Biology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "College Biology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "College Biology - # prompt tokens": "{\"description\": \"min=473.875, mean=473.875, max=473.875, sum=947.75 (2)\", \"tab\": \"General information\", \"score\": \"473.875\"}",
-              "College Biology - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "College Computer Science - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "College Computer Science - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "College Computer Science - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "College Computer Science - # prompt tokens": "{\"description\": \"min=828.29, mean=828.29, max=828.29, sum=1656.58 (2)\", \"tab\": \"General information\", \"score\": \"828.29\"}",
-              "College Computer Science - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "College Mathematics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "College Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "College Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "College Mathematics - # prompt tokens": "{\"description\": \"min=594.51, mean=594.51, max=594.51, sum=1189.02 (2)\", \"tab\": \"General information\", \"score\": \"594.51\"}",
-              "College Mathematics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "College Medicine - # eval": "{\"description\": \"min=173, mean=173, max=173, sum=346 (2)\", \"tab\": \"General information\", \"score\": \"173.0\"}",
-              "College Medicine - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "College Medicine - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "College Medicine - # prompt tokens": "{\"description\": \"min=502.705, mean=502.705, max=502.705, sum=1005.41 (2)\", \"tab\": \"General information\", \"score\": \"502.70520231213874\"}",
-              "College Medicine - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "College Physics - # eval": "{\"description\": \"min=102, mean=102, max=102, sum=204 (2)\", \"tab\": \"General information\", \"score\": \"102.0\"}",
-              "College Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "College Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "College Physics - # prompt tokens": "{\"description\": \"min=503.569, mean=503.569, max=503.569, sum=1007.137 (2)\", \"tab\": \"General information\", \"score\": \"503.5686274509804\"}",
-              "College Physics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
+              "OpenbookQA - Observed inference time (s)": "{\"description\": \"min=0.285, mean=0.285, max=0.285, sum=0.285 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.28476666021347047\"}",
+              "OpenbookQA - # eval": "{\"description\": \"min=500, mean=500, max=500, sum=500 (1)\", \"tab\": \"General information\", \"score\": \"500.0\"}",
+              "OpenbookQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "OpenbookQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "OpenbookQA - # prompt tokens": "{\"description\": \"min=249.776, mean=249.776, max=249.776, sum=249.776 (1)\", \"tab\": \"General information\", \"score\": \"249.776\"}",
+              "OpenbookQA - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"college_physics\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_college_physics\""
+              "dataset": "\"openbookqa\"",
+              "method": "\"multiple_choice_joint\""
             }
           }
         },
         {
-          "evaluation_name": "Computer Security",
+          "evaluation_name": "MMLU",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "MMLU",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Computer Security",
+            "evaluation_description": "EM on MMLU",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.81,
+            "score": 0.703,
             "details": {
-              "description": "min=0.81, mean=0.81, max=0.81, sum=1.62 (2)",
+              "description": "min=0.52, mean=0.703, max=0.93, sum=3.514 (5)",
               "tab": "Accuracy",
-              "Computer Security - Observed inference time (s)": "{\"description\": \"min=0.266, mean=0.266, max=0.266, sum=0.532 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.26576273441314696\"}",
-              "Computer Security - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "Computer Security - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Computer Security - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Computer Security - # prompt tokens": "{\"description\": \"min=378.51, mean=378.51, max=378.51, sum=757.02 (2)\", \"tab\": \"General information\", \"score\": \"378.51\"}",
-              "Computer Security - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
+              "MMLU - Observed inference time (s)": "{\"description\": \"min=0.266, mean=0.798, max=2.612, sum=3.992 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.7984467656654225\"}",
+              "MMLU - # eval": "{\"description\": \"min=100, mean=102.8, max=114, sum=514 (5)\", \"tab\": \"General information\", \"score\": \"102.8\"}",
+              "MMLU - # train": "{\"description\": \"min=5, mean=5, max=5, sum=25 (5)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "MMLU - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "MMLU - # prompt tokens": "{\"description\": \"min=373.43, mean=467.686, max=614.421, sum=2338.431 (5)\", \"tab\": \"General information\", \"score\": \"467.6862105263158\"}",
+              "MMLU - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"computer_security\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_computer_security\""
+              "subject": "[\"abstract_algebra\", \"college_chemistry\", \"computer_security\", \"econometrics\", \"us_foreign_policy\"]",
+              "method": "\"multiple_choice_joint\""
             }
           }
         },
         {
-          "evaluation_name": "Econometrics",
+          "evaluation_name": "MATH",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "MATH",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Econometrics",
+            "evaluation_description": "Equivalent (CoT) on MATH",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.684,
+            "score": 0.791,
             "details": {
-              "description": "min=0.684, mean=0.684, max=0.684, sum=1.368 (2)",
+              "description": "min=0.579, mean=0.791, max=0.978, sum=5.54 (7)",
               "tab": "Accuracy",
-              "Econometrics - Observed inference time (s)": "{\"description\": \"min=0.297, mean=0.297, max=0.297, sum=0.595 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2972530210227297\"}",
-              "Econometrics - # eval": "{\"description\": \"min=114, mean=114, max=114, sum=228 (2)\", \"tab\": \"General information\", \"score\": \"114.0\"}",
-              "Econometrics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Econometrics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Econometrics - # prompt tokens": "{\"description\": \"min=614.421, mean=614.421, max=614.421, sum=1228.842 (2)\", \"tab\": \"General information\", \"score\": \"614.421052631579\"}",
-              "Econometrics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
+              "MATH - Observed inference time (s)": "{\"description\": \"min=4.64, mean=5.739, max=6.652, sum=40.174 (7)\", \"tab\": \"Efficiency\", \"score\": \"5.739186799526185\"}",
+              "MATH - # eval": "{\"description\": \"min=30, mean=62.429, max=135, sum=437 (7)\", \"tab\": \"General information\", \"score\": \"62.42857142857143\"}",
+              "MATH - # train": "{\"description\": \"min=8, mean=8, max=8, sum=56 (7)\", \"tab\": \"General information\", \"score\": \"8.0\"}",
+              "MATH - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (7)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "MATH - # prompt tokens": "{\"description\": \"min=881.363, mean=1262.909, max=2197.577, sum=8840.364 (7)\", \"tab\": \"General information\", \"score\": \"1262.9092130545007\"}",
+              "MATH - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (7)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"econometrics\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_econometrics\""
+              "subject": "[\"algebra\", \"counting_and_probability\", \"geometry\", \"intermediate_algebra\", \"number_theory\", \"prealgebra\", \"precalculus\"]",
+              "level": "\"1\"",
+              "use_official_examples": "\"False\"",
+              "use_chain_of_thought": "\"True\""
             }
           }
         },
         {
-          "evaluation_name": "Global Facts",
+          "evaluation_name": "GSM8K",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "GSM8K",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Global Facts",
+            "evaluation_description": "EM on GSM8K",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6,
+            "score": 0.936,
             "details": {
-              "description": "min=0.6, mean=0.6, max=0.6, sum=1.2 (2)",
+              "description": "min=0.936, mean=0.936, max=0.936, sum=0.936 (1)",
               "tab": "Accuracy",
-              "Global Facts - Observed inference time (s)": "{\"description\": \"min=0.267, mean=0.267, max=0.267, sum=0.533 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2666162133216858\"}",
-              "Global Facts - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "Global Facts - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Global Facts - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Global Facts - # prompt tokens": "{\"description\": \"min=399.71, mean=399.71, max=399.71, sum=799.42 (2)\", \"tab\": \"General information\", \"score\": \"399.71\"}",
-              "Global Facts - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
+              "GSM8K - Observed inference time (s)": "{\"description\": \"min=2.889, mean=2.889, max=2.889, sum=2.889 (1)\", \"tab\": \"Efficiency\", \"score\": \"2.8894128675460817\"}",
+              "GSM8K - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}",
+              "GSM8K - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "GSM8K - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "GSM8K - # prompt tokens": "{\"description\": \"min=959.032, mean=959.032, max=959.032, sum=959.032 (1)\", \"tab\": \"General information\", \"score\": \"959.032\"}",
+              "GSM8K - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"global_facts\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_global_facts\""
+              "stop": "\"none\""
             }
           }
         },
         {
-          "evaluation_name": "Jurisprudence",
+          "evaluation_name": "LegalBench",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "LegalBench",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Jurisprudence",
+            "evaluation_description": "EM on LegalBench",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.88,
+            "score": 0.68,
             "details": {
-              "description": "min=0.88, mean=0.88, max=0.88, sum=1.759 (2)",
+              "description": "min=0.438, mean=0.68, max=0.989, sum=3.398 (5)",
               "tab": "Accuracy",
-              "Jurisprudence - Observed inference time (s)": "{\"description\": \"min=0.279, mean=0.279, max=0.279, sum=0.558 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.278864703796528\"}",
-              "Jurisprudence - # eval": "{\"description\": \"min=108, mean=108, max=108, sum=216 (2)\", \"tab\": \"General information\", \"score\": \"108.0\"}",
-              "Jurisprudence - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Jurisprudence - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Jurisprudence - # prompt tokens": "{\"description\": \"min=394.63, mean=394.63, max=394.63, sum=789.259 (2)\", \"tab\": \"General information\", \"score\": \"394.6296296296296\"}",
-              "Jurisprudence - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
+              "LegalBench - Observed inference time (s)": "{\"description\": \"min=0.284, mean=0.478, max=1.152, sum=2.389 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.47773526830658064\"}",
+              "LegalBench - # eval": "{\"description\": \"min=95, mean=409.4, max=1000, sum=2047 (5)\", \"tab\": \"General information\", \"score\": \"409.4\"}",
+              "LegalBench - # train": "{\"description\": \"min=4, mean=4.8, max=5, sum=24 (5)\", \"tab\": \"General information\", \"score\": \"4.8\"}",
+              "LegalBench - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "LegalBench - # prompt tokens": "{\"description\": \"min=197.442, mean=1513.882, max=6300.012, sum=7569.412 (5)\", \"tab\": \"General information\", \"score\": \"1513.8824197238912\"}",
+              "LegalBench - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"jurisprudence\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_jurisprudence\""
+              "subset": "[\"abercrombie\", \"corporate_lobbying\", \"function_of_decision_section\", \"international_citizenship_questions\", \"proa\"]"
             }
           }
         },
         {
-          "evaluation_name": "Philosophy",
+          "evaluation_name": "MedQA",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "MedQA",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Philosophy",
+            "evaluation_description": "EM on MedQA",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.839,
+            "score": 0.769,
             "details": {
-              "description": "min=0.839, mean=0.839, max=0.839, sum=1.678 (2)",
+              "description": "min=0.769, mean=0.769, max=0.769, sum=0.769 (1)",
               "tab": "Accuracy",
-              "Philosophy - Observed inference time (s)": "{\"description\": \"min=0.297, mean=0.297, max=0.297, sum=0.594 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.29689135582117404\"}",
-              "Philosophy - # eval": "{\"description\": \"min=311, mean=311, max=311, sum=622 (2)\", \"tab\": \"General information\", \"score\": \"311.0\"}",
-              "Philosophy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Philosophy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Philosophy - # prompt tokens": "{\"description\": \"min=329.084, mean=329.084, max=329.084, sum=658.167 (2)\", \"tab\": \"General information\", \"score\": \"329.08360128617363\"}",
-              "Philosophy - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
+              "MedQA - Observed inference time (s)": "{\"description\": \"min=0.318, mean=0.318, max=0.318, sum=0.318 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.3180293652930743\"}",
+              "MedQA - # eval": "{\"description\": \"min=503, mean=503, max=503, sum=503 (1)\", \"tab\": \"General information\", \"score\": \"503.0\"}",
+              "MedQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "MedQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "MedQA - # prompt tokens": "{\"description\": \"min=1025.274, mean=1025.274, max=1025.274, sum=1025.274 (1)\", \"tab\": \"General information\", \"score\": \"1025.2743538767395\"}",
+              "MedQA - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
             }
           },
           "generation_config": {
-            "additional_details": {
-              "subject": "\"philosophy\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_philosophy\""
-            }
+            "additional_details": {}
           }
         },
         {
-          "evaluation_name": "Professional Psychology",
+          "evaluation_name": "WMT 2014",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "WMT 2014",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Professional Psychology",
+            "evaluation_description": "BLEU-4 on WMT 2014",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.843,
+            "score": 0.224,
             "details": {
-              "description": "min=0.843, mean=0.843, max=0.843, sum=1.686 (2)",
+              "description": "min=0.182, mean=0.224, max=0.266, sum=1.121 (5)",
               "tab": "Accuracy",
-              "Professional Medicine - Observed inference time (s)": "{\"description\": \"min=0.553, mean=0.553, max=0.553, sum=1.106 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5529017465956071\"}",
-              "Professional Accounting - Observed inference time (s)": "{\"description\": \"min=0.323, mean=0.323, max=0.323, sum=0.647 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.32346555189038\"}",
-              "Professional Law - Observed inference time (s)": "{\"description\": \"min=0.372, mean=0.372, max=0.372, sum=0.743 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3715069820859131\"}",
-              "Professional Psychology - Observed inference time (s)": "{\"description\": \"min=0.315, mean=0.315, max=0.315, sum=0.63 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3151663907992294\"}",
-              "Professional Medicine - # eval": "{\"description\": \"min=272, mean=272, max=272, sum=544 (2)\", \"tab\": \"General information\", \"score\": \"272.0\"}",
-              "Professional Medicine - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Professional Medicine - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Professional Medicine - # prompt tokens": "{\"description\": \"min=1094.489, mean=1094.489, max=1094.489, sum=2188.978 (2)\", \"tab\": \"General information\", \"score\": \"1094.4889705882354\"}",
-              "Professional Medicine - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Professional Accounting - # eval": "{\"description\": \"min=282, mean=282, max=282, sum=564 (2)\", \"tab\": \"General information\", \"score\": \"282.0\"}",
-              "Professional Accounting - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Professional Accounting - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Professional Accounting - # prompt tokens": "{\"description\": \"min=658.585, mean=658.585, max=658.585, sum=1317.17 (2)\", \"tab\": \"General information\", \"score\": \"658.5851063829788\"}",
-              "Professional Accounting - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Professional Law - # eval": "{\"description\": \"min=1534, mean=1534, max=1534, sum=3068 (2)\", \"tab\": \"General information\", \"score\": \"1534.0\"}",
-              "Professional Law - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Professional Law - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Professional Law - # prompt tokens": "{\"description\": \"min=1637.601, mean=1637.601, max=1637.601, sum=3275.202 (2)\", \"tab\": \"General information\", \"score\": \"1637.6010430247718\"}",
-              "Professional Law - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Professional Psychology - # eval": "{\"description\": \"min=612, mean=612, max=612, sum=1224 (2)\", \"tab\": \"General information\", \"score\": \"612.0\"}",
-              "Professional Psychology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Professional Psychology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Professional Psychology - # prompt tokens": "{\"description\": \"min=575.098, mean=575.098, max=575.098, sum=1150.196 (2)\", \"tab\": \"General information\", \"score\": \"575.0980392156863\"}",
-              "Professional Psychology - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
+              "WMT 2014 - Observed inference time (s)": "{\"description\": \"min=0.737, mean=0.816, max=0.848, sum=4.078 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.8156762526912515\"}",
+              "WMT 2014 - # eval": "{\"description\": \"min=503, mean=568.8, max=832, sum=2844 (5)\", \"tab\": \"General information\", \"score\": \"568.8\"}",
+              "WMT 2014 - # train": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "WMT 2014 - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "WMT 2014 - # prompt tokens": "{\"description\": \"min=101.139, mean=120.868, max=141.33, sum=604.34 (5)\", \"tab\": \"General information\", \"score\": \"120.86804366111025\"}",
+              "WMT 2014 - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"professional_psychology\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_professional_psychology\""
+              "language_pair": "[\"cs-en\", \"de-en\", \"fr-en\", \"hi-en\", \"ru-en\"]"
             }
           }
-        },
+        }
+      ],
+      "detailed_evaluation_results": null,
+      "generation_config": {
+        "additional_details": {}
+      }
+    },
+    {
+      "evaluation_id": "helm_mmlu/meta_llama-3.2-90b-vision-instruct-turbo/1774096312.00548",
+      "retrieved_timestamp": "1774096312.00548",
+      "source_metadata": {
+        "source_name": "helm_mmlu",
+        "source_type": "documentation",
+        "source_organization_name": "crfm",
+        "evaluator_relationship": "third_party"
+      },
+      "eval_library": {
+        "name": "helm",
+        "version": "unknown"
+      },
+      "benchmark": "helm_mmlu",
+      "evaluation_results": [
         {
-          "evaluation_name": "Us Foreign Policy",
+          "evaluation_name": "MMLU All Subjects",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -459,36 +403,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Us Foreign Policy",
+            "evaluation_description": "EM on MMLU All Subjects",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.93,
+            "score": 0.803,
             "details": {
-              "description": "min=0.93, mean=0.93, max=0.93, sum=1.86 (2)",
+              "description": "min=0.407, mean=0.803, max=0.979, sum=91.503 (114)",
               "tab": "Accuracy",
-              "Us Foreign Policy - Observed inference time (s)": "{\"description\": \"min=0.507, mean=0.507, max=0.507, sum=1.014 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5069083476066589\"}",
-              "Us Foreign Policy - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "Us Foreign Policy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Us Foreign Policy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Us Foreign Policy - # prompt tokens": "{\"description\": \"min=422.79, mean=422.79, max=422.79, sum=845.58 (2)\", \"tab\": \"General information\", \"score\": \"422.79\"}",
-              "Us Foreign Policy - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
+              "MMLU All Subjects - Observed inference time (s)": "{\"description\": \"min=0.256, mean=0.374, max=2.612, sum=42.58 (114)\", \"tab\": \"Efficiency\", \"score\": \"0.37350966276831277\"}",
+              "MMLU All Subjects - # eval": "{\"description\": \"min=100, mean=246.351, max=1534, sum=28084 (114)\", \"tab\": \"General information\", \"score\": \"246.35087719298247\"}",
+              "MMLU All Subjects - # train": "{\"description\": \"min=5, mean=5, max=5, sum=570 (114)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "MMLU All Subjects - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (114)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "MMLU All Subjects - # prompt tokens": "{\"description\": \"min=274.52, mean=614.619, max=2797.885, sum=70066.61 (114)\", \"tab\": \"General information\", \"score\": \"614.6193817308517\"}",
+              "MMLU All Subjects - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (114)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"us_foreign_policy\"",
+              "subject": "[\"abstract_algebra\", \"anatomy\", \"astronomy\", \"business_ethics\", \"clinical_knowledge\", \"college_biology\", \"college_chemistry\", \"college_computer_science\", \"college_mathematics\", \"college_medicine\", \"college_physics\", \"computer_security\", \"conceptual_physics\", \"econometrics\", \"electrical_engineering\", \"elementary_mathematics\", \"formal_logic\", \"global_facts\", \"high_school_biology\", \"high_school_chemistry\", \"high_school_computer_science\", \"high_school_european_history\", \"high_school_geography\", \"high_school_government_and_politics\", \"high_school_macroeconomics\", \"high_school_mathematics\", \"high_school_microeconomics\", \"high_school_physics\", \"high_school_psychology\", \"high_school_statistics\", \"high_school_us_history\", \"high_school_world_history\", \"human_aging\", \"human_sexuality\", \"international_law\", \"jurisprudence\", \"logical_fallacies\", \"machine_learning\", \"management\", \"marketing\", \"medical_genetics\", \"miscellaneous\", \"moral_disputes\", \"moral_scenarios\", \"nutrition\", \"philosophy\", \"prehistory\", \"professional_accounting\", \"professional_law\", \"professional_medicine\", \"professional_psychology\", \"public_relations\", \"security_studies\", \"sociology\", \"us_foreign_policy\", \"virology\", \"world_religions\"]",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_us_foreign_policy\""
+              "groups": "[\"mmlu_abstract_algebra\", \"mmlu_anatomy\", \"mmlu_astronomy\", \"mmlu_business_ethics\", \"mmlu_clinical_knowledge\", \"mmlu_college_biology\", \"mmlu_college_chemistry\", \"mmlu_college_computer_science\", \"mmlu_college_mathematics\", \"mmlu_college_medicine\", \"mmlu_college_physics\", \"mmlu_computer_security\", \"mmlu_conceptual_physics\", \"mmlu_econometrics\", \"mmlu_electrical_engineering\", \"mmlu_elementary_mathematics\", \"mmlu_formal_logic\", \"mmlu_global_facts\", \"mmlu_high_school_biology\", \"mmlu_high_school_chemistry\", \"mmlu_high_school_computer_science\", \"mmlu_high_school_european_history\", \"mmlu_high_school_geography\", \"mmlu_high_school_government_and_politics\", \"mmlu_high_school_macroeconomics\", \"mmlu_high_school_mathematics\", \"mmlu_high_school_microeconomics\", \"mmlu_high_school_physics\", \"mmlu_high_school_psychology\", \"mmlu_high_school_statistics\", \"mmlu_high_school_us_history\", \"mmlu_high_school_world_history\", \"mmlu_human_aging\", \"mmlu_human_sexuality\", \"mmlu_international_law\", \"mmlu_jurisprudence\", \"mmlu_logical_fallacies\", \"mmlu_machine_learning\", \"mmlu_management\", \"mmlu_marketing\", \"mmlu_medical_genetics\", \"mmlu_miscellaneous\", \"mmlu_moral_disputes\", \"mmlu_moral_scenarios\", \"mmlu_nutrition\", \"mmlu_philosophy\", \"mmlu_prehistory\", \"mmlu_professional_accounting\", \"mmlu_professional_law\", \"mmlu_professional_medicine\", \"mmlu_professional_psychology\", \"mmlu_public_relations\", \"mmlu_security_studies\", \"mmlu_sociology\", \"mmlu_us_foreign_policy\", \"mmlu_virology\", \"mmlu_world_religions\"]"
             }
           }
         },
         {
-          "evaluation_name": "Astronomy",
+          "evaluation_name": "Abstract Algebra",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -497,36 +441,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Astronomy",
+            "evaluation_description": "EM on Abstract Algebra",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.921,
+            "score": 0.52,
             "details": {
-              "description": "min=0.921, mean=0.921, max=0.921, sum=1.842 (2)",
+              "description": "min=0.52, mean=0.52, max=0.52, sum=1.04 (2)",
               "tab": "Accuracy",
-              "Astronomy - Observed inference time (s)": "{\"description\": \"min=0.332, mean=0.332, max=0.332, sum=0.665 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3323579352152975\"}",
-              "Astronomy - # eval": "{\"description\": \"min=152, mean=152, max=152, sum=304 (2)\", \"tab\": \"General information\", \"score\": \"152.0\"}",
-              "Astronomy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Astronomy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Astronomy - # prompt tokens": "{\"description\": \"min=579.684, mean=579.684, max=579.684, sum=1159.368 (2)\", \"tab\": \"General information\", \"score\": \"579.6842105263158\"}",
-              "Astronomy - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
+              "Abstract Algebra - Observed inference time (s)": "{\"description\": \"min=2.612, mean=2.612, max=2.612, sum=5.224 (2)\", \"tab\": \"Efficiency\", \"score\": \"2.611864836215973\"}",
+              "Abstract Algebra - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "Abstract Algebra - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Abstract Algebra - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Abstract Algebra - # prompt tokens": "{\"description\": \"min=373.43, mean=373.43, max=373.43, sum=746.86 (2)\", \"tab\": \"General information\", \"score\": \"373.43\"}",
+              "Abstract Algebra - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"astronomy\"",
+              "subject": "\"abstract_algebra\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_astronomy\""
+              "groups": "\"mmlu_abstract_algebra\""
             }
           }
         },
         {
-          "evaluation_name": "Business Ethics",
+          "evaluation_name": "Anatomy",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -535,36 +479,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Business Ethics",
+            "evaluation_description": "EM on Anatomy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.76,
+            "score": 0.8,
             "details": {
-              "description": "min=0.76, mean=0.76, max=0.76, sum=1.52 (2)",
+              "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)",
               "tab": "Accuracy",
-              "Business Ethics - Observed inference time (s)": "{\"description\": \"min=0.291, mean=0.291, max=0.291, sum=0.581 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.29072295665740966\"}",
-              "Business Ethics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "Business Ethics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Business Ethics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Business Ethics - # prompt tokens": "{\"description\": \"min=569.52, mean=569.52, max=569.52, sum=1139.04 (2)\", \"tab\": \"General information\", \"score\": \"569.52\"}",
-              "Business Ethics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
+              "Anatomy - Observed inference time (s)": "{\"description\": \"min=0.336, mean=0.336, max=0.336, sum=0.672 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3359027315069128\"}",
+              "Anatomy - # eval": "{\"description\": \"min=135, mean=135, max=135, sum=270 (2)\", \"tab\": \"General information\", \"score\": \"135.0\"}",
+              "Anatomy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Anatomy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Anatomy - # prompt tokens": "{\"description\": \"min=353.874, mean=353.874, max=353.874, sum=707.748 (2)\", \"tab\": \"General information\", \"score\": \"353.8740740740741\"}",
+              "Anatomy - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"business_ethics\"",
+              "subject": "\"anatomy\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_business_ethics\""
+              "groups": "\"mmlu_anatomy\""
             }
           }
         },
         {
-          "evaluation_name": "Clinical Knowledge",
+          "evaluation_name": "College Physics",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -573,36 +517,66 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Clinical Knowledge",
+            "evaluation_description": "EM on College Physics",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.845,
+            "score": 0.539,
             "details": {
-              "description": "min=0.845, mean=0.845, max=0.845, sum=1.691 (2)",
+              "description": "min=0.539, mean=0.539, max=0.539, sum=1.078 (2)",
               "tab": "Accuracy",
-              "Clinical Knowledge - Observed inference time (s)": "{\"description\": \"min=0.29, mean=0.29, max=0.29, sum=0.579 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2897273891376999\"}",
-              "Clinical Knowledge - # eval": "{\"description\": \"min=265, mean=265, max=265, sum=530 (2)\", \"tab\": \"General information\", \"score\": \"265.0\"}",
-              "Clinical Knowledge - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Clinical Knowledge - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Clinical Knowledge - # prompt tokens": "{\"description\": \"min=397.928, mean=397.928, max=397.928, sum=795.857 (2)\", \"tab\": \"General information\", \"score\": \"397.92830188679244\"}",
-              "Clinical Knowledge - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
+              "College Chemistry - Observed inference time (s)": "{\"description\": \"min=0.31, mean=0.31, max=0.31, sum=0.621 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3104448890686035\"}",
+              "College Biology - Observed inference time (s)": "{\"description\": \"min=0.272, mean=0.272, max=0.272, sum=0.544 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2720499005582597\"}",
+              "College Computer Science - Observed inference time (s)": "{\"description\": \"min=0.321, mean=0.321, max=0.321, sum=0.642 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.32119542360305786\"}",
+              "College Mathematics - Observed inference time (s)": "{\"description\": \"min=0.315, mean=0.315, max=0.315, sum=0.63 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.31477957487106323\"}",
+              "College Medicine - Observed inference time (s)": "{\"description\": \"min=0.283, mean=0.283, max=0.283, sum=0.566 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.28313319255850905\"}",
+              "College Physics - Observed inference time (s)": "{\"description\": \"min=0.317, mean=0.317, max=0.317, sum=0.634 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.31692570097306194\"}",
+              "College Chemistry - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "College Chemistry - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "College Chemistry - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "College Chemistry - # prompt tokens": "{\"description\": \"min=549.28, mean=549.28, max=549.28, sum=1098.56 (2)\", \"tab\": \"General information\", \"score\": \"549.28\"}",
+              "College Chemistry - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "College Biology - # eval": "{\"description\": \"min=144, mean=144, max=144, sum=288 (2)\", \"tab\": \"General information\", \"score\": \"144.0\"}",
+              "College Biology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "College Biology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "College Biology - # prompt tokens": "{\"description\": \"min=473.875, mean=473.875, max=473.875, sum=947.75 (2)\", \"tab\": \"General information\", \"score\": \"473.875\"}",
+              "College Biology - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "College Computer Science - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "College Computer Science - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "College Computer Science - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "College Computer Science - # prompt tokens": "{\"description\": \"min=828.29, mean=828.29, max=828.29, sum=1656.58 (2)\", \"tab\": \"General information\", \"score\": \"828.29\"}",
+              "College Computer Science - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "College Mathematics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "College Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "College Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "College Mathematics - # prompt tokens": "{\"description\": \"min=594.51, mean=594.51, max=594.51, sum=1189.02 (2)\", \"tab\": \"General information\", \"score\": \"594.51\"}",
+              "College Mathematics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "College Medicine - # eval": "{\"description\": \"min=173, mean=173, max=173, sum=346 (2)\", \"tab\": \"General information\", \"score\": \"173.0\"}",
+              "College Medicine - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "College Medicine - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "College Medicine - # prompt tokens": "{\"description\": \"min=502.705, mean=502.705, max=502.705, sum=1005.41 (2)\", \"tab\": \"General information\", \"score\": \"502.70520231213874\"}",
+              "College Medicine - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "College Physics - # eval": "{\"description\": \"min=102, mean=102, max=102, sum=204 (2)\", \"tab\": \"General information\", \"score\": \"102.0\"}",
+              "College Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "College Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "College Physics - # prompt tokens": "{\"description\": \"min=503.569, mean=503.569, max=503.569, sum=1007.137 (2)\", \"tab\": \"General information\", \"score\": \"503.5686274509804\"}",
+              "College Physics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"clinical_knowledge\"",
+              "subject": "\"college_physics\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_clinical_knowledge\""
+              "groups": "\"mmlu_college_physics\""
             }
           }
         },
         {
-          "evaluation_name": "Conceptual Physics",
+          "evaluation_name": "Computer Security",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -611,36 +585,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Conceptual Physics",
+            "evaluation_description": "EM on Computer Security",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.826,
+            "score": 0.81,
             "details": {
-              "description": "min=0.826, mean=0.826, max=0.826, sum=1.651 (2)",
+              "description": "min=0.81, mean=0.81, max=0.81, sum=1.62 (2)",
               "tab": "Accuracy",
-              "Conceptual Physics - Observed inference time (s)": "{\"description\": \"min=0.279, mean=0.279, max=0.279, sum=0.559 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2794749209221373\"}",
-              "Conceptual Physics - # eval": "{\"description\": \"min=235, mean=235, max=235, sum=470 (2)\", \"tab\": \"General information\", \"score\": \"235.0\"}",
-              "Conceptual Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Conceptual Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Conceptual Physics - # prompt tokens": "{\"description\": \"min=304.834, mean=304.834, max=304.834, sum=609.668 (2)\", \"tab\": \"General information\", \"score\": \"304.83404255319147\"}",
-              "Conceptual Physics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
+              "Computer Security - Observed inference time (s)": "{\"description\": \"min=0.266, mean=0.266, max=0.266, sum=0.532 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.26576273441314696\"}",
+              "Computer Security - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "Computer Security - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Computer Security - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Computer Security - # prompt tokens": "{\"description\": \"min=378.51, mean=378.51, max=378.51, sum=757.02 (2)\", \"tab\": \"General information\", \"score\": \"378.51\"}",
+              "Computer Security - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"conceptual_physics\"",
+              "subject": "\"computer_security\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_conceptual_physics\""
+              "groups": "\"mmlu_computer_security\""
             }
           }
         },
         {
-          "evaluation_name": "Electrical Engineering",
+          "evaluation_name": "Econometrics",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -649,36 +623,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Electrical Engineering",
+            "evaluation_description": "EM on Econometrics",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.759,
+            "score": 0.684,
             "details": {
-              "description": "min=0.759, mean=0.759, max=0.759, sum=1.517 (2)",
+              "description": "min=0.684, mean=0.684, max=0.684, sum=1.368 (2)",
               "tab": "Accuracy",
-              "Electrical Engineering - Observed inference time (s)": "{\"description\": \"min=0.256, mean=0.256, max=0.256, sum=0.512 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2558267790695717\"}",
-              "Electrical Engineering - # eval": "{\"description\": \"min=145, mean=145, max=145, sum=290 (2)\", \"tab\": \"General information\", \"score\": \"145.0\"}",
-              "Electrical Engineering - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Electrical Engineering - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Electrical Engineering - # prompt tokens": "{\"description\": \"min=435.607, mean=435.607, max=435.607, sum=871.214 (2)\", \"tab\": \"General information\", \"score\": \"435.60689655172416\"}",
-              "Electrical Engineering - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
+              "Econometrics - Observed inference time (s)": "{\"description\": \"min=0.297, mean=0.297, max=0.297, sum=0.595 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2972530210227297\"}",
+              "Econometrics - # eval": "{\"description\": \"min=114, mean=114, max=114, sum=228 (2)\", \"tab\": \"General information\", \"score\": \"114.0\"}",
+              "Econometrics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Econometrics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Econometrics - # prompt tokens": "{\"description\": \"min=614.421, mean=614.421, max=614.421, sum=1228.842 (2)\", \"tab\": \"General information\", \"score\": \"614.421052631579\"}",
+              "Econometrics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"electrical_engineering\"",
+              "subject": "\"econometrics\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_electrical_engineering\""
+              "groups": "\"mmlu_econometrics\""
             }
           }
         },
         {
-          "evaluation_name": "Elementary Mathematics",
+          "evaluation_name": "Global Facts",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -687,36 +661,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Elementary Mathematics",
+            "evaluation_description": "EM on Global Facts",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.688,
+            "score": 0.6,
             "details": {
-              "description": "min=0.688, mean=0.688, max=0.688, sum=1.376 (2)",
+              "description": "min=0.6, mean=0.6, max=0.6, sum=1.2 (2)",
               "tab": "Accuracy",
-              "Elementary Mathematics - Observed inference time (s)": "{\"description\": \"min=0.308, mean=0.308, max=0.308, sum=0.617 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.30840403945357714\"}",
-              "Elementary Mathematics - # eval": "{\"description\": \"min=378, mean=378, max=378, sum=756 (2)\", \"tab\": \"General information\", \"score\": \"378.0\"}",
-              "Elementary Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Elementary Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Elementary Mathematics - # prompt tokens": "{\"description\": \"min=531.854, mean=531.854, max=531.854, sum=1063.709 (2)\", \"tab\": \"General information\", \"score\": \"531.8544973544973\"}",
-              "Elementary Mathematics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
+              "Global Facts - Observed inference time (s)": "{\"description\": \"min=0.267, mean=0.267, max=0.267, sum=0.533 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2666162133216858\"}",
+              "Global Facts - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "Global Facts - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Global Facts - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Global Facts - # prompt tokens": "{\"description\": \"min=399.71, mean=399.71, max=399.71, sum=799.42 (2)\", \"tab\": \"General information\", \"score\": \"399.71\"}",
+              "Global Facts - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"elementary_mathematics\"",
+              "subject": "\"global_facts\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_elementary_mathematics\""
+              "groups": "\"mmlu_global_facts\""
             }
           }
         },
         {
-          "evaluation_name": "Formal Logic",
+          "evaluation_name": "Jurisprudence",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -725,36 +699,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Formal Logic",
+            "evaluation_description": "EM on Jurisprudence",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.683,
+            "score": 0.88,
             "details": {
-              "description": "min=0.683, mean=0.683, max=0.683, sum=1.365 (2)",
+              "description": "min=0.88, mean=0.88, max=0.88, sum=1.759 (2)",
               "tab": "Accuracy",
-              "Formal Logic - Observed inference time (s)": "{\"description\": \"min=0.304, mean=0.304, max=0.304, sum=0.609 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.30448357074979754\"}",
-              "Formal Logic - # eval": "{\"description\": \"min=126, mean=126, max=126, sum=252 (2)\", \"tab\": \"General information\", \"score\": \"126.0\"}",
-              "Formal Logic - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Formal Logic - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Formal Logic - # prompt tokens": "{\"description\": \"min=601.778, mean=601.778, max=601.778, sum=1203.556 (2)\", \"tab\": \"General information\", \"score\": \"601.7777777777778\"}",
-              "Formal Logic - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
+              "Jurisprudence - Observed inference time (s)": "{\"description\": \"min=0.279, mean=0.279, max=0.279, sum=0.558 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.278864703796528\"}",
+              "Jurisprudence - # eval": "{\"description\": \"min=108, mean=108, max=108, sum=216 (2)\", \"tab\": \"General information\", \"score\": \"108.0\"}",
+              "Jurisprudence - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Jurisprudence - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Jurisprudence - # prompt tokens": "{\"description\": \"min=394.63, mean=394.63, max=394.63, sum=789.259 (2)\", \"tab\": \"General information\", \"score\": \"394.6296296296296\"}",
+              "Jurisprudence - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"formal_logic\"",
+              "subject": "\"jurisprudence\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_formal_logic\""
+              "groups": "\"mmlu_jurisprudence\""
             }
           }
         },
         {
-          "evaluation_name": "High School World History",
+          "evaluation_name": "Philosophy",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -763,114 +737,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on High School World History",
+            "evaluation_description": "EM on Philosophy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.941,
+            "score": 0.839,
             "details": {
-              "description": "min=0.941, mean=0.941, max=0.941, sum=1.882 (2)",
+              "description": "min=0.839, mean=0.839, max=0.839, sum=1.678 (2)",
               "tab": "Accuracy",
-              "High School Biology - Observed inference time (s)": "{\"description\": \"min=0.309, mean=0.309, max=0.309, sum=0.619 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3094667688492806\"}",
-              "High School Chemistry - Observed inference time (s)": "{\"description\": \"min=0.294, mean=0.294, max=0.294, sum=0.588 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.29394797386207017\"}",
-              "High School Computer Science - Observed inference time (s)": "{\"description\": \"min=0.301, mean=0.301, max=0.301, sum=0.602 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.30106969356536867\"}",
-              "High School European History - Observed inference time (s)": "{\"description\": \"min=0.48, mean=0.48, max=0.48, sum=0.96 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4799844944115841\"}",
-              "High School Geography - Observed inference time (s)": "{\"description\": \"min=0.297, mean=0.297, max=0.297, sum=0.595 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.29747620014229204\"}",
-              "High School Government And Politics - Observed inference time (s)": "{\"description\": \"min=0.291, mean=0.291, max=0.291, sum=0.583 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2914604300662026\"}",
-              "High School Macroeconomics - Observed inference time (s)": "{\"description\": \"min=0.279, mean=0.279, max=0.279, sum=0.557 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.27857950650728663\"}",
-              "High School Mathematics - Observed inference time (s)": "{\"description\": \"min=0.312, mean=0.312, max=0.312, sum=0.625 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3123831342767786\"}",
-              "High School Microeconomics - Observed inference time (s)": "{\"description\": \"min=0.302, mean=0.302, max=0.302, sum=0.603 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.30159517997453195\"}",
-              "High School Physics - Observed inference time (s)": "{\"description\": \"min=0.322, mean=0.322, max=0.322, sum=0.643 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.32152655108874995\"}",
-              "High School Psychology - Observed inference time (s)": "{\"description\": \"min=0.29, mean=0.29, max=0.29, sum=0.581 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2903494253071076\"}",
-              "High School Statistics - Observed inference time (s)": "{\"description\": \"min=0.333, mean=0.333, max=0.333, sum=0.667 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.33328031720938506\"}",
-              "High School US History - Observed inference time (s)": "{\"description\": \"min=0.394, mean=0.394, max=0.394, sum=0.788 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.39396579826579375\"}",
-              "High School World History - Observed inference time (s)": "{\"description\": \"min=0.679, mean=0.679, max=0.679, sum=1.359 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6793377369265013\"}",
-              "High School Biology - # eval": "{\"description\": \"min=310, mean=310, max=310, sum=620 (2)\", \"tab\": \"General information\", \"score\": \"310.0\"}",
-              "High School Biology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Biology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Biology - # prompt tokens": "{\"description\": \"min=513.671, mean=513.671, max=513.671, sum=1027.342 (2)\", \"tab\": \"General information\", \"score\": \"513.6709677419354\"}",
-              "High School Biology - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Chemistry - # eval": "{\"description\": \"min=203, mean=203, max=203, sum=406 (2)\", \"tab\": \"General information\", \"score\": \"203.0\"}",
-              "High School Chemistry - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Chemistry - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Chemistry - # prompt tokens": "{\"description\": \"min=496.704, mean=496.704, max=496.704, sum=993.409 (2)\", \"tab\": \"General information\", \"score\": \"496.70443349753697\"}",
-              "High School Chemistry - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Computer Science - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "High School Computer Science - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Computer Science - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Computer Science - # prompt tokens": "{\"description\": \"min=867.78, mean=867.78, max=867.78, sum=1735.56 (2)\", \"tab\": \"General information\", \"score\": \"867.78\"}",
-              "High School Computer Science - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School European History - # eval": "{\"description\": \"min=165, mean=165, max=165, sum=330 (2)\", \"tab\": \"General information\", \"score\": \"165.0\"}",
-              "High School European History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School European History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School European History - # prompt tokens": "{\"description\": \"min=2797.885, mean=2797.885, max=2797.885, sum=5595.77 (2)\", \"tab\": \"General information\", \"score\": \"2797.8848484848486\"}",
-              "High School European History - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Geography - # eval": "{\"description\": \"min=198, mean=198, max=198, sum=396 (2)\", \"tab\": \"General information\", \"score\": \"198.0\"}",
-              "High School Geography - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Geography - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Geography - # prompt tokens": "{\"description\": \"min=372.035, mean=372.035, max=372.035, sum=744.071 (2)\", \"tab\": \"General information\", \"score\": \"372.0353535353535\"}",
-              "High School Geography - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Government And Politics - # eval": "{\"description\": \"min=193, mean=193, max=193, sum=386 (2)\", \"tab\": \"General information\", \"score\": \"193.0\"}",
-              "High School Government And Politics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Government And Politics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Government And Politics - # prompt tokens": "{\"description\": \"min=465.824, mean=465.824, max=465.824, sum=931.648 (2)\", \"tab\": \"General information\", \"score\": \"465.8238341968912\"}",
-              "High School Government And Politics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Macroeconomics - # eval": "{\"description\": \"min=390, mean=390, max=390, sum=780 (2)\", \"tab\": \"General information\", \"score\": \"390.0\"}",
-              "High School Macroeconomics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Macroeconomics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Macroeconomics - # prompt tokens": "{\"description\": \"min=370.908, mean=370.908, max=370.908, sum=741.815 (2)\", \"tab\": \"General information\", \"score\": \"370.9076923076923\"}",
-              "High School Macroeconomics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Mathematics - # eval": "{\"description\": \"min=270, mean=270, max=270, sum=540 (2)\", \"tab\": \"General information\", \"score\": \"270.0\"}",
-              "High School Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Mathematics - # prompt tokens": "{\"description\": \"min=532.356, mean=532.356, max=532.356, sum=1064.711 (2)\", \"tab\": \"General information\", \"score\": \"532.3555555555556\"}",
-              "High School Mathematics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Microeconomics - # eval": "{\"description\": \"min=238, mean=238, max=238, sum=476 (2)\", \"tab\": \"General information\", \"score\": \"238.0\"}",
-              "High School Microeconomics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Microeconomics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Microeconomics - # prompt tokens": "{\"description\": \"min=399.013, mean=399.013, max=399.013, sum=798.025 (2)\", \"tab\": \"General information\", \"score\": \"399.0126050420168\"}",
-              "High School Microeconomics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Physics - # eval": "{\"description\": \"min=151, mean=151, max=151, sum=302 (2)\", \"tab\": \"General information\", \"score\": \"151.0\"}",
-              "High School Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Physics - # prompt tokens": "{\"description\": \"min=560.457, mean=560.457, max=560.457, sum=1120.914 (2)\", \"tab\": \"General information\", \"score\": \"560.4569536423841\"}",
-              "High School Physics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Psychology - # eval": "{\"description\": \"min=545, mean=545, max=545, sum=1090 (2)\", \"tab\": \"General information\", \"score\": \"545.0\"}",
-              "High School Psychology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Psychology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Psychology - # prompt tokens": "{\"description\": \"min=495.242, mean=495.242, max=495.242, sum=990.484 (2)\", \"tab\": \"General information\", \"score\": \"495.2422018348624\"}",
-              "High School Psychology - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Statistics - # eval": "{\"description\": \"min=216, mean=216, max=216, sum=432 (2)\", \"tab\": \"General information\", \"score\": \"216.0\"}",
-              "High School Statistics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Statistics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Statistics - # prompt tokens": "{\"description\": \"min=795.639, mean=795.639, max=795.639, sum=1591.278 (2)\", \"tab\": \"General information\", \"score\": \"795.6388888888889\"}",
-              "High School Statistics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School US History - # eval": "{\"description\": \"min=204, mean=204, max=204, sum=408 (2)\", \"tab\": \"General information\", \"score\": \"204.0\"}",
-              "High School US History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School US History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School US History - # prompt tokens": "{\"description\": \"min=2217.809, mean=2217.809, max=2217.809, sum=4435.618 (2)\", \"tab\": \"General information\", \"score\": \"2217.8088235294117\"}",
-              "High School US History - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School World History - # eval": "{\"description\": \"min=237, mean=237, max=237, sum=474 (2)\", \"tab\": \"General information\", \"score\": \"237.0\"}",
-              "High School World History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School World History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School World History - # prompt tokens": "{\"description\": \"min=1428.173, mean=1428.173, max=1428.173, sum=2856.346 (2)\", \"tab\": \"General information\", \"score\": \"1428.1729957805908\"}",
-              "High School World History - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
+              "Philosophy - Observed inference time (s)": "{\"description\": \"min=0.297, mean=0.297, max=0.297, sum=0.594 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.29689135582117404\"}",
+              "Philosophy - # eval": "{\"description\": \"min=311, mean=311, max=311, sum=622 (2)\", \"tab\": \"General information\", \"score\": \"311.0\"}",
+              "Philosophy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Philosophy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Philosophy - # prompt tokens": "{\"description\": \"min=329.084, mean=329.084, max=329.084, sum=658.167 (2)\", \"tab\": \"General information\", \"score\": \"329.08360128617363\"}",
+              "Philosophy - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"high_school_world_history\"",
+              "subject": "\"philosophy\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_high_school_world_history\""
+              "groups": "\"mmlu_philosophy\""
             }
           }
         },
         {
-          "evaluation_name": "Human Sexuality",
+          "evaluation_name": "Professional Psychology",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -879,42 +775,54 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Human Sexuality",
+            "evaluation_description": "EM on Professional Psychology",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.87,
+            "score": 0.843,
             "details": {
-              "description": "min=0.87, mean=0.87, max=0.87, sum=1.74 (2)",
+              "description": "min=0.843, mean=0.843, max=0.843, sum=1.686 (2)",
               "tab": "Accuracy",
-              "Human Aging - Observed inference time (s)": "{\"description\": \"min=0.388, mean=0.388, max=0.388, sum=0.776 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.38789880863754206\"}",
-              "Human Sexuality - Observed inference time (s)": "{\"description\": \"min=0.293, mean=0.293, max=0.293, sum=0.586 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2929920222013051\"}",
-              "Human Aging - # eval": "{\"description\": \"min=223, mean=223, max=223, sum=446 (2)\", \"tab\": \"General information\", \"score\": \"223.0\"}",
-              "Human Aging - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Human Aging - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Human Aging - # prompt tokens": "{\"description\": \"min=319.888, mean=319.888, max=319.888, sum=639.776 (2)\", \"tab\": \"General information\", \"score\": \"319.88789237668163\"}",
-              "Human Aging - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Human Sexuality - # eval": "{\"description\": \"min=131, mean=131, max=131, sum=262 (2)\", \"tab\": \"General information\", \"score\": \"131.0\"}",
-              "Human Sexuality - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Human Sexuality - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Human Sexuality - # prompt tokens": "{\"description\": \"min=341.168, mean=341.168, max=341.168, sum=682.336 (2)\", \"tab\": \"General information\", \"score\": \"341.1679389312977\"}",
-              "Human Sexuality - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
+              "Professional Medicine - Observed inference time (s)": "{\"description\": \"min=0.553, mean=0.553, max=0.553, sum=1.106 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5529017465956071\"}",
+              "Professional Accounting - Observed inference time (s)": "{\"description\": \"min=0.323, mean=0.323, max=0.323, sum=0.647 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.32346555189038\"}",
+              "Professional Law - Observed inference time (s)": "{\"description\": \"min=0.372, mean=0.372, max=0.372, sum=0.743 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3715069820859131\"}",
+              "Professional Psychology - Observed inference time (s)": "{\"description\": \"min=0.315, mean=0.315, max=0.315, sum=0.63 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3151663907992294\"}",
+              "Professional Medicine - # eval": "{\"description\": \"min=272, mean=272, max=272, sum=544 (2)\", \"tab\": \"General information\", \"score\": \"272.0\"}",
+              "Professional Medicine - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Professional Medicine - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Professional Medicine - # prompt tokens": "{\"description\": \"min=1094.489, mean=1094.489, max=1094.489, sum=2188.978 (2)\", \"tab\": \"General information\", \"score\": \"1094.4889705882354\"}",
+              "Professional Medicine - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Professional Accounting - # eval": "{\"description\": \"min=282, mean=282, max=282, sum=564 (2)\", \"tab\": \"General information\", \"score\": \"282.0\"}",
+              "Professional Accounting - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Professional Accounting - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Professional Accounting - # prompt tokens": "{\"description\": \"min=658.585, mean=658.585, max=658.585, sum=1317.17 (2)\", \"tab\": \"General information\", \"score\": \"658.5851063829788\"}",
+              "Professional Accounting - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Professional Law - # eval": "{\"description\": \"min=1534, mean=1534, max=1534, sum=3068 (2)\", \"tab\": \"General information\", \"score\": \"1534.0\"}",
+              "Professional Law - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Professional Law - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Professional Law - # prompt tokens": "{\"description\": \"min=1637.601, mean=1637.601, max=1637.601, sum=3275.202 (2)\", \"tab\": \"General information\", \"score\": \"1637.6010430247718\"}",
+              "Professional Law - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Professional Psychology - # eval": "{\"description\": \"min=612, mean=612, max=612, sum=1224 (2)\", \"tab\": \"General information\", \"score\": \"612.0\"}",
+              "Professional Psychology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Professional Psychology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Professional Psychology - # prompt tokens": "{\"description\": \"min=575.098, mean=575.098, max=575.098, sum=1150.196 (2)\", \"tab\": \"General information\", \"score\": \"575.0980392156863\"}",
+              "Professional Psychology - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"human_sexuality\"",
+              "subject": "\"professional_psychology\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_human_sexuality\""
+              "groups": "\"mmlu_professional_psychology\""
             }
           }
         },
         {
-          "evaluation_name": "International Law",
+          "evaluation_name": "Us Foreign Policy",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -923,36 +831,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on International Law",
+            "evaluation_description": "EM on Us Foreign Policy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.934,
+            "score": 0.93,
             "details": {
-              "description": "min=0.934, mean=0.934, max=0.934, sum=1.868 (2)",
+              "description": "min=0.93, mean=0.93, max=0.93, sum=1.86 (2)",
               "tab": "Accuracy",
-              "International Law - Observed inference time (s)": "{\"description\": \"min=0.342, mean=0.342, max=0.342, sum=0.685 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.34241620962284813\"}",
-              "International Law - # eval": "{\"description\": \"min=121, mean=121, max=121, sum=242 (2)\", \"tab\": \"General information\", \"score\": \"121.0\"}",
-              "International Law - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "International Law - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "International Law - # prompt tokens": "{\"description\": \"min=639.818, mean=639.818, max=639.818, sum=1279.636 (2)\", \"tab\": \"General information\", \"score\": \"639.8181818181819\"}",
-              "International Law - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
+              "Us Foreign Policy - Observed inference time (s)": "{\"description\": \"min=0.507, mean=0.507, max=0.507, sum=1.014 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5069083476066589\"}",
+              "Us Foreign Policy - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "Us Foreign Policy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Us Foreign Policy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Us Foreign Policy - # prompt tokens": "{\"description\": \"min=422.79, mean=422.79, max=422.79, sum=845.58 (2)\", \"tab\": \"General information\", \"score\": \"422.79\"}",
+              "Us Foreign Policy - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"international_law\"",
+              "subject": "\"us_foreign_policy\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_international_law\""
+              "groups": "\"mmlu_us_foreign_policy\""
             }
           }
         },
         {
-          "evaluation_name": "Logical Fallacies",
+          "evaluation_name": "Astronomy",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -961,36 +869,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Logical Fallacies",
+            "evaluation_description": "EM on Astronomy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.834,
+            "score": 0.921,
             "details": {
-              "description": "min=0.834, mean=0.834, max=0.834, sum=1.669 (2)",
+              "description": "min=0.921, mean=0.921, max=0.921, sum=1.842 (2)",
               "tab": "Accuracy",
-              "Logical Fallacies - Observed inference time (s)": "{\"description\": \"min=0.282, mean=0.282, max=0.282, sum=0.565 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.28232605325663745\"}",
-              "Logical Fallacies - # eval": "{\"description\": \"min=163, mean=163, max=163, sum=326 (2)\", \"tab\": \"General information\", \"score\": \"163.0\"}",
-              "Logical Fallacies - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Logical Fallacies - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Logical Fallacies - # prompt tokens": "{\"description\": \"min=449.564, mean=449.564, max=449.564, sum=899.129 (2)\", \"tab\": \"General information\", \"score\": \"449.5644171779141\"}",
-              "Logical Fallacies - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
+              "Astronomy - Observed inference time (s)": "{\"description\": \"min=0.332, mean=0.332, max=0.332, sum=0.665 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3323579352152975\"}",
+              "Astronomy - # eval": "{\"description\": \"min=152, mean=152, max=152, sum=304 (2)\", \"tab\": \"General information\", \"score\": \"152.0\"}",
+              "Astronomy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Astronomy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Astronomy - # prompt tokens": "{\"description\": \"min=579.684, mean=579.684, max=579.684, sum=1159.368 (2)\", \"tab\": \"General information\", \"score\": \"579.6842105263158\"}",
+              "Astronomy - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"logical_fallacies\"",
+              "subject": "\"astronomy\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_logical_fallacies\""
+              "groups": "\"mmlu_astronomy\""
             }
           }
         },
         {
-          "evaluation_name": "Machine Learning",
+          "evaluation_name": "Business Ethics",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -999,36 +907,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Machine Learning",
+            "evaluation_description": "EM on Business Ethics",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.688,
+            "score": 0.76,
             "details": {
-              "description": "min=0.688, mean=0.688, max=0.688, sum=1.375 (2)",
+              "description": "min=0.76, mean=0.76, max=0.76, sum=1.52 (2)",
               "tab": "Accuracy",
-              "Machine Learning - Observed inference time (s)": "{\"description\": \"min=0.338, mean=0.338, max=0.338, sum=0.676 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.33782388057027546\"}",
-              "Machine Learning - # eval": "{\"description\": \"min=112, mean=112, max=112, sum=224 (2)\", \"tab\": \"General information\", \"score\": \"112.0\"}",
-              "Machine Learning - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Machine Learning - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Machine Learning - # prompt tokens": "{\"description\": \"min=668.054, mean=668.054, max=668.054, sum=1336.107 (2)\", \"tab\": \"General information\", \"score\": \"668.0535714285714\"}",
-              "Machine Learning - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
+              "Business Ethics - Observed inference time (s)": "{\"description\": \"min=0.291, mean=0.291, max=0.291, sum=0.581 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.29072295665740966\"}",
+              "Business Ethics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "Business Ethics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Business Ethics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Business Ethics - # prompt tokens": "{\"description\": \"min=569.52, mean=569.52, max=569.52, sum=1139.04 (2)\", \"tab\": \"General information\", \"score\": \"569.52\"}",
+              "Business Ethics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"machine_learning\"",
+              "subject": "\"business_ethics\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_machine_learning\""
+              "groups": "\"mmlu_business_ethics\""
             }
           }
         },
         {
-          "evaluation_name": "Management",
+          "evaluation_name": "Clinical Knowledge",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1037,36 +945,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Management",
+            "evaluation_description": "EM on Clinical Knowledge",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.913,
+            "score": 0.845,
             "details": {
-              "description": "min=0.913, mean=0.913, max=0.913, sum=1.825 (2)",
+              "description": "min=0.845, mean=0.845, max=0.845, sum=1.691 (2)",
               "tab": "Accuracy",
-              "Management - Observed inference time (s)": "{\"description\": \"min=0.285, mean=0.285, max=0.285, sum=0.571 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2853238027072647\"}",
-              "Management - # eval": "{\"description\": \"min=103, mean=103, max=103, sum=206 (2)\", \"tab\": \"General information\", \"score\": \"103.0\"}",
-              "Management - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Management - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Management - # prompt tokens": "{\"description\": \"min=283.786, mean=283.786, max=283.786, sum=567.573 (2)\", \"tab\": \"General information\", \"score\": \"283.7864077669903\"}",
-              "Management - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
+              "Clinical Knowledge - Observed inference time (s)": "{\"description\": \"min=0.29, mean=0.29, max=0.29, sum=0.579 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2897273891376999\"}",
+              "Clinical Knowledge - # eval": "{\"description\": \"min=265, mean=265, max=265, sum=530 (2)\", \"tab\": \"General information\", \"score\": \"265.0\"}",
+              "Clinical Knowledge - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Clinical Knowledge - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Clinical Knowledge - # prompt tokens": "{\"description\": \"min=397.928, mean=397.928, max=397.928, sum=795.857 (2)\", \"tab\": \"General information\", \"score\": \"397.92830188679244\"}",
+              "Clinical Knowledge - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"management\"",
+              "subject": "\"clinical_knowledge\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_management\""
+              "groups": "\"mmlu_clinical_knowledge\""
             }
           }
         },
         {
-          "evaluation_name": "Marketing",
+          "evaluation_name": "Conceptual Physics",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1075,36 +983,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Marketing",
+            "evaluation_description": "EM on Conceptual Physics",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.944,
+            "score": 0.826,
             "details": {
-              "description": "min=0.944, mean=0.944, max=0.944, sum=1.889 (2)",
+              "description": "min=0.826, mean=0.826, max=0.826, sum=1.651 (2)",
               "tab": "Accuracy",
-              "Marketing - Observed inference time (s)": "{\"description\": \"min=0.28, mean=0.28, max=0.28, sum=0.561 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.28032574796269083\"}",
-              "Marketing - # eval": "{\"description\": \"min=234, mean=234, max=234, sum=468 (2)\", \"tab\": \"General information\", \"score\": \"234.0\"}",
-              "Marketing - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Marketing - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Marketing - # prompt tokens": "{\"description\": \"min=404.218, mean=404.218, max=404.218, sum=808.436 (2)\", \"tab\": \"General information\", \"score\": \"404.21794871794873\"}",
-              "Marketing - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
+              "Conceptual Physics - Observed inference time (s)": "{\"description\": \"min=0.279, mean=0.279, max=0.279, sum=0.559 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2794749209221373\"}",
+              "Conceptual Physics - # eval": "{\"description\": \"min=235, mean=235, max=235, sum=470 (2)\", \"tab\": \"General information\", \"score\": \"235.0\"}",
+              "Conceptual Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Conceptual Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Conceptual Physics - # prompt tokens": "{\"description\": \"min=304.834, mean=304.834, max=304.834, sum=609.668 (2)\", \"tab\": \"General information\", \"score\": \"304.83404255319147\"}",
+              "Conceptual Physics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"marketing\"",
+              "subject": "\"conceptual_physics\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_marketing\""
+              "groups": "\"mmlu_conceptual_physics\""
             }
           }
         },
         {
-          "evaluation_name": "Medical Genetics",
+          "evaluation_name": "Electrical Engineering",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1113,36 +1021,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Medical Genetics",
+            "evaluation_description": "EM on Electrical Engineering",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.92,
+            "score": 0.759,
             "details": {
-              "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)",
+              "description": "min=0.759, mean=0.759, max=0.759, sum=1.517 (2)",
               "tab": "Accuracy",
-              "Medical Genetics - Observed inference time (s)": "{\"description\": \"min=0.296, mean=0.296, max=0.296, sum=0.592 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.29611136198043825\"}",
-              "Medical Genetics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "Medical Genetics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Medical Genetics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Medical Genetics - # prompt tokens": "{\"description\": \"min=340.99, mean=340.99, max=340.99, sum=681.98 (2)\", \"tab\": \"General information\", \"score\": \"340.99\"}",
-              "Medical Genetics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
+              "Electrical Engineering - Observed inference time (s)": "{\"description\": \"min=0.256, mean=0.256, max=0.256, sum=0.512 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2558267790695717\"}",
+              "Electrical Engineering - # eval": "{\"description\": \"min=145, mean=145, max=145, sum=290 (2)\", \"tab\": \"General information\", \"score\": \"145.0\"}",
+              "Electrical Engineering - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Electrical Engineering - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Electrical Engineering - # prompt tokens": "{\"description\": \"min=435.607, mean=435.607, max=435.607, sum=871.214 (2)\", \"tab\": \"General information\", \"score\": \"435.60689655172416\"}",
+              "Electrical Engineering - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"medical_genetics\"",
+              "subject": "\"electrical_engineering\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_medical_genetics\""
+              "groups": "\"mmlu_electrical_engineering\""
             }
           }
         },
         {
-          "evaluation_name": "Miscellaneous",
+          "evaluation_name": "Elementary Mathematics",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1151,36 +1059,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Miscellaneous",
+            "evaluation_description": "EM on Elementary Mathematics",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.913,
+            "score": 0.688,
             "details": {
-              "description": "min=0.913, mean=0.913, max=0.913, sum=1.826 (2)",
+              "description": "min=0.688, mean=0.688, max=0.688, sum=1.376 (2)",
               "tab": "Accuracy",
-              "Miscellaneous - Observed inference time (s)": "{\"description\": \"min=0.324, mean=0.324, max=0.324, sum=0.647 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3237126984967735\"}",
-              "Miscellaneous - # eval": "{\"description\": \"min=783, mean=783, max=783, sum=1566 (2)\", \"tab\": \"General information\", \"score\": \"783.0\"}",
-              "Miscellaneous - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Miscellaneous - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Miscellaneous - # prompt tokens": "{\"description\": \"min=299.911, mean=299.911, max=299.911, sum=599.821 (2)\", \"tab\": \"General information\", \"score\": \"299.9106002554278\"}",
-              "Miscellaneous - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
+              "Elementary Mathematics - Observed inference time (s)": "{\"description\": \"min=0.308, mean=0.308, max=0.308, sum=0.617 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.30840403945357714\"}",
+              "Elementary Mathematics - # eval": "{\"description\": \"min=378, mean=378, max=378, sum=756 (2)\", \"tab\": \"General information\", \"score\": \"378.0\"}",
+              "Elementary Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Elementary Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Elementary Mathematics - # prompt tokens": "{\"description\": \"min=531.854, mean=531.854, max=531.854, sum=1063.709 (2)\", \"tab\": \"General information\", \"score\": \"531.8544973544973\"}",
+              "Elementary Mathematics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"miscellaneous\"",
+              "subject": "\"elementary_mathematics\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_miscellaneous\""
+              "groups": "\"mmlu_elementary_mathematics\""
             }
           }
         },
         {
-          "evaluation_name": "Moral Scenarios",
+          "evaluation_name": "Formal Logic",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1189,42 +1097,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Moral Scenarios",
+            "evaluation_description": "EM on Formal Logic",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.841,
+            "score": 0.683,
             "details": {
-              "description": "min=0.841, mean=0.841, max=0.841, sum=1.683 (2)",
+              "description": "min=0.683, mean=0.683, max=0.683, sum=1.365 (2)",
               "tab": "Accuracy",
-              "Moral Disputes - Observed inference time (s)": "{\"description\": \"min=0.29, mean=0.29, max=0.29, sum=0.58 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2901734975032035\"}",
-              "Moral Scenarios - Observed inference time (s)": "{\"description\": \"min=0.506, mean=0.506, max=0.506, sum=1.012 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5058047955262595\"}",
-              "Moral Disputes - # eval": "{\"description\": \"min=346, mean=346, max=346, sum=692 (2)\", \"tab\": \"General information\", \"score\": \"346.0\"}",
-              "Moral Disputes - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Moral Disputes - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Moral Disputes - # prompt tokens": "{\"description\": \"min=476.113, mean=476.113, max=476.113, sum=952.225 (2)\", \"tab\": \"General information\", \"score\": \"476.1127167630058\"}",
-              "Moral Disputes - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Moral Scenarios - # eval": "{\"description\": \"min=895, mean=895, max=895, sum=1790 (2)\", \"tab\": \"General information\", \"score\": \"895.0\"}",
-              "Moral Scenarios - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Moral Scenarios - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Moral Scenarios - # prompt tokens": "{\"description\": \"min=656.455, mean=656.455, max=656.455, sum=1312.909 (2)\", \"tab\": \"General information\", \"score\": \"656.454748603352\"}",
-              "Moral Scenarios - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
+              "Formal Logic - Observed inference time (s)": "{\"description\": \"min=0.304, mean=0.304, max=0.304, sum=0.609 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.30448357074979754\"}",
+              "Formal Logic - # eval": "{\"description\": \"min=126, mean=126, max=126, sum=252 (2)\", \"tab\": \"General information\", \"score\": \"126.0\"}",
+              "Formal Logic - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Formal Logic - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Formal Logic - # prompt tokens": "{\"description\": \"min=601.778, mean=601.778, max=601.778, sum=1203.556 (2)\", \"tab\": \"General information\", \"score\": \"601.7777777777778\"}",
+              "Formal Logic - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"moral_scenarios\"",
+              "subject": "\"formal_logic\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_moral_scenarios\""
+              "groups": "\"mmlu_formal_logic\""
             }
           }
         },
         {
-          "evaluation_name": "Nutrition",
+          "evaluation_name": "High School World History",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1233,36 +1135,114 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Nutrition",
+            "evaluation_description": "EM on High School World History",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.889,
+            "score": 0.941,
             "details": {
-              "description": "min=0.889, mean=0.889, max=0.889, sum=1.778 (2)",
+              "description": "min=0.941, mean=0.941, max=0.941, sum=1.882 (2)",
               "tab": "Accuracy",
-              "Nutrition - Observed inference time (s)": "{\"description\": \"min=0.321, mean=0.321, max=0.321, sum=0.641 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.32064209264867444\"}",
-              "Nutrition - # eval": "{\"description\": \"min=306, mean=306, max=306, sum=612 (2)\", \"tab\": \"General information\", \"score\": \"306.0\"}",
-              "Nutrition - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Nutrition - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Nutrition - # prompt tokens": "{\"description\": \"min=586.814, mean=586.814, max=586.814, sum=1173.627 (2)\", \"tab\": \"General information\", \"score\": \"586.8137254901961\"}",
-              "Nutrition - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
+              "High School Biology - Observed inference time (s)": "{\"description\": \"min=0.309, mean=0.309, max=0.309, sum=0.619 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3094667688492806\"}",
+              "High School Chemistry - Observed inference time (s)": "{\"description\": \"min=0.294, mean=0.294, max=0.294, sum=0.588 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.29394797386207017\"}",
+              "High School Computer Science - Observed inference time (s)": "{\"description\": \"min=0.301, mean=0.301, max=0.301, sum=0.602 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.30106969356536867\"}",
+              "High School European History - Observed inference time (s)": "{\"description\": \"min=0.48, mean=0.48, max=0.48, sum=0.96 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4799844944115841\"}",
+              "High School Geography - Observed inference time (s)": "{\"description\": \"min=0.297, mean=0.297, max=0.297, sum=0.595 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.29747620014229204\"}",
+              "High School Government And Politics - Observed inference time (s)": "{\"description\": \"min=0.291, mean=0.291, max=0.291, sum=0.583 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2914604300662026\"}",
+              "High School Macroeconomics - Observed inference time (s)": "{\"description\": \"min=0.279, mean=0.279, max=0.279, sum=0.557 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.27857950650728663\"}",
+              "High School Mathematics - Observed inference time (s)": "{\"description\": \"min=0.312, mean=0.312, max=0.312, sum=0.625 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3123831342767786\"}",
+              "High School Microeconomics - Observed inference time (s)": "{\"description\": \"min=0.302, mean=0.302, max=0.302, sum=0.603 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.30159517997453195\"}",
+              "High School Physics - Observed inference time (s)": "{\"description\": \"min=0.322, mean=0.322, max=0.322, sum=0.643 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.32152655108874995\"}",
+              "High School Psychology - Observed inference time (s)": "{\"description\": \"min=0.29, mean=0.29, max=0.29, sum=0.581 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2903494253071076\"}",
+              "High School Statistics - Observed inference time (s)": "{\"description\": \"min=0.333, mean=0.333, max=0.333, sum=0.667 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.33328031720938506\"}",
+              "High School US History - Observed inference time (s)": "{\"description\": \"min=0.394, mean=0.394, max=0.394, sum=0.788 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.39396579826579375\"}",
+              "High School World History - Observed inference time (s)": "{\"description\": \"min=0.679, mean=0.679, max=0.679, sum=1.359 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6793377369265013\"}",
+              "High School Biology - # eval": "{\"description\": \"min=310, mean=310, max=310, sum=620 (2)\", \"tab\": \"General information\", \"score\": \"310.0\"}",
+              "High School Biology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Biology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Biology - # prompt tokens": "{\"description\": \"min=513.671, mean=513.671, max=513.671, sum=1027.342 (2)\", \"tab\": \"General information\", \"score\": \"513.6709677419354\"}",
+              "High School Biology - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Chemistry - # eval": "{\"description\": \"min=203, mean=203, max=203, sum=406 (2)\", \"tab\": \"General information\", \"score\": \"203.0\"}",
+              "High School Chemistry - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Chemistry - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Chemistry - # prompt tokens": "{\"description\": \"min=496.704, mean=496.704, max=496.704, sum=993.409 (2)\", \"tab\": \"General information\", \"score\": \"496.70443349753697\"}",
+              "High School Chemistry - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Computer Science - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "High School Computer Science - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Computer Science - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Computer Science - # prompt tokens": "{\"description\": \"min=867.78, mean=867.78, max=867.78, sum=1735.56 (2)\", \"tab\": \"General information\", \"score\": \"867.78\"}",
+              "High School Computer Science - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School European History - # eval": "{\"description\": \"min=165, mean=165, max=165, sum=330 (2)\", \"tab\": \"General information\", \"score\": \"165.0\"}",
+              "High School European History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School European History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School European History - # prompt tokens": "{\"description\": \"min=2797.885, mean=2797.885, max=2797.885, sum=5595.77 (2)\", \"tab\": \"General information\", \"score\": \"2797.8848484848486\"}",
+              "High School European History - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Geography - # eval": "{\"description\": \"min=198, mean=198, max=198, sum=396 (2)\", \"tab\": \"General information\", \"score\": \"198.0\"}",
+              "High School Geography - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Geography - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Geography - # prompt tokens": "{\"description\": \"min=372.035, mean=372.035, max=372.035, sum=744.071 (2)\", \"tab\": \"General information\", \"score\": \"372.0353535353535\"}",
+              "High School Geography - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Government And Politics - # eval": "{\"description\": \"min=193, mean=193, max=193, sum=386 (2)\", \"tab\": \"General information\", \"score\": \"193.0\"}",
+              "High School Government And Politics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Government And Politics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Government And Politics - # prompt tokens": "{\"description\": \"min=465.824, mean=465.824, max=465.824, sum=931.648 (2)\", \"tab\": \"General information\", \"score\": \"465.8238341968912\"}",
+              "High School Government And Politics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Macroeconomics - # eval": "{\"description\": \"min=390, mean=390, max=390, sum=780 (2)\", \"tab\": \"General information\", \"score\": \"390.0\"}",
+              "High School Macroeconomics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Macroeconomics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Macroeconomics - # prompt tokens": "{\"description\": \"min=370.908, mean=370.908, max=370.908, sum=741.815 (2)\", \"tab\": \"General information\", \"score\": \"370.9076923076923\"}",
+              "High School Macroeconomics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Mathematics - # eval": "{\"description\": \"min=270, mean=270, max=270, sum=540 (2)\", \"tab\": \"General information\", \"score\": \"270.0\"}",
+              "High School Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Mathematics - # prompt tokens": "{\"description\": \"min=532.356, mean=532.356, max=532.356, sum=1064.711 (2)\", \"tab\": \"General information\", \"score\": \"532.3555555555556\"}",
+              "High School Mathematics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Microeconomics - # eval": "{\"description\": \"min=238, mean=238, max=238, sum=476 (2)\", \"tab\": \"General information\", \"score\": \"238.0\"}",
+              "High School Microeconomics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Microeconomics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Microeconomics - # prompt tokens": "{\"description\": \"min=399.013, mean=399.013, max=399.013, sum=798.025 (2)\", \"tab\": \"General information\", \"score\": \"399.0126050420168\"}",
+              "High School Microeconomics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Physics - # eval": "{\"description\": \"min=151, mean=151, max=151, sum=302 (2)\", \"tab\": \"General information\", \"score\": \"151.0\"}",
+              "High School Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Physics - # prompt tokens": "{\"description\": \"min=560.457, mean=560.457, max=560.457, sum=1120.914 (2)\", \"tab\": \"General information\", \"score\": \"560.4569536423841\"}",
+              "High School Physics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Psychology - # eval": "{\"description\": \"min=545, mean=545, max=545, sum=1090 (2)\", \"tab\": \"General information\", \"score\": \"545.0\"}",
+              "High School Psychology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Psychology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Psychology - # prompt tokens": "{\"description\": \"min=495.242, mean=495.242, max=495.242, sum=990.484 (2)\", \"tab\": \"General information\", \"score\": \"495.2422018348624\"}",
+              "High School Psychology - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Statistics - # eval": "{\"description\": \"min=216, mean=216, max=216, sum=432 (2)\", \"tab\": \"General information\", \"score\": \"216.0\"}",
+              "High School Statistics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Statistics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Statistics - # prompt tokens": "{\"description\": \"min=795.639, mean=795.639, max=795.639, sum=1591.278 (2)\", \"tab\": \"General information\", \"score\": \"795.6388888888889\"}",
+              "High School Statistics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School US History - # eval": "{\"description\": \"min=204, mean=204, max=204, sum=408 (2)\", \"tab\": \"General information\", \"score\": \"204.0\"}",
+              "High School US History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School US History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School US History - # prompt tokens": "{\"description\": \"min=2217.809, mean=2217.809, max=2217.809, sum=4435.618 (2)\", \"tab\": \"General information\", \"score\": \"2217.8088235294117\"}",
+              "High School US History - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School World History - # eval": "{\"description\": \"min=237, mean=237, max=237, sum=474 (2)\", \"tab\": \"General information\", \"score\": \"237.0\"}",
+              "High School World History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School World History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School World History - # prompt tokens": "{\"description\": \"min=1428.173, mean=1428.173, max=1428.173, sum=2856.346 (2)\", \"tab\": \"General information\", \"score\": \"1428.1729957805908\"}",
+              "High School World History - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"nutrition\"",
+              "subject": "\"high_school_world_history\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_nutrition\""
+              "groups": "\"mmlu_high_school_world_history\""
             }
           }
         },
         {
-          "evaluation_name": "Prehistory",
+          "evaluation_name": "Human Sexuality",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1271,36 +1251,42 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Prehistory",
+            "evaluation_description": "EM on Human Sexuality",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.886,
+            "score": 0.87,
             "details": {
-              "description": "min=0.886, mean=0.886, max=0.886, sum=1.772 (2)",
+              "description": "min=0.87, mean=0.87, max=0.87, sum=1.74 (2)",
               "tab": "Accuracy",
-              "Prehistory - Observed inference time (s)": "{\"description\": \"min=0.614, mean=0.614, max=0.614, sum=1.227 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6136744522754057\"}",
-              "Prehistory - # eval": "{\"description\": \"min=324, mean=324, max=324, sum=648 (2)\", \"tab\": \"General information\", \"score\": \"324.0\"}",
-              "Prehistory - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Prehistory - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Prehistory - # prompt tokens": "{\"description\": \"min=514.528, mean=514.528, max=514.528, sum=1029.056 (2)\", \"tab\": \"General information\", \"score\": \"514.5277777777778\"}",
-              "Prehistory - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
+              "Human Aging - Observed inference time (s)": "{\"description\": \"min=0.388, mean=0.388, max=0.388, sum=0.776 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.38789880863754206\"}",
+              "Human Sexuality - Observed inference time (s)": "{\"description\": \"min=0.293, mean=0.293, max=0.293, sum=0.586 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2929920222013051\"}",
+              "Human Aging - # eval": "{\"description\": \"min=223, mean=223, max=223, sum=446 (2)\", \"tab\": \"General information\", \"score\": \"223.0\"}",
+              "Human Aging - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Human Aging - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Human Aging - # prompt tokens": "{\"description\": \"min=319.888, mean=319.888, max=319.888, sum=639.776 (2)\", \"tab\": \"General information\", \"score\": \"319.88789237668163\"}",
+              "Human Aging - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Human Sexuality - # eval": "{\"description\": \"min=131, mean=131, max=131, sum=262 (2)\", \"tab\": \"General information\", \"score\": \"131.0\"}",
+              "Human Sexuality - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Human Sexuality - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Human Sexuality - # prompt tokens": "{\"description\": \"min=341.168, mean=341.168, max=341.168, sum=682.336 (2)\", \"tab\": \"General information\", \"score\": \"341.1679389312977\"}",
+              "Human Sexuality - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"prehistory\"",
+              "subject": "\"human_sexuality\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_prehistory\""
+              "groups": "\"mmlu_human_sexuality\""
             }
           }
         },
         {
-          "evaluation_name": "Public Relations",
+          "evaluation_name": "International Law",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1309,36 +1295,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Public Relations",
+            "evaluation_description": "EM on International Law",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.718,
+            "score": 0.934,
             "details": {
-              "description": "min=0.718, mean=0.718, max=0.718, sum=1.436 (2)",
+              "description": "min=0.934, mean=0.934, max=0.934, sum=1.868 (2)",
               "tab": "Accuracy",
-              "Public Relations - Observed inference time (s)": "{\"description\": \"min=0.3, mean=0.3, max=0.3, sum=0.599 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.29952496832067316\"}",
-              "Public Relations - # eval": "{\"description\": \"min=110, mean=110, max=110, sum=220 (2)\", \"tab\": \"General information\", \"score\": \"110.0\"}",
-              "Public Relations - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Public Relations - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Public Relations - # prompt tokens": "{\"description\": \"min=405.318, mean=405.318, max=405.318, sum=810.636 (2)\", \"tab\": \"General information\", \"score\": \"405.3181818181818\"}",
-              "Public Relations - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
+              "International Law - Observed inference time (s)": "{\"description\": \"min=0.342, mean=0.342, max=0.342, sum=0.685 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.34241620962284813\"}",
+              "International Law - # eval": "{\"description\": \"min=121, mean=121, max=121, sum=242 (2)\", \"tab\": \"General information\", \"score\": \"121.0\"}",
+              "International Law - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "International Law - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "International Law - # prompt tokens": "{\"description\": \"min=639.818, mean=639.818, max=639.818, sum=1279.636 (2)\", \"tab\": \"General information\", \"score\": \"639.8181818181819\"}",
+              "International Law - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"public_relations\"",
+              "subject": "\"international_law\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_public_relations\""
+              "groups": "\"mmlu_international_law\""
             }
           }
         },
         {
-          "evaluation_name": "Security Studies",
+          "evaluation_name": "Logical Fallacies",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1347,36 +1333,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Security Studies",
+            "evaluation_description": "EM on Logical Fallacies",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.853,
+            "score": 0.834,
             "details": {
-              "description": "min=0.853, mean=0.853, max=0.853, sum=1.706 (2)",
+              "description": "min=0.834, mean=0.834, max=0.834, sum=1.669 (2)",
               "tab": "Accuracy",
-              "Security Studies - Observed inference time (s)": "{\"description\": \"min=0.348, mean=0.348, max=0.348, sum=0.697 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.348436891789339\"}",
-              "Security Studies - # eval": "{\"description\": \"min=245, mean=245, max=245, sum=490 (2)\", \"tab\": \"General information\", \"score\": \"245.0\"}",
-              "Security Studies - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Security Studies - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Security Studies - # prompt tokens": "{\"description\": \"min=1164.473, mean=1164.473, max=1164.473, sum=2328.947 (2)\", \"tab\": \"General information\", \"score\": \"1164.4734693877551\"}",
-              "Security Studies - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
+              "Logical Fallacies - Observed inference time (s)": "{\"description\": \"min=0.282, mean=0.282, max=0.282, sum=0.565 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.28232605325663745\"}",
+              "Logical Fallacies - # eval": "{\"description\": \"min=163, mean=163, max=163, sum=326 (2)\", \"tab\": \"General information\", \"score\": \"163.0\"}",
+              "Logical Fallacies - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Logical Fallacies - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Logical Fallacies - # prompt tokens": "{\"description\": \"min=449.564, mean=449.564, max=449.564, sum=899.129 (2)\", \"tab\": \"General information\", \"score\": \"449.5644171779141\"}",
+              "Logical Fallacies - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"security_studies\"",
+              "subject": "\"logical_fallacies\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_security_studies\""
+              "groups": "\"mmlu_logical_fallacies\""
             }
           }
         },
         {
-          "evaluation_name": "Sociology",
+          "evaluation_name": "Machine Learning",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1385,36 +1371,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Sociology",
+            "evaluation_description": "EM on Machine Learning",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.92,
+            "score": 0.688,
             "details": {
-              "description": "min=0.92, mean=0.92, max=0.92, sum=1.841 (2)",
+              "description": "min=0.688, mean=0.688, max=0.688, sum=1.375 (2)",
               "tab": "Accuracy",
-              "Sociology - Observed inference time (s)": "{\"description\": \"min=0.297, mean=0.297, max=0.297, sum=0.595 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.29732529915387357\"}",
-              "Sociology - # eval": "{\"description\": \"min=201, mean=201, max=201, sum=402 (2)\", \"tab\": \"General information\", \"score\": \"201.0\"}",
-              "Sociology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Sociology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Sociology - # prompt tokens": "{\"description\": \"min=445.517, mean=445.517, max=445.517, sum=891.035 (2)\", \"tab\": \"General information\", \"score\": \"445.51741293532336\"}",
-              "Sociology - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
+              "Machine Learning - Observed inference time (s)": "{\"description\": \"min=0.338, mean=0.338, max=0.338, sum=0.676 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.33782388057027546\"}",
+              "Machine Learning - # eval": "{\"description\": \"min=112, mean=112, max=112, sum=224 (2)\", \"tab\": \"General information\", \"score\": \"112.0\"}",
+              "Machine Learning - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Machine Learning - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Machine Learning - # prompt tokens": "{\"description\": \"min=668.054, mean=668.054, max=668.054, sum=1336.107 (2)\", \"tab\": \"General information\", \"score\": \"668.0535714285714\"}",
+              "Machine Learning - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"sociology\"",
+              "subject": "\"machine_learning\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_sociology\""
+              "groups": "\"mmlu_machine_learning\""
             }
           }
         },
         {
-          "evaluation_name": "Virology",
+          "evaluation_name": "Management",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1423,36 +1409,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Virology",
+            "evaluation_description": "EM on Management",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.584,
+            "score": 0.913,
             "details": {
-              "description": "min=0.584, mean=0.584, max=0.584, sum=1.169 (2)",
+              "description": "min=0.913, mean=0.913, max=0.913, sum=1.825 (2)",
               "tab": "Accuracy",
-              "Virology - Observed inference time (s)": "{\"description\": \"min=0.321, mean=0.321, max=0.321, sum=0.642 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.32124968609177923\"}",
-              "Virology - # eval": "{\"description\": \"min=166, mean=166, max=166, sum=332 (2)\", \"tab\": \"General information\", \"score\": \"166.0\"}",
-              "Virology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Virology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Virology - # prompt tokens": "{\"description\": \"min=343.018, mean=343.018, max=343.018, sum=686.036 (2)\", \"tab\": \"General information\", \"score\": \"343.01807228915663\"}",
-              "Virology - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
+              "Management - Observed inference time (s)": "{\"description\": \"min=0.285, mean=0.285, max=0.285, sum=0.571 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2853238027072647\"}",
+              "Management - # eval": "{\"description\": \"min=103, mean=103, max=103, sum=206 (2)\", \"tab\": \"General information\", \"score\": \"103.0\"}",
+              "Management - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Management - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Management - # prompt tokens": "{\"description\": \"min=283.786, mean=283.786, max=283.786, sum=567.573 (2)\", \"tab\": \"General information\", \"score\": \"283.7864077669903\"}",
+              "Management - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"virology\"",
+              "subject": "\"management\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_virology\""
+              "groups": "\"mmlu_management\""
             }
           }
         },
         {
-          "evaluation_name": "World Religions",
+          "evaluation_name": "Marketing",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1461,36 +1447,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on World Religions",
+            "evaluation_description": "EM on Marketing",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.901,
+            "score": 0.944,
             "details": {
-              "description": "min=0.901, mean=0.901, max=0.901, sum=1.801 (2)",
+              "description": "min=0.944, mean=0.944, max=0.944, sum=1.889 (2)",
               "tab": "Accuracy",
-              "World Religions - Observed inference time (s)": "{\"description\": \"min=0.277, mean=0.277, max=0.277, sum=0.554 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.27723441068191973\"}",
-              "World Religions - # eval": "{\"description\": \"min=171, mean=171, max=171, sum=342 (2)\", \"tab\": \"General information\", \"score\": \"171.0\"}",
-              "World Religions - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "World Religions - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "World Religions - # prompt tokens": "{\"description\": \"min=274.52, mean=274.52, max=274.52, sum=549.041 (2)\", \"tab\": \"General information\", \"score\": \"274.5204678362573\"}",
-              "World Religions - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
+              "Marketing - Observed inference time (s)": "{\"description\": \"min=0.28, mean=0.28, max=0.28, sum=0.561 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.28032574796269083\"}",
+              "Marketing - # eval": "{\"description\": \"min=234, mean=234, max=234, sum=468 (2)\", \"tab\": \"General information\", \"score\": \"234.0\"}",
+              "Marketing - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Marketing - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Marketing - # prompt tokens": "{\"description\": \"min=404.218, mean=404.218, max=404.218, sum=808.436 (2)\", \"tab\": \"General information\", \"score\": \"404.21794871794873\"}",
+              "Marketing - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"world_religions\"",
+              "subject": "\"marketing\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_world_religions\""
+              "groups": "\"mmlu_marketing\""
             }
           }
         },
         {
-          "evaluation_name": "Mean win rate",
+          "evaluation_name": "Medical Genetics",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1499,404 +1485,418 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "How many models this model outperforms on average (over columns).",
+            "evaluation_description": "EM on Medical Genetics",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.773,
+            "score": 0.92,
             "details": {
-              "description": "",
-              "tab": "Efficiency"
+              "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)",
+              "tab": "Accuracy",
+              "Medical Genetics - Observed inference time (s)": "{\"description\": \"min=0.296, mean=0.296, max=0.296, sum=0.592 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.29611136198043825\"}",
+              "Medical Genetics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "Medical Genetics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Medical Genetics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Medical Genetics - # prompt tokens": "{\"description\": \"min=340.99, mean=340.99, max=340.99, sum=681.98 (2)\", \"tab\": \"General information\", \"score\": \"340.99\"}",
+              "Medical Genetics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
             }
           },
           "generation_config": {
-            "additional_details": {}
+            "additional_details": {
+              "subject": "\"medical_genetics\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_medical_genetics\""
+            }
           }
-        }
-      ],
-      "detailed_evaluation_results": null,
-      "generation_config": {
-        "additional_details": {
-          "subject": "[\"abstract_algebra\", \"anatomy\", \"astronomy\", \"business_ethics\", \"clinical_knowledge\", \"college_biology\", \"college_chemistry\", \"college_computer_science\", \"college_mathematics\", \"college_medicine\", \"college_physics\", \"computer_security\", \"conceptual_physics\", \"econometrics\", \"electrical_engineering\", \"elementary_mathematics\", \"formal_logic\", \"global_facts\", \"high_school_biology\", \"high_school_chemistry\", \"high_school_computer_science\", \"high_school_european_history\", \"high_school_geography\", \"high_school_government_and_politics\", \"high_school_macroeconomics\", \"high_school_mathematics\", \"high_school_microeconomics\", \"high_school_physics\", \"high_school_psychology\", \"high_school_statistics\", \"high_school_us_history\", \"high_school_world_history\", \"human_aging\", \"human_sexuality\", \"international_law\", \"jurisprudence\", \"logical_fallacies\", \"machine_learning\", \"management\", \"marketing\", \"medical_genetics\", \"miscellaneous\", \"moral_disputes\", \"moral_scenarios\", \"nutrition\", \"philosophy\", \"prehistory\", \"professional_accounting\", \"professional_law\", \"professional_medicine\", \"professional_psychology\", \"public_relations\", \"security_studies\", \"sociology\", \"us_foreign_policy\", \"virology\", \"world_religions\"]",
-          "method": "\"multiple_choice_joint\"",
-          "eval_split": "\"test\"",
-          "groups": "[\"mmlu_abstract_algebra\", \"mmlu_anatomy\", \"mmlu_astronomy\", \"mmlu_business_ethics\", \"mmlu_clinical_knowledge\", \"mmlu_college_biology\", \"mmlu_college_chemistry\", \"mmlu_college_computer_science\", \"mmlu_college_mathematics\", \"mmlu_college_medicine\", \"mmlu_college_physics\", \"mmlu_computer_security\", \"mmlu_conceptual_physics\", \"mmlu_econometrics\", \"mmlu_electrical_engineering\", \"mmlu_elementary_mathematics\", \"mmlu_formal_logic\", \"mmlu_global_facts\", \"mmlu_high_school_biology\", \"mmlu_high_school_chemistry\", \"mmlu_high_school_computer_science\", \"mmlu_high_school_european_history\", \"mmlu_high_school_geography\", \"mmlu_high_school_government_and_politics\", \"mmlu_high_school_macroeconomics\", \"mmlu_high_school_mathematics\", \"mmlu_high_school_microeconomics\", \"mmlu_high_school_physics\", \"mmlu_high_school_psychology\", \"mmlu_high_school_statistics\", \"mmlu_high_school_us_history\", \"mmlu_high_school_world_history\", \"mmlu_human_aging\", \"mmlu_human_sexuality\", \"mmlu_international_law\", \"mmlu_jurisprudence\", \"mmlu_logical_fallacies\", \"mmlu_machine_learning\", \"mmlu_management\", \"mmlu_marketing\", \"mmlu_medical_genetics\", \"mmlu_miscellaneous\", \"mmlu_moral_disputes\", \"mmlu_moral_scenarios\", \"mmlu_nutrition\", \"mmlu_philosophy\", \"mmlu_prehistory\", \"mmlu_professional_accounting\", \"mmlu_professional_law\", \"mmlu_professional_medicine\", \"mmlu_professional_psychology\", \"mmlu_public_relations\", \"mmlu_security_studies\", \"mmlu_sociology\", \"mmlu_us_foreign_policy\", \"mmlu_virology\", \"mmlu_world_religions\"]"
-        }
-      }
-    },
-    {
-      "evaluation_id": "helm_lite/meta_llama-3.2-90b-vision-instruct-turbo/1774096306.427425",
-      "retrieved_timestamp": "1774096306.427425",
-      "source_metadata": {
-        "source_name": "helm_lite",
-        "source_type": "documentation",
-        "source_organization_name": "crfm",
-        "evaluator_relationship": "third_party"
-      },
-      "eval_library": {
-        "name": "helm",
-        "version": "unknown"
-      },
-      "benchmark": "helm_lite",
-      "evaluation_results": [
+        },
         {
-          "evaluation_name": "Mean win rate",
+          "evaluation_name": "Miscellaneous",
           "source_data": {
-            "dataset_name": "helm_lite",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "How many models this model outperforms on average (over columns).",
+            "evaluation_description": "EM on Miscellaneous",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.819,
+            "score": 0.913,
             "details": {
-              "description": "",
+              "description": "min=0.913, mean=0.913, max=0.913, sum=1.826 (2)",
               "tab": "Accuracy",
-              "Mean win rate - Efficiency": "{\"description\": \"\", \"tab\": \"Efficiency\", \"score\": \"0.5839825218476904\"}",
-              "Mean win rate - General information": "{\"description\": \"\", \"tab\": \"General information\", \"score\": \"\"}"
+              "Miscellaneous - Observed inference time (s)": "{\"description\": \"min=0.324, mean=0.324, max=0.324, sum=0.647 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3237126984967735\"}",
+              "Miscellaneous - # eval": "{\"description\": \"min=783, mean=783, max=783, sum=1566 (2)\", \"tab\": \"General information\", \"score\": \"783.0\"}",
+              "Miscellaneous - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Miscellaneous - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Miscellaneous - # prompt tokens": "{\"description\": \"min=299.911, mean=299.911, max=299.911, sum=599.821 (2)\", \"tab\": \"General information\", \"score\": \"299.9106002554278\"}",
+              "Miscellaneous - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
             }
           },
           "generation_config": {
-            "additional_details": {}
+            "additional_details": {
+              "subject": "\"miscellaneous\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_miscellaneous\""
+            }
           }
         },
         {
-          "evaluation_name": "NarrativeQA",
+          "evaluation_name": "Moral Scenarios",
           "source_data": {
-            "dataset_name": "NarrativeQA",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "F1 on NarrativeQA",
+            "evaluation_description": "EM on Moral Scenarios",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.777,
+            "score": 0.841,
             "details": {
-              "description": "min=0.777, mean=0.777, max=0.777, sum=0.777 (1)",
+              "description": "min=0.841, mean=0.841, max=0.841, sum=1.683 (2)",
               "tab": "Accuracy",
-              "NarrativeQA - Observed inference time (s)": "{\"description\": \"min=0.83, mean=0.83, max=0.83, sum=0.83 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.8297326531208736\"}",
-              "NarrativeQA - # eval": "{\"description\": \"min=355, mean=355, max=355, sum=355 (1)\", \"tab\": \"General information\", \"score\": \"355.0\"}",
-              "NarrativeQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "NarrativeQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "NarrativeQA - # prompt tokens": "{\"description\": \"min=3484.268, mean=3484.268, max=3484.268, sum=3484.268 (1)\", \"tab\": \"General information\", \"score\": \"3484.2676056338028\"}",
-              "NarrativeQA - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
+              "Moral Disputes - Observed inference time (s)": "{\"description\": \"min=0.29, mean=0.29, max=0.29, sum=0.58 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2901734975032035\"}",
+              "Moral Scenarios - Observed inference time (s)": "{\"description\": \"min=0.506, mean=0.506, max=0.506, sum=1.012 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5058047955262595\"}",
+              "Moral Disputes - # eval": "{\"description\": \"min=346, mean=346, max=346, sum=692 (2)\", \"tab\": \"General information\", \"score\": \"346.0\"}",
+              "Moral Disputes - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Moral Disputes - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Moral Disputes - # prompt tokens": "{\"description\": \"min=476.113, mean=476.113, max=476.113, sum=952.225 (2)\", \"tab\": \"General information\", \"score\": \"476.1127167630058\"}",
+              "Moral Disputes - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Moral Scenarios - # eval": "{\"description\": \"min=895, mean=895, max=895, sum=1790 (2)\", \"tab\": \"General information\", \"score\": \"895.0\"}",
+              "Moral Scenarios - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Moral Scenarios - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Moral Scenarios - # prompt tokens": "{\"description\": \"min=656.455, mean=656.455, max=656.455, sum=1312.909 (2)\", \"tab\": \"General information\", \"score\": \"656.454748603352\"}",
+              "Moral Scenarios - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
             }
           },
           "generation_config": {
-            "additional_details": {}
+            "additional_details": {
+              "subject": "\"moral_scenarios\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_moral_scenarios\""
+            }
           }
         },
         {
-          "evaluation_name": "NaturalQuestions (closed-book)",
+          "evaluation_name": "Nutrition",
           "source_data": {
-            "dataset_name": "NaturalQuestions (closed-book)",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "F1 on NaturalQuestions (closed-book)",
+            "evaluation_description": "EM on Nutrition",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.457,
+            "score": 0.889,
             "details": {
-              "description": "min=0.457, mean=0.457, max=0.457, sum=0.457 (1)",
+              "description": "min=0.889, mean=0.889, max=0.889, sum=1.778 (2)",
               "tab": "Accuracy",
-              "NaturalQuestions (open-book) - Observed inference time (s)": "{\"description\": \"min=1.111, mean=1.111, max=1.111, sum=1.111 (1)\", \"tab\": \"Efficiency\", \"score\": \"1.110703297138214\"}",
-              "NaturalQuestions (closed-book) - Observed inference time (s)": "{\"description\": \"min=0.422, mean=0.422, max=0.422, sum=0.422 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.4218848171234131\"}",
-              "NaturalQuestions (open-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}",
-              "NaturalQuestions (open-book) - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "NaturalQuestions (open-book) - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "NaturalQuestions (open-book) - # prompt tokens": "{\"description\": \"min=1716.785, mean=1716.785, max=1716.785, sum=1716.785 (1)\", \"tab\": \"General information\", \"score\": \"1716.785\"}",
-              "NaturalQuestions (open-book) - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "NaturalQuestions (closed-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}",
-              "NaturalQuestions (closed-book) - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "NaturalQuestions (closed-book) - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "NaturalQuestions (closed-book) - # prompt tokens": "{\"description\": \"min=129.12, mean=129.12, max=129.12, sum=129.12 (1)\", \"tab\": \"General information\", \"score\": \"129.12\"}",
-              "NaturalQuestions (closed-book) - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
+              "Nutrition - Observed inference time (s)": "{\"description\": \"min=0.321, mean=0.321, max=0.321, sum=0.641 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.32064209264867444\"}",
+              "Nutrition - # eval": "{\"description\": \"min=306, mean=306, max=306, sum=612 (2)\", \"tab\": \"General information\", \"score\": \"306.0\"}",
+              "Nutrition - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Nutrition - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Nutrition - # prompt tokens": "{\"description\": \"min=586.814, mean=586.814, max=586.814, sum=1173.627 (2)\", \"tab\": \"General information\", \"score\": \"586.8137254901961\"}",
+              "Nutrition - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "mode": "\"closedbook\""
+              "subject": "\"nutrition\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_nutrition\""
             }
           }
         },
         {
-          "evaluation_name": "OpenbookQA",
+          "evaluation_name": "Prehistory",
           "source_data": {
-            "dataset_name": "OpenbookQA",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on OpenbookQA",
+            "evaluation_description": "EM on Prehistory",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.942,
+            "score": 0.886,
             "details": {
-              "description": "min=0.942, mean=0.942, max=0.942, sum=0.942 (1)",
+              "description": "min=0.886, mean=0.886, max=0.886, sum=1.772 (2)",
               "tab": "Accuracy",
-              "OpenbookQA - Observed inference time (s)": "{\"description\": \"min=0.285, mean=0.285, max=0.285, sum=0.285 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.28476666021347047\"}",
-              "OpenbookQA - # eval": "{\"description\": \"min=500, mean=500, max=500, sum=500 (1)\", \"tab\": \"General information\", \"score\": \"500.0\"}",
-              "OpenbookQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "OpenbookQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "OpenbookQA - # prompt tokens": "{\"description\": \"min=249.776, mean=249.776, max=249.776, sum=249.776 (1)\", \"tab\": \"General information\", \"score\": \"249.776\"}",
-              "OpenbookQA - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
+              "Prehistory - Observed inference time (s)": "{\"description\": \"min=0.614, mean=0.614, max=0.614, sum=1.227 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6136744522754057\"}",
+              "Prehistory - # eval": "{\"description\": \"min=324, mean=324, max=324, sum=648 (2)\", \"tab\": \"General information\", \"score\": \"324.0\"}",
+              "Prehistory - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Prehistory - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Prehistory - # prompt tokens": "{\"description\": \"min=514.528, mean=514.528, max=514.528, sum=1029.056 (2)\", \"tab\": \"General information\", \"score\": \"514.5277777777778\"}",
+              "Prehistory - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "dataset": "\"openbookqa\"",
-              "method": "\"multiple_choice_joint\""
+              "subject": "\"prehistory\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_prehistory\""
             }
           }
         },
         {
-          "evaluation_name": "MMLU",
+          "evaluation_name": "Public Relations",
           "source_data": {
-            "dataset_name": "MMLU",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on MMLU",
+            "evaluation_description": "EM on Public Relations",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.703,
+            "score": 0.718,
             "details": {
-              "description": "min=0.52, mean=0.703, max=0.93, sum=3.514 (5)",
+              "description": "min=0.718, mean=0.718, max=0.718, sum=1.436 (2)",
               "tab": "Accuracy",
-              "MMLU - Observed inference time (s)": "{\"description\": \"min=0.266, mean=0.798, max=2.612, sum=3.992 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.7984467656654225\"}",
-              "MMLU - # eval": "{\"description\": \"min=100, mean=102.8, max=114, sum=514 (5)\", \"tab\": \"General information\", \"score\": \"102.8\"}",
-              "MMLU - # train": "{\"description\": \"min=5, mean=5, max=5, sum=25 (5)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "MMLU - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "MMLU - # prompt tokens": "{\"description\": \"min=373.43, mean=467.686, max=614.421, sum=2338.431 (5)\", \"tab\": \"General information\", \"score\": \"467.6862105263158\"}",
-              "MMLU - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
+              "Public Relations - Observed inference time (s)": "{\"description\": \"min=0.3, mean=0.3, max=0.3, sum=0.599 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.29952496832067316\"}",
+              "Public Relations - # eval": "{\"description\": \"min=110, mean=110, max=110, sum=220 (2)\", \"tab\": \"General information\", \"score\": \"110.0\"}",
+              "Public Relations - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Public Relations - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Public Relations - # prompt tokens": "{\"description\": \"min=405.318, mean=405.318, max=405.318, sum=810.636 (2)\", \"tab\": \"General information\", \"score\": \"405.3181818181818\"}",
+              "Public Relations - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "[\"abstract_algebra\", \"college_chemistry\", \"computer_security\", \"econometrics\", \"us_foreign_policy\"]",
-              "method": "\"multiple_choice_joint\""
+              "subject": "\"public_relations\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_public_relations\""
             }
           }
         },
         {
-          "evaluation_name": "MATH",
+          "evaluation_name": "Security Studies",
           "source_data": {
-            "dataset_name": "MATH",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "Equivalent (CoT) on MATH",
+            "evaluation_description": "EM on Security Studies",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.791,
+            "score": 0.853,
             "details": {
-              "description": "min=0.579, mean=0.791, max=0.978, sum=5.54 (7)",
+              "description": "min=0.853, mean=0.853, max=0.853, sum=1.706 (2)",
               "tab": "Accuracy",
-              "MATH - Observed inference time (s)": "{\"description\": \"min=4.64, mean=5.739, max=6.652, sum=40.174 (7)\", \"tab\": \"Efficiency\", \"score\": \"5.739186799526185\"}",
-              "MATH - # eval": "{\"description\": \"min=30, mean=62.429, max=135, sum=437 (7)\", \"tab\": \"General information\", \"score\": \"62.42857142857143\"}",
-              "MATH - # train": "{\"description\": \"min=8, mean=8, max=8, sum=56 (7)\", \"tab\": \"General information\", \"score\": \"8.0\"}",
-              "MATH - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (7)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "MATH - # prompt tokens": "{\"description\": \"min=881.363, mean=1262.909, max=2197.577, sum=8840.364 (7)\", \"tab\": \"General information\", \"score\": \"1262.9092130545007\"}",
-              "MATH - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (7)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
+              "Security Studies - Observed inference time (s)": "{\"description\": \"min=0.348, mean=0.348, max=0.348, sum=0.697 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.348436891789339\"}",
+              "Security Studies - # eval": "{\"description\": \"min=245, mean=245, max=245, sum=490 (2)\", \"tab\": \"General information\", \"score\": \"245.0\"}",
+              "Security Studies - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Security Studies - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Security Studies - # prompt tokens": "{\"description\": \"min=1164.473, mean=1164.473, max=1164.473, sum=2328.947 (2)\", \"tab\": \"General information\", \"score\": \"1164.4734693877551\"}",
+              "Security Studies - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "[\"algebra\", \"counting_and_probability\", \"geometry\", \"intermediate_algebra\", \"number_theory\", \"prealgebra\", \"precalculus\"]",
-              "level": "\"1\"",
-              "use_official_examples": "\"False\"",
-              "use_chain_of_thought": "\"True\""
+              "subject": "\"security_studies\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_security_studies\""
             }
           }
         },
         {
-          "evaluation_name": "GSM8K",
+          "evaluation_name": "Sociology",
           "source_data": {
-            "dataset_name": "GSM8K",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on GSM8K",
+            "evaluation_description": "EM on Sociology",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.936,
+            "score": 0.92,
             "details": {
-              "description": "min=0.936, mean=0.936, max=0.936, sum=0.936 (1)",
+              "description": "min=0.92, mean=0.92, max=0.92, sum=1.841 (2)",
               "tab": "Accuracy",
-              "GSM8K - Observed inference time (s)": "{\"description\": \"min=2.889, mean=2.889, max=2.889, sum=2.889 (1)\", \"tab\": \"Efficiency\", \"score\": \"2.8894128675460817\"}",
-              "GSM8K - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}",
-              "GSM8K - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "GSM8K - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "GSM8K - # prompt tokens": "{\"description\": \"min=959.032, mean=959.032, max=959.032, sum=959.032 (1)\", \"tab\": \"General information\", \"score\": \"959.032\"}",
-              "GSM8K - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
+              "Sociology - Observed inference time (s)": "{\"description\": \"min=0.297, mean=0.297, max=0.297, sum=0.595 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.29732529915387357\"}",
+              "Sociology - # eval": "{\"description\": \"min=201, mean=201, max=201, sum=402 (2)\", \"tab\": \"General information\", \"score\": \"201.0\"}",
+              "Sociology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Sociology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Sociology - # prompt tokens": "{\"description\": \"min=445.517, mean=445.517, max=445.517, sum=891.035 (2)\", \"tab\": \"General information\", \"score\": \"445.51741293532336\"}",
+              "Sociology - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "stop": "\"none\""
+              "subject": "\"sociology\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_sociology\""
             }
           }
         },
         {
-          "evaluation_name": "LegalBench",
+          "evaluation_name": "Virology",
           "source_data": {
-            "dataset_name": "LegalBench",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on LegalBench",
+            "evaluation_description": "EM on Virology",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.68,
+            "score": 0.584,
             "details": {
-              "description": "min=0.438, mean=0.68, max=0.989, sum=3.398 (5)",
+              "description": "min=0.584, mean=0.584, max=0.584, sum=1.169 (2)",
               "tab": "Accuracy",
-              "LegalBench - Observed inference time (s)": "{\"description\": \"min=0.284, mean=0.478, max=1.152, sum=2.389 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.47773526830658064\"}",
-              "LegalBench - # eval": "{\"description\": \"min=95, mean=409.4, max=1000, sum=2047 (5)\", \"tab\": \"General information\", \"score\": \"409.4\"}",
-              "LegalBench - # train": "{\"description\": \"min=4, mean=4.8, max=5, sum=24 (5)\", \"tab\": \"General information\", \"score\": \"4.8\"}",
-              "LegalBench - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "LegalBench - # prompt tokens": "{\"description\": \"min=197.442, mean=1513.882, max=6300.012, sum=7569.412 (5)\", \"tab\": \"General information\", \"score\": \"1513.8824197238912\"}",
-              "LegalBench - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
+              "Virology - Observed inference time (s)": "{\"description\": \"min=0.321, mean=0.321, max=0.321, sum=0.642 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.32124968609177923\"}",
+              "Virology - # eval": "{\"description\": \"min=166, mean=166, max=166, sum=332 (2)\", \"tab\": \"General information\", \"score\": \"166.0\"}",
+              "Virology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Virology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Virology - # prompt tokens": "{\"description\": \"min=343.018, mean=343.018, max=343.018, sum=686.036 (2)\", \"tab\": \"General information\", \"score\": \"343.01807228915663\"}",
+              "Virology - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subset": "[\"abercrombie\", \"corporate_lobbying\", \"function_of_decision_section\", \"international_citizenship_questions\", \"proa\"]"
+              "subject": "\"virology\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_virology\""
             }
           }
         },
         {
-          "evaluation_name": "MedQA",
+          "evaluation_name": "World Religions",
           "source_data": {
-            "dataset_name": "MedQA",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on MedQA",
+            "evaluation_description": "EM on World Religions",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.769,
+            "score": 0.901,
             "details": {
-              "description": "min=0.769, mean=0.769, max=0.769, sum=0.769 (1)",
+              "description": "min=0.901, mean=0.901, max=0.901, sum=1.801 (2)",
               "tab": "Accuracy",
-              "MedQA - Observed inference time (s)": "{\"description\": \"min=0.318, mean=0.318, max=0.318, sum=0.318 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.3180293652930743\"}",
-              "MedQA - # eval": "{\"description\": \"min=503, mean=503, max=503, sum=503 (1)\", \"tab\": \"General information\", \"score\": \"503.0\"}",
-              "MedQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "MedQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "MedQA - # prompt tokens": "{\"description\": \"min=1025.274, mean=1025.274, max=1025.274, sum=1025.274 (1)\", \"tab\": \"General information\", \"score\": \"1025.2743538767395\"}",
-              "MedQA - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
+              "World Religions - Observed inference time (s)": "{\"description\": \"min=0.277, mean=0.277, max=0.277, sum=0.554 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.27723441068191973\"}",
+              "World Religions - # eval": "{\"description\": \"min=171, mean=171, max=171, sum=342 (2)\", \"tab\": \"General information\", \"score\": \"171.0\"}",
+              "World Religions - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "World Religions - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "World Religions - # prompt tokens": "{\"description\": \"min=274.52, mean=274.52, max=274.52, sum=549.041 (2)\", \"tab\": \"General information\", \"score\": \"274.5204678362573\"}",
+              "World Religions - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
             }
           },
           "generation_config": {
-            "additional_details": {}
+            "additional_details": {
+              "subject": "\"world_religions\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_world_religions\""
+            }
           }
         },
         {
-          "evaluation_name": "WMT 2014",
+          "evaluation_name": "Mean win rate",
           "source_data": {
-            "dataset_name": "WMT 2014",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "BLEU-4 on WMT 2014",
+            "evaluation_description": "How many models this model outperforms on average (over columns).",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.224,
+            "score": 0.773,
             "details": {
-              "description": "min=0.182, mean=0.224, max=0.266, sum=1.121 (5)",
-              "tab": "Accuracy",
-              "WMT 2014 - Observed inference time (s)": "{\"description\": \"min=0.737, mean=0.816, max=0.848, sum=4.078 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.8156762526912515\"}",
-              "WMT 2014 - # eval": "{\"description\": \"min=503, mean=568.8, max=832, sum=2844 (5)\", \"tab\": \"General information\", \"score\": \"568.8\"}",
-              "WMT 2014 - # train": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "WMT 2014 - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "WMT 2014 - # prompt tokens": "{\"description\": \"min=101.139, mean=120.868, max=141.33, sum=604.34 (5)\", \"tab\": \"General information\", \"score\": \"120.86804366111025\"}",
-              "WMT 2014 - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}"
+              "description": "",
+              "tab": "Efficiency"
             }
           },
           "generation_config": {
-            "additional_details": {
-              "language_pair": "[\"cs-en\", \"de-en\", \"fr-en\", \"hi-en\", \"ru-en\"]"
-            }
+            "additional_details": {}
           }
         }
       ],
       "detailed_evaluation_results": null,
       "generation_config": {
-        "additional_details": {}
+        "additional_details": {
+          "subject": "[\"abstract_algebra\", \"anatomy\", \"astronomy\", \"business_ethics\", \"clinical_knowledge\", \"college_biology\", \"college_chemistry\", \"college_computer_science\", \"college_mathematics\", \"college_medicine\", \"college_physics\", \"computer_security\", \"conceptual_physics\", \"econometrics\", \"electrical_engineering\", \"elementary_mathematics\", \"formal_logic\", \"global_facts\", \"high_school_biology\", \"high_school_chemistry\", \"high_school_computer_science\", \"high_school_european_history\", \"high_school_geography\", \"high_school_government_and_politics\", \"high_school_macroeconomics\", \"high_school_mathematics\", \"high_school_microeconomics\", \"high_school_physics\", \"high_school_psychology\", \"high_school_statistics\", \"high_school_us_history\", \"high_school_world_history\", \"human_aging\", \"human_sexuality\", \"international_law\", \"jurisprudence\", \"logical_fallacies\", \"machine_learning\", \"management\", \"marketing\", \"medical_genetics\", \"miscellaneous\", \"moral_disputes\", \"moral_scenarios\", \"nutrition\", \"philosophy\", \"prehistory\", \"professional_accounting\", \"professional_law\", \"professional_medicine\", \"professional_psychology\", \"public_relations\", \"security_studies\", \"sociology\", \"us_foreign_policy\", \"virology\", \"world_religions\"]",
+          "method": "\"multiple_choice_joint\"",
+          "eval_split": "\"test\"",
+          "groups": "[\"mmlu_abstract_algebra\", \"mmlu_anatomy\", \"mmlu_astronomy\", \"mmlu_business_ethics\", \"mmlu_clinical_knowledge\", \"mmlu_college_biology\", \"mmlu_college_chemistry\", \"mmlu_college_computer_science\", \"mmlu_college_mathematics\", \"mmlu_college_medicine\", \"mmlu_college_physics\", \"mmlu_computer_security\", \"mmlu_conceptual_physics\", \"mmlu_econometrics\", \"mmlu_electrical_engineering\", \"mmlu_elementary_mathematics\", \"mmlu_formal_logic\", \"mmlu_global_facts\", \"mmlu_high_school_biology\", \"mmlu_high_school_chemistry\", \"mmlu_high_school_computer_science\", \"mmlu_high_school_european_history\", \"mmlu_high_school_geography\", \"mmlu_high_school_government_and_politics\", \"mmlu_high_school_macroeconomics\", \"mmlu_high_school_mathematics\", \"mmlu_high_school_microeconomics\", \"mmlu_high_school_physics\", \"mmlu_high_school_psychology\", \"mmlu_high_school_statistics\", \"mmlu_high_school_us_history\", \"mmlu_high_school_world_history\", \"mmlu_human_aging\", \"mmlu_human_sexuality\", \"mmlu_international_law\", \"mmlu_jurisprudence\", \"mmlu_logical_fallacies\", \"mmlu_machine_learning\", \"mmlu_management\", \"mmlu_marketing\", \"mmlu_medical_genetics\", \"mmlu_miscellaneous\", \"mmlu_moral_disputes\", \"mmlu_moral_scenarios\", \"mmlu_nutrition\", \"mmlu_philosophy\", \"mmlu_prehistory\", \"mmlu_professional_accounting\", \"mmlu_professional_law\", \"mmlu_professional_medicine\", \"mmlu_professional_psychology\", \"mmlu_public_relations\", \"mmlu_security_studies\", \"mmlu_sociology\", \"mmlu_us_foreign_policy\", \"mmlu_virology\", \"mmlu_world_religions\"]"
+        }
       }
     }
   ]
diff --git a/data/models/minimax_minimax-m2.1.json b/data/models/minimax_minimax-m2.1.json
index 33905b0ffaa72b65b1d021a73faac651c1ebfecd..a5b957c9841bcbc5016fb37de174c988ce273877 100644
--- a/data/models/minimax_minimax-m2.1.json
+++ b/data/models/minimax_minimax-m2.1.json
@@ -4,13 +4,13 @@
     "id": "minimax/minimax-m2.1",
     "developer": "MiniMax",
     "additional_details": {
-      "agent_name": "Crux",
-      "agent_organization": "Roam"
+      "agent_name": "Terminus 2",
+      "agent_organization": "Terminal Bench"
     }
   },
   "evaluations": [
     {
-      "evaluation_id": "terminal-bench-2.0/crux__minimax-m2.1/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/terminus-2__minimax-m2.1/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -34,7 +34,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-12-22",
+          "evaluation_timestamp": "2025-12-23",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -43,7 +43,7 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 36.6,
+            "score": 29.2,
             "uncertainty": {
               "standard_error": {
                 "value": 2.9
@@ -53,7 +53,7 @@
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Crux\" -m \"MiniMax M2.1\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"MiniMax M2.1\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -70,7 +70,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Crux\" -m \"MiniMax M2.1\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"MiniMax M2.1\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -84,7 +84,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/terminus-2__minimax-m2.1/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/crux__minimax-m2.1/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -108,7 +108,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-12-23",
+          "evaluation_timestamp": "2025-12-22",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -117,7 +117,7 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 29.2,
+            "score": 36.6,
             "uncertainty": {
               "standard_error": {
                 "value": 2.9
@@ -127,7 +127,7 @@
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"MiniMax M2.1\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Crux\" -m \"MiniMax M2.1\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -144,7 +144,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"MiniMax M2.1\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Crux\" -m \"MiniMax M2.1\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
diff --git a/data/models/mistralai_mistral-7b-instruct-v0.3.json b/data/models/mistralai_mistral-7b-instruct-v0.3.json
index d35f23e1653c8321bbb94b0e748858ca165accef..0f7fe68f0b7a1a194dc76677de8b8b0fb7a1f4bb 100644
--- a/data/models/mistralai_mistral-7b-instruct-v0.3.json
+++ b/data/models/mistralai_mistral-7b-instruct-v0.3.json
@@ -236,10 +236,10 @@
       }
     },
     {
-      "evaluation_id": "helm_mmlu/mistralai_mistral-7b-instruct-v0.3/1774096312.00548",
-      "retrieved_timestamp": "1774096312.00548",
+      "evaluation_id": "helm_lite/mistralai_mistral-7b-instruct-v0.3/1774096306.427425",
+      "retrieved_timestamp": "1774096306.427425",
       "source_metadata": {
-        "source_name": "helm_mmlu",
+        "source_name": "helm_lite",
         "source_type": "documentation",
         "source_organization_name": "crfm",
         "evaluator_relationship": "third_party"
@@ -248,438 +248,382 @@
         "name": "helm",
         "version": "unknown"
       },
-      "benchmark": "helm_mmlu",
+      "benchmark": "helm_lite",
       "evaluation_results": [
         {
-          "evaluation_name": "MMLU All Subjects",
+          "evaluation_name": "Mean win rate",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "helm_lite",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on MMLU All Subjects",
+            "evaluation_description": "How many models this model outperforms on average (over columns).",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.599,
+            "score": 0.196,
             "details": {
-              "description": "min=0.258, mean=0.599, max=0.881, sum=68.3 (114)",
+              "description": "",
               "tab": "Accuracy",
-              "MMLU All Subjects - Observed inference time (s)": "{\"description\": \"min=0.212, mean=0.526, max=1.438, sum=59.959 (114)\", \"tab\": \"Efficiency\", \"score\": \"0.525951832745908\"}",
-              "MMLU All Subjects - # eval": "{\"description\": \"min=100, mean=246.351, max=1534, sum=28084 (114)\", \"tab\": \"General information\", \"score\": \"246.35087719298247\"}",
-              "MMLU All Subjects - # train": "{\"description\": \"min=5, mean=5, max=5, sum=570 (114)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "MMLU All Subjects - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (114)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "MMLU All Subjects - # prompt tokens": "{\"description\": \"min=317.924, mean=705.273, max=3098.109, sum=80401.178 (114)\", \"tab\": \"General information\", \"score\": \"705.2734899593811\"}",
-              "MMLU All Subjects - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=114 (114)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Mean win rate - Efficiency": "{\"description\": \"\", \"tab\": \"Efficiency\", \"score\": \"0.6493133583021223\"}",
+              "Mean win rate - General information": "{\"description\": \"\", \"tab\": \"General information\", \"score\": \"\"}"
             }
           },
           "generation_config": {
-            "additional_details": {
-              "subject": "[\"abstract_algebra\", \"anatomy\", \"astronomy\", \"business_ethics\", \"clinical_knowledge\", \"college_biology\", \"college_chemistry\", \"college_computer_science\", \"college_mathematics\", \"college_medicine\", \"college_physics\", \"computer_security\", \"conceptual_physics\", \"econometrics\", \"electrical_engineering\", \"elementary_mathematics\", \"formal_logic\", \"global_facts\", \"high_school_biology\", \"high_school_chemistry\", \"high_school_computer_science\", \"high_school_european_history\", \"high_school_geography\", \"high_school_government_and_politics\", \"high_school_macroeconomics\", \"high_school_mathematics\", \"high_school_microeconomics\", \"high_school_physics\", \"high_school_psychology\", \"high_school_statistics\", \"high_school_us_history\", \"high_school_world_history\", \"human_aging\", \"human_sexuality\", \"international_law\", \"jurisprudence\", \"logical_fallacies\", \"machine_learning\", \"management\", \"marketing\", \"medical_genetics\", \"miscellaneous\", \"moral_disputes\", \"moral_scenarios\", \"nutrition\", \"philosophy\", \"prehistory\", \"professional_accounting\", \"professional_law\", \"professional_medicine\", \"professional_psychology\", \"public_relations\", \"security_studies\", \"sociology\", \"us_foreign_policy\", \"virology\", \"world_religions\"]",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "[\"mmlu_abstract_algebra\", \"mmlu_anatomy\", \"mmlu_astronomy\", \"mmlu_business_ethics\", \"mmlu_clinical_knowledge\", \"mmlu_college_biology\", \"mmlu_college_chemistry\", \"mmlu_college_computer_science\", \"mmlu_college_mathematics\", \"mmlu_college_medicine\", \"mmlu_college_physics\", \"mmlu_computer_security\", \"mmlu_conceptual_physics\", \"mmlu_econometrics\", \"mmlu_electrical_engineering\", \"mmlu_elementary_mathematics\", \"mmlu_formal_logic\", \"mmlu_global_facts\", \"mmlu_high_school_biology\", \"mmlu_high_school_chemistry\", \"mmlu_high_school_computer_science\", \"mmlu_high_school_european_history\", \"mmlu_high_school_geography\", \"mmlu_high_school_government_and_politics\", \"mmlu_high_school_macroeconomics\", \"mmlu_high_school_mathematics\", \"mmlu_high_school_microeconomics\", \"mmlu_high_school_physics\", \"mmlu_high_school_psychology\", \"mmlu_high_school_statistics\", \"mmlu_high_school_us_history\", \"mmlu_high_school_world_history\", \"mmlu_human_aging\", \"mmlu_human_sexuality\", \"mmlu_international_law\", \"mmlu_jurisprudence\", \"mmlu_logical_fallacies\", \"mmlu_machine_learning\", \"mmlu_management\", \"mmlu_marketing\", \"mmlu_medical_genetics\", \"mmlu_miscellaneous\", \"mmlu_moral_disputes\", \"mmlu_moral_scenarios\", \"mmlu_nutrition\", \"mmlu_philosophy\", \"mmlu_prehistory\", \"mmlu_professional_accounting\", \"mmlu_professional_law\", \"mmlu_professional_medicine\", \"mmlu_professional_psychology\", \"mmlu_public_relations\", \"mmlu_security_studies\", \"mmlu_sociology\", \"mmlu_us_foreign_policy\", \"mmlu_virology\", \"mmlu_world_religions\"]"
-            }
+            "additional_details": {}
           }
         },
         {
-          "evaluation_name": "Abstract Algebra",
+          "evaluation_name": "NarrativeQA",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "NarrativeQA",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Abstract Algebra",
+            "evaluation_description": "F1 on NarrativeQA",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.27,
+            "score": 0.716,
             "details": {
-              "description": "min=0.27, mean=0.27, max=0.27, sum=0.54 (2)",
+              "description": "min=0.716, mean=0.716, max=0.716, sum=0.716 (1)",
               "tab": "Accuracy",
-              "Abstract Algebra - Observed inference time (s)": "{\"description\": \"min=0.321, mean=0.321, max=0.321, sum=0.642 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.32117165088653565\"}",
-              "Abstract Algebra - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "Abstract Algebra - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Abstract Algebra - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Abstract Algebra - # prompt tokens": "{\"description\": \"min=411.44, mean=411.44, max=411.44, sum=822.88 (2)\", \"tab\": \"General information\", \"score\": \"411.44\"}",
-              "Abstract Algebra - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "NarrativeQA - Observed inference time (s)": "{\"description\": \"min=0.813, mean=0.813, max=0.813, sum=0.813 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.8132137520212522\"}",
+              "NarrativeQA - # eval": "{\"description\": \"min=355, mean=355, max=355, sum=355 (1)\", \"tab\": \"General information\", \"score\": \"355.0\"}",
+              "NarrativeQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "NarrativeQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "NarrativeQA - # prompt tokens": "{\"description\": \"min=3924.33, mean=3924.33, max=3924.33, sum=3924.33 (1)\", \"tab\": \"General information\", \"score\": \"3924.3295774647886\"}",
+              "NarrativeQA - # output tokens": "{\"description\": \"min=7.107, mean=7.107, max=7.107, sum=7.107 (1)\", \"tab\": \"General information\", \"score\": \"7.107042253521127\"}"
             }
           },
           "generation_config": {
-            "additional_details": {
-              "subject": "\"abstract_algebra\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_abstract_algebra\""
-            }
+            "additional_details": {}
           }
         },
         {
-          "evaluation_name": "Anatomy",
+          "evaluation_name": "NaturalQuestions (closed-book)",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "NaturalQuestions (closed-book)",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Anatomy",
+            "evaluation_description": "F1 on NaturalQuestions (closed-book)",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.585,
+            "score": 0.253,
             "details": {
-              "description": "min=0.585, mean=0.585, max=0.585, sum=1.17 (2)",
+              "description": "min=0.253, mean=0.253, max=0.253, sum=0.253 (1)",
               "tab": "Accuracy",
-              "Anatomy - Observed inference time (s)": "{\"description\": \"min=0.246, mean=0.246, max=0.246, sum=0.493 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.24627229902479383\"}",
-              "Anatomy - # eval": "{\"description\": \"min=135, mean=135, max=135, sum=270 (2)\", \"tab\": \"General information\", \"score\": \"135.0\"}",
-              "Anatomy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Anatomy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Anatomy - # prompt tokens": "{\"description\": \"min=416.089, mean=416.089, max=416.089, sum=832.178 (2)\", \"tab\": \"General information\", \"score\": \"416.0888888888889\"}",
-              "Anatomy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "NaturalQuestions (open-book) - Observed inference time (s)": "{\"description\": \"min=0.563, mean=0.563, max=0.563, sum=0.563 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.5634698050022126\"}",
+              "NaturalQuestions (closed-book) - Observed inference time (s)": "{\"description\": \"min=0.535, mean=0.535, max=0.535, sum=0.535 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.5347676448822022\"}",
+              "NaturalQuestions (open-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}",
+              "NaturalQuestions (open-book) - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "NaturalQuestions (open-book) - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "NaturalQuestions (open-book) - # prompt tokens": "{\"description\": \"min=2498.79, mean=2498.79, max=2498.79, sum=2498.79 (1)\", \"tab\": \"General information\", \"score\": \"2498.79\"}",
+              "NaturalQuestions (open-book) - # output tokens": "{\"description\": \"min=12.448, mean=12.448, max=12.448, sum=12.448 (1)\", \"tab\": \"General information\", \"score\": \"12.448\"}",
+              "NaturalQuestions (closed-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}",
+              "NaturalQuestions (closed-book) - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "NaturalQuestions (closed-book) - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "NaturalQuestions (closed-book) - # prompt tokens": "{\"description\": \"min=172.069, mean=172.069, max=172.069, sum=172.069 (1)\", \"tab\": \"General information\", \"score\": \"172.069\"}",
+              "NaturalQuestions (closed-book) - # output tokens": "{\"description\": \"min=20.461, mean=20.461, max=20.461, sum=20.461 (1)\", \"tab\": \"General information\", \"score\": \"20.461\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"anatomy\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_anatomy\""
+              "mode": "\"closedbook\""
             }
           }
         },
         {
-          "evaluation_name": "College Physics",
+          "evaluation_name": "OpenbookQA",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "OpenbookQA",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on College Physics",
+            "evaluation_description": "EM on OpenbookQA",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.343,
+            "score": 0.79,
             "details": {
-              "description": "min=0.343, mean=0.343, max=0.343, sum=0.686 (2)",
+              "description": "min=0.79, mean=0.79, max=0.79, sum=0.79 (1)",
               "tab": "Accuracy",
-              "College Chemistry - Observed inference time (s)": "{\"description\": \"min=0.221, mean=0.221, max=0.221, sum=0.442 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.22099271774291993\"}",
-              "College Biology - Observed inference time (s)": "{\"description\": \"min=0.7, mean=0.7, max=0.7, sum=1.399 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6997380173868604\"}",
-              "College Computer Science - Observed inference time (s)": "{\"description\": \"min=0.466, mean=0.466, max=0.466, sum=0.932 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4661028146743774\"}",
-              "College Mathematics - Observed inference time (s)": "{\"description\": \"min=0.212, mean=0.212, max=0.212, sum=0.424 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.21210591793060302\"}",
-              "College Medicine - Observed inference time (s)": "{\"description\": \"min=0.387, mean=0.387, max=0.387, sum=0.774 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3871537646806309\"}",
-              "College Physics - Observed inference time (s)": "{\"description\": \"min=0.455, mean=0.455, max=0.455, sum=0.91 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.45503536392660704\"}",
-              "College Chemistry - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "College Chemistry - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "College Chemistry - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "College Chemistry - # prompt tokens": "{\"description\": \"min=636.71, mean=636.71, max=636.71, sum=1273.42 (2)\", \"tab\": \"General information\", \"score\": \"636.71\"}",
-              "College Chemistry - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "College Biology - # eval": "{\"description\": \"min=144, mean=144, max=144, sum=288 (2)\", \"tab\": \"General information\", \"score\": \"144.0\"}",
-              "College Biology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "College Biology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "College Biology - # prompt tokens": "{\"description\": \"min=559.799, mean=559.799, max=559.799, sum=1119.597 (2)\", \"tab\": \"General information\", \"score\": \"559.7986111111111\"}",
-              "College Biology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "College Computer Science - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "College Computer Science - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "College Computer Science - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "College Computer Science - # prompt tokens": "{\"description\": \"min=911.17, mean=911.17, max=911.17, sum=1822.34 (2)\", \"tab\": \"General information\", \"score\": \"911.17\"}",
-              "College Computer Science - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "College Mathematics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "College Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "College Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "College Mathematics - # prompt tokens": "{\"description\": \"min=667.31, mean=667.31, max=667.31, sum=1334.62 (2)\", \"tab\": \"General information\", \"score\": \"667.31\"}",
-              "College Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "College Medicine - # eval": "{\"description\": \"min=173, mean=173, max=173, sum=346 (2)\", \"tab\": \"General information\", \"score\": \"173.0\"}",
-              "College Medicine - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "College Medicine - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "College Medicine - # prompt tokens": "{\"description\": \"min=601.41, mean=601.41, max=601.41, sum=1202.821 (2)\", \"tab\": \"General information\", \"score\": \"601.4104046242775\"}",
-              "College Medicine - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "College Physics - # eval": "{\"description\": \"min=102, mean=102, max=102, sum=204 (2)\", \"tab\": \"General information\", \"score\": \"102.0\"}",
-              "College Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "College Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "College Physics - # prompt tokens": "{\"description\": \"min=560.029, mean=560.029, max=560.029, sum=1120.059 (2)\", \"tab\": \"General information\", \"score\": \"560.0294117647059\"}",
-              "College Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "OpenbookQA - Observed inference time (s)": "{\"description\": \"min=0.256, mean=0.256, max=0.256, sum=0.256 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.25593132400512697\"}",
+              "OpenbookQA - # eval": "{\"description\": \"min=500, mean=500, max=500, sum=500 (1)\", \"tab\": \"General information\", \"score\": \"500.0\"}",
+              "OpenbookQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "OpenbookQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "OpenbookQA - # prompt tokens": "{\"description\": \"min=289.15, mean=289.15, max=289.15, sum=289.15 (1)\", \"tab\": \"General information\", \"score\": \"289.15\"}",
+              "OpenbookQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"college_physics\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_college_physics\""
+              "dataset": "\"openbookqa\"",
+              "method": "\"multiple_choice_joint\""
             }
           }
         },
         {
-          "evaluation_name": "Computer Security",
+          "evaluation_name": "MMLU",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "MMLU",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Computer Security",
+            "evaluation_description": "EM on MMLU",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7,
+            "score": 0.51,
             "details": {
-              "description": "min=0.7, mean=0.7, max=0.7, sum=1.4 (2)",
+              "description": "min=0.27, mean=0.51, max=0.79, sum=2.551 (5)",
               "tab": "Accuracy",
-              "Computer Security - Observed inference time (s)": "{\"description\": \"min=0.426, mean=0.426, max=0.426, sum=0.853 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4263953256607056\"}",
-              "Computer Security - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "Computer Security - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Computer Security - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Computer Security - # prompt tokens": "{\"description\": \"min=433.94, mean=433.94, max=433.94, sum=867.88 (2)\", \"tab\": \"General information\", \"score\": \"433.94\"}",
-              "Computer Security - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "MMLU - Observed inference time (s)": "{\"description\": \"min=0.221, mean=0.372, max=0.487, sum=1.862 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.37230395750413864\"}",
+              "MMLU - # eval": "{\"description\": \"min=100, mean=102.8, max=114, sum=514 (5)\", \"tab\": \"General information\", \"score\": \"102.8\"}",
+              "MMLU - # train": "{\"description\": \"min=5, mean=5, max=5, sum=25 (5)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "MMLU - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "MMLU - # prompt tokens": "{\"description\": \"min=411.44, mean=532.091, max=696.175, sum=2660.455 (5)\", \"tab\": \"General information\", \"score\": \"532.0910877192983\"}",
+              "MMLU - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"computer_security\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_computer_security\""
+              "subject": "[\"abstract_algebra\", \"college_chemistry\", \"computer_security\", \"econometrics\", \"us_foreign_policy\"]",
+              "method": "\"multiple_choice_joint\""
             }
           }
         },
         {
-          "evaluation_name": "Econometrics",
+          "evaluation_name": "MATH",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "MATH",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Econometrics",
+            "evaluation_description": "Equivalent (CoT) on MATH",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.421,
+            "score": 0.289,
             "details": {
-              "description": "min=0.421, mean=0.421, max=0.421, sum=0.842 (2)",
+              "description": "min=0.115, mean=0.289, max=0.477, sum=2.02 (7)",
               "tab": "Accuracy",
-              "Econometrics - Observed inference time (s)": "{\"description\": \"min=0.406, mean=0.406, max=0.406, sum=0.813 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.406455958098696\"}",
-              "Econometrics - # eval": "{\"description\": \"min=114, mean=114, max=114, sum=228 (2)\", \"tab\": \"General information\", \"score\": \"114.0\"}",
-              "Econometrics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Econometrics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Econometrics - # prompt tokens": "{\"description\": \"min=696.175, mean=696.175, max=696.175, sum=1392.351 (2)\", \"tab\": \"General information\", \"score\": \"696.1754385964912\"}",
-              "Econometrics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "MATH - Observed inference time (s)": "{\"description\": \"min=2.027, mean=2.656, max=3.039, sum=18.593 (7)\", \"tab\": \"Efficiency\", \"score\": \"2.656151831465352\"}",
+              "MATH - # eval": "{\"description\": \"min=30, mean=62.429, max=135, sum=437 (7)\", \"tab\": \"General information\", \"score\": \"62.42857142857143\"}",
+              "MATH - # train": "{\"description\": \"min=8, mean=8, max=8, sum=56 (7)\", \"tab\": \"General information\", \"score\": \"8.0\"}",
+              "MATH - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (7)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "MATH - # prompt tokens": "{\"description\": \"min=991.615, mean=1455.266, max=2502.962, sum=10186.865 (7)\", \"tab\": \"General information\", \"score\": \"1455.2664139976257\"}",
+              "MATH - # output tokens": "{\"description\": \"min=123.616, mean=149.99, max=172.789, sum=1049.933 (7)\", \"tab\": \"General information\", \"score\": \"149.99043902740354\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"econometrics\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_econometrics\""
+              "subject": "[\"algebra\", \"counting_and_probability\", \"geometry\", \"intermediate_algebra\", \"number_theory\", \"prealgebra\", \"precalculus\"]",
+              "level": "\"1\"",
+              "use_official_examples": "\"False\"",
+              "use_chain_of_thought": "\"True\""
             }
           }
         },
         {
-          "evaluation_name": "Global Facts",
+          "evaluation_name": "GSM8K",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "GSM8K",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Global Facts",
+            "evaluation_description": "EM on GSM8K",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.33,
+            "score": 0.538,
             "details": {
-              "description": "min=0.33, mean=0.33, max=0.33, sum=0.66 (2)",
+              "description": "min=0.538, mean=0.538, max=0.538, sum=0.538 (1)",
               "tab": "Accuracy",
-              "Global Facts - Observed inference time (s)": "{\"description\": \"min=0.299, mean=0.299, max=0.299, sum=0.598 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.29881003856658933\"}",
-              "Global Facts - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "Global Facts - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Global Facts - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Global Facts - # prompt tokens": "{\"description\": \"min=492.47, mean=492.47, max=492.47, sum=984.94 (2)\", \"tab\": \"General information\", \"score\": \"492.47\"}",
-              "Global Facts - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "GSM8K - Observed inference time (s)": "{\"description\": \"min=3.95, mean=3.95, max=3.95, sum=3.95 (1)\", \"tab\": \"Efficiency\", \"score\": \"3.949965229511261\"}",
+              "GSM8K - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}",
+              "GSM8K - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "GSM8K - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "GSM8K - # prompt tokens": "{\"description\": \"min=1187.268, mean=1187.268, max=1187.268, sum=1187.268 (1)\", \"tab\": \"General information\", \"score\": \"1187.268\"}",
+              "GSM8K - # output tokens": "{\"description\": \"min=196.611, mean=196.611, max=196.611, sum=196.611 (1)\", \"tab\": \"General information\", \"score\": \"196.611\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"global_facts\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_global_facts\""
+              "stop": "\"none\""
             }
           }
         },
         {
-          "evaluation_name": "Jurisprudence",
+          "evaluation_name": "LegalBench",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "LegalBench",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Jurisprudence",
+            "evaluation_description": "EM on LegalBench",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.713,
+            "score": 0.331,
             "details": {
-              "description": "min=0.713, mean=0.713, max=0.713, sum=1.426 (2)",
+              "description": "min=0.063, mean=0.331, max=0.733, sum=1.655 (5)",
               "tab": "Accuracy",
-              "Jurisprudence - Observed inference time (s)": "{\"description\": \"min=0.232, mean=0.232, max=0.232, sum=0.465 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.23237781833719323\"}",
-              "Jurisprudence - # eval": "{\"description\": \"min=108, mean=108, max=108, sum=216 (2)\", \"tab\": \"General information\", \"score\": \"108.0\"}",
-              "Jurisprudence - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Jurisprudence - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Jurisprudence - # prompt tokens": "{\"description\": \"min=460.093, mean=460.093, max=460.093, sum=920.185 (2)\", \"tab\": \"General information\", \"score\": \"460.0925925925926\"}",
-              "Jurisprudence - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "LegalBench - Observed inference time (s)": "{\"description\": \"min=0.316, mean=0.489, max=0.855, sum=2.444 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.4887186054518059\"}",
+              "LegalBench - # eval": "{\"description\": \"min=95, mean=409.4, max=1000, sum=2047 (5)\", \"tab\": \"General information\", \"score\": \"409.4\"}",
+              "LegalBench - # train": "{\"description\": \"min=4, mean=4.8, max=5, sum=24 (5)\", \"tab\": \"General information\", \"score\": \"4.8\"}",
+              "LegalBench - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "LegalBench - # prompt tokens": "{\"description\": \"min=236.453, mean=1750.748, max=7224.488, sum=8753.741 (5)\", \"tab\": \"General information\", \"score\": \"1750.7482458432962\"}",
+              "LegalBench - # output tokens": "{\"description\": \"min=2, mean=9.174, max=15.242, sum=45.871 (5)\", \"tab\": \"General information\", \"score\": \"9.17419274343898\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"jurisprudence\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_jurisprudence\""
+              "subset": "[\"abercrombie\", \"corporate_lobbying\", \"function_of_decision_section\", \"international_citizenship_questions\", \"proa\"]"
             }
           }
         },
         {
-          "evaluation_name": "Philosophy",
+          "evaluation_name": "MedQA",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "MedQA",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Philosophy",
+            "evaluation_description": "EM on MedQA",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.659,
+            "score": 0.517,
             "details": {
-              "description": "min=0.659, mean=0.659, max=0.659, sum=1.318 (2)",
+              "description": "min=0.517, mean=0.517, max=0.517, sum=0.517 (1)",
               "tab": "Accuracy",
-              "Philosophy - Observed inference time (s)": "{\"description\": \"min=0.899, mean=0.899, max=0.899, sum=1.798 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.8987545852109167\"}",
-              "Philosophy - # eval": "{\"description\": \"min=311, mean=311, max=311, sum=622 (2)\", \"tab\": \"General information\", \"score\": \"311.0\"}",
-              "Philosophy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Philosophy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Philosophy - # prompt tokens": "{\"description\": \"min=382.82, mean=382.82, max=382.82, sum=765.64 (2)\", \"tab\": \"General information\", \"score\": \"382.81993569131834\"}",
-              "Philosophy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "MedQA - Observed inference time (s)": "{\"description\": \"min=0.418, mean=0.418, max=0.418, sum=0.418 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.4182186216767692\"}",
+              "MedQA - # eval": "{\"description\": \"min=503, mean=503, max=503, sum=503 (1)\", \"tab\": \"General information\", \"score\": \"503.0\"}",
+              "MedQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "MedQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "MedQA - # prompt tokens": "{\"description\": \"min=1202.093, mean=1202.093, max=1202.093, sum=1202.093 (1)\", \"tab\": \"General information\", \"score\": \"1202.0934393638172\"}",
+              "MedQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
-            "additional_details": {
-              "subject": "\"philosophy\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_philosophy\""
-            }
+            "additional_details": {}
           }
         },
         {
-          "evaluation_name": "Professional Psychology",
+          "evaluation_name": "WMT 2014",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "WMT 2014",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Professional Psychology",
+            "evaluation_description": "BLEU-4 on WMT 2014",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.641,
+            "score": 0.142,
             "details": {
-              "description": "min=0.641, mean=0.641, max=0.641, sum=1.281 (2)",
+              "description": "min=0.047, mean=0.142, max=0.184, sum=0.712 (5)",
               "tab": "Accuracy",
-              "Professional Medicine - Observed inference time (s)": "{\"description\": \"min=0.615, mean=0.615, max=0.615, sum=1.23 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6148438769228318\"}",
-              "Professional Accounting - Observed inference time (s)": "{\"description\": \"min=0.825, mean=0.825, max=0.825, sum=1.651 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.8254362666015084\"}",
-              "Professional Law - Observed inference time (s)": "{\"description\": \"min=0.682, mean=0.682, max=0.682, sum=1.364 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.68212915414937\"}",
-              "Professional Psychology - Observed inference time (s)": "{\"description\": \"min=0.506, mean=0.506, max=0.506, sum=1.012 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.505940170459498\"}",
-              "Professional Medicine - # eval": "{\"description\": \"min=272, mean=272, max=272, sum=544 (2)\", \"tab\": \"General information\", \"score\": \"272.0\"}",
-              "Professional Medicine - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Professional Medicine - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Professional Medicine - # prompt tokens": "{\"description\": \"min=1288.143, mean=1288.143, max=1288.143, sum=2576.287 (2)\", \"tab\": \"General information\", \"score\": \"1288.1433823529412\"}",
-              "Professional Medicine - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "Professional Accounting - # eval": "{\"description\": \"min=282, mean=282, max=282, sum=564 (2)\", \"tab\": \"General information\", \"score\": \"282.0\"}",
-              "Professional Accounting - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Professional Accounting - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Professional Accounting - # prompt tokens": "{\"description\": \"min=805.496, mean=805.496, max=805.496, sum=1610.993 (2)\", \"tab\": \"General information\", \"score\": \"805.4964539007092\"}",
-              "Professional Accounting - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "Professional Law - # eval": "{\"description\": \"min=1534, mean=1534, max=1534, sum=3068 (2)\", \"tab\": \"General information\", \"score\": \"1534.0\"}",
-              "Professional Law - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Professional Law - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Professional Law - # prompt tokens": "{\"description\": \"min=1858.711, mean=1858.711, max=1858.711, sum=3717.421 (2)\", \"tab\": \"General information\", \"score\": \"1858.7105606258149\"}",
-              "Professional Law - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "Professional Psychology - # eval": "{\"description\": \"min=612, mean=612, max=612, sum=1224 (2)\", \"tab\": \"General information\", \"score\": \"612.0\"}",
-              "Professional Psychology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Professional Psychology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Professional Psychology - # prompt tokens": "{\"description\": \"min=654.278, mean=654.278, max=654.278, sum=1308.556 (2)\", \"tab\": \"General information\", \"score\": \"654.2777777777778\"}",
-              "Professional Psychology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "WMT 2014 - Observed inference time (s)": "{\"description\": \"min=0.582, mean=0.775, max=0.872, sum=3.875 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.7750062139801958\"}",
+              "WMT 2014 - # eval": "{\"description\": \"min=503, mean=568.8, max=832, sum=2844 (5)\", \"tab\": \"General information\", \"score\": \"568.8\"}",
+              "WMT 2014 - # train": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "WMT 2014 - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "WMT 2014 - # prompt tokens": "{\"description\": \"min=148.306, mean=162.433, max=181.018, sum=812.166 (5)\", \"tab\": \"General information\", \"score\": \"162.43317355482492\"}",
+              "WMT 2014 - # output tokens": "{\"description\": \"min=28.3, mean=30.51, max=31.912, sum=152.552 (5)\", \"tab\": \"General information\", \"score\": \"30.510483732222053\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"professional_psychology\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_professional_psychology\""
+              "language_pair": "[\"cs-en\", \"de-en\", \"fr-en\", \"hi-en\", \"ru-en\"]"
             }
           }
-        },
+        }
+      ],
+      "detailed_evaluation_results": null,
+      "generation_config": {
+        "additional_details": {}
+      }
+    },
+    {
+      "evaluation_id": "helm_mmlu/mistralai_mistral-7b-instruct-v0.3/1774096312.00548",
+      "retrieved_timestamp": "1774096312.00548",
+      "source_metadata": {
+        "source_name": "helm_mmlu",
+        "source_type": "documentation",
+        "source_organization_name": "crfm",
+        "evaluator_relationship": "third_party"
+      },
+      "eval_library": {
+        "name": "helm",
+        "version": "unknown"
+      },
+      "benchmark": "helm_mmlu",
+      "evaluation_results": [
         {
-          "evaluation_name": "Us Foreign Policy",
+          "evaluation_name": "MMLU All Subjects",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -688,36 +632,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Us Foreign Policy",
+            "evaluation_description": "EM on MMLU All Subjects",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.79,
+            "score": 0.599,
             "details": {
-              "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)",
+              "description": "min=0.258, mean=0.599, max=0.881, sum=68.3 (114)",
               "tab": "Accuracy",
-              "Us Foreign Policy - Observed inference time (s)": "{\"description\": \"min=0.487, mean=0.487, max=0.487, sum=0.973 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.48650413513183594\"}",
-              "Us Foreign Policy - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "Us Foreign Policy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Us Foreign Policy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Us Foreign Policy - # prompt tokens": "{\"description\": \"min=482.19, mean=482.19, max=482.19, sum=964.38 (2)\", \"tab\": \"General information\", \"score\": \"482.19\"}",
-              "Us Foreign Policy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "MMLU All Subjects - Observed inference time (s)": "{\"description\": \"min=0.212, mean=0.526, max=1.438, sum=59.959 (114)\", \"tab\": \"Efficiency\", \"score\": \"0.525951832745908\"}",
+              "MMLU All Subjects - # eval": "{\"description\": \"min=100, mean=246.351, max=1534, sum=28084 (114)\", \"tab\": \"General information\", \"score\": \"246.35087719298247\"}",
+              "MMLU All Subjects - # train": "{\"description\": \"min=5, mean=5, max=5, sum=570 (114)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "MMLU All Subjects - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (114)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "MMLU All Subjects - # prompt tokens": "{\"description\": \"min=317.924, mean=705.273, max=3098.109, sum=80401.178 (114)\", \"tab\": \"General information\", \"score\": \"705.2734899593811\"}",
+              "MMLU All Subjects - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=114 (114)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"us_foreign_policy\"",
+              "subject": "[\"abstract_algebra\", \"anatomy\", \"astronomy\", \"business_ethics\", \"clinical_knowledge\", \"college_biology\", \"college_chemistry\", \"college_computer_science\", \"college_mathematics\", \"college_medicine\", \"college_physics\", \"computer_security\", \"conceptual_physics\", \"econometrics\", \"electrical_engineering\", \"elementary_mathematics\", \"formal_logic\", \"global_facts\", \"high_school_biology\", \"high_school_chemistry\", \"high_school_computer_science\", \"high_school_european_history\", \"high_school_geography\", \"high_school_government_and_politics\", \"high_school_macroeconomics\", \"high_school_mathematics\", \"high_school_microeconomics\", \"high_school_physics\", \"high_school_psychology\", \"high_school_statistics\", \"high_school_us_history\", \"high_school_world_history\", \"human_aging\", \"human_sexuality\", \"international_law\", \"jurisprudence\", \"logical_fallacies\", \"machine_learning\", \"management\", \"marketing\", \"medical_genetics\", \"miscellaneous\", \"moral_disputes\", \"moral_scenarios\", \"nutrition\", \"philosophy\", \"prehistory\", \"professional_accounting\", \"professional_law\", \"professional_medicine\", \"professional_psychology\", \"public_relations\", \"security_studies\", \"sociology\", \"us_foreign_policy\", \"virology\", \"world_religions\"]",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_us_foreign_policy\""
+              "groups": "[\"mmlu_abstract_algebra\", \"mmlu_anatomy\", \"mmlu_astronomy\", \"mmlu_business_ethics\", \"mmlu_clinical_knowledge\", \"mmlu_college_biology\", \"mmlu_college_chemistry\", \"mmlu_college_computer_science\", \"mmlu_college_mathematics\", \"mmlu_college_medicine\", \"mmlu_college_physics\", \"mmlu_computer_security\", \"mmlu_conceptual_physics\", \"mmlu_econometrics\", \"mmlu_electrical_engineering\", \"mmlu_elementary_mathematics\", \"mmlu_formal_logic\", \"mmlu_global_facts\", \"mmlu_high_school_biology\", \"mmlu_high_school_chemistry\", \"mmlu_high_school_computer_science\", \"mmlu_high_school_european_history\", \"mmlu_high_school_geography\", \"mmlu_high_school_government_and_politics\", \"mmlu_high_school_macroeconomics\", \"mmlu_high_school_mathematics\", \"mmlu_high_school_microeconomics\", \"mmlu_high_school_physics\", \"mmlu_high_school_psychology\", \"mmlu_high_school_statistics\", \"mmlu_high_school_us_history\", \"mmlu_high_school_world_history\", \"mmlu_human_aging\", \"mmlu_human_sexuality\", \"mmlu_international_law\", \"mmlu_jurisprudence\", \"mmlu_logical_fallacies\", \"mmlu_machine_learning\", \"mmlu_management\", \"mmlu_marketing\", \"mmlu_medical_genetics\", \"mmlu_miscellaneous\", \"mmlu_moral_disputes\", \"mmlu_moral_scenarios\", \"mmlu_nutrition\", \"mmlu_philosophy\", \"mmlu_prehistory\", \"mmlu_professional_accounting\", \"mmlu_professional_law\", \"mmlu_professional_medicine\", \"mmlu_professional_psychology\", \"mmlu_public_relations\", \"mmlu_security_studies\", \"mmlu_sociology\", \"mmlu_us_foreign_policy\", \"mmlu_virology\", \"mmlu_world_religions\"]"
             }
           }
         },
         {
-          "evaluation_name": "Astronomy",
+          "evaluation_name": "Abstract Algebra",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -726,36 +670,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Astronomy",
+            "evaluation_description": "EM on Abstract Algebra",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.638,
+            "score": 0.27,
             "details": {
-              "description": "min=0.638, mean=0.638, max=0.638, sum=1.276 (2)",
+              "description": "min=0.27, mean=0.27, max=0.27, sum=0.54 (2)",
               "tab": "Accuracy",
-              "Astronomy - Observed inference time (s)": "{\"description\": \"min=0.678, mean=0.678, max=0.678, sum=1.355 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6775346147386652\"}",
-              "Astronomy - # eval": "{\"description\": \"min=152, mean=152, max=152, sum=304 (2)\", \"tab\": \"General information\", \"score\": \"152.0\"}",
-              "Astronomy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Astronomy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Astronomy - # prompt tokens": "{\"description\": \"min=674.987, mean=674.987, max=674.987, sum=1349.974 (2)\", \"tab\": \"General information\", \"score\": \"674.9868421052631\"}",
-              "Astronomy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Abstract Algebra - Observed inference time (s)": "{\"description\": \"min=0.321, mean=0.321, max=0.321, sum=0.642 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.32117165088653565\"}",
+              "Abstract Algebra - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "Abstract Algebra - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Abstract Algebra - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Abstract Algebra - # prompt tokens": "{\"description\": \"min=411.44, mean=411.44, max=411.44, sum=822.88 (2)\", \"tab\": \"General information\", \"score\": \"411.44\"}",
+              "Abstract Algebra - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"astronomy\"",
+              "subject": "\"abstract_algebra\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_astronomy\""
+              "groups": "\"mmlu_abstract_algebra\""
             }
           }
         },
         {
-          "evaluation_name": "Business Ethics",
+          "evaluation_name": "Anatomy",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -764,36 +708,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Business Ethics",
+            "evaluation_description": "EM on Anatomy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.57,
+            "score": 0.585,
             "details": {
-              "description": "min=0.57, mean=0.57, max=0.57, sum=1.14 (2)",
+              "description": "min=0.585, mean=0.585, max=0.585, sum=1.17 (2)",
               "tab": "Accuracy",
-              "Business Ethics - Observed inference time (s)": "{\"description\": \"min=0.645, mean=0.645, max=0.645, sum=1.289 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6446590375900269\"}",
-              "Business Ethics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "Business Ethics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Business Ethics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Business Ethics - # prompt tokens": "{\"description\": \"min=653.6, mean=653.6, max=653.6, sum=1307.2 (2)\", \"tab\": \"General information\", \"score\": \"653.6\"}",
-              "Business Ethics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Anatomy - Observed inference time (s)": "{\"description\": \"min=0.246, mean=0.246, max=0.246, sum=0.493 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.24627229902479383\"}",
+              "Anatomy - # eval": "{\"description\": \"min=135, mean=135, max=135, sum=270 (2)\", \"tab\": \"General information\", \"score\": \"135.0\"}",
+              "Anatomy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Anatomy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Anatomy - # prompt tokens": "{\"description\": \"min=416.089, mean=416.089, max=416.089, sum=832.178 (2)\", \"tab\": \"General information\", \"score\": \"416.0888888888889\"}",
+              "Anatomy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"business_ethics\"",
+              "subject": "\"anatomy\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_business_ethics\""
+              "groups": "\"mmlu_anatomy\""
             }
           }
         },
         {
-          "evaluation_name": "Clinical Knowledge",
+          "evaluation_name": "College Physics",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -802,36 +746,66 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Clinical Knowledge",
+            "evaluation_description": "EM on College Physics",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.687,
+            "score": 0.343,
             "details": {
-              "description": "min=0.687, mean=0.687, max=0.687, sum=1.374 (2)",
+              "description": "min=0.343, mean=0.343, max=0.343, sum=0.686 (2)",
               "tab": "Accuracy",
-              "Clinical Knowledge - Observed inference time (s)": "{\"description\": \"min=0.844, mean=0.844, max=0.844, sum=1.687 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.8436905698956184\"}",
-              "Clinical Knowledge - # eval": "{\"description\": \"min=265, mean=265, max=265, sum=530 (2)\", \"tab\": \"General information\", \"score\": \"265.0\"}",
-              "Clinical Knowledge - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Clinical Knowledge - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Clinical Knowledge - # prompt tokens": "{\"description\": \"min=496.174, mean=496.174, max=496.174, sum=992.347 (2)\", \"tab\": \"General information\", \"score\": \"496.1735849056604\"}",
-              "Clinical Knowledge - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "College Chemistry - Observed inference time (s)": "{\"description\": \"min=0.221, mean=0.221, max=0.221, sum=0.442 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.22099271774291993\"}",
+              "College Biology - Observed inference time (s)": "{\"description\": \"min=0.7, mean=0.7, max=0.7, sum=1.399 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6997380173868604\"}",
+              "College Computer Science - Observed inference time (s)": "{\"description\": \"min=0.466, mean=0.466, max=0.466, sum=0.932 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4661028146743774\"}",
+              "College Mathematics - Observed inference time (s)": "{\"description\": \"min=0.212, mean=0.212, max=0.212, sum=0.424 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.21210591793060302\"}",
+              "College Medicine - Observed inference time (s)": "{\"description\": \"min=0.387, mean=0.387, max=0.387, sum=0.774 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3871537646806309\"}",
+              "College Physics - Observed inference time (s)": "{\"description\": \"min=0.455, mean=0.455, max=0.455, sum=0.91 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.45503536392660704\"}",
+              "College Chemistry - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "College Chemistry - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "College Chemistry - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "College Chemistry - # prompt tokens": "{\"description\": \"min=636.71, mean=636.71, max=636.71, sum=1273.42 (2)\", \"tab\": \"General information\", \"score\": \"636.71\"}",
+              "College Chemistry - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "College Biology - # eval": "{\"description\": \"min=144, mean=144, max=144, sum=288 (2)\", \"tab\": \"General information\", \"score\": \"144.0\"}",
+              "College Biology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "College Biology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "College Biology - # prompt tokens": "{\"description\": \"min=559.799, mean=559.799, max=559.799, sum=1119.597 (2)\", \"tab\": \"General information\", \"score\": \"559.7986111111111\"}",
+              "College Biology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "College Computer Science - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "College Computer Science - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "College Computer Science - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "College Computer Science - # prompt tokens": "{\"description\": \"min=911.17, mean=911.17, max=911.17, sum=1822.34 (2)\", \"tab\": \"General information\", \"score\": \"911.17\"}",
+              "College Computer Science - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "College Mathematics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "College Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "College Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "College Mathematics - # prompt tokens": "{\"description\": \"min=667.31, mean=667.31, max=667.31, sum=1334.62 (2)\", \"tab\": \"General information\", \"score\": \"667.31\"}",
+              "College Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "College Medicine - # eval": "{\"description\": \"min=173, mean=173, max=173, sum=346 (2)\", \"tab\": \"General information\", \"score\": \"173.0\"}",
+              "College Medicine - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "College Medicine - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "College Medicine - # prompt tokens": "{\"description\": \"min=601.41, mean=601.41, max=601.41, sum=1202.821 (2)\", \"tab\": \"General information\", \"score\": \"601.4104046242775\"}",
+              "College Medicine - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "College Physics - # eval": "{\"description\": \"min=102, mean=102, max=102, sum=204 (2)\", \"tab\": \"General information\", \"score\": \"102.0\"}",
+              "College Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "College Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "College Physics - # prompt tokens": "{\"description\": \"min=560.029, mean=560.029, max=560.029, sum=1120.059 (2)\", \"tab\": \"General information\", \"score\": \"560.0294117647059\"}",
+              "College Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"clinical_knowledge\"",
+              "subject": "\"college_physics\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_clinical_knowledge\""
+              "groups": "\"mmlu_college_physics\""
             }
           }
         },
         {
-          "evaluation_name": "Conceptual Physics",
+          "evaluation_name": "Computer Security",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -840,36 +814,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Conceptual Physics",
+            "evaluation_description": "EM on Computer Security",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.549,
+            "score": 0.7,
             "details": {
-              "description": "min=0.549, mean=0.549, max=0.549, sum=1.098 (2)",
+              "description": "min=0.7, mean=0.7, max=0.7, sum=1.4 (2)",
               "tab": "Accuracy",
-              "Conceptual Physics - Observed inference time (s)": "{\"description\": \"min=0.333, mean=0.333, max=0.333, sum=0.666 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.33306963900302317\"}",
-              "Conceptual Physics - # eval": "{\"description\": \"min=235, mean=235, max=235, sum=470 (2)\", \"tab\": \"General information\", \"score\": \"235.0\"}",
-              "Conceptual Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Conceptual Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Conceptual Physics - # prompt tokens": "{\"description\": \"min=343.285, mean=343.285, max=343.285, sum=686.57 (2)\", \"tab\": \"General information\", \"score\": \"343.2851063829787\"}",
-              "Conceptual Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Computer Security - Observed inference time (s)": "{\"description\": \"min=0.426, mean=0.426, max=0.426, sum=0.853 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4263953256607056\"}",
+              "Computer Security - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "Computer Security - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Computer Security - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Computer Security - # prompt tokens": "{\"description\": \"min=433.94, mean=433.94, max=433.94, sum=867.88 (2)\", \"tab\": \"General information\", \"score\": \"433.94\"}",
+              "Computer Security - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"conceptual_physics\"",
+              "subject": "\"computer_security\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_conceptual_physics\""
+              "groups": "\"mmlu_computer_security\""
             }
           }
         },
         {
-          "evaluation_name": "Electrical Engineering",
+          "evaluation_name": "Econometrics",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -878,36 +852,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Electrical Engineering",
+            "evaluation_description": "EM on Econometrics",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.572,
+            "score": 0.421,
             "details": {
-              "description": "min=0.572, mean=0.572, max=0.572, sum=1.145 (2)",
+              "description": "min=0.421, mean=0.421, max=0.421, sum=0.842 (2)",
               "tab": "Accuracy",
-              "Electrical Engineering - Observed inference time (s)": "{\"description\": \"min=0.392, mean=0.392, max=0.392, sum=0.784 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3922290703345989\"}",
-              "Electrical Engineering - # eval": "{\"description\": \"min=145, mean=145, max=145, sum=290 (2)\", \"tab\": \"General information\", \"score\": \"145.0\"}",
-              "Electrical Engineering - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Electrical Engineering - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Electrical Engineering - # prompt tokens": "{\"description\": \"min=510.379, mean=510.379, max=510.379, sum=1020.759 (2)\", \"tab\": \"General information\", \"score\": \"510.37931034482756\"}",
-              "Electrical Engineering - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Econometrics - Observed inference time (s)": "{\"description\": \"min=0.406, mean=0.406, max=0.406, sum=0.813 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.406455958098696\"}",
+              "Econometrics - # eval": "{\"description\": \"min=114, mean=114, max=114, sum=228 (2)\", \"tab\": \"General information\", \"score\": \"114.0\"}",
+              "Econometrics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Econometrics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Econometrics - # prompt tokens": "{\"description\": \"min=696.175, mean=696.175, max=696.175, sum=1392.351 (2)\", \"tab\": \"General information\", \"score\": \"696.1754385964912\"}",
+              "Econometrics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"electrical_engineering\"",
+              "subject": "\"econometrics\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_electrical_engineering\""
+              "groups": "\"mmlu_econometrics\""
             }
           }
         },
         {
-          "evaluation_name": "Elementary Mathematics",
+          "evaluation_name": "Global Facts",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -916,36 +890,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Elementary Mathematics",
+            "evaluation_description": "EM on Global Facts",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.402,
+            "score": 0.33,
             "details": {
-              "description": "min=0.402, mean=0.402, max=0.402, sum=0.804 (2)",
+              "description": "min=0.33, mean=0.33, max=0.33, sum=0.66 (2)",
               "tab": "Accuracy",
-              "Elementary Mathematics - Observed inference time (s)": "{\"description\": \"min=0.676, mean=0.676, max=0.676, sum=1.352 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6761655416438188\"}",
-              "Elementary Mathematics - # eval": "{\"description\": \"min=378, mean=378, max=378, sum=756 (2)\", \"tab\": \"General information\", \"score\": \"378.0\"}",
-              "Elementary Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Elementary Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Elementary Mathematics - # prompt tokens": "{\"description\": \"min=622.386, mean=622.386, max=622.386, sum=1244.772 (2)\", \"tab\": \"General information\", \"score\": \"622.3862433862433\"}",
-              "Elementary Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Global Facts - Observed inference time (s)": "{\"description\": \"min=0.299, mean=0.299, max=0.299, sum=0.598 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.29881003856658933\"}",
+              "Global Facts - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "Global Facts - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Global Facts - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Global Facts - # prompt tokens": "{\"description\": \"min=492.47, mean=492.47, max=492.47, sum=984.94 (2)\", \"tab\": \"General information\", \"score\": \"492.47\"}",
+              "Global Facts - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"elementary_mathematics\"",
+              "subject": "\"global_facts\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_elementary_mathematics\""
+              "groups": "\"mmlu_global_facts\""
             }
           }
         },
         {
-          "evaluation_name": "Formal Logic",
+          "evaluation_name": "Jurisprudence",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -954,36 +928,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Formal Logic",
+            "evaluation_description": "EM on Jurisprudence",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.397,
+            "score": 0.713,
             "details": {
-              "description": "min=0.397, mean=0.397, max=0.397, sum=0.794 (2)",
+              "description": "min=0.713, mean=0.713, max=0.713, sum=1.426 (2)",
               "tab": "Accuracy",
-              "Formal Logic - Observed inference time (s)": "{\"description\": \"min=0.734, mean=0.734, max=0.734, sum=1.467 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.7336057802987477\"}",
-              "Formal Logic - # eval": "{\"description\": \"min=126, mean=126, max=126, sum=252 (2)\", \"tab\": \"General information\", \"score\": \"126.0\"}",
-              "Formal Logic - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Formal Logic - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Formal Logic - # prompt tokens": "{\"description\": \"min=727.984, mean=727.984, max=727.984, sum=1455.968 (2)\", \"tab\": \"General information\", \"score\": \"727.984126984127\"}",
-              "Formal Logic - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Jurisprudence - Observed inference time (s)": "{\"description\": \"min=0.232, mean=0.232, max=0.232, sum=0.465 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.23237781833719323\"}",
+              "Jurisprudence - # eval": "{\"description\": \"min=108, mean=108, max=108, sum=216 (2)\", \"tab\": \"General information\", \"score\": \"108.0\"}",
+              "Jurisprudence - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Jurisprudence - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Jurisprudence - # prompt tokens": "{\"description\": \"min=460.093, mean=460.093, max=460.093, sum=920.185 (2)\", \"tab\": \"General information\", \"score\": \"460.0925925925926\"}",
+              "Jurisprudence - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"formal_logic\"",
+              "subject": "\"jurisprudence\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_formal_logic\""
+              "groups": "\"mmlu_jurisprudence\""
             }
           }
         },
         {
-          "evaluation_name": "High School World History",
+          "evaluation_name": "Philosophy",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -992,114 +966,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on High School World History",
+            "evaluation_description": "EM on Philosophy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.759,
+            "score": 0.659,
             "details": {
-              "description": "min=0.759, mean=0.759, max=0.759, sum=1.519 (2)",
+              "description": "min=0.659, mean=0.659, max=0.659, sum=1.318 (2)",
               "tab": "Accuracy",
-              "High School Biology - Observed inference time (s)": "{\"description\": \"min=0.805, mean=0.805, max=0.805, sum=1.61 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.8049156188964843\"}",
-              "High School Chemistry - Observed inference time (s)": "{\"description\": \"min=0.44, mean=0.44, max=0.44, sum=0.881 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.44036899529067164\"}",
-              "High School Computer Science - Observed inference time (s)": "{\"description\": \"min=0.435, mean=0.435, max=0.435, sum=0.869 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4347002100944519\"}",
-              "High School European History - Observed inference time (s)": "{\"description\": \"min=0.445, mean=0.445, max=0.445, sum=0.891 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4453156341205944\"}",
-              "High School Geography - Observed inference time (s)": "{\"description\": \"min=0.331, mean=0.331, max=0.331, sum=0.661 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3305177327358361\"}",
-              "High School Government And Politics - Observed inference time (s)": "{\"description\": \"min=0.545, mean=0.545, max=0.545, sum=1.089 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5445178654527417\"}",
-              "High School Macroeconomics - Observed inference time (s)": "{\"description\": \"min=0.53, mean=0.53, max=0.53, sum=1.061 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5302642871172\"}",
-              "High School Mathematics - Observed inference time (s)": "{\"description\": \"min=0.585, mean=0.585, max=0.585, sum=1.169 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5845282289716932\"}",
-              "High School Microeconomics - Observed inference time (s)": "{\"description\": \"min=0.234, mean=0.234, max=0.234, sum=0.468 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.23408917118521297\"}",
-              "High School Physics - Observed inference time (s)": "{\"description\": \"min=0.384, mean=0.384, max=0.384, sum=0.768 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3838195042894376\"}",
-              "High School Psychology - Observed inference time (s)": "{\"description\": \"min=0.274, mean=0.274, max=0.274, sum=0.547 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2735835779697523\"}",
-              "High School Statistics - Observed inference time (s)": "{\"description\": \"min=0.654, mean=0.654, max=0.654, sum=1.308 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6539056665367551\"}",
-              "High School US History - Observed inference time (s)": "{\"description\": \"min=0.942, mean=0.942, max=0.942, sum=1.883 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.9417344308366963\"}",
-              "High School World History - Observed inference time (s)": "{\"description\": \"min=0.864, mean=0.864, max=0.864, sum=1.727 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.8635432951561006\"}",
-              "High School Biology - # eval": "{\"description\": \"min=310, mean=310, max=310, sum=620 (2)\", \"tab\": \"General information\", \"score\": \"310.0\"}",
-              "High School Biology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Biology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Biology - # prompt tokens": "{\"description\": \"min=609.561, mean=609.561, max=609.561, sum=1219.123 (2)\", \"tab\": \"General information\", \"score\": \"609.5612903225806\"}",
-              "High School Biology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Chemistry - # eval": "{\"description\": \"min=203, mean=203, max=203, sum=406 (2)\", \"tab\": \"General information\", \"score\": \"203.0\"}",
-              "High School Chemistry - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Chemistry - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Chemistry - # prompt tokens": "{\"description\": \"min=581.798, mean=581.798, max=581.798, sum=1163.596 (2)\", \"tab\": \"General information\", \"score\": \"581.7980295566502\"}",
-              "High School Chemistry - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Computer Science - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "High School Computer Science - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Computer Science - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Computer Science - # prompt tokens": "{\"description\": \"min=997.24, mean=997.24, max=997.24, sum=1994.48 (2)\", \"tab\": \"General information\", \"score\": \"997.24\"}",
-              "High School Computer Science - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School European History - # eval": "{\"description\": \"min=165, mean=165, max=165, sum=330 (2)\", \"tab\": \"General information\", \"score\": \"165.0\"}",
-              "High School European History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School European History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School European History - # prompt tokens": "{\"description\": \"min=3098.109, mean=3098.109, max=3098.109, sum=6196.218 (2)\", \"tab\": \"General information\", \"score\": \"3098.109090909091\"}",
-              "High School European History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Geography - # eval": "{\"description\": \"min=198, mean=198, max=198, sum=396 (2)\", \"tab\": \"General information\", \"score\": \"198.0\"}",
-              "High School Geography - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Geography - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Geography - # prompt tokens": "{\"description\": \"min=438.207, mean=438.207, max=438.207, sum=876.414 (2)\", \"tab\": \"General information\", \"score\": \"438.2070707070707\"}",
-              "High School Geography - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Government And Politics - # eval": "{\"description\": \"min=193, mean=193, max=193, sum=386 (2)\", \"tab\": \"General information\", \"score\": \"193.0\"}",
-              "High School Government And Politics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Government And Politics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Government And Politics - # prompt tokens": "{\"description\": \"min=523.808, mean=523.808, max=523.808, sum=1047.617 (2)\", \"tab\": \"General information\", \"score\": \"523.8082901554404\"}",
-              "High School Government And Politics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Macroeconomics - # eval": "{\"description\": \"min=390, mean=390, max=390, sum=780 (2)\", \"tab\": \"General information\", \"score\": \"390.0\"}",
-              "High School Macroeconomics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Macroeconomics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Macroeconomics - # prompt tokens": "{\"description\": \"min=432.815, mean=432.815, max=432.815, sum=865.631 (2)\", \"tab\": \"General information\", \"score\": \"432.81538461538463\"}",
-              "High School Macroeconomics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Mathematics - # eval": "{\"description\": \"min=270, mean=270, max=270, sum=540 (2)\", \"tab\": \"General information\", \"score\": \"270.0\"}",
-              "High School Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Mathematics - # prompt tokens": "{\"description\": \"min=593.13, mean=593.13, max=593.13, sum=1186.259 (2)\", \"tab\": \"General information\", \"score\": \"593.1296296296297\"}",
-              "High School Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Microeconomics - # eval": "{\"description\": \"min=238, mean=238, max=238, sum=476 (2)\", \"tab\": \"General information\", \"score\": \"238.0\"}",
-              "High School Microeconomics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Microeconomics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Microeconomics - # prompt tokens": "{\"description\": \"min=452.345, mean=452.345, max=452.345, sum=904.689 (2)\", \"tab\": \"General information\", \"score\": \"452.34453781512605\"}",
-              "High School Microeconomics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Physics - # eval": "{\"description\": \"min=151, mean=151, max=151, sum=302 (2)\", \"tab\": \"General information\", \"score\": \"151.0\"}",
-              "High School Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Physics - # prompt tokens": "{\"description\": \"min=631.775, mean=631.775, max=631.775, sum=1263.55 (2)\", \"tab\": \"General information\", \"score\": \"631.774834437086\"}",
-              "High School Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Psychology - # eval": "{\"description\": \"min=545, mean=545, max=545, sum=1090 (2)\", \"tab\": \"General information\", \"score\": \"545.0\"}",
-              "High School Psychology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Psychology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Psychology - # prompt tokens": "{\"description\": \"min=567.873, mean=567.873, max=567.873, sum=1135.747 (2)\", \"tab\": \"General information\", \"score\": \"567.8733944954129\"}",
-              "High School Psychology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Statistics - # eval": "{\"description\": \"min=216, mean=216, max=216, sum=432 (2)\", \"tab\": \"General information\", \"score\": \"216.0\"}",
-              "High School Statistics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Statistics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Statistics - # prompt tokens": "{\"description\": \"min=922.644, mean=922.644, max=922.644, sum=1845.287 (2)\", \"tab\": \"General information\", \"score\": \"922.6435185185185\"}",
-              "High School Statistics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School US History - # eval": "{\"description\": \"min=204, mean=204, max=204, sum=408 (2)\", \"tab\": \"General information\", \"score\": \"204.0\"}",
-              "High School US History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School US History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School US History - # prompt tokens": "{\"description\": \"min=2486.446, mean=2486.446, max=2486.446, sum=4972.892 (2)\", \"tab\": \"General information\", \"score\": \"2486.4460784313724\"}",
-              "High School US History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School World History - # eval": "{\"description\": \"min=237, mean=237, max=237, sum=474 (2)\", \"tab\": \"General information\", \"score\": \"237.0\"}",
-              "High School World History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School World History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School World History - # prompt tokens": "{\"description\": \"min=1594.553, mean=1594.553, max=1594.553, sum=3189.105 (2)\", \"tab\": \"General information\", \"score\": \"1594.5527426160338\"}",
-              "High School World History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Philosophy - Observed inference time (s)": "{\"description\": \"min=0.899, mean=0.899, max=0.899, sum=1.798 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.8987545852109167\"}",
+              "Philosophy - # eval": "{\"description\": \"min=311, mean=311, max=311, sum=622 (2)\", \"tab\": \"General information\", \"score\": \"311.0\"}",
+              "Philosophy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Philosophy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Philosophy - # prompt tokens": "{\"description\": \"min=382.82, mean=382.82, max=382.82, sum=765.64 (2)\", \"tab\": \"General information\", \"score\": \"382.81993569131834\"}",
+              "Philosophy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"high_school_world_history\"",
+              "subject": "\"philosophy\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_high_school_world_history\""
+              "groups": "\"mmlu_philosophy\""
             }
           }
         },
         {
-          "evaluation_name": "Human Sexuality",
+          "evaluation_name": "Professional Psychology",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1108,42 +1004,54 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Human Sexuality",
+            "evaluation_description": "EM on Professional Psychology",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.702,
+            "score": 0.641,
             "details": {
-              "description": "min=0.702, mean=0.702, max=0.702, sum=1.405 (2)",
+              "description": "min=0.641, mean=0.641, max=0.641, sum=1.281 (2)",
               "tab": "Accuracy",
-              "Human Aging - Observed inference time (s)": "{\"description\": \"min=0.809, mean=0.809, max=0.809, sum=1.618 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.8091403518557014\"}",
-              "Human Sexuality - Observed inference time (s)": "{\"description\": \"min=1.438, mean=1.438, max=1.438, sum=2.875 (2)\", \"tab\": \"Efficiency\", \"score\": \"1.437711750278036\"}",
-              "Human Aging - # eval": "{\"description\": \"min=223, mean=223, max=223, sum=446 (2)\", \"tab\": \"General information\", \"score\": \"223.0\"}",
-              "Human Aging - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Human Aging - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Human Aging - # prompt tokens": "{\"description\": \"min=362.152, mean=362.152, max=362.152, sum=724.305 (2)\", \"tab\": \"General information\", \"score\": \"362.15246636771303\"}",
-              "Human Aging - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "Human Sexuality - # eval": "{\"description\": \"min=131, mean=131, max=131, sum=262 (2)\", \"tab\": \"General information\", \"score\": \"131.0\"}",
-              "Human Sexuality - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Human Sexuality - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Human Sexuality - # prompt tokens": "{\"description\": \"min=403.748, mean=403.748, max=403.748, sum=807.496 (2)\", \"tab\": \"General information\", \"score\": \"403.7480916030534\"}",
-              "Human Sexuality - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Professional Medicine - Observed inference time (s)": "{\"description\": \"min=0.615, mean=0.615, max=0.615, sum=1.23 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6148438769228318\"}",
+              "Professional Accounting - Observed inference time (s)": "{\"description\": \"min=0.825, mean=0.825, max=0.825, sum=1.651 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.8254362666015084\"}",
+              "Professional Law - Observed inference time (s)": "{\"description\": \"min=0.682, mean=0.682, max=0.682, sum=1.364 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.68212915414937\"}",
+              "Professional Psychology - Observed inference time (s)": "{\"description\": \"min=0.506, mean=0.506, max=0.506, sum=1.012 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.505940170459498\"}",
+              "Professional Medicine - # eval": "{\"description\": \"min=272, mean=272, max=272, sum=544 (2)\", \"tab\": \"General information\", \"score\": \"272.0\"}",
+              "Professional Medicine - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Professional Medicine - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Professional Medicine - # prompt tokens": "{\"description\": \"min=1288.143, mean=1288.143, max=1288.143, sum=2576.287 (2)\", \"tab\": \"General information\", \"score\": \"1288.1433823529412\"}",
+              "Professional Medicine - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "Professional Accounting - # eval": "{\"description\": \"min=282, mean=282, max=282, sum=564 (2)\", \"tab\": \"General information\", \"score\": \"282.0\"}",
+              "Professional Accounting - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Professional Accounting - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Professional Accounting - # prompt tokens": "{\"description\": \"min=805.496, mean=805.496, max=805.496, sum=1610.993 (2)\", \"tab\": \"General information\", \"score\": \"805.4964539007092\"}",
+              "Professional Accounting - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "Professional Law - # eval": "{\"description\": \"min=1534, mean=1534, max=1534, sum=3068 (2)\", \"tab\": \"General information\", \"score\": \"1534.0\"}",
+              "Professional Law - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Professional Law - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Professional Law - # prompt tokens": "{\"description\": \"min=1858.711, mean=1858.711, max=1858.711, sum=3717.421 (2)\", \"tab\": \"General information\", \"score\": \"1858.7105606258149\"}",
+              "Professional Law - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "Professional Psychology - # eval": "{\"description\": \"min=612, mean=612, max=612, sum=1224 (2)\", \"tab\": \"General information\", \"score\": \"612.0\"}",
+              "Professional Psychology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Professional Psychology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Professional Psychology - # prompt tokens": "{\"description\": \"min=654.278, mean=654.278, max=654.278, sum=1308.556 (2)\", \"tab\": \"General information\", \"score\": \"654.2777777777778\"}",
+              "Professional Psychology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"human_sexuality\"",
+              "subject": "\"professional_psychology\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_human_sexuality\""
+              "groups": "\"mmlu_professional_psychology\""
             }
           }
         },
         {
-          "evaluation_name": "International Law",
+          "evaluation_name": "Us Foreign Policy",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1152,36 +1060,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on International Law",
+            "evaluation_description": "EM on Us Foreign Policy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.76,
+            "score": 0.79,
             "details": {
-              "description": "min=0.76, mean=0.76, max=0.76, sum=1.521 (2)",
+              "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)",
               "tab": "Accuracy",
-              "International Law - Observed inference time (s)": "{\"description\": \"min=0.393, mean=0.393, max=0.393, sum=0.787 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3933255593638775\"}",
-              "International Law - # eval": "{\"description\": \"min=121, mean=121, max=121, sum=242 (2)\", \"tab\": \"General information\", \"score\": \"121.0\"}",
-              "International Law - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "International Law - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "International Law - # prompt tokens": "{\"description\": \"min=729.182, mean=729.182, max=729.182, sum=1458.364 (2)\", \"tab\": \"General information\", \"score\": \"729.1818181818181\"}",
-              "International Law - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Us Foreign Policy - Observed inference time (s)": "{\"description\": \"min=0.487, mean=0.487, max=0.487, sum=0.973 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.48650413513183594\"}",
+              "Us Foreign Policy - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "Us Foreign Policy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Us Foreign Policy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Us Foreign Policy - # prompt tokens": "{\"description\": \"min=482.19, mean=482.19, max=482.19, sum=964.38 (2)\", \"tab\": \"General information\", \"score\": \"482.19\"}",
+              "Us Foreign Policy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"international_law\"",
+              "subject": "\"us_foreign_policy\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_international_law\""
+              "groups": "\"mmlu_us_foreign_policy\""
             }
           }
         },
         {
-          "evaluation_name": "Logical Fallacies",
+          "evaluation_name": "Astronomy",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1190,36 +1098,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Logical Fallacies",
+            "evaluation_description": "EM on Astronomy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.712,
+            "score": 0.638,
             "details": {
-              "description": "min=0.712, mean=0.712, max=0.712, sum=1.423 (2)",
+              "description": "min=0.638, mean=0.638, max=0.638, sum=1.276 (2)",
               "tab": "Accuracy",
-              "Logical Fallacies - Observed inference time (s)": "{\"description\": \"min=0.848, mean=0.848, max=0.848, sum=1.695 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.8476987660296855\"}",
-              "Logical Fallacies - # eval": "{\"description\": \"min=163, mean=163, max=163, sum=326 (2)\", \"tab\": \"General information\", \"score\": \"163.0\"}",
-              "Logical Fallacies - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Logical Fallacies - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Logical Fallacies - # prompt tokens": "{\"description\": \"min=495.779, mean=495.779, max=495.779, sum=991.558 (2)\", \"tab\": \"General information\", \"score\": \"495.77914110429447\"}",
-              "Logical Fallacies - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Astronomy - Observed inference time (s)": "{\"description\": \"min=0.678, mean=0.678, max=0.678, sum=1.355 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6775346147386652\"}",
+              "Astronomy - # eval": "{\"description\": \"min=152, mean=152, max=152, sum=304 (2)\", \"tab\": \"General information\", \"score\": \"152.0\"}",
+              "Astronomy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Astronomy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Astronomy - # prompt tokens": "{\"description\": \"min=674.987, mean=674.987, max=674.987, sum=1349.974 (2)\", \"tab\": \"General information\", \"score\": \"674.9868421052631\"}",
+              "Astronomy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"logical_fallacies\"",
+              "subject": "\"astronomy\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_logical_fallacies\""
+              "groups": "\"mmlu_astronomy\""
             }
           }
         },
         {
-          "evaluation_name": "Machine Learning",
+          "evaluation_name": "Business Ethics",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1228,36 +1136,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Machine Learning",
+            "evaluation_description": "EM on Business Ethics",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.455,
+            "score": 0.57,
             "details": {
-              "description": "min=0.455, mean=0.455, max=0.455, sum=0.911 (2)",
+              "description": "min=0.57, mean=0.57, max=0.57, sum=1.14 (2)",
               "tab": "Accuracy",
-              "Machine Learning - Observed inference time (s)": "{\"description\": \"min=0.557, mean=0.557, max=0.557, sum=1.113 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5566470899752208\"}",
-              "Machine Learning - # eval": "{\"description\": \"min=112, mean=112, max=112, sum=224 (2)\", \"tab\": \"General information\", \"score\": \"112.0\"}",
-              "Machine Learning - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Machine Learning - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Machine Learning - # prompt tokens": "{\"description\": \"min=743.83, mean=743.83, max=743.83, sum=1487.661 (2)\", \"tab\": \"General information\", \"score\": \"743.8303571428571\"}",
-              "Machine Learning - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Business Ethics - Observed inference time (s)": "{\"description\": \"min=0.645, mean=0.645, max=0.645, sum=1.289 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6446590375900269\"}",
+              "Business Ethics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "Business Ethics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Business Ethics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Business Ethics - # prompt tokens": "{\"description\": \"min=653.6, mean=653.6, max=653.6, sum=1307.2 (2)\", \"tab\": \"General information\", \"score\": \"653.6\"}",
+              "Business Ethics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"machine_learning\"",
+              "subject": "\"business_ethics\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_machine_learning\""
+              "groups": "\"mmlu_business_ethics\""
             }
           }
         },
         {
-          "evaluation_name": "Management",
+          "evaluation_name": "Clinical Knowledge",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1266,36 +1174,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Management",
+            "evaluation_description": "EM on Clinical Knowledge",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.767,
+            "score": 0.687,
             "details": {
-              "description": "min=0.767, mean=0.767, max=0.767, sum=1.534 (2)",
+              "description": "min=0.687, mean=0.687, max=0.687, sum=1.374 (2)",
               "tab": "Accuracy",
-              "Management - Observed inference time (s)": "{\"description\": \"min=0.365, mean=0.365, max=0.365, sum=0.73 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.36507687059420985\"}",
-              "Management - # eval": "{\"description\": \"min=103, mean=103, max=103, sum=206 (2)\", \"tab\": \"General information\", \"score\": \"103.0\"}",
-              "Management - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Management - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Management - # prompt tokens": "{\"description\": \"min=324.359, mean=324.359, max=324.359, sum=648.718 (2)\", \"tab\": \"General information\", \"score\": \"324.3592233009709\"}",
-              "Management - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Clinical Knowledge - Observed inference time (s)": "{\"description\": \"min=0.844, mean=0.844, max=0.844, sum=1.687 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.8436905698956184\"}",
+              "Clinical Knowledge - # eval": "{\"description\": \"min=265, mean=265, max=265, sum=530 (2)\", \"tab\": \"General information\", \"score\": \"265.0\"}",
+              "Clinical Knowledge - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Clinical Knowledge - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Clinical Knowledge - # prompt tokens": "{\"description\": \"min=496.174, mean=496.174, max=496.174, sum=992.347 (2)\", \"tab\": \"General information\", \"score\": \"496.1735849056604\"}",
+              "Clinical Knowledge - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"management\"",
+              "subject": "\"clinical_knowledge\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_management\""
+              "groups": "\"mmlu_clinical_knowledge\""
             }
           }
         },
         {
-          "evaluation_name": "Marketing",
+          "evaluation_name": "Conceptual Physics",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1304,36 +1212,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Marketing",
+            "evaluation_description": "EM on Conceptual Physics",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.842,
+            "score": 0.549,
             "details": {
-              "description": "min=0.842, mean=0.842, max=0.842, sum=1.684 (2)",
+              "description": "min=0.549, mean=0.549, max=0.549, sum=1.098 (2)",
               "tab": "Accuracy",
-              "Marketing - Observed inference time (s)": "{\"description\": \"min=0.585, mean=0.585, max=0.585, sum=1.17 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.58499161606161\"}",
-              "Marketing - # eval": "{\"description\": \"min=234, mean=234, max=234, sum=468 (2)\", \"tab\": \"General information\", \"score\": \"234.0\"}",
-              "Marketing - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Marketing - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Marketing - # prompt tokens": "{\"description\": \"min=472.423, mean=472.423, max=472.423, sum=944.846 (2)\", \"tab\": \"General information\", \"score\": \"472.4230769230769\"}",
-              "Marketing - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Conceptual Physics - Observed inference time (s)": "{\"description\": \"min=0.333, mean=0.333, max=0.333, sum=0.666 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.33306963900302317\"}",
+              "Conceptual Physics - # eval": "{\"description\": \"min=235, mean=235, max=235, sum=470 (2)\", \"tab\": \"General information\", \"score\": \"235.0\"}",
+              "Conceptual Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Conceptual Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Conceptual Physics - # prompt tokens": "{\"description\": \"min=343.285, mean=343.285, max=343.285, sum=686.57 (2)\", \"tab\": \"General information\", \"score\": \"343.2851063829787\"}",
+              "Conceptual Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"marketing\"",
+              "subject": "\"conceptual_physics\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_marketing\""
+              "groups": "\"mmlu_conceptual_physics\""
             }
           }
         },
         {
-          "evaluation_name": "Medical Genetics",
+          "evaluation_name": "Electrical Engineering",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1342,36 +1250,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Medical Genetics",
+            "evaluation_description": "EM on Electrical Engineering",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.75,
+            "score": 0.572,
             "details": {
-              "description": "min=0.75, mean=0.75, max=0.75, sum=1.5 (2)",
+              "description": "min=0.572, mean=0.572, max=0.572, sum=1.145 (2)",
               "tab": "Accuracy",
-              "Medical Genetics - Observed inference time (s)": "{\"description\": \"min=0.268, mean=0.268, max=0.268, sum=0.535 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2675498366355896\"}",
-              "Medical Genetics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "Medical Genetics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Medical Genetics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Medical Genetics - # prompt tokens": "{\"description\": \"min=414.71, mean=414.71, max=414.71, sum=829.42 (2)\", \"tab\": \"General information\", \"score\": \"414.71\"}",
-              "Medical Genetics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Electrical Engineering - Observed inference time (s)": "{\"description\": \"min=0.392, mean=0.392, max=0.392, sum=0.784 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3922290703345989\"}",
+              "Electrical Engineering - # eval": "{\"description\": \"min=145, mean=145, max=145, sum=290 (2)\", \"tab\": \"General information\", \"score\": \"145.0\"}",
+              "Electrical Engineering - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Electrical Engineering - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Electrical Engineering - # prompt tokens": "{\"description\": \"min=510.379, mean=510.379, max=510.379, sum=1020.759 (2)\", \"tab\": \"General information\", \"score\": \"510.37931034482756\"}",
+              "Electrical Engineering - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"medical_genetics\"",
+              "subject": "\"electrical_engineering\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_medical_genetics\""
+              "groups": "\"mmlu_electrical_engineering\""
             }
           }
         },
         {
-          "evaluation_name": "Miscellaneous",
+          "evaluation_name": "Elementary Mathematics",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1380,36 +1288,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Miscellaneous",
+            "evaluation_description": "EM on Elementary Mathematics",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.785,
+            "score": 0.402,
             "details": {
-              "description": "min=0.785, mean=0.785, max=0.785, sum=1.571 (2)",
+              "description": "min=0.402, mean=0.402, max=0.402, sum=0.804 (2)",
               "tab": "Accuracy",
-              "Miscellaneous - Observed inference time (s)": "{\"description\": \"min=0.504, mean=0.504, max=0.504, sum=1.008 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5038632959850599\"}",
-              "Miscellaneous - # eval": "{\"description\": \"min=783, mean=783, max=783, sum=1566 (2)\", \"tab\": \"General information\", \"score\": \"783.0\"}",
-              "Miscellaneous - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Miscellaneous - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Miscellaneous - # prompt tokens": "{\"description\": \"min=357.519, mean=357.519, max=357.519, sum=715.037 (2)\", \"tab\": \"General information\", \"score\": \"357.51851851851853\"}",
-              "Miscellaneous - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Elementary Mathematics - Observed inference time (s)": "{\"description\": \"min=0.676, mean=0.676, max=0.676, sum=1.352 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6761655416438188\"}",
+              "Elementary Mathematics - # eval": "{\"description\": \"min=378, mean=378, max=378, sum=756 (2)\", \"tab\": \"General information\", \"score\": \"378.0\"}",
+              "Elementary Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Elementary Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Elementary Mathematics - # prompt tokens": "{\"description\": \"min=622.386, mean=622.386, max=622.386, sum=1244.772 (2)\", \"tab\": \"General information\", \"score\": \"622.3862433862433\"}",
+              "Elementary Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"miscellaneous\"",
+              "subject": "\"elementary_mathematics\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_miscellaneous\""
+              "groups": "\"mmlu_elementary_mathematics\""
             }
           }
         },
         {
-          "evaluation_name": "Moral Scenarios",
+          "evaluation_name": "Formal Logic",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1418,42 +1326,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Moral Scenarios",
+            "evaluation_description": "EM on Formal Logic",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.393,
+            "score": 0.397,
             "details": {
-              "description": "min=0.393, mean=0.393, max=0.393, sum=0.787 (2)",
+              "description": "min=0.397, mean=0.397, max=0.397, sum=0.794 (2)",
               "tab": "Accuracy",
-              "Moral Disputes - Observed inference time (s)": "{\"description\": \"min=0.777, mean=0.777, max=0.777, sum=1.553 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.7765735477381359\"}",
-              "Moral Scenarios - Observed inference time (s)": "{\"description\": \"min=0.493, mean=0.493, max=0.493, sum=0.986 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4927780463042872\"}",
-              "Moral Disputes - # eval": "{\"description\": \"min=346, mean=346, max=346, sum=692 (2)\", \"tab\": \"General information\", \"score\": \"346.0\"}",
-              "Moral Disputes - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Moral Disputes - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Moral Disputes - # prompt tokens": "{\"description\": \"min=549.038, mean=549.038, max=549.038, sum=1098.075 (2)\", \"tab\": \"General information\", \"score\": \"549.0375722543353\"}",
-              "Moral Disputes - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "Moral Scenarios - # eval": "{\"description\": \"min=895, mean=895, max=895, sum=1790 (2)\", \"tab\": \"General information\", \"score\": \"895.0\"}",
-              "Moral Scenarios - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Moral Scenarios - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Moral Scenarios - # prompt tokens": "{\"description\": \"min=754.516, mean=754.516, max=754.516, sum=1509.032 (2)\", \"tab\": \"General information\", \"score\": \"754.5162011173185\"}",
-              "Moral Scenarios - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Formal Logic - Observed inference time (s)": "{\"description\": \"min=0.734, mean=0.734, max=0.734, sum=1.467 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.7336057802987477\"}",
+              "Formal Logic - # eval": "{\"description\": \"min=126, mean=126, max=126, sum=252 (2)\", \"tab\": \"General information\", \"score\": \"126.0\"}",
+              "Formal Logic - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Formal Logic - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Formal Logic - # prompt tokens": "{\"description\": \"min=727.984, mean=727.984, max=727.984, sum=1455.968 (2)\", \"tab\": \"General information\", \"score\": \"727.984126984127\"}",
+              "Formal Logic - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"moral_scenarios\"",
+              "subject": "\"formal_logic\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_moral_scenarios\""
+              "groups": "\"mmlu_formal_logic\""
             }
           }
         },
         {
-          "evaluation_name": "Nutrition",
+          "evaluation_name": "High School World History",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1462,36 +1364,114 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Nutrition",
+            "evaluation_description": "EM on High School World History",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.676,
+            "score": 0.759,
             "details": {
-              "description": "min=0.676, mean=0.676, max=0.676, sum=1.353 (2)",
+              "description": "min=0.759, mean=0.759, max=0.759, sum=1.519 (2)",
               "tab": "Accuracy",
-              "Nutrition - Observed inference time (s)": "{\"description\": \"min=0.236, mean=0.236, max=0.236, sum=0.471 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.23563866054310517\"}",
-              "Nutrition - # eval": "{\"description\": \"min=306, mean=306, max=306, sum=612 (2)\", \"tab\": \"General information\", \"score\": \"306.0\"}",
-              "Nutrition - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Nutrition - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Nutrition - # prompt tokens": "{\"description\": \"min=689.69, mean=689.69, max=689.69, sum=1379.379 (2)\", \"tab\": \"General information\", \"score\": \"689.6895424836601\"}",
-              "Nutrition - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "High School Biology - Observed inference time (s)": "{\"description\": \"min=0.805, mean=0.805, max=0.805, sum=1.61 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.8049156188964843\"}",
+              "High School Chemistry - Observed inference time (s)": "{\"description\": \"min=0.44, mean=0.44, max=0.44, sum=0.881 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.44036899529067164\"}",
+              "High School Computer Science - Observed inference time (s)": "{\"description\": \"min=0.435, mean=0.435, max=0.435, sum=0.869 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4347002100944519\"}",
+              "High School European History - Observed inference time (s)": "{\"description\": \"min=0.445, mean=0.445, max=0.445, sum=0.891 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4453156341205944\"}",
+              "High School Geography - Observed inference time (s)": "{\"description\": \"min=0.331, mean=0.331, max=0.331, sum=0.661 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3305177327358361\"}",
+              "High School Government And Politics - Observed inference time (s)": "{\"description\": \"min=0.545, mean=0.545, max=0.545, sum=1.089 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5445178654527417\"}",
+              "High School Macroeconomics - Observed inference time (s)": "{\"description\": \"min=0.53, mean=0.53, max=0.53, sum=1.061 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5302642871172\"}",
+              "High School Mathematics - Observed inference time (s)": "{\"description\": \"min=0.585, mean=0.585, max=0.585, sum=1.169 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5845282289716932\"}",
+              "High School Microeconomics - Observed inference time (s)": "{\"description\": \"min=0.234, mean=0.234, max=0.234, sum=0.468 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.23408917118521297\"}",
+              "High School Physics - Observed inference time (s)": "{\"description\": \"min=0.384, mean=0.384, max=0.384, sum=0.768 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3838195042894376\"}",
+              "High School Psychology - Observed inference time (s)": "{\"description\": \"min=0.274, mean=0.274, max=0.274, sum=0.547 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2735835779697523\"}",
+              "High School Statistics - Observed inference time (s)": "{\"description\": \"min=0.654, mean=0.654, max=0.654, sum=1.308 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6539056665367551\"}",
+              "High School US History - Observed inference time (s)": "{\"description\": \"min=0.942, mean=0.942, max=0.942, sum=1.883 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.9417344308366963\"}",
+              "High School World History - Observed inference time (s)": "{\"description\": \"min=0.864, mean=0.864, max=0.864, sum=1.727 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.8635432951561006\"}",
+              "High School Biology - # eval": "{\"description\": \"min=310, mean=310, max=310, sum=620 (2)\", \"tab\": \"General information\", \"score\": \"310.0\"}",
+              "High School Biology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Biology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Biology - # prompt tokens": "{\"description\": \"min=609.561, mean=609.561, max=609.561, sum=1219.123 (2)\", \"tab\": \"General information\", \"score\": \"609.5612903225806\"}",
+              "High School Biology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Chemistry - # eval": "{\"description\": \"min=203, mean=203, max=203, sum=406 (2)\", \"tab\": \"General information\", \"score\": \"203.0\"}",
+              "High School Chemistry - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Chemistry - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Chemistry - # prompt tokens": "{\"description\": \"min=581.798, mean=581.798, max=581.798, sum=1163.596 (2)\", \"tab\": \"General information\", \"score\": \"581.7980295566502\"}",
+              "High School Chemistry - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Computer Science - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "High School Computer Science - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Computer Science - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Computer Science - # prompt tokens": "{\"description\": \"min=997.24, mean=997.24, max=997.24, sum=1994.48 (2)\", \"tab\": \"General information\", \"score\": \"997.24\"}",
+              "High School Computer Science - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School European History - # eval": "{\"description\": \"min=165, mean=165, max=165, sum=330 (2)\", \"tab\": \"General information\", \"score\": \"165.0\"}",
+              "High School European History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School European History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School European History - # prompt tokens": "{\"description\": \"min=3098.109, mean=3098.109, max=3098.109, sum=6196.218 (2)\", \"tab\": \"General information\", \"score\": \"3098.109090909091\"}",
+              "High School European History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Geography - # eval": "{\"description\": \"min=198, mean=198, max=198, sum=396 (2)\", \"tab\": \"General information\", \"score\": \"198.0\"}",
+              "High School Geography - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Geography - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Geography - # prompt tokens": "{\"description\": \"min=438.207, mean=438.207, max=438.207, sum=876.414 (2)\", \"tab\": \"General information\", \"score\": \"438.2070707070707\"}",
+              "High School Geography - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Government And Politics - # eval": "{\"description\": \"min=193, mean=193, max=193, sum=386 (2)\", \"tab\": \"General information\", \"score\": \"193.0\"}",
+              "High School Government And Politics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Government And Politics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Government And Politics - # prompt tokens": "{\"description\": \"min=523.808, mean=523.808, max=523.808, sum=1047.617 (2)\", \"tab\": \"General information\", \"score\": \"523.8082901554404\"}",
+              "High School Government And Politics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Macroeconomics - # eval": "{\"description\": \"min=390, mean=390, max=390, sum=780 (2)\", \"tab\": \"General information\", \"score\": \"390.0\"}",
+              "High School Macroeconomics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Macroeconomics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Macroeconomics - # prompt tokens": "{\"description\": \"min=432.815, mean=432.815, max=432.815, sum=865.631 (2)\", \"tab\": \"General information\", \"score\": \"432.81538461538463\"}",
+              "High School Macroeconomics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Mathematics - # eval": "{\"description\": \"min=270, mean=270, max=270, sum=540 (2)\", \"tab\": \"General information\", \"score\": \"270.0\"}",
+              "High School Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Mathematics - # prompt tokens": "{\"description\": \"min=593.13, mean=593.13, max=593.13, sum=1186.259 (2)\", \"tab\": \"General information\", \"score\": \"593.1296296296297\"}",
+              "High School Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Microeconomics - # eval": "{\"description\": \"min=238, mean=238, max=238, sum=476 (2)\", \"tab\": \"General information\", \"score\": \"238.0\"}",
+              "High School Microeconomics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Microeconomics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Microeconomics - # prompt tokens": "{\"description\": \"min=452.345, mean=452.345, max=452.345, sum=904.689 (2)\", \"tab\": \"General information\", \"score\": \"452.34453781512605\"}",
+              "High School Microeconomics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Physics - # eval": "{\"description\": \"min=151, mean=151, max=151, sum=302 (2)\", \"tab\": \"General information\", \"score\": \"151.0\"}",
+              "High School Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Physics - # prompt tokens": "{\"description\": \"min=631.775, mean=631.775, max=631.775, sum=1263.55 (2)\", \"tab\": \"General information\", \"score\": \"631.774834437086\"}",
+              "High School Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Psychology - # eval": "{\"description\": \"min=545, mean=545, max=545, sum=1090 (2)\", \"tab\": \"General information\", \"score\": \"545.0\"}",
+              "High School Psychology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Psychology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Psychology - # prompt tokens": "{\"description\": \"min=567.873, mean=567.873, max=567.873, sum=1135.747 (2)\", \"tab\": \"General information\", \"score\": \"567.8733944954129\"}",
+              "High School Psychology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Statistics - # eval": "{\"description\": \"min=216, mean=216, max=216, sum=432 (2)\", \"tab\": \"General information\", \"score\": \"216.0\"}",
+              "High School Statistics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Statistics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Statistics - # prompt tokens": "{\"description\": \"min=922.644, mean=922.644, max=922.644, sum=1845.287 (2)\", \"tab\": \"General information\", \"score\": \"922.6435185185185\"}",
+              "High School Statistics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School US History - # eval": "{\"description\": \"min=204, mean=204, max=204, sum=408 (2)\", \"tab\": \"General information\", \"score\": \"204.0\"}",
+              "High School US History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School US History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School US History - # prompt tokens": "{\"description\": \"min=2486.446, mean=2486.446, max=2486.446, sum=4972.892 (2)\", \"tab\": \"General information\", \"score\": \"2486.4460784313724\"}",
+              "High School US History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School World History - # eval": "{\"description\": \"min=237, mean=237, max=237, sum=474 (2)\", \"tab\": \"General information\", \"score\": \"237.0\"}",
+              "High School World History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School World History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School World History - # prompt tokens": "{\"description\": \"min=1594.553, mean=1594.553, max=1594.553, sum=3189.105 (2)\", \"tab\": \"General information\", \"score\": \"1594.5527426160338\"}",
+              "High School World History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"nutrition\"",
+              "subject": "\"high_school_world_history\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_nutrition\""
+              "groups": "\"mmlu_high_school_world_history\""
             }
           }
         },
         {
-          "evaluation_name": "Prehistory",
+          "evaluation_name": "Human Sexuality",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1500,36 +1480,42 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Prehistory",
+            "evaluation_description": "EM on Human Sexuality",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.673,
+            "score": 0.702,
             "details": {
-              "description": "min=0.673, mean=0.673, max=0.673, sum=1.346 (2)",
+              "description": "min=0.702, mean=0.702, max=0.702, sum=1.405 (2)",
               "tab": "Accuracy",
-              "Prehistory - Observed inference time (s)": "{\"description\": \"min=0.345, mean=0.345, max=0.345, sum=0.69 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.34476134880089465\"}",
-              "Prehistory - # eval": "{\"description\": \"min=324, mean=324, max=324, sum=648 (2)\", \"tab\": \"General information\", \"score\": \"324.0\"}",
-              "Prehistory - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Prehistory - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Prehistory - # prompt tokens": "{\"description\": \"min=611.145, mean=611.145, max=611.145, sum=1222.29 (2)\", \"tab\": \"General information\", \"score\": \"611.145061728395\"}",
-              "Prehistory - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Human Aging - Observed inference time (s)": "{\"description\": \"min=0.809, mean=0.809, max=0.809, sum=1.618 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.8091403518557014\"}",
+              "Human Sexuality - Observed inference time (s)": "{\"description\": \"min=1.438, mean=1.438, max=1.438, sum=2.875 (2)\", \"tab\": \"Efficiency\", \"score\": \"1.437711750278036\"}",
+              "Human Aging - # eval": "{\"description\": \"min=223, mean=223, max=223, sum=446 (2)\", \"tab\": \"General information\", \"score\": \"223.0\"}",
+              "Human Aging - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Human Aging - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Human Aging - # prompt tokens": "{\"description\": \"min=362.152, mean=362.152, max=362.152, sum=724.305 (2)\", \"tab\": \"General information\", \"score\": \"362.15246636771303\"}",
+              "Human Aging - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "Human Sexuality - # eval": "{\"description\": \"min=131, mean=131, max=131, sum=262 (2)\", \"tab\": \"General information\", \"score\": \"131.0\"}",
+              "Human Sexuality - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Human Sexuality - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Human Sexuality - # prompt tokens": "{\"description\": \"min=403.748, mean=403.748, max=403.748, sum=807.496 (2)\", \"tab\": \"General information\", \"score\": \"403.7480916030534\"}",
+              "Human Sexuality - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"prehistory\"",
+              "subject": "\"human_sexuality\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_prehistory\""
+              "groups": "\"mmlu_human_sexuality\""
             }
           }
         },
         {
-          "evaluation_name": "Public Relations",
+          "evaluation_name": "International Law",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1538,36 +1524,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Public Relations",
+            "evaluation_description": "EM on International Law",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.636,
+            "score": 0.76,
             "details": {
-              "description": "min=0.636, mean=0.636, max=0.636, sum=1.273 (2)",
+              "description": "min=0.76, mean=0.76, max=0.76, sum=1.521 (2)",
               "tab": "Accuracy",
-              "Public Relations - Observed inference time (s)": "{\"description\": \"min=0.327, mean=0.327, max=0.327, sum=0.654 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3271717678416859\"}",
-              "Public Relations - # eval": "{\"description\": \"min=110, mean=110, max=110, sum=220 (2)\", \"tab\": \"General information\", \"score\": \"110.0\"}",
-              "Public Relations - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Public Relations - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Public Relations - # prompt tokens": "{\"description\": \"min=471.036, mean=471.036, max=471.036, sum=942.073 (2)\", \"tab\": \"General information\", \"score\": \"471.03636363636366\"}",
-              "Public Relations - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "International Law - Observed inference time (s)": "{\"description\": \"min=0.393, mean=0.393, max=0.393, sum=0.787 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3933255593638775\"}",
+              "International Law - # eval": "{\"description\": \"min=121, mean=121, max=121, sum=242 (2)\", \"tab\": \"General information\", \"score\": \"121.0\"}",
+              "International Law - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "International Law - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "International Law - # prompt tokens": "{\"description\": \"min=729.182, mean=729.182, max=729.182, sum=1458.364 (2)\", \"tab\": \"General information\", \"score\": \"729.1818181818181\"}",
+              "International Law - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"public_relations\"",
+              "subject": "\"international_law\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_public_relations\""
+              "groups": "\"mmlu_international_law\""
             }
           }
         },
         {
-          "evaluation_name": "Security Studies",
+          "evaluation_name": "Logical Fallacies",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1576,36 +1562,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Security Studies",
+            "evaluation_description": "EM on Logical Fallacies",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.682,
+            "score": 0.712,
             "details": {
-              "description": "min=0.682, mean=0.682, max=0.682, sum=1.363 (2)",
+              "description": "min=0.712, mean=0.712, max=0.712, sum=1.423 (2)",
               "tab": "Accuracy",
-              "Security Studies - Observed inference time (s)": "{\"description\": \"min=0.561, mean=0.561, max=0.561, sum=1.121 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5606838294437954\"}",
-              "Security Studies - # eval": "{\"description\": \"min=245, mean=245, max=245, sum=490 (2)\", \"tab\": \"General information\", \"score\": \"245.0\"}",
-              "Security Studies - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Security Studies - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Security Studies - # prompt tokens": "{\"description\": \"min=1324.865, mean=1324.865, max=1324.865, sum=2649.731 (2)\", \"tab\": \"General information\", \"score\": \"1324.865306122449\"}",
-              "Security Studies - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Logical Fallacies - Observed inference time (s)": "{\"description\": \"min=0.848, mean=0.848, max=0.848, sum=1.695 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.8476987660296855\"}",
+              "Logical Fallacies - # eval": "{\"description\": \"min=163, mean=163, max=163, sum=326 (2)\", \"tab\": \"General information\", \"score\": \"163.0\"}",
+              "Logical Fallacies - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Logical Fallacies - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Logical Fallacies - # prompt tokens": "{\"description\": \"min=495.779, mean=495.779, max=495.779, sum=991.558 (2)\", \"tab\": \"General information\", \"score\": \"495.77914110429447\"}",
+              "Logical Fallacies - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"security_studies\"",
+              "subject": "\"logical_fallacies\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_security_studies\""
+              "groups": "\"mmlu_logical_fallacies\""
             }
           }
         },
         {
-          "evaluation_name": "Sociology",
+          "evaluation_name": "Machine Learning",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1614,36 +1600,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Sociology",
+            "evaluation_description": "EM on Machine Learning",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.806,
+            "score": 0.455,
             "details": {
-              "description": "min=0.806, mean=0.806, max=0.806, sum=1.612 (2)",
+              "description": "min=0.455, mean=0.455, max=0.455, sum=0.911 (2)",
               "tab": "Accuracy",
-              "Sociology - Observed inference time (s)": "{\"description\": \"min=0.413, mean=0.413, max=0.413, sum=0.825 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.41272182962787685\"}",
-              "Sociology - # eval": "{\"description\": \"min=201, mean=201, max=201, sum=402 (2)\", \"tab\": \"General information\", \"score\": \"201.0\"}",
-              "Sociology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Sociology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Sociology - # prompt tokens": "{\"description\": \"min=496.95, mean=496.95, max=496.95, sum=993.9 (2)\", \"tab\": \"General information\", \"score\": \"496.9502487562189\"}",
-              "Sociology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Machine Learning - Observed inference time (s)": "{\"description\": \"min=0.557, mean=0.557, max=0.557, sum=1.113 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5566470899752208\"}",
+              "Machine Learning - # eval": "{\"description\": \"min=112, mean=112, max=112, sum=224 (2)\", \"tab\": \"General information\", \"score\": \"112.0\"}",
+              "Machine Learning - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Machine Learning - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Machine Learning - # prompt tokens": "{\"description\": \"min=743.83, mean=743.83, max=743.83, sum=1487.661 (2)\", \"tab\": \"General information\", \"score\": \"743.8303571428571\"}",
+              "Machine Learning - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"sociology\"",
+              "subject": "\"machine_learning\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_sociology\""
+              "groups": "\"mmlu_machine_learning\""
             }
           }
         },
         {
-          "evaluation_name": "Virology",
+          "evaluation_name": "Management",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1652,36 +1638,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Virology",
+            "evaluation_description": "EM on Management",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.47,
+            "score": 0.767,
             "details": {
-              "description": "min=0.47, mean=0.47, max=0.47, sum=0.94 (2)",
+              "description": "min=0.767, mean=0.767, max=0.767, sum=1.534 (2)",
               "tab": "Accuracy",
-              "Virology - Observed inference time (s)": "{\"description\": \"min=0.644, mean=0.644, max=0.644, sum=1.288 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6437842285776713\"}",
-              "Virology - # eval": "{\"description\": \"min=166, mean=166, max=166, sum=332 (2)\", \"tab\": \"General information\", \"score\": \"166.0\"}",
-              "Virology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Virology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Virology - # prompt tokens": "{\"description\": \"min=404.349, mean=404.349, max=404.349, sum=808.699 (2)\", \"tab\": \"General information\", \"score\": \"404.34939759036143\"}",
-              "Virology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Management - Observed inference time (s)": "{\"description\": \"min=0.365, mean=0.365, max=0.365, sum=0.73 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.36507687059420985\"}",
+              "Management - # eval": "{\"description\": \"min=103, mean=103, max=103, sum=206 (2)\", \"tab\": \"General information\", \"score\": \"103.0\"}",
+              "Management - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Management - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Management - # prompt tokens": "{\"description\": \"min=324.359, mean=324.359, max=324.359, sum=648.718 (2)\", \"tab\": \"General information\", \"score\": \"324.3592233009709\"}",
+              "Management - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"virology\"",
+              "subject": "\"management\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_virology\""
+              "groups": "\"mmlu_management\""
             }
           }
         },
         {
-          "evaluation_name": "World Religions",
+          "evaluation_name": "Marketing",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1690,36 +1676,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on World Religions",
+            "evaluation_description": "EM on Marketing",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.825,
+            "score": 0.842,
             "details": {
-              "description": "min=0.825, mean=0.825, max=0.825, sum=1.649 (2)",
+              "description": "min=0.842, mean=0.842, max=0.842, sum=1.684 (2)",
               "tab": "Accuracy",
-              "World Religions - Observed inference time (s)": "{\"description\": \"min=0.266, mean=0.266, max=0.266, sum=0.532 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.26615772330970094\"}",
-              "World Religions - # eval": "{\"description\": \"min=171, mean=171, max=171, sum=342 (2)\", \"tab\": \"General information\", \"score\": \"171.0\"}",
-              "World Religions - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "World Religions - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "World Religions - # prompt tokens": "{\"description\": \"min=317.924, mean=317.924, max=317.924, sum=635.848 (2)\", \"tab\": \"General information\", \"score\": \"317.92397660818716\"}",
-              "World Religions - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Marketing - Observed inference time (s)": "{\"description\": \"min=0.585, mean=0.585, max=0.585, sum=1.17 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.58499161606161\"}",
+              "Marketing - # eval": "{\"description\": \"min=234, mean=234, max=234, sum=468 (2)\", \"tab\": \"General information\", \"score\": \"234.0\"}",
+              "Marketing - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Marketing - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Marketing - # prompt tokens": "{\"description\": \"min=472.423, mean=472.423, max=472.423, sum=944.846 (2)\", \"tab\": \"General information\", \"score\": \"472.4230769230769\"}",
+              "Marketing - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"world_religions\"",
+              "subject": "\"marketing\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_world_religions\""
+              "groups": "\"mmlu_marketing\""
             }
           }
         },
         {
-          "evaluation_name": "Mean win rate",
+          "evaluation_name": "Medical Genetics",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1728,404 +1714,418 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "How many models this model outperforms on average (over columns).",
+            "evaluation_description": "EM on Medical Genetics",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.509,
+            "score": 0.75,
             "details": {
-              "description": "",
-              "tab": "Efficiency"
+              "description": "min=0.75, mean=0.75, max=0.75, sum=1.5 (2)",
+              "tab": "Accuracy",
+              "Medical Genetics - Observed inference time (s)": "{\"description\": \"min=0.268, mean=0.268, max=0.268, sum=0.535 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2675498366355896\"}",
+              "Medical Genetics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "Medical Genetics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Medical Genetics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Medical Genetics - # prompt tokens": "{\"description\": \"min=414.71, mean=414.71, max=414.71, sum=829.42 (2)\", \"tab\": \"General information\", \"score\": \"414.71\"}",
+              "Medical Genetics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
-            "additional_details": {}
+            "additional_details": {
+              "subject": "\"medical_genetics\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_medical_genetics\""
+            }
           }
-        }
-      ],
-      "detailed_evaluation_results": null,
-      "generation_config": {
-        "additional_details": {
-          "subject": "[\"abstract_algebra\", \"anatomy\", \"astronomy\", \"business_ethics\", \"clinical_knowledge\", \"college_biology\", \"college_chemistry\", \"college_computer_science\", \"college_mathematics\", \"college_medicine\", \"college_physics\", \"computer_security\", \"conceptual_physics\", \"econometrics\", \"electrical_engineering\", \"elementary_mathematics\", \"formal_logic\", \"global_facts\", \"high_school_biology\", \"high_school_chemistry\", \"high_school_computer_science\", \"high_school_european_history\", \"high_school_geography\", \"high_school_government_and_politics\", \"high_school_macroeconomics\", \"high_school_mathematics\", \"high_school_microeconomics\", \"high_school_physics\", \"high_school_psychology\", \"high_school_statistics\", \"high_school_us_history\", \"high_school_world_history\", \"human_aging\", \"human_sexuality\", \"international_law\", \"jurisprudence\", \"logical_fallacies\", \"machine_learning\", \"management\", \"marketing\", \"medical_genetics\", \"miscellaneous\", \"moral_disputes\", \"moral_scenarios\", \"nutrition\", \"philosophy\", \"prehistory\", \"professional_accounting\", \"professional_law\", \"professional_medicine\", \"professional_psychology\", \"public_relations\", \"security_studies\", \"sociology\", \"us_foreign_policy\", \"virology\", \"world_religions\"]",
-          "method": "\"multiple_choice_joint\"",
-          "eval_split": "\"test\"",
-          "groups": "[\"mmlu_abstract_algebra\", \"mmlu_anatomy\", \"mmlu_astronomy\", \"mmlu_business_ethics\", \"mmlu_clinical_knowledge\", \"mmlu_college_biology\", \"mmlu_college_chemistry\", \"mmlu_college_computer_science\", \"mmlu_college_mathematics\", \"mmlu_college_medicine\", \"mmlu_college_physics\", \"mmlu_computer_security\", \"mmlu_conceptual_physics\", \"mmlu_econometrics\", \"mmlu_electrical_engineering\", \"mmlu_elementary_mathematics\", \"mmlu_formal_logic\", \"mmlu_global_facts\", \"mmlu_high_school_biology\", \"mmlu_high_school_chemistry\", \"mmlu_high_school_computer_science\", \"mmlu_high_school_european_history\", \"mmlu_high_school_geography\", \"mmlu_high_school_government_and_politics\", \"mmlu_high_school_macroeconomics\", \"mmlu_high_school_mathematics\", \"mmlu_high_school_microeconomics\", \"mmlu_high_school_physics\", \"mmlu_high_school_psychology\", \"mmlu_high_school_statistics\", \"mmlu_high_school_us_history\", \"mmlu_high_school_world_history\", \"mmlu_human_aging\", \"mmlu_human_sexuality\", \"mmlu_international_law\", \"mmlu_jurisprudence\", \"mmlu_logical_fallacies\", \"mmlu_machine_learning\", \"mmlu_management\", \"mmlu_marketing\", \"mmlu_medical_genetics\", \"mmlu_miscellaneous\", \"mmlu_moral_disputes\", \"mmlu_moral_scenarios\", \"mmlu_nutrition\", \"mmlu_philosophy\", \"mmlu_prehistory\", \"mmlu_professional_accounting\", \"mmlu_professional_law\", \"mmlu_professional_medicine\", \"mmlu_professional_psychology\", \"mmlu_public_relations\", \"mmlu_security_studies\", \"mmlu_sociology\", \"mmlu_us_foreign_policy\", \"mmlu_virology\", \"mmlu_world_religions\"]"
-        }
-      }
-    },
-    {
-      "evaluation_id": "helm_lite/mistralai_mistral-7b-instruct-v0.3/1774096306.427425",
-      "retrieved_timestamp": "1774096306.427425",
-      "source_metadata": {
-        "source_name": "helm_lite",
-        "source_type": "documentation",
-        "source_organization_name": "crfm",
-        "evaluator_relationship": "third_party"
-      },
-      "eval_library": {
-        "name": "helm",
-        "version": "unknown"
-      },
-      "benchmark": "helm_lite",
-      "evaluation_results": [
+        },
         {
-          "evaluation_name": "Mean win rate",
+          "evaluation_name": "Miscellaneous",
           "source_data": {
-            "dataset_name": "helm_lite",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "How many models this model outperforms on average (over columns).",
+            "evaluation_description": "EM on Miscellaneous",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.196,
+            "score": 0.785,
             "details": {
-              "description": "",
+              "description": "min=0.785, mean=0.785, max=0.785, sum=1.571 (2)",
               "tab": "Accuracy",
-              "Mean win rate - Efficiency": "{\"description\": \"\", \"tab\": \"Efficiency\", \"score\": \"0.6493133583021223\"}",
-              "Mean win rate - General information": "{\"description\": \"\", \"tab\": \"General information\", \"score\": \"\"}"
+              "Miscellaneous - Observed inference time (s)": "{\"description\": \"min=0.504, mean=0.504, max=0.504, sum=1.008 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5038632959850599\"}",
+              "Miscellaneous - # eval": "{\"description\": \"min=783, mean=783, max=783, sum=1566 (2)\", \"tab\": \"General information\", \"score\": \"783.0\"}",
+              "Miscellaneous - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Miscellaneous - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Miscellaneous - # prompt tokens": "{\"description\": \"min=357.519, mean=357.519, max=357.519, sum=715.037 (2)\", \"tab\": \"General information\", \"score\": \"357.51851851851853\"}",
+              "Miscellaneous - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
-            "additional_details": {}
+            "additional_details": {
+              "subject": "\"miscellaneous\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_miscellaneous\""
+            }
           }
         },
         {
-          "evaluation_name": "NarrativeQA",
+          "evaluation_name": "Moral Scenarios",
           "source_data": {
-            "dataset_name": "NarrativeQA",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "F1 on NarrativeQA",
+            "evaluation_description": "EM on Moral Scenarios",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.716,
+            "score": 0.393,
             "details": {
-              "description": "min=0.716, mean=0.716, max=0.716, sum=0.716 (1)",
+              "description": "min=0.393, mean=0.393, max=0.393, sum=0.787 (2)",
               "tab": "Accuracy",
-              "NarrativeQA - Observed inference time (s)": "{\"description\": \"min=0.813, mean=0.813, max=0.813, sum=0.813 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.8132137520212522\"}",
-              "NarrativeQA - # eval": "{\"description\": \"min=355, mean=355, max=355, sum=355 (1)\", \"tab\": \"General information\", \"score\": \"355.0\"}",
-              "NarrativeQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "NarrativeQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "NarrativeQA - # prompt tokens": "{\"description\": \"min=3924.33, mean=3924.33, max=3924.33, sum=3924.33 (1)\", \"tab\": \"General information\", \"score\": \"3924.3295774647886\"}",
-              "NarrativeQA - # output tokens": "{\"description\": \"min=7.107, mean=7.107, max=7.107, sum=7.107 (1)\", \"tab\": \"General information\", \"score\": \"7.107042253521127\"}"
+              "Moral Disputes - Observed inference time (s)": "{\"description\": \"min=0.777, mean=0.777, max=0.777, sum=1.553 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.7765735477381359\"}",
+              "Moral Scenarios - Observed inference time (s)": "{\"description\": \"min=0.493, mean=0.493, max=0.493, sum=0.986 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4927780463042872\"}",
+              "Moral Disputes - # eval": "{\"description\": \"min=346, mean=346, max=346, sum=692 (2)\", \"tab\": \"General information\", \"score\": \"346.0\"}",
+              "Moral Disputes - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Moral Disputes - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Moral Disputes - # prompt tokens": "{\"description\": \"min=549.038, mean=549.038, max=549.038, sum=1098.075 (2)\", \"tab\": \"General information\", \"score\": \"549.0375722543353\"}",
+              "Moral Disputes - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "Moral Scenarios - # eval": "{\"description\": \"min=895, mean=895, max=895, sum=1790 (2)\", \"tab\": \"General information\", \"score\": \"895.0\"}",
+              "Moral Scenarios - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Moral Scenarios - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Moral Scenarios - # prompt tokens": "{\"description\": \"min=754.516, mean=754.516, max=754.516, sum=1509.032 (2)\", \"tab\": \"General information\", \"score\": \"754.5162011173185\"}",
+              "Moral Scenarios - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
-            "additional_details": {}
+            "additional_details": {
+              "subject": "\"moral_scenarios\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_moral_scenarios\""
+            }
           }
         },
         {
-          "evaluation_name": "NaturalQuestions (closed-book)",
+          "evaluation_name": "Nutrition",
           "source_data": {
-            "dataset_name": "NaturalQuestions (closed-book)",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "F1 on NaturalQuestions (closed-book)",
+            "evaluation_description": "EM on Nutrition",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.253,
+            "score": 0.676,
             "details": {
-              "description": "min=0.253, mean=0.253, max=0.253, sum=0.253 (1)",
+              "description": "min=0.676, mean=0.676, max=0.676, sum=1.353 (2)",
               "tab": "Accuracy",
-              "NaturalQuestions (open-book) - Observed inference time (s)": "{\"description\": \"min=0.563, mean=0.563, max=0.563, sum=0.563 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.5634698050022126\"}",
-              "NaturalQuestions (closed-book) - Observed inference time (s)": "{\"description\": \"min=0.535, mean=0.535, max=0.535, sum=0.535 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.5347676448822022\"}",
-              "NaturalQuestions (open-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}",
-              "NaturalQuestions (open-book) - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "NaturalQuestions (open-book) - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "NaturalQuestions (open-book) - # prompt tokens": "{\"description\": \"min=2498.79, mean=2498.79, max=2498.79, sum=2498.79 (1)\", \"tab\": \"General information\", \"score\": \"2498.79\"}",
-              "NaturalQuestions (open-book) - # output tokens": "{\"description\": \"min=12.448, mean=12.448, max=12.448, sum=12.448 (1)\", \"tab\": \"General information\", \"score\": \"12.448\"}",
-              "NaturalQuestions (closed-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}",
-              "NaturalQuestions (closed-book) - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "NaturalQuestions (closed-book) - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "NaturalQuestions (closed-book) - # prompt tokens": "{\"description\": \"min=172.069, mean=172.069, max=172.069, sum=172.069 (1)\", \"tab\": \"General information\", \"score\": \"172.069\"}",
-              "NaturalQuestions (closed-book) - # output tokens": "{\"description\": \"min=20.461, mean=20.461, max=20.461, sum=20.461 (1)\", \"tab\": \"General information\", \"score\": \"20.461\"}"
+              "Nutrition - Observed inference time (s)": "{\"description\": \"min=0.236, mean=0.236, max=0.236, sum=0.471 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.23563866054310517\"}",
+              "Nutrition - # eval": "{\"description\": \"min=306, mean=306, max=306, sum=612 (2)\", \"tab\": \"General information\", \"score\": \"306.0\"}",
+              "Nutrition - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Nutrition - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Nutrition - # prompt tokens": "{\"description\": \"min=689.69, mean=689.69, max=689.69, sum=1379.379 (2)\", \"tab\": \"General information\", \"score\": \"689.6895424836601\"}",
+              "Nutrition - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "mode": "\"closedbook\""
+              "subject": "\"nutrition\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_nutrition\""
             }
           }
         },
         {
-          "evaluation_name": "OpenbookQA",
+          "evaluation_name": "Prehistory",
           "source_data": {
-            "dataset_name": "OpenbookQA",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on OpenbookQA",
+            "evaluation_description": "EM on Prehistory",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.79,
+            "score": 0.673,
             "details": {
-              "description": "min=0.79, mean=0.79, max=0.79, sum=0.79 (1)",
+              "description": "min=0.673, mean=0.673, max=0.673, sum=1.346 (2)",
               "tab": "Accuracy",
-              "OpenbookQA - Observed inference time (s)": "{\"description\": \"min=0.256, mean=0.256, max=0.256, sum=0.256 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.25593132400512697\"}",
-              "OpenbookQA - # eval": "{\"description\": \"min=500, mean=500, max=500, sum=500 (1)\", \"tab\": \"General information\", \"score\": \"500.0\"}",
-              "OpenbookQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "OpenbookQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "OpenbookQA - # prompt tokens": "{\"description\": \"min=289.15, mean=289.15, max=289.15, sum=289.15 (1)\", \"tab\": \"General information\", \"score\": \"289.15\"}",
-              "OpenbookQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Prehistory - Observed inference time (s)": "{\"description\": \"min=0.345, mean=0.345, max=0.345, sum=0.69 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.34476134880089465\"}",
+              "Prehistory - # eval": "{\"description\": \"min=324, mean=324, max=324, sum=648 (2)\", \"tab\": \"General information\", \"score\": \"324.0\"}",
+              "Prehistory - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Prehistory - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Prehistory - # prompt tokens": "{\"description\": \"min=611.145, mean=611.145, max=611.145, sum=1222.29 (2)\", \"tab\": \"General information\", \"score\": \"611.145061728395\"}",
+              "Prehistory - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "dataset": "\"openbookqa\"",
-              "method": "\"multiple_choice_joint\""
+              "subject": "\"prehistory\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_prehistory\""
             }
           }
         },
         {
-          "evaluation_name": "MMLU",
+          "evaluation_name": "Public Relations",
           "source_data": {
-            "dataset_name": "MMLU",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on MMLU",
+            "evaluation_description": "EM on Public Relations",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.51,
+            "score": 0.636,
             "details": {
-              "description": "min=0.27, mean=0.51, max=0.79, sum=2.551 (5)",
+              "description": "min=0.636, mean=0.636, max=0.636, sum=1.273 (2)",
               "tab": "Accuracy",
-              "MMLU - Observed inference time (s)": "{\"description\": \"min=0.221, mean=0.372, max=0.487, sum=1.862 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.37230395750413864\"}",
-              "MMLU - # eval": "{\"description\": \"min=100, mean=102.8, max=114, sum=514 (5)\", \"tab\": \"General information\", \"score\": \"102.8\"}",
-              "MMLU - # train": "{\"description\": \"min=5, mean=5, max=5, sum=25 (5)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "MMLU - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "MMLU - # prompt tokens": "{\"description\": \"min=411.44, mean=532.091, max=696.175, sum=2660.455 (5)\", \"tab\": \"General information\", \"score\": \"532.0910877192983\"}",
-              "MMLU - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Public Relations - Observed inference time (s)": "{\"description\": \"min=0.327, mean=0.327, max=0.327, sum=0.654 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3271717678416859\"}",
+              "Public Relations - # eval": "{\"description\": \"min=110, mean=110, max=110, sum=220 (2)\", \"tab\": \"General information\", \"score\": \"110.0\"}",
+              "Public Relations - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Public Relations - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Public Relations - # prompt tokens": "{\"description\": \"min=471.036, mean=471.036, max=471.036, sum=942.073 (2)\", \"tab\": \"General information\", \"score\": \"471.03636363636366\"}",
+              "Public Relations - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "[\"abstract_algebra\", \"college_chemistry\", \"computer_security\", \"econometrics\", \"us_foreign_policy\"]",
-              "method": "\"multiple_choice_joint\""
+              "subject": "\"public_relations\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_public_relations\""
             }
           }
         },
         {
-          "evaluation_name": "MATH",
+          "evaluation_name": "Security Studies",
           "source_data": {
-            "dataset_name": "MATH",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "Equivalent (CoT) on MATH",
+            "evaluation_description": "EM on Security Studies",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.289,
+            "score": 0.682,
             "details": {
-              "description": "min=0.115, mean=0.289, max=0.477, sum=2.02 (7)",
+              "description": "min=0.682, mean=0.682, max=0.682, sum=1.363 (2)",
               "tab": "Accuracy",
-              "MATH - Observed inference time (s)": "{\"description\": \"min=2.027, mean=2.656, max=3.039, sum=18.593 (7)\", \"tab\": \"Efficiency\", \"score\": \"2.656151831465352\"}",
-              "MATH - # eval": "{\"description\": \"min=30, mean=62.429, max=135, sum=437 (7)\", \"tab\": \"General information\", \"score\": \"62.42857142857143\"}",
-              "MATH - # train": "{\"description\": \"min=8, mean=8, max=8, sum=56 (7)\", \"tab\": \"General information\", \"score\": \"8.0\"}",
-              "MATH - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (7)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "MATH - # prompt tokens": "{\"description\": \"min=991.615, mean=1455.266, max=2502.962, sum=10186.865 (7)\", \"tab\": \"General information\", \"score\": \"1455.2664139976257\"}",
-              "MATH - # output tokens": "{\"description\": \"min=123.616, mean=149.99, max=172.789, sum=1049.933 (7)\", \"tab\": \"General information\", \"score\": \"149.99043902740354\"}"
+              "Security Studies - Observed inference time (s)": "{\"description\": \"min=0.561, mean=0.561, max=0.561, sum=1.121 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5606838294437954\"}",
+              "Security Studies - # eval": "{\"description\": \"min=245, mean=245, max=245, sum=490 (2)\", \"tab\": \"General information\", \"score\": \"245.0\"}",
+              "Security Studies - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Security Studies - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Security Studies - # prompt tokens": "{\"description\": \"min=1324.865, mean=1324.865, max=1324.865, sum=2649.731 (2)\", \"tab\": \"General information\", \"score\": \"1324.865306122449\"}",
+              "Security Studies - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "[\"algebra\", \"counting_and_probability\", \"geometry\", \"intermediate_algebra\", \"number_theory\", \"prealgebra\", \"precalculus\"]",
-              "level": "\"1\"",
-              "use_official_examples": "\"False\"",
-              "use_chain_of_thought": "\"True\""
+              "subject": "\"security_studies\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_security_studies\""
             }
           }
         },
         {
-          "evaluation_name": "GSM8K",
+          "evaluation_name": "Sociology",
           "source_data": {
-            "dataset_name": "GSM8K",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on GSM8K",
+            "evaluation_description": "EM on Sociology",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.538,
+            "score": 0.806,
             "details": {
-              "description": "min=0.538, mean=0.538, max=0.538, sum=0.538 (1)",
+              "description": "min=0.806, mean=0.806, max=0.806, sum=1.612 (2)",
               "tab": "Accuracy",
-              "GSM8K - Observed inference time (s)": "{\"description\": \"min=3.95, mean=3.95, max=3.95, sum=3.95 (1)\", \"tab\": \"Efficiency\", \"score\": \"3.949965229511261\"}",
-              "GSM8K - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}",
-              "GSM8K - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "GSM8K - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "GSM8K - # prompt tokens": "{\"description\": \"min=1187.268, mean=1187.268, max=1187.268, sum=1187.268 (1)\", \"tab\": \"General information\", \"score\": \"1187.268\"}",
-              "GSM8K - # output tokens": "{\"description\": \"min=196.611, mean=196.611, max=196.611, sum=196.611 (1)\", \"tab\": \"General information\", \"score\": \"196.611\"}"
+              "Sociology - Observed inference time (s)": "{\"description\": \"min=0.413, mean=0.413, max=0.413, sum=0.825 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.41272182962787685\"}",
+              "Sociology - # eval": "{\"description\": \"min=201, mean=201, max=201, sum=402 (2)\", \"tab\": \"General information\", \"score\": \"201.0\"}",
+              "Sociology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Sociology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Sociology - # prompt tokens": "{\"description\": \"min=496.95, mean=496.95, max=496.95, sum=993.9 (2)\", \"tab\": \"General information\", \"score\": \"496.9502487562189\"}",
+              "Sociology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "stop": "\"none\""
+              "subject": "\"sociology\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_sociology\""
             }
           }
         },
         {
-          "evaluation_name": "LegalBench",
+          "evaluation_name": "Virology",
           "source_data": {
-            "dataset_name": "LegalBench",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on LegalBench",
+            "evaluation_description": "EM on Virology",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.331,
+            "score": 0.47,
             "details": {
-              "description": "min=0.063, mean=0.331, max=0.733, sum=1.655 (5)",
+              "description": "min=0.47, mean=0.47, max=0.47, sum=0.94 (2)",
               "tab": "Accuracy",
-              "LegalBench - Observed inference time (s)": "{\"description\": \"min=0.316, mean=0.489, max=0.855, sum=2.444 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.4887186054518059\"}",
-              "LegalBench - # eval": "{\"description\": \"min=95, mean=409.4, max=1000, sum=2047 (5)\", \"tab\": \"General information\", \"score\": \"409.4\"}",
-              "LegalBench - # train": "{\"description\": \"min=4, mean=4.8, max=5, sum=24 (5)\", \"tab\": \"General information\", \"score\": \"4.8\"}",
-              "LegalBench - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "LegalBench - # prompt tokens": "{\"description\": \"min=236.453, mean=1750.748, max=7224.488, sum=8753.741 (5)\", \"tab\": \"General information\", \"score\": \"1750.7482458432962\"}",
-              "LegalBench - # output tokens": "{\"description\": \"min=2, mean=9.174, max=15.242, sum=45.871 (5)\", \"tab\": \"General information\", \"score\": \"9.17419274343898\"}"
+              "Virology - Observed inference time (s)": "{\"description\": \"min=0.644, mean=0.644, max=0.644, sum=1.288 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6437842285776713\"}",
+              "Virology - # eval": "{\"description\": \"min=166, mean=166, max=166, sum=332 (2)\", \"tab\": \"General information\", \"score\": \"166.0\"}",
+              "Virology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Virology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Virology - # prompt tokens": "{\"description\": \"min=404.349, mean=404.349, max=404.349, sum=808.699 (2)\", \"tab\": \"General information\", \"score\": \"404.34939759036143\"}",
+              "Virology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subset": "[\"abercrombie\", \"corporate_lobbying\", \"function_of_decision_section\", \"international_citizenship_questions\", \"proa\"]"
+              "subject": "\"virology\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_virology\""
             }
           }
         },
         {
-          "evaluation_name": "MedQA",
+          "evaluation_name": "World Religions",
           "source_data": {
-            "dataset_name": "MedQA",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on MedQA",
+            "evaluation_description": "EM on World Religions",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.517,
+            "score": 0.825,
             "details": {
-              "description": "min=0.517, mean=0.517, max=0.517, sum=0.517 (1)",
+              "description": "min=0.825, mean=0.825, max=0.825, sum=1.649 (2)",
               "tab": "Accuracy",
-              "MedQA - Observed inference time (s)": "{\"description\": \"min=0.418, mean=0.418, max=0.418, sum=0.418 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.4182186216767692\"}",
-              "MedQA - # eval": "{\"description\": \"min=503, mean=503, max=503, sum=503 (1)\", \"tab\": \"General information\", \"score\": \"503.0\"}",
-              "MedQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "MedQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "MedQA - # prompt tokens": "{\"description\": \"min=1202.093, mean=1202.093, max=1202.093, sum=1202.093 (1)\", \"tab\": \"General information\", \"score\": \"1202.0934393638172\"}",
-              "MedQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "World Religions - Observed inference time (s)": "{\"description\": \"min=0.266, mean=0.266, max=0.266, sum=0.532 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.26615772330970094\"}",
+              "World Religions - # eval": "{\"description\": \"min=171, mean=171, max=171, sum=342 (2)\", \"tab\": \"General information\", \"score\": \"171.0\"}",
+              "World Religions - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "World Religions - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "World Religions - # prompt tokens": "{\"description\": \"min=317.924, mean=317.924, max=317.924, sum=635.848 (2)\", \"tab\": \"General information\", \"score\": \"317.92397660818716\"}",
+              "World Religions - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
-            "additional_details": {}
+            "additional_details": {
+              "subject": "\"world_religions\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_world_religions\""
+            }
           }
         },
         {
-          "evaluation_name": "WMT 2014",
+          "evaluation_name": "Mean win rate",
           "source_data": {
-            "dataset_name": "WMT 2014",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "BLEU-4 on WMT 2014",
+            "evaluation_description": "How many models this model outperforms on average (over columns).",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.142,
+            "score": 0.509,
             "details": {
-              "description": "min=0.047, mean=0.142, max=0.184, sum=0.712 (5)",
-              "tab": "Accuracy",
-              "WMT 2014 - Observed inference time (s)": "{\"description\": \"min=0.582, mean=0.775, max=0.872, sum=3.875 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.7750062139801958\"}",
-              "WMT 2014 - # eval": "{\"description\": \"min=503, mean=568.8, max=832, sum=2844 (5)\", \"tab\": \"General information\", \"score\": \"568.8\"}",
-              "WMT 2014 - # train": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "WMT 2014 - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "WMT 2014 - # prompt tokens": "{\"description\": \"min=148.306, mean=162.433, max=181.018, sum=812.166 (5)\", \"tab\": \"General information\", \"score\": \"162.43317355482492\"}",
-              "WMT 2014 - # output tokens": "{\"description\": \"min=28.3, mean=30.51, max=31.912, sum=152.552 (5)\", \"tab\": \"General information\", \"score\": \"30.510483732222053\"}"
+              "description": "",
+              "tab": "Efficiency"
             }
           },
           "generation_config": {
-            "additional_details": {
-              "language_pair": "[\"cs-en\", \"de-en\", \"fr-en\", \"hi-en\", \"ru-en\"]"
-            }
+            "additional_details": {}
           }
         }
       ],
       "detailed_evaluation_results": null,
       "generation_config": {
-        "additional_details": {}
+        "additional_details": {
+          "subject": "[\"abstract_algebra\", \"anatomy\", \"astronomy\", \"business_ethics\", \"clinical_knowledge\", \"college_biology\", \"college_chemistry\", \"college_computer_science\", \"college_mathematics\", \"college_medicine\", \"college_physics\", \"computer_security\", \"conceptual_physics\", \"econometrics\", \"electrical_engineering\", \"elementary_mathematics\", \"formal_logic\", \"global_facts\", \"high_school_biology\", \"high_school_chemistry\", \"high_school_computer_science\", \"high_school_european_history\", \"high_school_geography\", \"high_school_government_and_politics\", \"high_school_macroeconomics\", \"high_school_mathematics\", \"high_school_microeconomics\", \"high_school_physics\", \"high_school_psychology\", \"high_school_statistics\", \"high_school_us_history\", \"high_school_world_history\", \"human_aging\", \"human_sexuality\", \"international_law\", \"jurisprudence\", \"logical_fallacies\", \"machine_learning\", \"management\", \"marketing\", \"medical_genetics\", \"miscellaneous\", \"moral_disputes\", \"moral_scenarios\", \"nutrition\", \"philosophy\", \"prehistory\", \"professional_accounting\", \"professional_law\", \"professional_medicine\", \"professional_psychology\", \"public_relations\", \"security_studies\", \"sociology\", \"us_foreign_policy\", \"virology\", \"world_religions\"]",
+          "method": "\"multiple_choice_joint\"",
+          "eval_split": "\"test\"",
+          "groups": "[\"mmlu_abstract_algebra\", \"mmlu_anatomy\", \"mmlu_astronomy\", \"mmlu_business_ethics\", \"mmlu_clinical_knowledge\", \"mmlu_college_biology\", \"mmlu_college_chemistry\", \"mmlu_college_computer_science\", \"mmlu_college_mathematics\", \"mmlu_college_medicine\", \"mmlu_college_physics\", \"mmlu_computer_security\", \"mmlu_conceptual_physics\", \"mmlu_econometrics\", \"mmlu_electrical_engineering\", \"mmlu_elementary_mathematics\", \"mmlu_formal_logic\", \"mmlu_global_facts\", \"mmlu_high_school_biology\", \"mmlu_high_school_chemistry\", \"mmlu_high_school_computer_science\", \"mmlu_high_school_european_history\", \"mmlu_high_school_geography\", \"mmlu_high_school_government_and_politics\", \"mmlu_high_school_macroeconomics\", \"mmlu_high_school_mathematics\", \"mmlu_high_school_microeconomics\", \"mmlu_high_school_physics\", \"mmlu_high_school_psychology\", \"mmlu_high_school_statistics\", \"mmlu_high_school_us_history\", \"mmlu_high_school_world_history\", \"mmlu_human_aging\", \"mmlu_human_sexuality\", \"mmlu_international_law\", \"mmlu_jurisprudence\", \"mmlu_logical_fallacies\", \"mmlu_machine_learning\", \"mmlu_management\", \"mmlu_marketing\", \"mmlu_medical_genetics\", \"mmlu_miscellaneous\", \"mmlu_moral_disputes\", \"mmlu_moral_scenarios\", \"mmlu_nutrition\", \"mmlu_philosophy\", \"mmlu_prehistory\", \"mmlu_professional_accounting\", \"mmlu_professional_law\", \"mmlu_professional_medicine\", \"mmlu_professional_psychology\", \"mmlu_public_relations\", \"mmlu_security_studies\", \"mmlu_sociology\", \"mmlu_us_foreign_policy\", \"mmlu_virology\", \"mmlu_world_religions\"]"
+        }
       }
     },
     {
diff --git a/data/models/mistralai_mistral-small-2503.json b/data/models/mistralai_mistral-small-2503.json
index be5d73de7278abb3c747dbebd44b60d3fa624503..6df0d972b005ae32b060fdc4673d6092e770670f 100644
--- a/data/models/mistralai_mistral-small-2503.json
+++ b/data/models/mistralai_mistral-small-2503.json
@@ -10,8 +10,8 @@
   },
   "evaluations": [
     {
-      "evaluation_id": "global-mmlu-lite/mistralai_mistral-small-2503/1773936496.366405",
-      "retrieved_timestamp": "1773936496.366405",
+      "evaluation_id": "global-mmlu-lite/mistralai_mistral-small-2503/1773936583.743359",
+      "retrieved_timestamp": "1773936583.743359",
       "source_metadata": {
         "source_name": "Global MMLU Lite Leaderboard",
         "source_type": "documentation",
@@ -525,8 +525,8 @@
       "generation_config": null
     },
     {
-      "evaluation_id": "global-mmlu-lite/mistralai_mistral-small-2503/1773936583.743359",
-      "retrieved_timestamp": "1773936583.743359",
+      "evaluation_id": "global-mmlu-lite/mistralai_mistral-small-2503/1773936496.366405",
+      "retrieved_timestamp": "1773936496.366405",
       "source_metadata": {
         "source_name": "Global MMLU Lite Leaderboard",
         "source_type": "documentation",
diff --git a/data/models/mistralai_mixtral-8x7b-v0.1.json b/data/models/mistralai_mixtral-8x7b-v0.1.json
index 9d997e3527157e47894ae0f49b424a1634297e78..c3ac844f0072de0a86748b754d6c90570298768c 100644
--- a/data/models/mistralai_mixtral-8x7b-v0.1.json
+++ b/data/models/mistralai_mixtral-8x7b-v0.1.json
@@ -5,7 +5,7 @@
     "developer": "mistralai",
     "inference_platform": "unknown",
     "additional_details": {
-      "precision": "bfloat16",
+      "precision": "float16",
       "architecture": "MixtralForCausalLM",
       "params_billions": "46.703"
     }
@@ -44,7 +44,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2415
+            "score": 0.2326
           }
         },
         {
@@ -62,7 +62,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5087
+            "score": 0.5098
           }
         },
         {
@@ -80,7 +80,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.102
+            "score": 0.0937
           }
         },
         {
@@ -98,7 +98,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3138
+            "score": 0.3205
           }
         },
         {
@@ -116,7 +116,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4321
+            "score": 0.4413
           }
         },
         {
@@ -134,7 +134,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.385
+            "score": 0.3871
           }
         }
       ],
@@ -174,7 +174,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2326
+            "score": 0.2415
           }
         },
         {
@@ -192,7 +192,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5098
+            "score": 0.5087
           }
         },
         {
@@ -210,7 +210,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0937
+            "score": 0.102
           }
         },
         {
@@ -228,7 +228,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3205
+            "score": 0.3138
           }
         },
         {
@@ -246,7 +246,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4413
+            "score": 0.4321
           }
         },
         {
@@ -264,7 +264,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3871
+            "score": 0.385
           }
         }
       ],
diff --git a/data/models/mlabonne_neuraldaredevil-8b-abliterated.json b/data/models/mlabonne_neuraldaredevil-8b-abliterated.json
index d443de39bb7ed82b00df80190432e583c21fd660..7ef165972eafdeec56d923c82e02fdbbc9479eac 100644
--- a/data/models/mlabonne_neuraldaredevil-8b-abliterated.json
+++ b/data/models/mlabonne_neuraldaredevil-8b-abliterated.json
@@ -5,7 +5,7 @@
     "developer": "mlabonne",
     "inference_platform": "unknown",
     "additional_details": {
-      "precision": "float16",
+      "precision": "bfloat16",
       "architecture": "LlamaForCausalLM",
       "params_billions": "8.03"
     }
@@ -44,7 +44,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7561
+            "score": 0.4162
           }
         },
         {
@@ -62,7 +62,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5111
+            "score": 0.5124
           }
         },
         {
@@ -80,7 +80,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0906
+            "score": 0.0853
           }
         },
         {
@@ -98,7 +98,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3062
+            "score": 0.3029
           }
         },
         {
@@ -116,7 +116,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4019
+            "score": 0.415
           }
         },
         {
@@ -134,7 +134,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3841
+            "score": 0.3802
           }
         }
       ],
@@ -174,7 +174,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4162
+            "score": 0.7561
           }
         },
         {
@@ -192,7 +192,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5124
+            "score": 0.5111
           }
         },
         {
@@ -210,7 +210,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0853
+            "score": 0.0906
           }
         },
         {
@@ -228,7 +228,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3029
+            "score": 0.3062
           }
         },
         {
@@ -246,7 +246,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.415
+            "score": 0.4019
           }
         },
         {
@@ -264,7 +264,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3802
+            "score": 0.3841
           }
         }
       ],
diff --git a/data/models/moonshot-ai_kimi-k2-instruct.json b/data/models/moonshot-ai_kimi-k2-instruct.json
index 2cfded0145e5c6821159f45b392f6b86e15c7f49..758984500ae56b028445a56fb8562c74a90a3a8f 100644
--- a/data/models/moonshot-ai_kimi-k2-instruct.json
+++ b/data/models/moonshot-ai_kimi-k2-instruct.json
@@ -4,13 +4,13 @@
     "id": "moonshot-ai/kimi-k2-instruct",
     "developer": "Moonshot AI",
     "additional_details": {
-      "agent_name": "Terminus 2",
-      "agent_organization": "Terminal Bench"
+      "agent_name": "OpenHands",
+      "agent_organization": "OpenHands"
     }
   },
   "evaluations": [
     {
-      "evaluation_id": "terminal-bench-2.0/terminus-2__kimi-k2-instruct/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/openhands__kimi-k2-instruct/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -34,7 +34,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-11-01",
+          "evaluation_timestamp": "2025-11-02",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -43,17 +43,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 27.8,
+            "score": 26.7,
             "uncertainty": {
               "standard_error": {
-                "value": 2.5
+                "value": 2.7
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Kimi K2 Instruct\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Kimi K2 Instruct\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -70,7 +70,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Kimi K2 Instruct\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Kimi K2 Instruct\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -84,7 +84,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/openhands__kimi-k2-instruct/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/terminus-2__kimi-k2-instruct/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -108,7 +108,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-11-02",
+          "evaluation_timestamp": "2025-11-01",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -117,17 +117,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 26.7,
+            "score": 27.8,
             "uncertainty": {
               "standard_error": {
-                "value": 2.7
+                "value": 2.5
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Kimi K2 Instruct\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Kimi K2 Instruct\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -144,7 +144,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Kimi K2 Instruct\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Kimi K2 Instruct\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
diff --git a/data/models/multiple_multiple.json b/data/models/multiple_multiple.json
index 0a0e35b1e041b4e94e4f88e456a39900b87f0e44..5fae5f56e342c454584c0143a055e41581a00848 100644
--- a/data/models/multiple_multiple.json
+++ b/data/models/multiple_multiple.json
@@ -34,7 +34,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-11-11",
+          "evaluation_timestamp": "2025-11-20",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -43,10 +43,10 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 50.1,
+            "score": 59.1,
             "uncertainty": {
               "standard_error": {
-                "value": 2.7
+                "value": 2.8
               },
               "num_samples": 435
             }
@@ -84,7 +84,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/warp__multiple/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/junie-cli__multiple/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -108,7 +108,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-11-20",
+          "evaluation_timestamp": "2026-03-07",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -117,17 +117,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 59.1,
+            "score": 71.0,
             "uncertainty": {
               "standard_error": {
-                "value": 2.8
+                "value": 2.9
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Warp\" -m \"Multiple\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Junie CLI\" -m \"Multiple\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -144,7 +144,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Warp\" -m \"Multiple\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Junie CLI\" -m \"Multiple\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -158,7 +158,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/warp__multiple/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/abacus-ai-desktop__multiple/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -182,7 +182,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-12-12",
+          "evaluation_timestamp": "2025-12-11",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -191,17 +191,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 61.2,
+            "score": 58.4,
             "uncertainty": {
               "standard_error": {
-                "value": 3.0
+                "value": 2.8
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Warp\" -m \"Multiple\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Abacus AI Desktop\" -m \"Multiple\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -218,7 +218,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Warp\" -m \"Multiple\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Abacus AI Desktop\" -m \"Multiple\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -232,7 +232,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/abacus-ai-desktop__multiple/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/warp__multiple/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -256,7 +256,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-12-11",
+          "evaluation_timestamp": "2025-12-12",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -265,17 +265,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 58.4,
+            "score": 61.2,
             "uncertainty": {
               "standard_error": {
-                "value": 2.8
+                "value": 3.0
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Abacus AI Desktop\" -m \"Multiple\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Warp\" -m \"Multiple\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -292,7 +292,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Abacus AI Desktop\" -m \"Multiple\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Warp\" -m \"Multiple\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -306,7 +306,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/ob-1__multiple/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/warp__multiple/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -330,7 +330,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2026-03-05",
+          "evaluation_timestamp": "2025-11-11",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -339,17 +339,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 72.4,
+            "score": 50.1,
             "uncertainty": {
               "standard_error": {
-                "value": 2.3
+                "value": 2.7
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"OB-1\" -m \"Multiple\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Warp\" -m \"Multiple\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -366,7 +366,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"OB-1\" -m \"Multiple\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Warp\" -m \"Multiple\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -380,7 +380,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/junie-cli__multiple/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/ob-1__multiple/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -404,7 +404,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2026-03-07",
+          "evaluation_timestamp": "2026-03-05",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -413,17 +413,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 71.0,
+            "score": 72.4,
             "uncertainty": {
               "standard_error": {
-                "value": 2.9
+                "value": 2.3
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Junie CLI\" -m \"Multiple\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"OB-1\" -m \"Multiple\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -440,7 +440,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Junie CLI\" -m \"Multiple\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"OB-1\" -m \"Multiple\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
diff --git a/data/models/nazimali_mistral-nemo-kurdish-instruct.json b/data/models/nazimali_mistral-nemo-kurdish-instruct.json
index bf12d1ac4ce4431e1cc4657892f1de48dd5df10b..7bcf436e7a29205a95d8b228b8e52bcfb9264e7a 100644
--- a/data/models/nazimali_mistral-nemo-kurdish-instruct.json
+++ b/data/models/nazimali_mistral-nemo-kurdish-instruct.json
@@ -5,7 +5,7 @@
     "developer": "nazimali",
     "inference_platform": "unknown",
     "additional_details": {
-      "precision": "float16",
+      "precision": "bfloat16",
       "architecture": "MistralForCausalLM",
       "params_billions": "12.248"
     }
@@ -44,7 +44,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.486
+            "score": 0.4964
           }
         },
         {
@@ -62,7 +62,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4721
+            "score": 0.4699
           }
         },
         {
@@ -80,7 +80,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0846
+            "score": 0.0045
           }
         },
         {
@@ -98,7 +98,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2844
+            "score": 0.2827
           }
         },
         {
@@ -116,7 +116,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4006
+            "score": 0.3979
           }
         },
         {
@@ -134,7 +134,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3087
+            "score": 0.3063
           }
         }
       ],
@@ -174,7 +174,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4964
+            "score": 0.486
           }
         },
         {
@@ -192,7 +192,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4699
+            "score": 0.4721
           }
         },
         {
@@ -210,7 +210,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0045
+            "score": 0.0846
           }
         },
         {
@@ -228,7 +228,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2827
+            "score": 0.2844
           }
         },
         {
@@ -246,7 +246,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3979
+            "score": 0.4006
           }
         },
         {
@@ -264,7 +264,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3063
+            "score": 0.3087
           }
         }
       ],
diff --git a/data/models/nicolinho_qrm-gemma-2-27b.json b/data/models/nicolinho_qrm-gemma-2-27b.json
index 98185886d3c230ddcd90456c69a5aeed49795fc5..1dea90f885df9d34139a9ef21e55b3dcce1a25fd 100644
--- a/data/models/nicolinho_qrm-gemma-2-27b.json
+++ b/data/models/nicolinho_qrm-gemma-2-27b.json
@@ -9,10 +9,10 @@
   },
   "evaluations": [
     {
-      "evaluation_id": "reward-bench-2/nicolinho_QRM-Gemma-2-27B/1766412838.146816",
+      "evaluation_id": "reward-bench/nicolinho_QRM-Gemma-2-27B/1766412838.146816",
       "retrieved_timestamp": "1766412838.146816",
       "source_metadata": {
-        "source_name": "RewardBench 2",
+        "source_name": "RewardBench",
         "source_type": "documentation",
         "source_organization_name": "Allen Institute for AI",
         "source_organization_url": "https://allenai.org",
@@ -31,104 +31,128 @@
         {
           "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
+            "evaluation_description": "Overall RewardBench Score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7667
+            "score": 0.9444
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Factuality",
+          "evaluation_name": "Chat",
           "metric_config": {
-            "evaluation_description": "Factuality score - measures factual accuracy",
+            "evaluation_description": "Chat accuracy - includes easy chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7853
+            "score": 0.9665
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Precise IF",
+          "evaluation_name": "Chat Hard",
           "metric_config": {
-            "evaluation_description": "Precise Instruction Following score",
+            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3719
+            "score": 0.9013
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Math",
+          "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Math score - measures mathematical reasoning",
+            "evaluation_description": "Safety accuracy - includes safety subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6995
+            "score": 0.927
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Safety",
+          "evaluation_name": "Reasoning",
           "metric_config": {
-            "evaluation_description": "Safety score - measures safety awareness",
+            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9578
+            "score": 0.9826
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
-        },
+        }
+      ],
+      "detailed_evaluation_results": null,
+      "generation_config": null
+    },
+    {
+      "evaluation_id": "reward-bench-2/nicolinho_QRM-Gemma-2-27B/1766412838.146816",
+      "retrieved_timestamp": "1766412838.146816",
+      "source_metadata": {
+        "source_name": "RewardBench 2",
+        "source_type": "documentation",
+        "source_organization_name": "Allen Institute for AI",
+        "source_organization_url": "https://allenai.org",
+        "evaluator_relationship": "third_party"
+      },
+      "eval_library": {
+        "name": "rewardbench",
+        "version": "0.1.3",
+        "additional_details": {
+          "subsets": "Chat, Chat Hard, Safety, Reasoning",
+          "hf_space": "allenai/reward-bench"
+        }
+      },
+      "benchmark": "reward-bench",
+      "evaluation_results": [
         {
-          "evaluation_name": "Focus",
+          "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Focus score - measures response focus",
+            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9535
+            "score": 0.7667
           },
           "source_data": {
             "dataset_name": "RewardBench 2",
@@ -137,135 +161,111 @@
           }
         },
         {
-          "evaluation_name": "Ties",
+          "evaluation_name": "Factuality",
           "metric_config": {
-            "evaluation_description": "Ties score - ability to identify tie cases",
+            "evaluation_description": "Factuality score - measures factual accuracy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8321
+            "score": 0.7853
           },
           "source_data": {
             "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
             "hf_repo": "allenai/reward-bench-2-results"
           }
-        }
-      ],
-      "detailed_evaluation_results": null,
-      "generation_config": null
-    },
-    {
-      "evaluation_id": "reward-bench/nicolinho_QRM-Gemma-2-27B/1766412838.146816",
-      "retrieved_timestamp": "1766412838.146816",
-      "source_metadata": {
-        "source_name": "RewardBench",
-        "source_type": "documentation",
-        "source_organization_name": "Allen Institute for AI",
-        "source_organization_url": "https://allenai.org",
-        "evaluator_relationship": "third_party"
-      },
-      "eval_library": {
-        "name": "rewardbench",
-        "version": "0.1.3",
-        "additional_details": {
-          "subsets": "Chat, Chat Hard, Safety, Reasoning",
-          "hf_space": "allenai/reward-bench"
-        }
-      },
-      "benchmark": "reward-bench",
-      "evaluation_results": [
+        },
         {
-          "evaluation_name": "Score",
+          "evaluation_name": "Precise IF",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench Score",
+            "evaluation_description": "Precise Instruction Following score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9444
+            "score": 0.3719
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat",
+          "evaluation_name": "Math",
           "metric_config": {
-            "evaluation_description": "Chat accuracy - includes easy chat subsets",
+            "evaluation_description": "Math score - measures mathematical reasoning",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9665
+            "score": 0.6995
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat Hard",
+          "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
+            "evaluation_description": "Safety score - measures safety awareness",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9013
+            "score": 0.9578
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Safety",
+          "evaluation_name": "Focus",
           "metric_config": {
-            "evaluation_description": "Safety accuracy - includes safety subsets",
+            "evaluation_description": "Focus score - measures response focus",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.927
+            "score": 0.9535
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Reasoning",
+          "evaluation_name": "Ties",
           "metric_config": {
-            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
+            "evaluation_description": "Ties score - ability to identify tie cases",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9826
+            "score": 0.8321
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         }
       ],
diff --git a/data/models/nicolinho_qrm-llama3.1-8b-v2.json b/data/models/nicolinho_qrm-llama3.1-8b-v2.json
index 0df8878cad15f33ea391f78f6a5406e147177591..71e586c5d191f366e8b76150e58f0f9807a69f6b 100644
--- a/data/models/nicolinho_qrm-llama3.1-8b-v2.json
+++ b/data/models/nicolinho_qrm-llama3.1-8b-v2.json
@@ -9,10 +9,10 @@
   },
   "evaluations": [
     {
-      "evaluation_id": "reward-bench-2/nicolinho_QRM-Llama3.1-8B-v2/1766412838.146816",
+      "evaluation_id": "reward-bench/nicolinho_QRM-Llama3.1-8B-v2/1766412838.146816",
       "retrieved_timestamp": "1766412838.146816",
       "source_metadata": {
-        "source_name": "RewardBench 2",
+        "source_name": "RewardBench",
         "source_type": "documentation",
         "source_organization_name": "Allen Institute for AI",
         "source_organization_url": "https://allenai.org",
@@ -31,104 +31,128 @@
         {
           "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
+            "evaluation_description": "Overall RewardBench Score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7074
+            "score": 0.9314
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Factuality",
+          "evaluation_name": "Chat",
           "metric_config": {
-            "evaluation_description": "Factuality score - measures factual accuracy",
+            "evaluation_description": "Chat accuracy - includes easy chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6653
+            "score": 0.9637
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Precise IF",
+          "evaluation_name": "Chat Hard",
           "metric_config": {
-            "evaluation_description": "Precise Instruction Following score",
+            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4062
+            "score": 0.8684
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Math",
+          "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Math score - measures mathematical reasoning",
+            "evaluation_description": "Safety accuracy - includes safety subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.612
+            "score": 0.9257
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Safety",
+          "evaluation_name": "Reasoning",
           "metric_config": {
-            "evaluation_description": "Safety score - measures safety awareness",
+            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9467
+            "score": 0.9677
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
-        },
+        }
+      ],
+      "detailed_evaluation_results": null,
+      "generation_config": null
+    },
+    {
+      "evaluation_id": "reward-bench-2/nicolinho_QRM-Llama3.1-8B-v2/1766412838.146816",
+      "retrieved_timestamp": "1766412838.146816",
+      "source_metadata": {
+        "source_name": "RewardBench 2",
+        "source_type": "documentation",
+        "source_organization_name": "Allen Institute for AI",
+        "source_organization_url": "https://allenai.org",
+        "evaluator_relationship": "third_party"
+      },
+      "eval_library": {
+        "name": "rewardbench",
+        "version": "0.1.3",
+        "additional_details": {
+          "subsets": "Chat, Chat Hard, Safety, Reasoning",
+          "hf_space": "allenai/reward-bench"
+        }
+      },
+      "benchmark": "reward-bench",
+      "evaluation_results": [
         {
-          "evaluation_name": "Focus",
+          "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Focus score - measures response focus",
+            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8909
+            "score": 0.7074
           },
           "source_data": {
             "dataset_name": "RewardBench 2",
@@ -137,135 +161,111 @@
           }
         },
         {
-          "evaluation_name": "Ties",
+          "evaluation_name": "Factuality",
           "metric_config": {
-            "evaluation_description": "Ties score - ability to identify tie cases",
+            "evaluation_description": "Factuality score - measures factual accuracy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7234
+            "score": 0.6653
           },
           "source_data": {
             "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
             "hf_repo": "allenai/reward-bench-2-results"
           }
-        }
-      ],
-      "detailed_evaluation_results": null,
-      "generation_config": null
-    },
-    {
-      "evaluation_id": "reward-bench/nicolinho_QRM-Llama3.1-8B-v2/1766412838.146816",
-      "retrieved_timestamp": "1766412838.146816",
-      "source_metadata": {
-        "source_name": "RewardBench",
-        "source_type": "documentation",
-        "source_organization_name": "Allen Institute for AI",
-        "source_organization_url": "https://allenai.org",
-        "evaluator_relationship": "third_party"
-      },
-      "eval_library": {
-        "name": "rewardbench",
-        "version": "0.1.3",
-        "additional_details": {
-          "subsets": "Chat, Chat Hard, Safety, Reasoning",
-          "hf_space": "allenai/reward-bench"
-        }
-      },
-      "benchmark": "reward-bench",
-      "evaluation_results": [
+        },
         {
-          "evaluation_name": "Score",
+          "evaluation_name": "Precise IF",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench Score",
+            "evaluation_description": "Precise Instruction Following score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9314
+            "score": 0.4062
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat",
+          "evaluation_name": "Math",
           "metric_config": {
-            "evaluation_description": "Chat accuracy - includes easy chat subsets",
+            "evaluation_description": "Math score - measures mathematical reasoning",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9637
+            "score": 0.612
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat Hard",
+          "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
+            "evaluation_description": "Safety score - measures safety awareness",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8684
+            "score": 0.9467
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Safety",
+          "evaluation_name": "Focus",
           "metric_config": {
-            "evaluation_description": "Safety accuracy - includes safety subsets",
+            "evaluation_description": "Focus score - measures response focus",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9257
+            "score": 0.8909
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Reasoning",
+          "evaluation_name": "Ties",
           "metric_config": {
-            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
+            "evaluation_description": "Ties score - ability to identify tie cases",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9677
+            "score": 0.7234
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         }
       ],
diff --git a/data/models/nisten_franqwenstein-35b.json b/data/models/nisten_franqwenstein-35b.json
index a3333a53a7553456405aceea78020ff40ff402f8..3a70722384deca25a07bb44eb7942735ee94aa9c 100644
--- a/data/models/nisten_franqwenstein-35b.json
+++ b/data/models/nisten_franqwenstein-35b.json
@@ -5,7 +5,7 @@
     "developer": "nisten",
     "inference_platform": "unknown",
     "additional_details": {
-      "precision": "float16",
+      "precision": "bfloat16",
       "architecture": "Qwen2ForCausalLM",
       "params_billions": "34.714"
     }
@@ -44,7 +44,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3799
+            "score": 0.3914
           }
         },
         {
@@ -62,7 +62,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6647
+            "score": 0.6591
           }
         },
         {
@@ -80,7 +80,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3406
+            "score": 0.3044
           }
         },
         {
@@ -98,7 +98,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4035
+            "score": 0.3591
           }
         },
         {
@@ -116,7 +116,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.494
+            "score": 0.4681
           }
         },
         {
@@ -134,7 +134,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5731
+            "score": 0.5611
           }
         }
       ],
@@ -174,7 +174,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3914
+            "score": 0.3799
           }
         },
         {
@@ -192,7 +192,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6591
+            "score": 0.6647
           }
         },
         {
@@ -210,7 +210,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3044
+            "score": 0.3406
           }
         },
         {
@@ -228,7 +228,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3591
+            "score": 0.4035
           }
         },
         {
@@ -246,7 +246,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4681
+            "score": 0.494
           }
         },
         {
@@ -264,7 +264,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5611
+            "score": 0.5731
           }
         }
       ],
diff --git a/data/models/nousresearch_yarn-llama-2-7b-128k.json b/data/models/nousresearch_yarn-llama-2-7b-128k.json
deleted file mode 100644
index 031faa4d0ff78244e61988ea0bfc24cefdd224de..0000000000000000000000000000000000000000
--- a/data/models/nousresearch_yarn-llama-2-7b-128k.json
+++ /dev/null
@@ -1,145 +0,0 @@
-{
-  "model_info": {
-    "name": "Yarn-Llama-2-7b-128k",
-    "id": "NousResearch/Yarn-Llama-2-7b-128k",
-    "developer": "NousResearch",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": "7.0"
-    }
-  },
-  "evaluations": [
-    {
-      "evaluation_id": "hfopenllm_v2/NousResearch_Yarn-Llama-2-7b-128k/1773936498.240187",
-      "retrieved_timestamp": "1773936498.240187",
-      "source_metadata": {
-        "source_name": "HF Open LLM v2",
-        "source_type": "documentation",
-        "source_organization_name": "Hugging Face",
-        "evaluator_relationship": "third_party"
-      },
-      "eval_library": {
-        "name": "lm-evaluation-harness",
-        "version": "0.4.0",
-        "additional_details": {
-          "fork": "https://github.com/huggingface/lm-evaluation-harness/tree/adding_all_changess"
-        }
-      },
-      "benchmark": "hfopenllm_v2",
-      "evaluation_results": [
-        {
-          "evaluation_name": "IFEval",
-          "source_data": {
-            "dataset_name": "IFEval",
-            "source_type": "hf_dataset",
-            "hf_repo": "google/IFEval"
-          },
-          "metric_config": {
-            "evaluation_description": "Accuracy on IFEval",
-            "lower_is_better": false,
-            "score_type": "continuous",
-            "min_score": 0.0,
-            "max_score": 1.0
-          },
-          "score_details": {
-            "score": 0.1485
-          }
-        },
-        {
-          "evaluation_name": "BBH",
-          "source_data": {
-            "dataset_name": "BBH",
-            "source_type": "hf_dataset",
-            "hf_repo": "SaylorTwift/bbh"
-          },
-          "metric_config": {
-            "evaluation_description": "Accuracy on BBH",
-            "lower_is_better": false,
-            "score_type": "continuous",
-            "min_score": 0.0,
-            "max_score": 1.0
-          },
-          "score_details": {
-            "score": 0.3248
-          }
-        },
-        {
-          "evaluation_name": "MATH Level 5",
-          "source_data": {
-            "dataset_name": "MATH Level 5",
-            "source_type": "hf_dataset",
-            "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-          },
-          "metric_config": {
-            "evaluation_description": "Exact Match on MATH Level 5",
-            "lower_is_better": false,
-            "score_type": "continuous",
-            "min_score": 0.0,
-            "max_score": 1.0
-          },
-          "score_details": {
-            "score": 0.0151
-          }
-        },
-        {
-          "evaluation_name": "GPQA",
-          "source_data": {
-            "dataset_name": "GPQA",
-            "source_type": "hf_dataset",
-            "hf_repo": "Idavidrein/gpqa"
-          },
-          "metric_config": {
-            "evaluation_description": "Accuracy on GPQA",
-            "lower_is_better": false,
-            "score_type": "continuous",
-            "min_score": 0.0,
-            "max_score": 1.0
-          },
-          "score_details": {
-            "score": 0.2601
-          }
-        },
-        {
-          "evaluation_name": "MUSR",
-          "source_data": {
-            "dataset_name": "MUSR",
-            "source_type": "hf_dataset",
-            "hf_repo": "TAUR-Lab/MuSR"
-          },
-          "metric_config": {
-            "evaluation_description": "Accuracy on MUSR",
-            "lower_is_better": false,
-            "score_type": "continuous",
-            "min_score": 0.0,
-            "max_score": 1.0
-          },
-          "score_details": {
-            "score": 0.3967
-          }
-        },
-        {
-          "evaluation_name": "MMLU-PRO",
-          "source_data": {
-            "dataset_name": "MMLU-PRO",
-            "source_type": "hf_dataset",
-            "hf_repo": "TIGER-Lab/MMLU-Pro"
-          },
-          "metric_config": {
-            "evaluation_description": "Accuracy on MMLU-PRO",
-            "lower_is_better": false,
-            "score_type": "continuous",
-            "min_score": 0.0,
-            "max_score": 1.0
-          },
-          "score_details": {
-            "score": 0.1791
-          }
-        }
-      ],
-      "detailed_evaluation_results": null,
-      "generation_config": null
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/models/omkar1102_code-yi.json b/data/models/omkar1102_code-yi.json
index c43a8e6f44d50964e1b475e97cca0076acc3fcc2..420be452467271270dd617b6ac43e82dcaa608b1 100644
--- a/data/models/omkar1102_code-yi.json
+++ b/data/models/omkar1102_code-yi.json
@@ -5,7 +5,7 @@
     "developer": "Omkar1102",
     "inference_platform": "unknown",
     "additional_details": {
-      "precision": "float16",
+      "precision": "bfloat16",
       "architecture": "LlamaForCausalLM",
       "params_billions": "2.084"
     }
@@ -44,7 +44,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2148
+            "score": 0.2254
           }
         },
         {
@@ -62,7 +62,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.276
+            "score": 0.275
           }
         },
         {
@@ -98,7 +98,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2508
+            "score": 0.2576
           }
         },
         {
@@ -116,7 +116,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3802
+            "score": 0.3762
           }
         },
         {
@@ -134,7 +134,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.1126
+            "score": 0.1123
           }
         }
       ],
@@ -174,7 +174,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2254
+            "score": 0.2148
           }
         },
         {
@@ -192,7 +192,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.275
+            "score": 0.276
           }
         },
         {
@@ -228,7 +228,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2576
+            "score": 0.2508
           }
         },
         {
@@ -246,7 +246,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3762
+            "score": 0.3802
           }
         },
         {
@@ -264,7 +264,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.1123
+            "score": 0.1126
           }
         }
       ],
diff --git a/data/models/openai_gpt-3.5-turbo-0613.json b/data/models/openai_gpt-3.5-turbo-0613.json
index 9d2e47e97b66d9b4bf8d0c5ba01a8d6f70327065..c24676b03a30239bea93de7207f3f13985ab4587 100644
--- a/data/models/openai_gpt-3.5-turbo-0613.json
+++ b/data/models/openai_gpt-3.5-turbo-0613.json
@@ -6,233 +6,6 @@
     "inference_platform": "unknown"
   },
   "evaluations": [
-    {
-      "evaluation_id": "helm_instruct/openai_gpt-3.5-turbo-0613/1774096309.537868",
-      "retrieved_timestamp": "1774096309.537868",
-      "source_metadata": {
-        "source_name": "helm_instruct",
-        "source_type": "documentation",
-        "source_organization_name": "crfm",
-        "evaluator_relationship": "third_party"
-      },
-      "eval_library": {
-        "name": "helm",
-        "version": "unknown"
-      },
-      "benchmark": "helm_instruct",
-      "evaluation_results": [
-        {
-          "evaluation_name": "Mean win rate",
-          "source_data": {
-            "dataset_name": "helm_instruct",
-            "source_type": "url",
-            "url": [
-              "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
-            ]
-          },
-          "metric_config": {
-            "evaluation_description": "How many models this model outperform on average (over columns).",
-            "lower_is_better": false,
-            "score_type": "continuous",
-            "min_score": 0.0,
-            "max_score": 1.0
-          },
-          "score_details": {
-            "score": 0.689,
-            "details": {
-              "description": "",
-              "tab": "Instruction Following"
-            }
-          },
-          "generation_config": {
-            "additional_details": {}
-          }
-        },
-        {
-          "evaluation_name": "Anthropic RLHF dataset",
-          "source_data": {
-            "dataset_name": "Anthropic RLHF dataset",
-            "source_type": "url",
-            "url": [
-              "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
-            ]
-          },
-          "metric_config": {
-            "evaluation_description": "Harmlessness on Anthropic RLHF dataset",
-            "lower_is_better": false,
-            "score_type": "continuous",
-            "min_score": 0.0,
-            "max_score": 5.0
-          },
-          "score_details": {
-            "score": 4.964,
-            "details": {
-              "description": "min=4.915, mean=4.964, max=5, sum=39.715 (8)",
-              "tab": "Instruction Following"
-            }
-          },
-          "generation_config": {
-            "additional_details": {
-              "subset": "[\"hh\", \"hh\", \"hh\", \"hh\", \"red_team\", \"red_team\", \"red_team\", \"red_team\"]",
-              "evaluator": "[\"claude\", \"gpt4\", \"mturk\", \"scale\", \"claude\", \"gpt4\", \"mturk\", \"scale\"]"
-            }
-          }
-        },
-        {
-          "evaluation_name": "Best ChatGPT Prompts",
-          "source_data": {
-            "dataset_name": "Best ChatGPT Prompts",
-            "source_type": "url",
-            "url": [
-              "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
-            ]
-          },
-          "metric_config": {
-            "evaluation_description": "Harmlessness on Best ChatGPT Prompts",
-            "lower_is_better": false,
-            "score_type": "continuous",
-            "min_score": 0.0,
-            "max_score": 5.0
-          },
-          "score_details": {
-            "score": 4.986,
-            "details": {
-              "description": "min=4.95, mean=4.986, max=5, sum=19.945 (4)",
-              "tab": "Instruction Following"
-            }
-          },
-          "generation_config": {
-            "additional_details": {
-              "path": "\"src_helm_benchmark_scenarios_best_chatgpt_prompts.yaml\"",
-              "tags": "\"\"",
-              "evaluator": "[\"claude\", \"gpt4\", \"mturk\", \"scale\"]"
-            }
-          }
-        },
-        {
-          "evaluation_name": "Koala test dataset",
-          "source_data": {
-            "dataset_name": "Koala test dataset",
-            "source_type": "url",
-            "url": [
-              "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
-            ]
-          },
-          "metric_config": {
-            "evaluation_description": "Harmlessness on Koala test dataset",
-            "lower_is_better": false,
-            "score_type": "continuous",
-            "min_score": 0.0,
-            "max_score": 5.0
-          },
-          "score_details": {
-            "score": 4.987,
-            "details": {
-              "description": "min=4.969, mean=4.987, max=5, sum=19.95 (4)",
-              "tab": "Instruction Following"
-            }
-          },
-          "generation_config": {
-            "additional_details": {
-              "evaluator": "[\"claude\", \"gpt4\", \"mturk\", \"scale\"]"
-            }
-          }
-        },
-        {
-          "evaluation_name": "Open Assistant",
-          "source_data": {
-            "dataset_name": "Open Assistant",
-            "source_type": "url",
-            "url": [
-              "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
-            ]
-          },
-          "metric_config": {
-            "evaluation_description": "Harmlessness on Open Assistant",
-            "lower_is_better": false,
-            "score_type": "continuous",
-            "min_score": 0.0,
-            "max_score": 5.0
-          },
-          "score_details": {
-            "score": 4.987,
-            "details": {
-              "description": "min=4.96, mean=4.987, max=5, sum=19.95 (4)",
-              "tab": "Instruction Following"
-            }
-          },
-          "generation_config": {
-            "additional_details": {
-              "language": "\"en\"",
-              "evaluator": "[\"claude\", \"gpt4\", \"mturk\", \"scale\"]"
-            }
-          }
-        },
-        {
-          "evaluation_name": "Self Instruct",
-          "source_data": {
-            "dataset_name": "Self Instruct",
-            "source_type": "url",
-            "url": [
-              "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
-            ]
-          },
-          "metric_config": {
-            "evaluation_description": "Harmlessness on Self Instruct",
-            "lower_is_better": false,
-            "score_type": "continuous",
-            "min_score": 0.0,
-            "max_score": 5.0
-          },
-          "score_details": {
-            "score": 4.99,
-            "details": {
-              "description": "min=4.97, mean=4.99, max=5, sum=19.96 (4)",
-              "tab": "Instruction Following"
-            }
-          },
-          "generation_config": {
-            "additional_details": {
-              "evaluator": "[\"claude\", \"gpt4\", \"mturk\", \"scale\"]"
-            }
-          }
-        },
-        {
-          "evaluation_name": "Vicuna",
-          "source_data": {
-            "dataset_name": "Vicuna",
-            "source_type": "url",
-            "url": [
-              "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
-            ]
-          },
-          "metric_config": {
-            "evaluation_description": "Harmlessness on Vicuna",
-            "lower_is_better": false,
-            "score_type": "continuous",
-            "min_score": 0.0,
-            "max_score": 5.0
-          },
-          "score_details": {
-            "score": 4.992,
-            "details": {
-              "description": "min=4.975, mean=4.992, max=5, sum=19.969 (4)",
-              "tab": "Instruction Following"
-            }
-          },
-          "generation_config": {
-            "additional_details": {
-              "category": "\"all\"",
-              "evaluator": "[\"claude\", \"gpt4\", \"mturk\", \"scale\"]"
-            }
-          }
-        }
-      ],
-      "detailed_evaluation_results": null,
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
     {
       "evaluation_id": "helm_classic/openai_gpt-3.5-turbo-0613/1774096308.339228",
       "retrieved_timestamp": "1774096308.339228",
@@ -897,6 +670,233 @@
         "additional_details": {}
       }
     },
+    {
+      "evaluation_id": "helm_instruct/openai_gpt-3.5-turbo-0613/1774096309.537868",
+      "retrieved_timestamp": "1774096309.537868",
+      "source_metadata": {
+        "source_name": "helm_instruct",
+        "source_type": "documentation",
+        "source_organization_name": "crfm",
+        "evaluator_relationship": "third_party"
+      },
+      "eval_library": {
+        "name": "helm",
+        "version": "unknown"
+      },
+      "benchmark": "helm_instruct",
+      "evaluation_results": [
+        {
+          "evaluation_name": "Mean win rate",
+          "source_data": {
+            "dataset_name": "helm_instruct",
+            "source_type": "url",
+            "url": [
+              "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+            ]
+          },
+          "metric_config": {
+            "evaluation_description": "How many models this model outperform on average (over columns).",
+            "lower_is_better": false,
+            "score_type": "continuous",
+            "min_score": 0.0,
+            "max_score": 1.0
+          },
+          "score_details": {
+            "score": 0.689,
+            "details": {
+              "description": "",
+              "tab": "Instruction Following"
+            }
+          },
+          "generation_config": {
+            "additional_details": {}
+          }
+        },
+        {
+          "evaluation_name": "Anthropic RLHF dataset",
+          "source_data": {
+            "dataset_name": "Anthropic RLHF dataset",
+            "source_type": "url",
+            "url": [
+              "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+            ]
+          },
+          "metric_config": {
+            "evaluation_description": "Harmlessness on Anthropic RLHF dataset",
+            "lower_is_better": false,
+            "score_type": "continuous",
+            "min_score": 0.0,
+            "max_score": 5.0
+          },
+          "score_details": {
+            "score": 4.964,
+            "details": {
+              "description": "min=4.915, mean=4.964, max=5, sum=39.715 (8)",
+              "tab": "Instruction Following"
+            }
+          },
+          "generation_config": {
+            "additional_details": {
+              "subset": "[\"hh\", \"hh\", \"hh\", \"hh\", \"red_team\", \"red_team\", \"red_team\", \"red_team\"]",
+              "evaluator": "[\"claude\", \"gpt4\", \"mturk\", \"scale\", \"claude\", \"gpt4\", \"mturk\", \"scale\"]"
+            }
+          }
+        },
+        {
+          "evaluation_name": "Best ChatGPT Prompts",
+          "source_data": {
+            "dataset_name": "Best ChatGPT Prompts",
+            "source_type": "url",
+            "url": [
+              "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+            ]
+          },
+          "metric_config": {
+            "evaluation_description": "Harmlessness on Best ChatGPT Prompts",
+            "lower_is_better": false,
+            "score_type": "continuous",
+            "min_score": 0.0,
+            "max_score": 5.0
+          },
+          "score_details": {
+            "score": 4.986,
+            "details": {
+              "description": "min=4.95, mean=4.986, max=5, sum=19.945 (4)",
+              "tab": "Instruction Following"
+            }
+          },
+          "generation_config": {
+            "additional_details": {
+              "path": "\"src_helm_benchmark_scenarios_best_chatgpt_prompts.yaml\"",
+              "tags": "\"\"",
+              "evaluator": "[\"claude\", \"gpt4\", \"mturk\", \"scale\"]"
+            }
+          }
+        },
+        {
+          "evaluation_name": "Koala test dataset",
+          "source_data": {
+            "dataset_name": "Koala test dataset",
+            "source_type": "url",
+            "url": [
+              "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+            ]
+          },
+          "metric_config": {
+            "evaluation_description": "Harmlessness on Koala test dataset",
+            "lower_is_better": false,
+            "score_type": "continuous",
+            "min_score": 0.0,
+            "max_score": 5.0
+          },
+          "score_details": {
+            "score": 4.987,
+            "details": {
+              "description": "min=4.969, mean=4.987, max=5, sum=19.95 (4)",
+              "tab": "Instruction Following"
+            }
+          },
+          "generation_config": {
+            "additional_details": {
+              "evaluator": "[\"claude\", \"gpt4\", \"mturk\", \"scale\"]"
+            }
+          }
+        },
+        {
+          "evaluation_name": "Open Assistant",
+          "source_data": {
+            "dataset_name": "Open Assistant",
+            "source_type": "url",
+            "url": [
+              "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+            ]
+          },
+          "metric_config": {
+            "evaluation_description": "Harmlessness on Open Assistant",
+            "lower_is_better": false,
+            "score_type": "continuous",
+            "min_score": 0.0,
+            "max_score": 5.0
+          },
+          "score_details": {
+            "score": 4.987,
+            "details": {
+              "description": "min=4.96, mean=4.987, max=5, sum=19.95 (4)",
+              "tab": "Instruction Following"
+            }
+          },
+          "generation_config": {
+            "additional_details": {
+              "language": "\"en\"",
+              "evaluator": "[\"claude\", \"gpt4\", \"mturk\", \"scale\"]"
+            }
+          }
+        },
+        {
+          "evaluation_name": "Self Instruct",
+          "source_data": {
+            "dataset_name": "Self Instruct",
+            "source_type": "url",
+            "url": [
+              "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+            ]
+          },
+          "metric_config": {
+            "evaluation_description": "Harmlessness on Self Instruct",
+            "lower_is_better": false,
+            "score_type": "continuous",
+            "min_score": 0.0,
+            "max_score": 5.0
+          },
+          "score_details": {
+            "score": 4.99,
+            "details": {
+              "description": "min=4.97, mean=4.99, max=5, sum=19.96 (4)",
+              "tab": "Instruction Following"
+            }
+          },
+          "generation_config": {
+            "additional_details": {
+              "evaluator": "[\"claude\", \"gpt4\", \"mturk\", \"scale\"]"
+            }
+          }
+        },
+        {
+          "evaluation_name": "Vicuna",
+          "source_data": {
+            "dataset_name": "Vicuna",
+            "source_type": "url",
+            "url": [
+              "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
+            ]
+          },
+          "metric_config": {
+            "evaluation_description": "Harmlessness on Vicuna",
+            "lower_is_better": false,
+            "score_type": "continuous",
+            "min_score": 0.0,
+            "max_score": 5.0
+          },
+          "score_details": {
+            "score": 4.992,
+            "details": {
+              "description": "min=4.975, mean=4.992, max=5, sum=19.969 (4)",
+              "tab": "Instruction Following"
+            }
+          },
+          "generation_config": {
+            "additional_details": {
+              "category": "\"all\"",
+              "evaluator": "[\"claude\", \"gpt4\", \"mturk\", \"scale\"]"
+            }
+          }
+        }
+      ],
+      "detailed_evaluation_results": null,
+      "generation_config": {
+        "additional_details": {}
+      }
+    },
     {
       "evaluation_id": "helm_lite/openai_gpt-3.5-turbo-0613/1774096306.427425",
       "retrieved_timestamp": "1774096306.427425",
diff --git a/data/models/openai_gpt-4-0613.json b/data/models/openai_gpt-4-0613.json
index 968890d0737b80810e6c39ba99b0e4eb0e407fa8..f9edf8b400bcfd0d8167fb29ff1bda235867f788 100644
--- a/data/models/openai_gpt-4-0613.json
+++ b/data/models/openai_gpt-4-0613.json
@@ -7,10 +7,10 @@
   },
   "evaluations": [
     {
-      "evaluation_id": "helm_mmlu/openai_gpt-4-0613/1774096312.00548",
-      "retrieved_timestamp": "1774096312.00548",
+      "evaluation_id": "helm_lite/openai_gpt-4-0613/1774096306.427425",
+      "retrieved_timestamp": "1774096306.427425",
       "source_metadata": {
-        "source_name": "helm_mmlu",
+        "source_name": "helm_lite",
         "source_type": "documentation",
         "source_organization_name": "crfm",
         "evaluator_relationship": "third_party"
@@ -19,438 +19,380 @@
         "name": "helm",
         "version": "unknown"
       },
-      "benchmark": "helm_mmlu",
+      "benchmark": "helm_lite",
       "evaluation_results": [
         {
-          "evaluation_name": "MMLU All Subjects",
+          "evaluation_name": "Mean win rate",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "helm_lite",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on MMLU All Subjects",
+            "evaluation_description": "How many models this model outperforms on average (over columns).",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.824,
+            "score": 0.867,
             "details": {
-              "description": "min=0.54, mean=0.824, max=0.99, sum=93.978 (114)",
+              "description": "",
               "tab": "Accuracy",
-              "MMLU All Subjects - Observed inference time (s)": "{\"description\": \"min=0.364, mean=0.447, max=0.579, sum=51.005 (114)\", \"tab\": \"Efficiency\", \"score\": \"0.4474144183932911\"}",
-              "MMLU All Subjects - # eval": "{\"description\": \"min=100, mean=246.351, max=1534, sum=28084 (114)\", \"tab\": \"General information\", \"score\": \"246.35087719298247\"}",
-              "MMLU All Subjects - # train": "{\"description\": \"min=5, mean=5, max=5, sum=570 (114)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "MMLU All Subjects - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (114)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "MMLU All Subjects - # prompt tokens": "{\"description\": \"min=268.561, mean=607.852, max=2791.073, sum=69295.086 (114)\", \"tab\": \"General information\", \"score\": \"607.851634217556\"}",
-              "MMLU All Subjects - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=114 (114)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Mean win rate - Efficiency": "{\"description\": \"\", \"tab\": \"Efficiency\", \"score\": \"0.5158801498127341\"}",
+              "Mean win rate - General information": "{\"description\": \"\", \"tab\": \"General information\", \"score\": \"\"}"
             }
           },
           "generation_config": {
-            "additional_details": {
-              "subject": "[\"abstract_algebra\", \"anatomy\", \"astronomy\", \"business_ethics\", \"clinical_knowledge\", \"college_biology\", \"college_chemistry\", \"college_computer_science\", \"college_mathematics\", \"college_medicine\", \"college_physics\", \"computer_security\", \"conceptual_physics\", \"econometrics\", \"electrical_engineering\", \"elementary_mathematics\", \"formal_logic\", \"global_facts\", \"high_school_biology\", \"high_school_chemistry\", \"high_school_computer_science\", \"high_school_european_history\", \"high_school_geography\", \"high_school_government_and_politics\", \"high_school_macroeconomics\", \"high_school_mathematics\", \"high_school_microeconomics\", \"high_school_physics\", \"high_school_psychology\", \"high_school_statistics\", \"high_school_us_history\", \"high_school_world_history\", \"human_aging\", \"human_sexuality\", \"international_law\", \"jurisprudence\", \"logical_fallacies\", \"machine_learning\", \"management\", \"marketing\", \"medical_genetics\", \"miscellaneous\", \"moral_disputes\", \"moral_scenarios\", \"nutrition\", \"philosophy\", \"prehistory\", \"professional_accounting\", \"professional_law\", \"professional_medicine\", \"professional_psychology\", \"public_relations\", \"security_studies\", \"sociology\", \"us_foreign_policy\", \"virology\", \"world_religions\"]",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "[\"mmlu_abstract_algebra\", \"mmlu_anatomy\", \"mmlu_astronomy\", \"mmlu_business_ethics\", \"mmlu_clinical_knowledge\", \"mmlu_college_biology\", \"mmlu_college_chemistry\", \"mmlu_college_computer_science\", \"mmlu_college_mathematics\", \"mmlu_college_medicine\", \"mmlu_college_physics\", \"mmlu_computer_security\", \"mmlu_conceptual_physics\", \"mmlu_econometrics\", \"mmlu_electrical_engineering\", \"mmlu_elementary_mathematics\", \"mmlu_formal_logic\", \"mmlu_global_facts\", \"mmlu_high_school_biology\", \"mmlu_high_school_chemistry\", \"mmlu_high_school_computer_science\", \"mmlu_high_school_european_history\", \"mmlu_high_school_geography\", \"mmlu_high_school_government_and_politics\", \"mmlu_high_school_macroeconomics\", \"mmlu_high_school_mathematics\", \"mmlu_high_school_microeconomics\", \"mmlu_high_school_physics\", \"mmlu_high_school_psychology\", \"mmlu_high_school_statistics\", \"mmlu_high_school_us_history\", \"mmlu_high_school_world_history\", \"mmlu_human_aging\", \"mmlu_human_sexuality\", \"mmlu_international_law\", \"mmlu_jurisprudence\", \"mmlu_logical_fallacies\", \"mmlu_machine_learning\", \"mmlu_management\", \"mmlu_marketing\", \"mmlu_medical_genetics\", \"mmlu_miscellaneous\", \"mmlu_moral_disputes\", \"mmlu_moral_scenarios\", \"mmlu_nutrition\", \"mmlu_philosophy\", \"mmlu_prehistory\", \"mmlu_professional_accounting\", \"mmlu_professional_law\", \"mmlu_professional_medicine\", \"mmlu_professional_psychology\", \"mmlu_public_relations\", \"mmlu_security_studies\", \"mmlu_sociology\", \"mmlu_us_foreign_policy\", \"mmlu_virology\", \"mmlu_world_religions\"]"
-            }
+            "additional_details": {}
           }
         },
         {
-          "evaluation_name": "Abstract Algebra",
+          "evaluation_name": "NarrativeQA",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "NarrativeQA",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Abstract Algebra",
+            "evaluation_description": "F1 on NarrativeQA",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.63,
+            "score": 0.768,
             "details": {
-              "description": "min=0.63, mean=0.63, max=0.63, sum=1.26 (2)",
+              "description": "min=0.768, mean=0.768, max=0.768, sum=0.768 (1)",
               "tab": "Accuracy",
-              "Abstract Algebra - Observed inference time (s)": "{\"description\": \"min=0.393, mean=0.393, max=0.393, sum=0.787 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.39332568168640136\"}",
-              "Abstract Algebra - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "Abstract Algebra - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Abstract Algebra - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Abstract Algebra - # prompt tokens": "{\"description\": \"min=366.44, mean=366.44, max=366.44, sum=732.88 (2)\", \"tab\": \"General information\", \"score\": \"366.44\"}",
-              "Abstract Algebra - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "NarrativeQA - Observed inference time (s)": "{\"description\": \"min=0.976, mean=0.976, max=0.976, sum=0.976 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.9758186582108619\"}",
+              "NarrativeQA - # eval": "{\"description\": \"min=355, mean=355, max=355, sum=355 (1)\", \"tab\": \"General information\", \"score\": \"355.0\"}",
+              "NarrativeQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "NarrativeQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "NarrativeQA - # prompt tokens": "{\"description\": \"min=3522.67, mean=3522.67, max=3522.67, sum=3522.67 (1)\", \"tab\": \"General information\", \"score\": \"3522.6704225352114\"}",
+              "NarrativeQA - # output tokens": "{\"description\": \"min=8.515, mean=8.515, max=8.515, sum=8.515 (1)\", \"tab\": \"General information\", \"score\": \"8.51549295774648\"}"
             }
           },
           "generation_config": {
-            "additional_details": {
-              "subject": "\"abstract_algebra\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_abstract_algebra\""
-            }
+            "additional_details": {}
           }
         },
         {
-          "evaluation_name": "Anatomy",
+          "evaluation_name": "NaturalQuestions (closed-book)",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "NaturalQuestions (closed-book)",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Anatomy",
+            "evaluation_description": "F1 on NaturalQuestions (closed-book)",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8,
+            "score": 0.457,
             "details": {
-              "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)",
+              "description": "min=0.457, mean=0.457, max=0.457, sum=0.457 (1)",
               "tab": "Accuracy",
-              "Anatomy - Observed inference time (s)": "{\"description\": \"min=0.545, mean=0.545, max=0.545, sum=1.09 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5451150911825674\"}",
-              "Anatomy - # eval": "{\"description\": \"min=135, mean=135, max=135, sum=270 (2)\", \"tab\": \"General information\", \"score\": \"135.0\"}",
-              "Anatomy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Anatomy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Anatomy - # prompt tokens": "{\"description\": \"min=346.978, mean=346.978, max=346.978, sum=693.956 (2)\", \"tab\": \"General information\", \"score\": \"346.97777777777776\"}",
-              "Anatomy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "NaturalQuestions (open-book) - Observed inference time (s)": "{\"description\": \"min=0.908, mean=0.908, max=0.908, sum=0.908 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.9083020164966583\"}",
+              "NaturalQuestions (closed-book) - Observed inference time (s)": "{\"description\": \"min=0.512, mean=0.512, max=0.512, sum=0.512 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.5116857671737671\"}",
+              "NaturalQuestions (open-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}",
+              "NaturalQuestions (open-book) - # train": "{\"description\": \"min=4.964, mean=4.964, max=4.964, sum=4.964 (1)\", \"tab\": \"General information\", \"score\": \"4.964\"}",
+              "NaturalQuestions (open-book) - truncated": "{\"description\": \"min=0.007, mean=0.007, max=0.007, sum=0.007 (1)\", \"tab\": \"General information\", \"score\": \"0.007\"}",
+              "NaturalQuestions (open-book) - # prompt tokens": "{\"description\": \"min=1717.847, mean=1717.847, max=1717.847, sum=1717.847 (1)\", \"tab\": \"General information\", \"score\": \"1717.847\"}",
+              "NaturalQuestions (open-book) - # output tokens": "{\"description\": \"min=8.055, mean=8.055, max=8.055, sum=8.055 (1)\", \"tab\": \"General information\", \"score\": \"8.055\"}",
+              "NaturalQuestions (closed-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}",
+              "NaturalQuestions (closed-book) - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "NaturalQuestions (closed-book) - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "NaturalQuestions (closed-book) - # prompt tokens": "{\"description\": \"min=173.127, mean=173.127, max=173.127, sum=173.127 (1)\", \"tab\": \"General information\", \"score\": \"173.127\"}",
+              "NaturalQuestions (closed-book) - # output tokens": "{\"description\": \"min=3.832, mean=3.832, max=3.832, sum=3.832 (1)\", \"tab\": \"General information\", \"score\": \"3.832\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"anatomy\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_anatomy\""
+              "mode": "\"closedbook\""
             }
           }
         },
         {
-          "evaluation_name": "College Physics",
+          "evaluation_name": "OpenbookQA",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "OpenbookQA",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on College Physics",
+            "evaluation_description": "EM on OpenbookQA",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.627,
+            "score": 0.96,
             "details": {
-              "description": "min=0.627, mean=0.627, max=0.627, sum=1.255 (2)",
+              "description": "min=0.96, mean=0.96, max=0.96, sum=0.96 (1)",
               "tab": "Accuracy",
-              "College Chemistry - Observed inference time (s)": "{\"description\": \"min=0.389, mean=0.389, max=0.389, sum=0.778 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3888898015022278\"}",
-              "College Biology - Observed inference time (s)": "{\"description\": \"min=0.433, mean=0.433, max=0.433, sum=0.866 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.43280420700709027\"}",
-              "College Computer Science - Observed inference time (s)": "{\"description\": \"min=0.492, mean=0.492, max=0.492, sum=0.984 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.49212974786758423\"}",
-              "College Mathematics - Observed inference time (s)": "{\"description\": \"min=0.435, mean=0.435, max=0.435, sum=0.871 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4354128074645996\"}",
-              "College Medicine - Observed inference time (s)": "{\"description\": \"min=0.431, mean=0.431, max=0.431, sum=0.861 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4306242893196944\"}",
-              "College Physics - Observed inference time (s)": "{\"description\": \"min=0.415, mean=0.415, max=0.415, sum=0.83 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.41519686287524654\"}",
-              "College Chemistry - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "College Chemistry - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "College Chemistry - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "College Chemistry - # prompt tokens": "{\"description\": \"min=542.4, mean=542.4, max=542.4, sum=1084.8 (2)\", \"tab\": \"General information\", \"score\": \"542.4\"}",
-              "College Chemistry - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "College Biology - # eval": "{\"description\": \"min=144, mean=144, max=144, sum=288 (2)\", \"tab\": \"General information\", \"score\": \"144.0\"}",
-              "College Biology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "College Biology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "College Biology - # prompt tokens": "{\"description\": \"min=466.917, mean=466.917, max=466.917, sum=933.833 (2)\", \"tab\": \"General information\", \"score\": \"466.9166666666667\"}",
-              "College Biology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "College Computer Science - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "College Computer Science - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "College Computer Science - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "College Computer Science - # prompt tokens": "{\"description\": \"min=821.39, mean=821.39, max=821.39, sum=1642.78 (2)\", \"tab\": \"General information\", \"score\": \"821.39\"}",
-              "College Computer Science - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "College Mathematics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "College Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "College Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "College Mathematics - # prompt tokens": "{\"description\": \"min=587.52, mean=587.52, max=587.52, sum=1175.04 (2)\", \"tab\": \"General information\", \"score\": \"587.52\"}",
-              "College Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "College Medicine - # eval": "{\"description\": \"min=173, mean=173, max=173, sum=346 (2)\", \"tab\": \"General information\", \"score\": \"173.0\"}",
-              "College Medicine - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "College Medicine - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "College Medicine - # prompt tokens": "{\"description\": \"min=495.728, mean=495.728, max=495.728, sum=991.457 (2)\", \"tab\": \"General information\", \"score\": \"495.728323699422\"}",
-              "College Medicine - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "College Physics - # eval": "{\"description\": \"min=102, mean=102, max=102, sum=204 (2)\", \"tab\": \"General information\", \"score\": \"102.0\"}",
-              "College Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "College Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "College Physics - # prompt tokens": "{\"description\": \"min=496.608, mean=496.608, max=496.608, sum=993.216 (2)\", \"tab\": \"General information\", \"score\": \"496.6078431372549\"}",
-              "College Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "OpenbookQA - Observed inference time (s)": "{\"description\": \"min=0.401, mean=0.401, max=0.401, sum=0.401 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.40061268854141235\"}",
+              "OpenbookQA - # eval": "{\"description\": \"min=500, mean=500, max=500, sum=500 (1)\", \"tab\": \"General information\", \"score\": \"500.0\"}",
+              "OpenbookQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "OpenbookQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "OpenbookQA - # prompt tokens": "{\"description\": \"min=242.782, mean=242.782, max=242.782, sum=242.782 (1)\", \"tab\": \"General information\", \"score\": \"242.782\"}",
+              "OpenbookQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"college_physics\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_college_physics\""
+              "dataset": "\"openbookqa\"",
+              "method": "\"multiple_choice_joint\""
             }
           }
         },
         {
-          "evaluation_name": "Computer Security",
+          "evaluation_name": "MMLU",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "MMLU",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Computer Security",
+            "evaluation_description": "EM on MMLU",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.86,
+            "score": 0.735,
             "details": {
-              "description": "min=0.86, mean=0.86, max=0.86, sum=1.72 (2)",
+              "description": "min=0.55, mean=0.735, max=0.95, sum=3.674 (5)",
               "tab": "Accuracy",
-              "Computer Security - Observed inference time (s)": "{\"description\": \"min=0.373, mean=0.373, max=0.373, sum=0.746 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3729291558265686\"}",
-              "Computer Security - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "Computer Security - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Computer Security - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Computer Security - # prompt tokens": "{\"description\": \"min=371.54, mean=371.54, max=371.54, sum=743.08 (2)\", \"tab\": \"General information\", \"score\": \"371.54\"}",
-              "Computer Security - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "MMLU - Observed inference time (s)": "{\"description\": \"min=0.364, mean=0.391, max=0.434, sum=1.954 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.39080846048656265\"}",
+              "MMLU - # eval": "{\"description\": \"min=100, mean=102.8, max=114, sum=514 (5)\", \"tab\": \"General information\", \"score\": \"102.8\"}",
+              "MMLU - # train": "{\"description\": \"min=5, mean=5, max=5, sum=25 (5)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "MMLU - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "MMLU - # prompt tokens": "{\"description\": \"min=366.44, mean=460.72, max=607.43, sum=2303.6 (5)\", \"tab\": \"General information\", \"score\": \"460.71996491228066\"}",
+              "MMLU - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"computer_security\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_computer_security\""
+              "subject": "[\"abstract_algebra\", \"college_chemistry\", \"computer_security\", \"econometrics\", \"us_foreign_policy\"]",
+              "method": "\"multiple_choice_joint\""
             }
           }
         },
         {
-          "evaluation_name": "Econometrics",
+          "evaluation_name": "MATH",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "MATH",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Econometrics",
+            "evaluation_description": "Equivalent (CoT) on MATH",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.684,
+            "score": 0.802,
             "details": {
-              "description": "min=0.684, mean=0.684, max=0.684, sum=1.368 (2)",
+              "description": "min=0.673, mean=0.802, max=0.948, sum=5.617 (7)",
               "tab": "Accuracy",
-              "Econometrics - Observed inference time (s)": "{\"description\": \"min=0.364, mean=0.364, max=0.364, sum=0.729 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.36447873241023016\"}",
-              "Econometrics - # eval": "{\"description\": \"min=114, mean=114, max=114, sum=228 (2)\", \"tab\": \"General information\", \"score\": \"114.0\"}",
-              "Econometrics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Econometrics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Econometrics - # prompt tokens": "{\"description\": \"min=607.43, mean=607.43, max=607.43, sum=1214.86 (2)\", \"tab\": \"General information\", \"score\": \"607.4298245614035\"}",
-              "Econometrics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "MATH - Observed inference time (s)": "{\"description\": \"min=2.95, mean=3.472, max=4.247, sum=24.303 (7)\", \"tab\": \"Efficiency\", \"score\": \"3.4718795228507955\"}",
+              "MATH - # eval": "{\"description\": \"min=30, mean=62.429, max=135, sum=437 (7)\", \"tab\": \"General information\", \"score\": \"62.42857142857143\"}",
+              "MATH - # train": "{\"description\": \"min=8, mean=8, max=8, sum=56 (7)\", \"tab\": \"General information\", \"score\": \"8.0\"}",
+              "MATH - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (7)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "MATH - # prompt tokens": "{\"description\": \"min=942.363, mean=1323.911, max=2258.577, sum=9267.376 (7)\", \"tab\": \"General information\", \"score\": \"1323.910874184069\"}",
+              "MATH - # output tokens": "{\"description\": \"min=59.674, mean=73.257, max=81.1, sum=512.799 (7)\", \"tab\": \"General information\", \"score\": \"73.25695858608955\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"econometrics\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_econometrics\""
+              "subject": "[\"algebra\", \"counting_and_probability\", \"geometry\", \"intermediate_algebra\", \"number_theory\", \"prealgebra\", \"precalculus\"]",
+              "level": "\"1\"",
+              "use_official_examples": "\"False\"",
+              "use_chain_of_thought": "\"True\""
             }
           }
         },
         {
-          "evaluation_name": "Global Facts",
+          "evaluation_name": "GSM8K",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "GSM8K",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Global Facts",
+            "evaluation_description": "EM on GSM8K",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.62,
+            "score": 0.932,
             "details": {
-              "description": "min=0.62, mean=0.62, max=0.62, sum=1.24 (2)",
+              "description": "min=0.932, mean=0.932, max=0.932, sum=0.932 (1)",
               "tab": "Accuracy",
-              "Global Facts - Observed inference time (s)": "{\"description\": \"min=0.476, mean=0.476, max=0.476, sum=0.952 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4758000469207764\"}",
-              "Global Facts - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "Global Facts - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Global Facts - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Global Facts - # prompt tokens": "{\"description\": \"min=392.71, mean=392.71, max=392.71, sum=785.42 (2)\", \"tab\": \"General information\", \"score\": \"392.71\"}",
-              "Global Facts - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "GSM8K - Observed inference time (s)": "{\"description\": \"min=4.948, mean=4.948, max=4.948, sum=4.948 (1)\", \"tab\": \"Efficiency\", \"score\": \"4.947624314308166\"}",
+              "GSM8K - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}",
+              "GSM8K - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "GSM8K - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "GSM8K - # prompt tokens": "{\"description\": \"min=1020.035, mean=1020.035, max=1020.035, sum=1020.035 (1)\", \"tab\": \"General information\", \"score\": \"1020.035\"}",
+              "GSM8K - # output tokens": "{\"description\": \"min=111.209, mean=111.209, max=111.209, sum=111.209 (1)\", \"tab\": \"General information\", \"score\": \"111.209\"}"
             }
           },
           "generation_config": {
-            "additional_details": {
-              "subject": "\"global_facts\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_global_facts\""
-            }
+            "additional_details": {}
           }
         },
         {
-          "evaluation_name": "Jurisprudence",
+          "evaluation_name": "LegalBench",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "LegalBench",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Jurisprudence",
+            "evaluation_description": "EM on LegalBench",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.889,
+            "score": 0.713,
             "details": {
-              "description": "min=0.889, mean=0.889, max=0.889, sum=1.778 (2)",
+              "description": "min=0.452, mean=0.713, max=0.905, sum=3.564 (5)",
               "tab": "Accuracy",
-              "Jurisprudence - Observed inference time (s)": "{\"description\": \"min=0.439, mean=0.439, max=0.439, sum=0.878 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.43886900389636\"}",
-              "Jurisprudence - # eval": "{\"description\": \"min=108, mean=108, max=108, sum=216 (2)\", \"tab\": \"General information\", \"score\": \"108.0\"}",
-              "Jurisprudence - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Jurisprudence - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Jurisprudence - # prompt tokens": "{\"description\": \"min=387.639, mean=387.639, max=387.639, sum=775.278 (2)\", \"tab\": \"General information\", \"score\": \"387.6388888888889\"}",
-              "Jurisprudence - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "LegalBench - Observed inference time (s)": "{\"description\": \"min=0.46, mean=0.558, max=0.886, sum=2.791 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.5582764348578453\"}",
+              "LegalBench - # eval": "{\"description\": \"min=95, mean=409.4, max=1000, sum=2047 (5)\", \"tab\": \"General information\", \"score\": \"409.4\"}",
+              "LegalBench - # train": "{\"description\": \"min=4, mean=4.798, max=5, sum=23.992 (5)\", \"tab\": \"General information\", \"score\": \"4.798367346938775\"}",
+              "LegalBench - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "LegalBench - # prompt tokens": "{\"description\": \"min=253.442, mean=1568.687, max=6350.008, sum=7843.435 (5)\", \"tab\": \"General information\", \"score\": \"1568.6870529886412\"}",
+              "LegalBench - # output tokens": "{\"description\": \"min=1, mean=1.34, max=2.063, sum=6.698 (5)\", \"tab\": \"General information\", \"score\": \"1.3396070557866055\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"jurisprudence\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_jurisprudence\""
+              "subset": "[\"abercrombie\", \"corporate_lobbying\", \"function_of_decision_section\", \"international_citizenship_questions\", \"proa\"]"
             }
           }
         },
         {
-          "evaluation_name": "Philosophy",
+          "evaluation_name": "MedQA",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "MedQA",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Philosophy",
+            "evaluation_description": "EM on MedQA",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.859,
+            "score": 0.815,
             "details": {
-              "description": "min=0.859, mean=0.859, max=0.859, sum=1.717 (2)",
+              "description": "min=0.815, mean=0.815, max=0.815, sum=0.815 (1)",
               "tab": "Accuracy",
-              "Philosophy - Observed inference time (s)": "{\"description\": \"min=0.403, mean=0.403, max=0.403, sum=0.807 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.40341131480177117\"}",
-              "Philosophy - # eval": "{\"description\": \"min=311, mean=311, max=311, sum=622 (2)\", \"tab\": \"General information\", \"score\": \"311.0\"}",
-              "Philosophy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Philosophy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Philosophy - # prompt tokens": "{\"description\": \"min=322.084, mean=322.084, max=322.084, sum=644.167 (2)\", \"tab\": \"General information\", \"score\": \"322.08360128617363\"}",
-              "Philosophy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "MedQA - Observed inference time (s)": "{\"description\": \"min=0.414, mean=0.414, max=0.414, sum=0.414 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.4136932588239787\"}",
+              "MedQA - # eval": "{\"description\": \"min=503, mean=503, max=503, sum=503 (1)\", \"tab\": \"General information\", \"score\": \"503.0\"}",
+              "MedQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "MedQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "MedQA - # prompt tokens": "{\"description\": \"min=1020.414, mean=1020.414, max=1020.414, sum=1020.414 (1)\", \"tab\": \"General information\", \"score\": \"1020.4135188866799\"}",
+              "MedQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
-            "additional_details": {
-              "subject": "\"philosophy\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_philosophy\""
-            }
+            "additional_details": {}
           }
         },
         {
-          "evaluation_name": "Professional Psychology",
+          "evaluation_name": "WMT 2014",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "WMT 2014",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Professional Psychology",
+            "evaluation_description": "BLEU-4 on WMT 2014",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.891,
+            "score": 0.211,
             "details": {
-              "description": "min=0.891, mean=0.891, max=0.891, sum=1.781 (2)",
+              "description": "min=0.149, mean=0.211, max=0.256, sum=1.053 (5)",
               "tab": "Accuracy",
-              "Professional Medicine - Observed inference time (s)": "{\"description\": \"min=0.483, mean=0.483, max=0.483, sum=0.966 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.48306868356816907\"}",
-              "Professional Accounting - Observed inference time (s)": "{\"description\": \"min=0.444, mean=0.444, max=0.444, sum=0.888 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.44407470006469296\"}",
-              "Professional Law - Observed inference time (s)": "{\"description\": \"min=0.578, mean=0.578, max=0.578, sum=1.157 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.578451920053017\"}",
-              "Professional Psychology - Observed inference time (s)": "{\"description\": \"min=0.469, mean=0.469, max=0.469, sum=0.938 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4690242421393301\"}",
-              "Professional Medicine - # eval": "{\"description\": \"min=272, mean=272, max=272, sum=544 (2)\", \"tab\": \"General information\", \"score\": \"272.0\"}",
-              "Professional Medicine - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Professional Medicine - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Professional Medicine - # prompt tokens": "{\"description\": \"min=1087.585, mean=1087.585, max=1087.585, sum=2175.169 (2)\", \"tab\": \"General information\", \"score\": \"1087.5845588235295\"}",
-              "Professional Medicine - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "Professional Accounting - # eval": "{\"description\": \"min=282, mean=282, max=282, sum=564 (2)\", \"tab\": \"General information\", \"score\": \"282.0\"}",
-              "Professional Accounting - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Professional Accounting - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Professional Accounting - # prompt tokens": "{\"description\": \"min=651.592, mean=651.592, max=651.592, sum=1303.184 (2)\", \"tab\": \"General information\", \"score\": \"651.5921985815603\"}",
-              "Professional Accounting - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "Professional Law - # eval": "{\"description\": \"min=1534, mean=1534, max=1534, sum=3068 (2)\", \"tab\": \"General information\", \"score\": \"1534.0\"}",
-              "Professional Law - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Professional Law - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Professional Law - # prompt tokens": "{\"description\": \"min=1630.787, mean=1630.787, max=1630.787, sum=3261.574 (2)\", \"tab\": \"General information\", \"score\": \"1630.7868318122555\"}",
-              "Professional Law - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "Professional Psychology - # eval": "{\"description\": \"min=612, mean=612, max=612, sum=1224 (2)\", \"tab\": \"General information\", \"score\": \"612.0\"}",
-              "Professional Psychology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Professional Psychology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Professional Psychology - # prompt tokens": "{\"description\": \"min=568.114, mean=568.114, max=568.114, sum=1136.229 (2)\", \"tab\": \"General information\", \"score\": \"568.1143790849674\"}",
-              "Professional Psychology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "WMT 2014 - Observed inference time (s)": "{\"description\": \"min=1.448, mean=1.58, max=1.724, sum=7.899 (5)\", \"tab\": \"Efficiency\", \"score\": \"1.5797039644192494\"}",
+              "WMT 2014 - # eval": "{\"description\": \"min=503, mean=568.8, max=832, sum=2844 (5)\", \"tab\": \"General information\", \"score\": \"568.8\"}",
+              "WMT 2014 - # train": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "WMT 2014 - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "WMT 2014 - # prompt tokens": "{\"description\": \"min=169.901, mean=193.043, max=213.185, sum=965.213 (5)\", \"tab\": \"General information\", \"score\": \"193.04258583116683\"}",
+              "WMT 2014 - # output tokens": "{\"description\": \"min=23.767, mean=25.424, max=26.121, sum=127.122 (5)\", \"tab\": \"General information\", \"score\": \"25.424382072946933\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"professional_psychology\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_professional_psychology\""
+              "language_pair": "[\"cs-en\", \"de-en\", \"fr-en\", \"hi-en\", \"ru-en\"]"
             }
           }
-        },
+        }
+      ],
+      "detailed_evaluation_results": null,
+      "generation_config": {
+        "additional_details": {}
+      }
+    },
+    {
+      "evaluation_id": "helm_mmlu/openai_gpt-4-0613/1774096312.00548",
+      "retrieved_timestamp": "1774096312.00548",
+      "source_metadata": {
+        "source_name": "helm_mmlu",
+        "source_type": "documentation",
+        "source_organization_name": "crfm",
+        "evaluator_relationship": "third_party"
+      },
+      "eval_library": {
+        "name": "helm",
+        "version": "unknown"
+      },
+      "benchmark": "helm_mmlu",
+      "evaluation_results": [
         {
-          "evaluation_name": "Us Foreign Policy",
+          "evaluation_name": "MMLU All Subjects",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -459,36 +401,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Us Foreign Policy",
+            "evaluation_description": "EM on MMLU All Subjects",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.95,
+            "score": 0.824,
             "details": {
-              "description": "min=0.95, mean=0.95, max=0.95, sum=1.9 (2)",
+              "description": "min=0.54, mean=0.824, max=0.99, sum=93.978 (114)",
               "tab": "Accuracy",
-              "Us Foreign Policy - Observed inference time (s)": "{\"description\": \"min=0.434, mean=0.434, max=0.434, sum=0.869 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.43441893100738527\"}",
-              "Us Foreign Policy - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "Us Foreign Policy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Us Foreign Policy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Us Foreign Policy - # prompt tokens": "{\"description\": \"min=415.79, mean=415.79, max=415.79, sum=831.58 (2)\", \"tab\": \"General information\", \"score\": \"415.79\"}",
-              "Us Foreign Policy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "MMLU All Subjects - Observed inference time (s)": "{\"description\": \"min=0.364, mean=0.447, max=0.579, sum=51.005 (114)\", \"tab\": \"Efficiency\", \"score\": \"0.4474144183932911\"}",
+              "MMLU All Subjects - # eval": "{\"description\": \"min=100, mean=246.351, max=1534, sum=28084 (114)\", \"tab\": \"General information\", \"score\": \"246.35087719298247\"}",
+              "MMLU All Subjects - # train": "{\"description\": \"min=5, mean=5, max=5, sum=570 (114)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "MMLU All Subjects - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (114)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "MMLU All Subjects - # prompt tokens": "{\"description\": \"min=268.561, mean=607.852, max=2791.073, sum=69295.086 (114)\", \"tab\": \"General information\", \"score\": \"607.851634217556\"}",
+              "MMLU All Subjects - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=114 (114)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"us_foreign_policy\"",
+              "subject": "[\"abstract_algebra\", \"anatomy\", \"astronomy\", \"business_ethics\", \"clinical_knowledge\", \"college_biology\", \"college_chemistry\", \"college_computer_science\", \"college_mathematics\", \"college_medicine\", \"college_physics\", \"computer_security\", \"conceptual_physics\", \"econometrics\", \"electrical_engineering\", \"elementary_mathematics\", \"formal_logic\", \"global_facts\", \"high_school_biology\", \"high_school_chemistry\", \"high_school_computer_science\", \"high_school_european_history\", \"high_school_geography\", \"high_school_government_and_politics\", \"high_school_macroeconomics\", \"high_school_mathematics\", \"high_school_microeconomics\", \"high_school_physics\", \"high_school_psychology\", \"high_school_statistics\", \"high_school_us_history\", \"high_school_world_history\", \"human_aging\", \"human_sexuality\", \"international_law\", \"jurisprudence\", \"logical_fallacies\", \"machine_learning\", \"management\", \"marketing\", \"medical_genetics\", \"miscellaneous\", \"moral_disputes\", \"moral_scenarios\", \"nutrition\", \"philosophy\", \"prehistory\", \"professional_accounting\", \"professional_law\", \"professional_medicine\", \"professional_psychology\", \"public_relations\", \"security_studies\", \"sociology\", \"us_foreign_policy\", \"virology\", \"world_religions\"]",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_us_foreign_policy\""
+              "groups": "[\"mmlu_abstract_algebra\", \"mmlu_anatomy\", \"mmlu_astronomy\", \"mmlu_business_ethics\", \"mmlu_clinical_knowledge\", \"mmlu_college_biology\", \"mmlu_college_chemistry\", \"mmlu_college_computer_science\", \"mmlu_college_mathematics\", \"mmlu_college_medicine\", \"mmlu_college_physics\", \"mmlu_computer_security\", \"mmlu_conceptual_physics\", \"mmlu_econometrics\", \"mmlu_electrical_engineering\", \"mmlu_elementary_mathematics\", \"mmlu_formal_logic\", \"mmlu_global_facts\", \"mmlu_high_school_biology\", \"mmlu_high_school_chemistry\", \"mmlu_high_school_computer_science\", \"mmlu_high_school_european_history\", \"mmlu_high_school_geography\", \"mmlu_high_school_government_and_politics\", \"mmlu_high_school_macroeconomics\", \"mmlu_high_school_mathematics\", \"mmlu_high_school_microeconomics\", \"mmlu_high_school_physics\", \"mmlu_high_school_psychology\", \"mmlu_high_school_statistics\", \"mmlu_high_school_us_history\", \"mmlu_high_school_world_history\", \"mmlu_human_aging\", \"mmlu_human_sexuality\", \"mmlu_international_law\", \"mmlu_jurisprudence\", \"mmlu_logical_fallacies\", \"mmlu_machine_learning\", \"mmlu_management\", \"mmlu_marketing\", \"mmlu_medical_genetics\", \"mmlu_miscellaneous\", \"mmlu_moral_disputes\", \"mmlu_moral_scenarios\", \"mmlu_nutrition\", \"mmlu_philosophy\", \"mmlu_prehistory\", \"mmlu_professional_accounting\", \"mmlu_professional_law\", \"mmlu_professional_medicine\", \"mmlu_professional_psychology\", \"mmlu_public_relations\", \"mmlu_security_studies\", \"mmlu_sociology\", \"mmlu_us_foreign_policy\", \"mmlu_virology\", \"mmlu_world_religions\"]"
             }
           }
         },
         {
-          "evaluation_name": "Astronomy",
+          "evaluation_name": "Abstract Algebra",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -497,36 +439,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Astronomy",
+            "evaluation_description": "EM on Abstract Algebra",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.934,
+            "score": 0.63,
             "details": {
-              "description": "min=0.934, mean=0.934, max=0.934, sum=1.868 (2)",
+              "description": "min=0.63, mean=0.63, max=0.63, sum=1.26 (2)",
               "tab": "Accuracy",
-              "Astronomy - Observed inference time (s)": "{\"description\": \"min=0.472, mean=0.472, max=0.472, sum=0.944 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4718977307018481\"}",
-              "Astronomy - # eval": "{\"description\": \"min=152, mean=152, max=152, sum=304 (2)\", \"tab\": \"General information\", \"score\": \"152.0\"}",
-              "Astronomy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Astronomy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Astronomy - # prompt tokens": "{\"description\": \"min=572.691, mean=572.691, max=572.691, sum=1145.382 (2)\", \"tab\": \"General information\", \"score\": \"572.6907894736842\"}",
-              "Astronomy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Abstract Algebra - Observed inference time (s)": "{\"description\": \"min=0.393, mean=0.393, max=0.393, sum=0.787 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.39332568168640136\"}",
+              "Abstract Algebra - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "Abstract Algebra - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Abstract Algebra - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Abstract Algebra - # prompt tokens": "{\"description\": \"min=366.44, mean=366.44, max=366.44, sum=732.88 (2)\", \"tab\": \"General information\", \"score\": \"366.44\"}",
+              "Abstract Algebra - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"astronomy\"",
+              "subject": "\"abstract_algebra\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_astronomy\""
+              "groups": "\"mmlu_abstract_algebra\""
             }
           }
         },
         {
-          "evaluation_name": "Business Ethics",
+          "evaluation_name": "Anatomy",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -535,36 +477,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Business Ethics",
+            "evaluation_description": "EM on Anatomy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.79,
+            "score": 0.8,
             "details": {
-              "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)",
+              "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)",
               "tab": "Accuracy",
-              "Business Ethics - Observed inference time (s)": "{\"description\": \"min=0.477, mean=0.477, max=0.477, sum=0.953 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4765148901939392\"}",
-              "Business Ethics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "Business Ethics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Business Ethics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Business Ethics - # prompt tokens": "{\"description\": \"min=562.52, mean=562.52, max=562.52, sum=1125.04 (2)\", \"tab\": \"General information\", \"score\": \"562.52\"}",
-              "Business Ethics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Anatomy - Observed inference time (s)": "{\"description\": \"min=0.545, mean=0.545, max=0.545, sum=1.09 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5451150911825674\"}",
+              "Anatomy - # eval": "{\"description\": \"min=135, mean=135, max=135, sum=270 (2)\", \"tab\": \"General information\", \"score\": \"135.0\"}",
+              "Anatomy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Anatomy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Anatomy - # prompt tokens": "{\"description\": \"min=346.978, mean=346.978, max=346.978, sum=693.956 (2)\", \"tab\": \"General information\", \"score\": \"346.97777777777776\"}",
+              "Anatomy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"business_ethics\"",
+              "subject": "\"anatomy\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_business_ethics\""
+              "groups": "\"mmlu_anatomy\""
             }
           }
         },
         {
-          "evaluation_name": "Clinical Knowledge",
+          "evaluation_name": "College Physics",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -573,36 +515,66 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Clinical Knowledge",
+            "evaluation_description": "EM on College Physics",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.845,
+            "score": 0.627,
             "details": {
-              "description": "min=0.845, mean=0.845, max=0.845, sum=1.691 (2)",
+              "description": "min=0.627, mean=0.627, max=0.627, sum=1.255 (2)",
               "tab": "Accuracy",
-              "Clinical Knowledge - Observed inference time (s)": "{\"description\": \"min=0.415, mean=0.415, max=0.415, sum=0.829 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.414557883424579\"}",
-              "Clinical Knowledge - # eval": "{\"description\": \"min=265, mean=265, max=265, sum=530 (2)\", \"tab\": \"General information\", \"score\": \"265.0\"}",
-              "Clinical Knowledge - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Clinical Knowledge - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Clinical Knowledge - # prompt tokens": "{\"description\": \"min=390.947, mean=390.947, max=390.947, sum=781.894 (2)\", \"tab\": \"General information\", \"score\": \"390.94716981132075\"}",
-              "Clinical Knowledge - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "College Chemistry - Observed inference time (s)": "{\"description\": \"min=0.389, mean=0.389, max=0.389, sum=0.778 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3888898015022278\"}",
+              "College Biology - Observed inference time (s)": "{\"description\": \"min=0.433, mean=0.433, max=0.433, sum=0.866 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.43280420700709027\"}",
+              "College Computer Science - Observed inference time (s)": "{\"description\": \"min=0.492, mean=0.492, max=0.492, sum=0.984 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.49212974786758423\"}",
+              "College Mathematics - Observed inference time (s)": "{\"description\": \"min=0.435, mean=0.435, max=0.435, sum=0.871 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4354128074645996\"}",
+              "College Medicine - Observed inference time (s)": "{\"description\": \"min=0.431, mean=0.431, max=0.431, sum=0.861 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4306242893196944\"}",
+              "College Physics - Observed inference time (s)": "{\"description\": \"min=0.415, mean=0.415, max=0.415, sum=0.83 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.41519686287524654\"}",
+              "College Chemistry - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "College Chemistry - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "College Chemistry - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "College Chemistry - # prompt tokens": "{\"description\": \"min=542.4, mean=542.4, max=542.4, sum=1084.8 (2)\", \"tab\": \"General information\", \"score\": \"542.4\"}",
+              "College Chemistry - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "College Biology - # eval": "{\"description\": \"min=144, mean=144, max=144, sum=288 (2)\", \"tab\": \"General information\", \"score\": \"144.0\"}",
+              "College Biology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "College Biology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "College Biology - # prompt tokens": "{\"description\": \"min=466.917, mean=466.917, max=466.917, sum=933.833 (2)\", \"tab\": \"General information\", \"score\": \"466.9166666666667\"}",
+              "College Biology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "College Computer Science - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "College Computer Science - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "College Computer Science - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "College Computer Science - # prompt tokens": "{\"description\": \"min=821.39, mean=821.39, max=821.39, sum=1642.78 (2)\", \"tab\": \"General information\", \"score\": \"821.39\"}",
+              "College Computer Science - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "College Mathematics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "College Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "College Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "College Mathematics - # prompt tokens": "{\"description\": \"min=587.52, mean=587.52, max=587.52, sum=1175.04 (2)\", \"tab\": \"General information\", \"score\": \"587.52\"}",
+              "College Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "College Medicine - # eval": "{\"description\": \"min=173, mean=173, max=173, sum=346 (2)\", \"tab\": \"General information\", \"score\": \"173.0\"}",
+              "College Medicine - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "College Medicine - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "College Medicine - # prompt tokens": "{\"description\": \"min=495.728, mean=495.728, max=495.728, sum=991.457 (2)\", \"tab\": \"General information\", \"score\": \"495.728323699422\"}",
+              "College Medicine - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "College Physics - # eval": "{\"description\": \"min=102, mean=102, max=102, sum=204 (2)\", \"tab\": \"General information\", \"score\": \"102.0\"}",
+              "College Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "College Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "College Physics - # prompt tokens": "{\"description\": \"min=496.608, mean=496.608, max=496.608, sum=993.216 (2)\", \"tab\": \"General information\", \"score\": \"496.6078431372549\"}",
+              "College Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"clinical_knowledge\"",
+              "subject": "\"college_physics\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_clinical_knowledge\""
+              "groups": "\"mmlu_college_physics\""
             }
           }
         },
         {
-          "evaluation_name": "Conceptual Physics",
+          "evaluation_name": "Computer Security",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -611,36 +583,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Conceptual Physics",
+            "evaluation_description": "EM on Computer Security",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.868,
+            "score": 0.86,
             "details": {
-              "description": "min=0.868, mean=0.868, max=0.868, sum=1.736 (2)",
+              "description": "min=0.86, mean=0.86, max=0.86, sum=1.72 (2)",
               "tab": "Accuracy",
-              "Conceptual Physics - Observed inference time (s)": "{\"description\": \"min=0.384, mean=0.384, max=0.384, sum=0.767 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3836827186827964\"}",
-              "Conceptual Physics - # eval": "{\"description\": \"min=235, mean=235, max=235, sum=470 (2)\", \"tab\": \"General information\", \"score\": \"235.0\"}",
-              "Conceptual Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Conceptual Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Conceptual Physics - # prompt tokens": "{\"description\": \"min=297.838, mean=297.838, max=297.838, sum=595.677 (2)\", \"tab\": \"General information\", \"score\": \"297.83829787234043\"}",
-              "Conceptual Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Computer Security - Observed inference time (s)": "{\"description\": \"min=0.373, mean=0.373, max=0.373, sum=0.746 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3729291558265686\"}",
+              "Computer Security - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "Computer Security - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Computer Security - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Computer Security - # prompt tokens": "{\"description\": \"min=371.54, mean=371.54, max=371.54, sum=743.08 (2)\", \"tab\": \"General information\", \"score\": \"371.54\"}",
+              "Computer Security - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"conceptual_physics\"",
+              "subject": "\"computer_security\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_conceptual_physics\""
+              "groups": "\"mmlu_computer_security\""
             }
           }
         },
         {
-          "evaluation_name": "Electrical Engineering",
+          "evaluation_name": "Econometrics",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -649,36 +621,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Electrical Engineering",
+            "evaluation_description": "EM on Econometrics",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.786,
+            "score": 0.684,
             "details": {
-              "description": "min=0.786, mean=0.786, max=0.786, sum=1.572 (2)",
-              "tab": "Accuracy",
-              "Electrical Engineering - Observed inference time (s)": "{\"description\": \"min=0.399, mean=0.399, max=0.399, sum=0.798 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.39915286919166304\"}",
-              "Electrical Engineering - # eval": "{\"description\": \"min=145, mean=145, max=145, sum=290 (2)\", \"tab\": \"General information\", \"score\": \"145.0\"}",
-              "Electrical Engineering - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Electrical Engineering - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Electrical Engineering - # prompt tokens": "{\"description\": \"min=433.641, mean=433.641, max=433.641, sum=867.283 (2)\", \"tab\": \"General information\", \"score\": \"433.6413793103448\"}",
-              "Electrical Engineering - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "description": "min=0.684, mean=0.684, max=0.684, sum=1.368 (2)",
+              "tab": "Accuracy",
+              "Econometrics - Observed inference time (s)": "{\"description\": \"min=0.364, mean=0.364, max=0.364, sum=0.729 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.36447873241023016\"}",
+              "Econometrics - # eval": "{\"description\": \"min=114, mean=114, max=114, sum=228 (2)\", \"tab\": \"General information\", \"score\": \"114.0\"}",
+              "Econometrics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Econometrics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Econometrics - # prompt tokens": "{\"description\": \"min=607.43, mean=607.43, max=607.43, sum=1214.86 (2)\", \"tab\": \"General information\", \"score\": \"607.4298245614035\"}",
+              "Econometrics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"electrical_engineering\"",
+              "subject": "\"econometrics\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_electrical_engineering\""
+              "groups": "\"mmlu_econometrics\""
             }
           }
         },
         {
-          "evaluation_name": "Elementary Mathematics",
+          "evaluation_name": "Global Facts",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -687,36 +659,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Elementary Mathematics",
+            "evaluation_description": "EM on Global Facts",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.807,
+            "score": 0.62,
             "details": {
-              "description": "min=0.807, mean=0.807, max=0.807, sum=1.614 (2)",
+              "description": "min=0.62, mean=0.62, max=0.62, sum=1.24 (2)",
               "tab": "Accuracy",
-              "Elementary Mathematics - Observed inference time (s)": "{\"description\": \"min=0.423, mean=0.423, max=0.423, sum=0.845 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4225258120784053\"}",
-              "Elementary Mathematics - # eval": "{\"description\": \"min=378, mean=378, max=378, sum=756 (2)\", \"tab\": \"General information\", \"score\": \"378.0\"}",
-              "Elementary Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Elementary Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Elementary Mathematics - # prompt tokens": "{\"description\": \"min=524.862, mean=524.862, max=524.862, sum=1049.725 (2)\", \"tab\": \"General information\", \"score\": \"524.8624338624338\"}",
-              "Elementary Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Global Facts - Observed inference time (s)": "{\"description\": \"min=0.476, mean=0.476, max=0.476, sum=0.952 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4758000469207764\"}",
+              "Global Facts - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "Global Facts - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Global Facts - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Global Facts - # prompt tokens": "{\"description\": \"min=392.71, mean=392.71, max=392.71, sum=785.42 (2)\", \"tab\": \"General information\", \"score\": \"392.71\"}",
+              "Global Facts - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"elementary_mathematics\"",
+              "subject": "\"global_facts\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_elementary_mathematics\""
+              "groups": "\"mmlu_global_facts\""
             }
           }
         },
         {
-          "evaluation_name": "Formal Logic",
+          "evaluation_name": "Jurisprudence",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -725,36 +697,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Formal Logic",
+            "evaluation_description": "EM on Jurisprudence",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.643,
+            "score": 0.889,
             "details": {
-              "description": "min=0.643, mean=0.643, max=0.643, sum=1.286 (2)",
+              "description": "min=0.889, mean=0.889, max=0.889, sum=1.778 (2)",
               "tab": "Accuracy",
-              "Formal Logic - Observed inference time (s)": "{\"description\": \"min=0.486, mean=0.486, max=0.486, sum=0.973 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.48647683007376535\"}",
-              "Formal Logic - # eval": "{\"description\": \"min=126, mean=126, max=126, sum=252 (2)\", \"tab\": \"General information\", \"score\": \"126.0\"}",
-              "Formal Logic - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Formal Logic - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Formal Logic - # prompt tokens": "{\"description\": \"min=599.762, mean=599.762, max=599.762, sum=1199.524 (2)\", \"tab\": \"General information\", \"score\": \"599.7619047619048\"}",
-              "Formal Logic - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Jurisprudence - Observed inference time (s)": "{\"description\": \"min=0.439, mean=0.439, max=0.439, sum=0.878 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.43886900389636\"}",
+              "Jurisprudence - # eval": "{\"description\": \"min=108, mean=108, max=108, sum=216 (2)\", \"tab\": \"General information\", \"score\": \"108.0\"}",
+              "Jurisprudence - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Jurisprudence - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Jurisprudence - # prompt tokens": "{\"description\": \"min=387.639, mean=387.639, max=387.639, sum=775.278 (2)\", \"tab\": \"General information\", \"score\": \"387.6388888888889\"}",
+              "Jurisprudence - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"formal_logic\"",
+              "subject": "\"jurisprudence\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_formal_logic\""
+              "groups": "\"mmlu_jurisprudence\""
             }
           }
         },
         {
-          "evaluation_name": "High School World History",
+          "evaluation_name": "Philosophy",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -763,114 +735,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on High School World History",
+            "evaluation_description": "EM on Philosophy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.945,
+            "score": 0.859,
             "details": {
-              "description": "min=0.945, mean=0.945, max=0.945, sum=1.89 (2)",
+              "description": "min=0.859, mean=0.859, max=0.859, sum=1.717 (2)",
               "tab": "Accuracy",
-              "High School Biology - Observed inference time (s)": "{\"description\": \"min=0.436, mean=0.436, max=0.436, sum=0.872 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4360047817230225\"}",
-              "High School Chemistry - Observed inference time (s)": "{\"description\": \"min=0.413, mean=0.413, max=0.413, sum=0.827 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.41338158710836775\"}",
-              "High School Computer Science - Observed inference time (s)": "{\"description\": \"min=0.5, mean=0.5, max=0.5, sum=1.001 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5002665758132935\"}",
-              "High School European History - Observed inference time (s)": "{\"description\": \"min=0.579, mean=0.579, max=0.579, sum=1.158 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.578774525902488\"}",
-              "High School Geography - Observed inference time (s)": "{\"description\": \"min=0.414, mean=0.414, max=0.414, sum=0.829 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4142996747084338\"}",
-              "High School Government And Politics - Observed inference time (s)": "{\"description\": \"min=0.43, mean=0.43, max=0.43, sum=0.86 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.43005221001224814\"}",
-              "High School Macroeconomics - Observed inference time (s)": "{\"description\": \"min=0.416, mean=0.416, max=0.416, sum=0.832 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4160928750649477\"}",
-              "High School Mathematics - Observed inference time (s)": "{\"description\": \"min=0.423, mean=0.423, max=0.423, sum=0.846 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4231933620240953\"}",
-              "High School Microeconomics - Observed inference time (s)": "{\"description\": \"min=0.474, mean=0.474, max=0.474, sum=0.948 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4740273321376127\"}",
-              "High School Physics - Observed inference time (s)": "{\"description\": \"min=0.462, mean=0.462, max=0.462, sum=0.924 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4620048778736039\"}",
-              "High School Psychology - Observed inference time (s)": "{\"description\": \"min=0.407, mean=0.407, max=0.407, sum=0.813 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.40661886022725235\"}",
-              "High School Statistics - Observed inference time (s)": "{\"description\": \"min=0.463, mean=0.463, max=0.463, sum=0.926 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.46296725780875597\"}",
-              "High School US History - Observed inference time (s)": "{\"description\": \"min=0.546, mean=0.546, max=0.546, sum=1.091 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5456923538563299\"}",
-              "High School World History - Observed inference time (s)": "{\"description\": \"min=0.517, mean=0.517, max=0.517, sum=1.033 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5166646488608188\"}",
-              "High School Biology - # eval": "{\"description\": \"min=310, mean=310, max=310, sum=620 (2)\", \"tab\": \"General information\", \"score\": \"310.0\"}",
-              "High School Biology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Biology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Biology - # prompt tokens": "{\"description\": \"min=506.677, mean=506.677, max=506.677, sum=1013.355 (2)\", \"tab\": \"General information\", \"score\": \"506.6774193548387\"}",
-              "High School Biology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Chemistry - # eval": "{\"description\": \"min=203, mean=203, max=203, sum=406 (2)\", \"tab\": \"General information\", \"score\": \"203.0\"}",
-              "High School Chemistry - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Chemistry - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Chemistry - # prompt tokens": "{\"description\": \"min=489.714, mean=489.714, max=489.714, sum=979.429 (2)\", \"tab\": \"General information\", \"score\": \"489.7142857142857\"}",
-              "High School Chemistry - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Computer Science - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "High School Computer Science - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Computer Science - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Computer Science - # prompt tokens": "{\"description\": \"min=860.78, mean=860.78, max=860.78, sum=1721.56 (2)\", \"tab\": \"General information\", \"score\": \"860.78\"}",
-              "High School Computer Science - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School European History - # eval": "{\"description\": \"min=165, mean=165, max=165, sum=330 (2)\", \"tab\": \"General information\", \"score\": \"165.0\"}",
-              "High School European History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School European History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School European History - # prompt tokens": "{\"description\": \"min=2791.073, mean=2791.073, max=2791.073, sum=5582.145 (2)\", \"tab\": \"General information\", \"score\": \"2791.072727272727\"}",
-              "High School European History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Geography - # eval": "{\"description\": \"min=198, mean=198, max=198, sum=396 (2)\", \"tab\": \"General information\", \"score\": \"198.0\"}",
-              "High School Geography - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Geography - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Geography - # prompt tokens": "{\"description\": \"min=365.045, mean=365.045, max=365.045, sum=730.091 (2)\", \"tab\": \"General information\", \"score\": \"365.04545454545456\"}",
-              "High School Geography - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Government And Politics - # eval": "{\"description\": \"min=193, mean=193, max=193, sum=386 (2)\", \"tab\": \"General information\", \"score\": \"193.0\"}",
-              "High School Government And Politics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Government And Politics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Government And Politics - # prompt tokens": "{\"description\": \"min=458.824, mean=458.824, max=458.824, sum=917.648 (2)\", \"tab\": \"General information\", \"score\": \"458.8238341968912\"}",
-              "High School Government And Politics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Macroeconomics - # eval": "{\"description\": \"min=390, mean=390, max=390, sum=780 (2)\", \"tab\": \"General information\", \"score\": \"390.0\"}",
-              "High School Macroeconomics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Macroeconomics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Macroeconomics - # prompt tokens": "{\"description\": \"min=364.562, mean=364.562, max=364.562, sum=729.123 (2)\", \"tab\": \"General information\", \"score\": \"364.5615384615385\"}",
-              "High School Macroeconomics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Mathematics - # eval": "{\"description\": \"min=270, mean=270, max=270, sum=540 (2)\", \"tab\": \"General information\", \"score\": \"270.0\"}",
-              "High School Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Mathematics - # prompt tokens": "{\"description\": \"min=525.374, mean=525.374, max=525.374, sum=1050.748 (2)\", \"tab\": \"General information\", \"score\": \"525.3740740740741\"}",
-              "High School Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Microeconomics - # eval": "{\"description\": \"min=238, mean=238, max=238, sum=476 (2)\", \"tab\": \"General information\", \"score\": \"238.0\"}",
-              "High School Microeconomics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Microeconomics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Microeconomics - # prompt tokens": "{\"description\": \"min=392.025, mean=392.025, max=392.025, sum=784.05 (2)\", \"tab\": \"General information\", \"score\": \"392.02521008403363\"}",
-              "High School Microeconomics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Physics - # eval": "{\"description\": \"min=151, mean=151, max=151, sum=302 (2)\", \"tab\": \"General information\", \"score\": \"151.0\"}",
-              "High School Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Physics - # prompt tokens": "{\"description\": \"min=553.464, mean=553.464, max=553.464, sum=1106.927 (2)\", \"tab\": \"General information\", \"score\": \"553.4635761589404\"}",
-              "High School Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Psychology - # eval": "{\"description\": \"min=545, mean=545, max=545, sum=1090 (2)\", \"tab\": \"General information\", \"score\": \"545.0\"}",
-              "High School Psychology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Psychology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Psychology - # prompt tokens": "{\"description\": \"min=488.246, mean=488.246, max=488.246, sum=976.492 (2)\", \"tab\": \"General information\", \"score\": \"488.24587155963303\"}",
-              "High School Psychology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Statistics - # eval": "{\"description\": \"min=216, mean=216, max=216, sum=432 (2)\", \"tab\": \"General information\", \"score\": \"216.0\"}",
-              "High School Statistics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Statistics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Statistics - # prompt tokens": "{\"description\": \"min=788.699, mean=788.699, max=788.699, sum=1577.398 (2)\", \"tab\": \"General information\", \"score\": \"788.699074074074\"}",
-              "High School Statistics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School US History - # eval": "{\"description\": \"min=204, mean=204, max=204, sum=408 (2)\", \"tab\": \"General information\", \"score\": \"204.0\"}",
-              "High School US History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School US History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School US History - # prompt tokens": "{\"description\": \"min=2210.809, mean=2210.809, max=2210.809, sum=4421.618 (2)\", \"tab\": \"General information\", \"score\": \"2210.8088235294117\"}",
-              "High School US History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School World History - # eval": "{\"description\": \"min=237, mean=237, max=237, sum=474 (2)\", \"tab\": \"General information\", \"score\": \"237.0\"}",
-              "High School World History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School World History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School World History - # prompt tokens": "{\"description\": \"min=1421.27, mean=1421.27, max=1421.27, sum=2842.54 (2)\", \"tab\": \"General information\", \"score\": \"1421.2700421940929\"}",
-              "High School World History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Philosophy - Observed inference time (s)": "{\"description\": \"min=0.403, mean=0.403, max=0.403, sum=0.807 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.40341131480177117\"}",
+              "Philosophy - # eval": "{\"description\": \"min=311, mean=311, max=311, sum=622 (2)\", \"tab\": \"General information\", \"score\": \"311.0\"}",
+              "Philosophy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Philosophy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Philosophy - # prompt tokens": "{\"description\": \"min=322.084, mean=322.084, max=322.084, sum=644.167 (2)\", \"tab\": \"General information\", \"score\": \"322.08360128617363\"}",
+              "Philosophy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"high_school_world_history\"",
+              "subject": "\"philosophy\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_high_school_world_history\""
+              "groups": "\"mmlu_philosophy\""
             }
           }
         },
         {
-          "evaluation_name": "Human Sexuality",
+          "evaluation_name": "Professional Psychology",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -879,42 +773,54 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Human Sexuality",
+            "evaluation_description": "EM on Professional Psychology",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.908,
+            "score": 0.891,
             "details": {
-              "description": "min=0.908, mean=0.908, max=0.908, sum=1.817 (2)",
+              "description": "min=0.891, mean=0.891, max=0.891, sum=1.781 (2)",
               "tab": "Accuracy",
-              "Human Aging - Observed inference time (s)": "{\"description\": \"min=0.406, mean=0.406, max=0.406, sum=0.812 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4058152218036053\"}",
-              "Human Sexuality - Observed inference time (s)": "{\"description\": \"min=0.466, mean=0.466, max=0.466, sum=0.932 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.46620041541470825\"}",
-              "Human Aging - # eval": "{\"description\": \"min=223, mean=223, max=223, sum=446 (2)\", \"tab\": \"General information\", \"score\": \"223.0\"}",
-              "Human Aging - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Human Aging - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Human Aging - # prompt tokens": "{\"description\": \"min=312.906, mean=312.906, max=312.906, sum=625.812 (2)\", \"tab\": \"General information\", \"score\": \"312.90582959641256\"}",
-              "Human Aging - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "Human Sexuality - # eval": "{\"description\": \"min=131, mean=131, max=131, sum=262 (2)\", \"tab\": \"General information\", \"score\": \"131.0\"}",
-              "Human Sexuality - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Human Sexuality - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Human Sexuality - # prompt tokens": "{\"description\": \"min=334.183, mean=334.183, max=334.183, sum=668.366 (2)\", \"tab\": \"General information\", \"score\": \"334.1832061068702\"}",
-              "Human Sexuality - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Professional Medicine - Observed inference time (s)": "{\"description\": \"min=0.483, mean=0.483, max=0.483, sum=0.966 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.48306868356816907\"}",
+              "Professional Accounting - Observed inference time (s)": "{\"description\": \"min=0.444, mean=0.444, max=0.444, sum=0.888 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.44407470006469296\"}",
+              "Professional Law - Observed inference time (s)": "{\"description\": \"min=0.578, mean=0.578, max=0.578, sum=1.157 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.578451920053017\"}",
+              "Professional Psychology - Observed inference time (s)": "{\"description\": \"min=0.469, mean=0.469, max=0.469, sum=0.938 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4690242421393301\"}",
+              "Professional Medicine - # eval": "{\"description\": \"min=272, mean=272, max=272, sum=544 (2)\", \"tab\": \"General information\", \"score\": \"272.0\"}",
+              "Professional Medicine - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Professional Medicine - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Professional Medicine - # prompt tokens": "{\"description\": \"min=1087.585, mean=1087.585, max=1087.585, sum=2175.169 (2)\", \"tab\": \"General information\", \"score\": \"1087.5845588235295\"}",
+              "Professional Medicine - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "Professional Accounting - # eval": "{\"description\": \"min=282, mean=282, max=282, sum=564 (2)\", \"tab\": \"General information\", \"score\": \"282.0\"}",
+              "Professional Accounting - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Professional Accounting - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Professional Accounting - # prompt tokens": "{\"description\": \"min=651.592, mean=651.592, max=651.592, sum=1303.184 (2)\", \"tab\": \"General information\", \"score\": \"651.5921985815603\"}",
+              "Professional Accounting - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "Professional Law - # eval": "{\"description\": \"min=1534, mean=1534, max=1534, sum=3068 (2)\", \"tab\": \"General information\", \"score\": \"1534.0\"}",
+              "Professional Law - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Professional Law - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Professional Law - # prompt tokens": "{\"description\": \"min=1630.787, mean=1630.787, max=1630.787, sum=3261.574 (2)\", \"tab\": \"General information\", \"score\": \"1630.7868318122555\"}",
+              "Professional Law - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "Professional Psychology - # eval": "{\"description\": \"min=612, mean=612, max=612, sum=1224 (2)\", \"tab\": \"General information\", \"score\": \"612.0\"}",
+              "Professional Psychology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Professional Psychology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Professional Psychology - # prompt tokens": "{\"description\": \"min=568.114, mean=568.114, max=568.114, sum=1136.229 (2)\", \"tab\": \"General information\", \"score\": \"568.1143790849674\"}",
+              "Professional Psychology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"human_sexuality\"",
+              "subject": "\"professional_psychology\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_human_sexuality\""
+              "groups": "\"mmlu_professional_psychology\""
             }
           }
         },
         {
-          "evaluation_name": "International Law",
+          "evaluation_name": "Us Foreign Policy",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -923,36 +829,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on International Law",
+            "evaluation_description": "EM on Us Foreign Policy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.917,
+            "score": 0.95,
             "details": {
-              "description": "min=0.917, mean=0.917, max=0.917, sum=1.835 (2)",
+              "description": "min=0.95, mean=0.95, max=0.95, sum=1.9 (2)",
               "tab": "Accuracy",
-              "International Law - Observed inference time (s)": "{\"description\": \"min=0.461, mean=0.461, max=0.461, sum=0.922 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4608367139642889\"}",
-              "International Law - # eval": "{\"description\": \"min=121, mean=121, max=121, sum=242 (2)\", \"tab\": \"General information\", \"score\": \"121.0\"}",
-              "International Law - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "International Law - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "International Law - # prompt tokens": "{\"description\": \"min=632.851, mean=632.851, max=632.851, sum=1265.702 (2)\", \"tab\": \"General information\", \"score\": \"632.8512396694215\"}",
-              "International Law - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Us Foreign Policy - Observed inference time (s)": "{\"description\": \"min=0.434, mean=0.434, max=0.434, sum=0.869 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.43441893100738527\"}",
+              "Us Foreign Policy - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "Us Foreign Policy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Us Foreign Policy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Us Foreign Policy - # prompt tokens": "{\"description\": \"min=415.79, mean=415.79, max=415.79, sum=831.58 (2)\", \"tab\": \"General information\", \"score\": \"415.79\"}",
+              "Us Foreign Policy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"international_law\"",
+              "subject": "\"us_foreign_policy\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_international_law\""
+              "groups": "\"mmlu_us_foreign_policy\""
             }
           }
         },
         {
-          "evaluation_name": "Logical Fallacies",
+          "evaluation_name": "Astronomy",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -961,36 +867,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Logical Fallacies",
+            "evaluation_description": "EM on Astronomy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.871,
+            "score": 0.934,
             "details": {
-              "description": "min=0.871, mean=0.871, max=0.871, sum=1.742 (2)",
+              "description": "min=0.934, mean=0.934, max=0.934, sum=1.868 (2)",
               "tab": "Accuracy",
-              "Logical Fallacies - Observed inference time (s)": "{\"description\": \"min=0.432, mean=0.432, max=0.432, sum=0.864 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4321035870745138\"}",
-              "Logical Fallacies - # eval": "{\"description\": \"min=163, mean=163, max=163, sum=326 (2)\", \"tab\": \"General information\", \"score\": \"163.0\"}",
-              "Logical Fallacies - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Logical Fallacies - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Logical Fallacies - # prompt tokens": "{\"description\": \"min=442.595, mean=442.595, max=442.595, sum=885.19 (2)\", \"tab\": \"General information\", \"score\": \"442.5950920245399\"}",
-              "Logical Fallacies - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Astronomy - Observed inference time (s)": "{\"description\": \"min=0.472, mean=0.472, max=0.472, sum=0.944 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4718977307018481\"}",
+              "Astronomy - # eval": "{\"description\": \"min=152, mean=152, max=152, sum=304 (2)\", \"tab\": \"General information\", \"score\": \"152.0\"}",
+              "Astronomy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Astronomy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Astronomy - # prompt tokens": "{\"description\": \"min=572.691, mean=572.691, max=572.691, sum=1145.382 (2)\", \"tab\": \"General information\", \"score\": \"572.6907894736842\"}",
+              "Astronomy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"logical_fallacies\"",
+              "subject": "\"astronomy\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_logical_fallacies\""
+              "groups": "\"mmlu_astronomy\""
             }
           }
         },
         {
-          "evaluation_name": "Machine Learning",
+          "evaluation_name": "Business Ethics",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -999,36 +905,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Machine Learning",
+            "evaluation_description": "EM on Business Ethics",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.759,
+            "score": 0.79,
             "details": {
-              "description": "min=0.759, mean=0.759, max=0.759, sum=1.518 (2)",
+              "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)",
               "tab": "Accuracy",
-              "Machine Learning - Observed inference time (s)": "{\"description\": \"min=0.463, mean=0.463, max=0.463, sum=0.926 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.46302694933755056\"}",
-              "Machine Learning - # eval": "{\"description\": \"min=112, mean=112, max=112, sum=224 (2)\", \"tab\": \"General information\", \"score\": \"112.0\"}",
-              "Machine Learning - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Machine Learning - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Machine Learning - # prompt tokens": "{\"description\": \"min=661.054, mean=661.054, max=661.054, sum=1322.107 (2)\", \"tab\": \"General information\", \"score\": \"661.0535714285714\"}",
-              "Machine Learning - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Business Ethics - Observed inference time (s)": "{\"description\": \"min=0.477, mean=0.477, max=0.477, sum=0.953 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4765148901939392\"}",
+              "Business Ethics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "Business Ethics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Business Ethics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Business Ethics - # prompt tokens": "{\"description\": \"min=562.52, mean=562.52, max=562.52, sum=1125.04 (2)\", \"tab\": \"General information\", \"score\": \"562.52\"}",
+              "Business Ethics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"machine_learning\"",
+              "subject": "\"business_ethics\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_machine_learning\""
+              "groups": "\"mmlu_business_ethics\""
             }
           }
         },
         {
-          "evaluation_name": "Management",
+          "evaluation_name": "Clinical Knowledge",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1037,36 +943,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Management",
+            "evaluation_description": "EM on Clinical Knowledge",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.932,
+            "score": 0.845,
             "details": {
-              "description": "min=0.932, mean=0.932, max=0.932, sum=1.864 (2)",
+              "description": "min=0.845, mean=0.845, max=0.845, sum=1.691 (2)",
               "tab": "Accuracy",
-              "Management - Observed inference time (s)": "{\"description\": \"min=0.446, mean=0.446, max=0.446, sum=0.891 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4455798760201167\"}",
-              "Management - # eval": "{\"description\": \"min=103, mean=103, max=103, sum=206 (2)\", \"tab\": \"General information\", \"score\": \"103.0\"}",
-              "Management - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Management - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Management - # prompt tokens": "{\"description\": \"min=276.796, mean=276.796, max=276.796, sum=553.592 (2)\", \"tab\": \"General information\", \"score\": \"276.79611650485435\"}",
-              "Management - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Clinical Knowledge - Observed inference time (s)": "{\"description\": \"min=0.415, mean=0.415, max=0.415, sum=0.829 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.414557883424579\"}",
+              "Clinical Knowledge - # eval": "{\"description\": \"min=265, mean=265, max=265, sum=530 (2)\", \"tab\": \"General information\", \"score\": \"265.0\"}",
+              "Clinical Knowledge - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Clinical Knowledge - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Clinical Knowledge - # prompt tokens": "{\"description\": \"min=390.947, mean=390.947, max=390.947, sum=781.894 (2)\", \"tab\": \"General information\", \"score\": \"390.94716981132075\"}",
+              "Clinical Knowledge - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"management\"",
+              "subject": "\"clinical_knowledge\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_management\""
+              "groups": "\"mmlu_clinical_knowledge\""
             }
           }
         },
         {
-          "evaluation_name": "Marketing",
+          "evaluation_name": "Conceptual Physics",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1075,36 +981,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Marketing",
+            "evaluation_description": "EM on Conceptual Physics",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.962,
+            "score": 0.868,
             "details": {
-              "description": "min=0.962, mean=0.962, max=0.962, sum=1.923 (2)",
+              "description": "min=0.868, mean=0.868, max=0.868, sum=1.736 (2)",
               "tab": "Accuracy",
-              "Marketing - Observed inference time (s)": "{\"description\": \"min=0.421, mean=0.421, max=0.421, sum=0.843 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4213859372668796\"}",
-              "Marketing - # eval": "{\"description\": \"min=234, mean=234, max=234, sum=468 (2)\", \"tab\": \"General information\", \"score\": \"234.0\"}",
-              "Marketing - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Marketing - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Marketing - # prompt tokens": "{\"description\": \"min=397.218, mean=397.218, max=397.218, sum=794.436 (2)\", \"tab\": \"General information\", \"score\": \"397.21794871794873\"}",
-              "Marketing - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Conceptual Physics - Observed inference time (s)": "{\"description\": \"min=0.384, mean=0.384, max=0.384, sum=0.767 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3836827186827964\"}",
+              "Conceptual Physics - # eval": "{\"description\": \"min=235, mean=235, max=235, sum=470 (2)\", \"tab\": \"General information\", \"score\": \"235.0\"}",
+              "Conceptual Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Conceptual Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Conceptual Physics - # prompt tokens": "{\"description\": \"min=297.838, mean=297.838, max=297.838, sum=595.677 (2)\", \"tab\": \"General information\", \"score\": \"297.83829787234043\"}",
+              "Conceptual Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"marketing\"",
+              "subject": "\"conceptual_physics\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_marketing\""
+              "groups": "\"mmlu_conceptual_physics\""
             }
           }
         },
         {
-          "evaluation_name": "Medical Genetics",
+          "evaluation_name": "Electrical Engineering",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1113,36 +1019,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Medical Genetics",
+            "evaluation_description": "EM on Electrical Engineering",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.94,
+            "score": 0.786,
             "details": {
-              "description": "min=0.94, mean=0.94, max=0.94, sum=1.88 (2)",
+              "description": "min=0.786, mean=0.786, max=0.786, sum=1.572 (2)",
               "tab": "Accuracy",
-              "Medical Genetics - Observed inference time (s)": "{\"description\": \"min=0.411, mean=0.411, max=0.411, sum=0.823 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.41135803937911986\"}",
-              "Medical Genetics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "Medical Genetics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Medical Genetics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Medical Genetics - # prompt tokens": "{\"description\": \"min=334, mean=334, max=334, sum=668 (2)\", \"tab\": \"General information\", \"score\": \"334.0\"}",
-              "Medical Genetics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Electrical Engineering - Observed inference time (s)": "{\"description\": \"min=0.399, mean=0.399, max=0.399, sum=0.798 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.39915286919166304\"}",
+              "Electrical Engineering - # eval": "{\"description\": \"min=145, mean=145, max=145, sum=290 (2)\", \"tab\": \"General information\", \"score\": \"145.0\"}",
+              "Electrical Engineering - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Electrical Engineering - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Electrical Engineering - # prompt tokens": "{\"description\": \"min=433.641, mean=433.641, max=433.641, sum=867.283 (2)\", \"tab\": \"General information\", \"score\": \"433.6413793103448\"}",
+              "Electrical Engineering - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"medical_genetics\"",
+              "subject": "\"electrical_engineering\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_medical_genetics\""
+              "groups": "\"mmlu_electrical_engineering\""
             }
           }
         },
         {
-          "evaluation_name": "Miscellaneous",
+          "evaluation_name": "Elementary Mathematics",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1151,36 +1057,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Miscellaneous",
+            "evaluation_description": "EM on Elementary Mathematics",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.949,
+            "score": 0.807,
             "details": {
-              "description": "min=0.949, mean=0.949, max=0.949, sum=1.898 (2)",
+              "description": "min=0.807, mean=0.807, max=0.807, sum=1.614 (2)",
               "tab": "Accuracy",
-              "Miscellaneous - Observed inference time (s)": "{\"description\": \"min=0.451, mean=0.451, max=0.451, sum=0.901 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4505587230088001\"}",
-              "Miscellaneous - # eval": "{\"description\": \"min=783, mean=783, max=783, sum=1566 (2)\", \"tab\": \"General information\", \"score\": \"783.0\"}",
-              "Miscellaneous - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Miscellaneous - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Miscellaneous - # prompt tokens": "{\"description\": \"min=292.925, mean=292.925, max=292.925, sum=585.849 (2)\", \"tab\": \"General information\", \"score\": \"292.92464878671774\"}",
-              "Miscellaneous - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Elementary Mathematics - Observed inference time (s)": "{\"description\": \"min=0.423, mean=0.423, max=0.423, sum=0.845 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4225258120784053\"}",
+              "Elementary Mathematics - # eval": "{\"description\": \"min=378, mean=378, max=378, sum=756 (2)\", \"tab\": \"General information\", \"score\": \"378.0\"}",
+              "Elementary Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Elementary Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Elementary Mathematics - # prompt tokens": "{\"description\": \"min=524.862, mean=524.862, max=524.862, sum=1049.725 (2)\", \"tab\": \"General information\", \"score\": \"524.8624338624338\"}",
+              "Elementary Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"miscellaneous\"",
+              "subject": "\"elementary_mathematics\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_miscellaneous\""
+              "groups": "\"mmlu_elementary_mathematics\""
             }
           }
         },
         {
-          "evaluation_name": "Moral Scenarios",
+          "evaluation_name": "Formal Logic",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1189,42 +1095,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Moral Scenarios",
+            "evaluation_description": "EM on Formal Logic",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.902,
+            "score": 0.643,
             "details": {
-              "description": "min=0.902, mean=0.902, max=0.902, sum=1.803 (2)",
+              "description": "min=0.643, mean=0.643, max=0.643, sum=1.286 (2)",
               "tab": "Accuracy",
-              "Moral Disputes - Observed inference time (s)": "{\"description\": \"min=0.428, mean=0.428, max=0.428, sum=0.856 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4281756044123214\"}",
-              "Moral Scenarios - Observed inference time (s)": "{\"description\": \"min=0.445, mean=0.445, max=0.445, sum=0.89 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.44513606945229645\"}",
-              "Moral Disputes - # eval": "{\"description\": \"min=346, mean=346, max=346, sum=692 (2)\", \"tab\": \"General information\", \"score\": \"346.0\"}",
-              "Moral Disputes - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Moral Disputes - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Moral Disputes - # prompt tokens": "{\"description\": \"min=469.145, mean=469.145, max=469.145, sum=938.289 (2)\", \"tab\": \"General information\", \"score\": \"469.1445086705202\"}",
-              "Moral Disputes - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "Moral Scenarios - # eval": "{\"description\": \"min=895, mean=895, max=895, sum=1790 (2)\", \"tab\": \"General information\", \"score\": \"895.0\"}",
-              "Moral Scenarios - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Moral Scenarios - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Moral Scenarios - # prompt tokens": "{\"description\": \"min=649.455, mean=649.455, max=649.455, sum=1298.909 (2)\", \"tab\": \"General information\", \"score\": \"649.454748603352\"}",
-              "Moral Scenarios - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Formal Logic - Observed inference time (s)": "{\"description\": \"min=0.486, mean=0.486, max=0.486, sum=0.973 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.48647683007376535\"}",
+              "Formal Logic - # eval": "{\"description\": \"min=126, mean=126, max=126, sum=252 (2)\", \"tab\": \"General information\", \"score\": \"126.0\"}",
+              "Formal Logic - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Formal Logic - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Formal Logic - # prompt tokens": "{\"description\": \"min=599.762, mean=599.762, max=599.762, sum=1199.524 (2)\", \"tab\": \"General information\", \"score\": \"599.7619047619048\"}",
+              "Formal Logic - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"moral_scenarios\"",
+              "subject": "\"formal_logic\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_moral_scenarios\""
+              "groups": "\"mmlu_formal_logic\""
             }
           }
         },
         {
-          "evaluation_name": "Nutrition",
+          "evaluation_name": "High School World History",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1233,36 +1133,114 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Nutrition",
+            "evaluation_description": "EM on High School World History",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.892,
+            "score": 0.945,
             "details": {
-              "description": "min=0.892, mean=0.892, max=0.892, sum=1.784 (2)",
+              "description": "min=0.945, mean=0.945, max=0.945, sum=1.89 (2)",
               "tab": "Accuracy",
-              "Nutrition - Observed inference time (s)": "{\"description\": \"min=0.446, mean=0.446, max=0.446, sum=0.892 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4460979816960354\"}",
-              "Nutrition - # eval": "{\"description\": \"min=306, mean=306, max=306, sum=612 (2)\", \"tab\": \"General information\", \"score\": \"306.0\"}",
-              "Nutrition - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Nutrition - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Nutrition - # prompt tokens": "{\"description\": \"min=579.817, mean=579.817, max=579.817, sum=1159.634 (2)\", \"tab\": \"General information\", \"score\": \"579.8169934640523\"}",
-              "Nutrition - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "High School Biology - Observed inference time (s)": "{\"description\": \"min=0.436, mean=0.436, max=0.436, sum=0.872 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4360047817230225\"}",
+              "High School Chemistry - Observed inference time (s)": "{\"description\": \"min=0.413, mean=0.413, max=0.413, sum=0.827 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.41338158710836775\"}",
+              "High School Computer Science - Observed inference time (s)": "{\"description\": \"min=0.5, mean=0.5, max=0.5, sum=1.001 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5002665758132935\"}",
+              "High School European History - Observed inference time (s)": "{\"description\": \"min=0.579, mean=0.579, max=0.579, sum=1.158 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.578774525902488\"}",
+              "High School Geography - Observed inference time (s)": "{\"description\": \"min=0.414, mean=0.414, max=0.414, sum=0.829 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4142996747084338\"}",
+              "High School Government And Politics - Observed inference time (s)": "{\"description\": \"min=0.43, mean=0.43, max=0.43, sum=0.86 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.43005221001224814\"}",
+              "High School Macroeconomics - Observed inference time (s)": "{\"description\": \"min=0.416, mean=0.416, max=0.416, sum=0.832 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4160928750649477\"}",
+              "High School Mathematics - Observed inference time (s)": "{\"description\": \"min=0.423, mean=0.423, max=0.423, sum=0.846 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4231933620240953\"}",
+              "High School Microeconomics - Observed inference time (s)": "{\"description\": \"min=0.474, mean=0.474, max=0.474, sum=0.948 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4740273321376127\"}",
+              "High School Physics - Observed inference time (s)": "{\"description\": \"min=0.462, mean=0.462, max=0.462, sum=0.924 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4620048778736039\"}",
+              "High School Psychology - Observed inference time (s)": "{\"description\": \"min=0.407, mean=0.407, max=0.407, sum=0.813 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.40661886022725235\"}",
+              "High School Statistics - Observed inference time (s)": "{\"description\": \"min=0.463, mean=0.463, max=0.463, sum=0.926 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.46296725780875597\"}",
+              "High School US History - Observed inference time (s)": "{\"description\": \"min=0.546, mean=0.546, max=0.546, sum=1.091 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5456923538563299\"}",
+              "High School World History - Observed inference time (s)": "{\"description\": \"min=0.517, mean=0.517, max=0.517, sum=1.033 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5166646488608188\"}",
+              "High School Biology - # eval": "{\"description\": \"min=310, mean=310, max=310, sum=620 (2)\", \"tab\": \"General information\", \"score\": \"310.0\"}",
+              "High School Biology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Biology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Biology - # prompt tokens": "{\"description\": \"min=506.677, mean=506.677, max=506.677, sum=1013.355 (2)\", \"tab\": \"General information\", \"score\": \"506.6774193548387\"}",
+              "High School Biology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Chemistry - # eval": "{\"description\": \"min=203, mean=203, max=203, sum=406 (2)\", \"tab\": \"General information\", \"score\": \"203.0\"}",
+              "High School Chemistry - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Chemistry - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Chemistry - # prompt tokens": "{\"description\": \"min=489.714, mean=489.714, max=489.714, sum=979.429 (2)\", \"tab\": \"General information\", \"score\": \"489.7142857142857\"}",
+              "High School Chemistry - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Computer Science - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "High School Computer Science - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Computer Science - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Computer Science - # prompt tokens": "{\"description\": \"min=860.78, mean=860.78, max=860.78, sum=1721.56 (2)\", \"tab\": \"General information\", \"score\": \"860.78\"}",
+              "High School Computer Science - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School European History - # eval": "{\"description\": \"min=165, mean=165, max=165, sum=330 (2)\", \"tab\": \"General information\", \"score\": \"165.0\"}",
+              "High School European History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School European History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School European History - # prompt tokens": "{\"description\": \"min=2791.073, mean=2791.073, max=2791.073, sum=5582.145 (2)\", \"tab\": \"General information\", \"score\": \"2791.072727272727\"}",
+              "High School European History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Geography - # eval": "{\"description\": \"min=198, mean=198, max=198, sum=396 (2)\", \"tab\": \"General information\", \"score\": \"198.0\"}",
+              "High School Geography - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Geography - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Geography - # prompt tokens": "{\"description\": \"min=365.045, mean=365.045, max=365.045, sum=730.091 (2)\", \"tab\": \"General information\", \"score\": \"365.04545454545456\"}",
+              "High School Geography - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Government And Politics - # eval": "{\"description\": \"min=193, mean=193, max=193, sum=386 (2)\", \"tab\": \"General information\", \"score\": \"193.0\"}",
+              "High School Government And Politics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Government And Politics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Government And Politics - # prompt tokens": "{\"description\": \"min=458.824, mean=458.824, max=458.824, sum=917.648 (2)\", \"tab\": \"General information\", \"score\": \"458.8238341968912\"}",
+              "High School Government And Politics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Macroeconomics - # eval": "{\"description\": \"min=390, mean=390, max=390, sum=780 (2)\", \"tab\": \"General information\", \"score\": \"390.0\"}",
+              "High School Macroeconomics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Macroeconomics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Macroeconomics - # prompt tokens": "{\"description\": \"min=364.562, mean=364.562, max=364.562, sum=729.123 (2)\", \"tab\": \"General information\", \"score\": \"364.5615384615385\"}",
+              "High School Macroeconomics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Mathematics - # eval": "{\"description\": \"min=270, mean=270, max=270, sum=540 (2)\", \"tab\": \"General information\", \"score\": \"270.0\"}",
+              "High School Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Mathematics - # prompt tokens": "{\"description\": \"min=525.374, mean=525.374, max=525.374, sum=1050.748 (2)\", \"tab\": \"General information\", \"score\": \"525.3740740740741\"}",
+              "High School Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Microeconomics - # eval": "{\"description\": \"min=238, mean=238, max=238, sum=476 (2)\", \"tab\": \"General information\", \"score\": \"238.0\"}",
+              "High School Microeconomics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Microeconomics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Microeconomics - # prompt tokens": "{\"description\": \"min=392.025, mean=392.025, max=392.025, sum=784.05 (2)\", \"tab\": \"General information\", \"score\": \"392.02521008403363\"}",
+              "High School Microeconomics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Physics - # eval": "{\"description\": \"min=151, mean=151, max=151, sum=302 (2)\", \"tab\": \"General information\", \"score\": \"151.0\"}",
+              "High School Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Physics - # prompt tokens": "{\"description\": \"min=553.464, mean=553.464, max=553.464, sum=1106.927 (2)\", \"tab\": \"General information\", \"score\": \"553.4635761589404\"}",
+              "High School Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Psychology - # eval": "{\"description\": \"min=545, mean=545, max=545, sum=1090 (2)\", \"tab\": \"General information\", \"score\": \"545.0\"}",
+              "High School Psychology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Psychology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Psychology - # prompt tokens": "{\"description\": \"min=488.246, mean=488.246, max=488.246, sum=976.492 (2)\", \"tab\": \"General information\", \"score\": \"488.24587155963303\"}",
+              "High School Psychology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Statistics - # eval": "{\"description\": \"min=216, mean=216, max=216, sum=432 (2)\", \"tab\": \"General information\", \"score\": \"216.0\"}",
+              "High School Statistics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Statistics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Statistics - # prompt tokens": "{\"description\": \"min=788.699, mean=788.699, max=788.699, sum=1577.398 (2)\", \"tab\": \"General information\", \"score\": \"788.699074074074\"}",
+              "High School Statistics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School US History - # eval": "{\"description\": \"min=204, mean=204, max=204, sum=408 (2)\", \"tab\": \"General information\", \"score\": \"204.0\"}",
+              "High School US History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School US History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School US History - # prompt tokens": "{\"description\": \"min=2210.809, mean=2210.809, max=2210.809, sum=4421.618 (2)\", \"tab\": \"General information\", \"score\": \"2210.8088235294117\"}",
+              "High School US History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School World History - # eval": "{\"description\": \"min=237, mean=237, max=237, sum=474 (2)\", \"tab\": \"General information\", \"score\": \"237.0\"}",
+              "High School World History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School World History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School World History - # prompt tokens": "{\"description\": \"min=1421.27, mean=1421.27, max=1421.27, sum=2842.54 (2)\", \"tab\": \"General information\", \"score\": \"1421.2700421940929\"}",
+              "High School World History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"nutrition\"",
+              "subject": "\"high_school_world_history\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_nutrition\""
+              "groups": "\"mmlu_high_school_world_history\""
             }
           }
         },
         {
-          "evaluation_name": "Prehistory",
+          "evaluation_name": "Human Sexuality",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1271,36 +1249,42 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Prehistory",
+            "evaluation_description": "EM on Human Sexuality",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.926,
+            "score": 0.908,
             "details": {
-              "description": "min=0.926, mean=0.926, max=0.926, sum=1.852 (2)",
+              "description": "min=0.908, mean=0.908, max=0.908, sum=1.817 (2)",
               "tab": "Accuracy",
-              "Prehistory - Observed inference time (s)": "{\"description\": \"min=0.426, mean=0.426, max=0.426, sum=0.852 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42610209665180726\"}",
-              "Prehistory - # eval": "{\"description\": \"min=324, mean=324, max=324, sum=648 (2)\", \"tab\": \"General information\", \"score\": \"324.0\"}",
-              "Prehistory - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Prehistory - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Prehistory - # prompt tokens": "{\"description\": \"min=507.559, mean=507.559, max=507.559, sum=1015.117 (2)\", \"tab\": \"General information\", \"score\": \"507.55864197530866\"}",
-              "Prehistory - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Human Aging - Observed inference time (s)": "{\"description\": \"min=0.406, mean=0.406, max=0.406, sum=0.812 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4058152218036053\"}",
+              "Human Sexuality - Observed inference time (s)": "{\"description\": \"min=0.466, mean=0.466, max=0.466, sum=0.932 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.46620041541470825\"}",
+              "Human Aging - # eval": "{\"description\": \"min=223, mean=223, max=223, sum=446 (2)\", \"tab\": \"General information\", \"score\": \"223.0\"}",
+              "Human Aging - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Human Aging - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Human Aging - # prompt tokens": "{\"description\": \"min=312.906, mean=312.906, max=312.906, sum=625.812 (2)\", \"tab\": \"General information\", \"score\": \"312.90582959641256\"}",
+              "Human Aging - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "Human Sexuality - # eval": "{\"description\": \"min=131, mean=131, max=131, sum=262 (2)\", \"tab\": \"General information\", \"score\": \"131.0\"}",
+              "Human Sexuality - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Human Sexuality - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Human Sexuality - # prompt tokens": "{\"description\": \"min=334.183, mean=334.183, max=334.183, sum=668.366 (2)\", \"tab\": \"General information\", \"score\": \"334.1832061068702\"}",
+              "Human Sexuality - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"prehistory\"",
+              "subject": "\"human_sexuality\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_prehistory\""
+              "groups": "\"mmlu_human_sexuality\""
             }
           }
         },
         {
-          "evaluation_name": "Public Relations",
+          "evaluation_name": "International Law",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1309,36 +1293,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Public Relations",
+            "evaluation_description": "EM on International Law",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.745,
+            "score": 0.917,
             "details": {
-              "description": "min=0.745, mean=0.745, max=0.745, sum=1.491 (2)",
+              "description": "min=0.917, mean=0.917, max=0.917, sum=1.835 (2)",
               "tab": "Accuracy",
-              "Public Relations - Observed inference time (s)": "{\"description\": \"min=0.496, mean=0.496, max=0.496, sum=0.992 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.49601870450106533\"}",
-              "Public Relations - # eval": "{\"description\": \"min=110, mean=110, max=110, sum=220 (2)\", \"tab\": \"General information\", \"score\": \"110.0\"}",
-              "Public Relations - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Public Relations - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Public Relations - # prompt tokens": "{\"description\": \"min=398.318, mean=398.318, max=398.318, sum=796.636 (2)\", \"tab\": \"General information\", \"score\": \"398.3181818181818\"}",
-              "Public Relations - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "International Law - Observed inference time (s)": "{\"description\": \"min=0.461, mean=0.461, max=0.461, sum=0.922 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4608367139642889\"}",
+              "International Law - # eval": "{\"description\": \"min=121, mean=121, max=121, sum=242 (2)\", \"tab\": \"General information\", \"score\": \"121.0\"}",
+              "International Law - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "International Law - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "International Law - # prompt tokens": "{\"description\": \"min=632.851, mean=632.851, max=632.851, sum=1265.702 (2)\", \"tab\": \"General information\", \"score\": \"632.8512396694215\"}",
+              "International Law - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"public_relations\"",
+              "subject": "\"international_law\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_public_relations\""
+              "groups": "\"mmlu_international_law\""
             }
           }
         },
         {
-          "evaluation_name": "Security Studies",
+          "evaluation_name": "Logical Fallacies",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1347,36 +1331,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Security Studies",
+            "evaluation_description": "EM on Logical Fallacies",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.861,
+            "score": 0.871,
             "details": {
-              "description": "min=0.861, mean=0.861, max=0.861, sum=1.722 (2)",
+              "description": "min=0.871, mean=0.871, max=0.871, sum=1.742 (2)",
               "tab": "Accuracy",
-              "Security Studies - Observed inference time (s)": "{\"description\": \"min=0.471, mean=0.471, max=0.471, sum=0.941 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.47064581306613223\"}",
-              "Security Studies - # eval": "{\"description\": \"min=245, mean=245, max=245, sum=490 (2)\", \"tab\": \"General information\", \"score\": \"245.0\"}",
-              "Security Studies - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Security Studies - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Security Studies - # prompt tokens": "{\"description\": \"min=1157.473, mean=1157.473, max=1157.473, sum=2314.947 (2)\", \"tab\": \"General information\", \"score\": \"1157.4734693877551\"}",
-              "Security Studies - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Logical Fallacies - Observed inference time (s)": "{\"description\": \"min=0.432, mean=0.432, max=0.432, sum=0.864 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4321035870745138\"}",
+              "Logical Fallacies - # eval": "{\"description\": \"min=163, mean=163, max=163, sum=326 (2)\", \"tab\": \"General information\", \"score\": \"163.0\"}",
+              "Logical Fallacies - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Logical Fallacies - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Logical Fallacies - # prompt tokens": "{\"description\": \"min=442.595, mean=442.595, max=442.595, sum=885.19 (2)\", \"tab\": \"General information\", \"score\": \"442.5950920245399\"}",
+              "Logical Fallacies - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"security_studies\"",
+              "subject": "\"logical_fallacies\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_security_studies\""
+              "groups": "\"mmlu_logical_fallacies\""
             }
           }
         },
         {
-          "evaluation_name": "Sociology",
+          "evaluation_name": "Machine Learning",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1385,36 +1369,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Sociology",
+            "evaluation_description": "EM on Machine Learning",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.93,
+            "score": 0.759,
             "details": {
-              "description": "min=0.93, mean=0.93, max=0.93, sum=1.861 (2)",
+              "description": "min=0.759, mean=0.759, max=0.759, sum=1.518 (2)",
               "tab": "Accuracy",
-              "Sociology - Observed inference time (s)": "{\"description\": \"min=0.43, mean=0.43, max=0.43, sum=0.86 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42976075143956427\"}",
-              "Sociology - # eval": "{\"description\": \"min=201, mean=201, max=201, sum=402 (2)\", \"tab\": \"General information\", \"score\": \"201.0\"}",
-              "Sociology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Sociology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Sociology - # prompt tokens": "{\"description\": \"min=438.522, mean=438.522, max=438.522, sum=877.045 (2)\", \"tab\": \"General information\", \"score\": \"438.5223880597015\"}",
-              "Sociology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Machine Learning - Observed inference time (s)": "{\"description\": \"min=0.463, mean=0.463, max=0.463, sum=0.926 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.46302694933755056\"}",
+              "Machine Learning - # eval": "{\"description\": \"min=112, mean=112, max=112, sum=224 (2)\", \"tab\": \"General information\", \"score\": \"112.0\"}",
+              "Machine Learning - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Machine Learning - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Machine Learning - # prompt tokens": "{\"description\": \"min=661.054, mean=661.054, max=661.054, sum=1322.107 (2)\", \"tab\": \"General information\", \"score\": \"661.0535714285714\"}",
+              "Machine Learning - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"sociology\"",
+              "subject": "\"machine_learning\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_sociology\""
+              "groups": "\"mmlu_machine_learning\""
             }
           }
         },
         {
-          "evaluation_name": "Virology",
+          "evaluation_name": "Management",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1423,36 +1407,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Virology",
+            "evaluation_description": "EM on Management",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.596,
+            "score": 0.932,
             "details": {
-              "description": "min=0.596, mean=0.596, max=0.596, sum=1.193 (2)",
+              "description": "min=0.932, mean=0.932, max=0.932, sum=1.864 (2)",
               "tab": "Accuracy",
-              "Virology - Observed inference time (s)": "{\"description\": \"min=0.42, mean=0.42, max=0.42, sum=0.84 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42023470890091125\"}",
-              "Virology - # eval": "{\"description\": \"min=166, mean=166, max=166, sum=332 (2)\", \"tab\": \"General information\", \"score\": \"166.0\"}",
-              "Virology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Virology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Virology - # prompt tokens": "{\"description\": \"min=336.09, mean=336.09, max=336.09, sum=672.181 (2)\", \"tab\": \"General information\", \"score\": \"336.0903614457831\"}",
-              "Virology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Management - Observed inference time (s)": "{\"description\": \"min=0.446, mean=0.446, max=0.446, sum=0.891 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4455798760201167\"}",
+              "Management - # eval": "{\"description\": \"min=103, mean=103, max=103, sum=206 (2)\", \"tab\": \"General information\", \"score\": \"103.0\"}",
+              "Management - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Management - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Management - # prompt tokens": "{\"description\": \"min=276.796, mean=276.796, max=276.796, sum=553.592 (2)\", \"tab\": \"General information\", \"score\": \"276.79611650485435\"}",
+              "Management - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"virology\"",
+              "subject": "\"management\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_virology\""
+              "groups": "\"mmlu_management\""
             }
           }
         },
         {
-          "evaluation_name": "World Religions",
+          "evaluation_name": "Marketing",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1461,36 +1445,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on World Religions",
+            "evaluation_description": "EM on Marketing",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.877,
+            "score": 0.962,
             "details": {
-              "description": "min=0.877, mean=0.877, max=0.877, sum=1.754 (2)",
+              "description": "min=0.962, mean=0.962, max=0.962, sum=1.923 (2)",
               "tab": "Accuracy",
-              "World Religions - Observed inference time (s)": "{\"description\": \"min=0.451, mean=0.451, max=0.451, sum=0.901 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4507097779658803\"}",
-              "World Religions - # eval": "{\"description\": \"min=171, mean=171, max=171, sum=342 (2)\", \"tab\": \"General information\", \"score\": \"171.0\"}",
-              "World Religions - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "World Religions - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "World Religions - # prompt tokens": "{\"description\": \"min=268.561, mean=268.561, max=268.561, sum=537.123 (2)\", \"tab\": \"General information\", \"score\": \"268.56140350877195\"}",
-              "World Religions - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Marketing - Observed inference time (s)": "{\"description\": \"min=0.421, mean=0.421, max=0.421, sum=0.843 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4213859372668796\"}",
+              "Marketing - # eval": "{\"description\": \"min=234, mean=234, max=234, sum=468 (2)\", \"tab\": \"General information\", \"score\": \"234.0\"}",
+              "Marketing - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Marketing - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Marketing - # prompt tokens": "{\"description\": \"min=397.218, mean=397.218, max=397.218, sum=794.436 (2)\", \"tab\": \"General information\", \"score\": \"397.21794871794873\"}",
+              "Marketing - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"world_religions\"",
+              "subject": "\"marketing\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_world_religions\""
+              "groups": "\"mmlu_marketing\""
             }
           }
         },
         {
-          "evaluation_name": "Mean win rate",
+          "evaluation_name": "Medical Genetics",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1499,402 +1483,418 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "How many models this model outperforms on average (over columns).",
+            "evaluation_description": "EM on Medical Genetics",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.517,
+            "score": 0.94,
             "details": {
-              "description": "",
-              "tab": "Efficiency"
+              "description": "min=0.94, mean=0.94, max=0.94, sum=1.88 (2)",
+              "tab": "Accuracy",
+              "Medical Genetics - Observed inference time (s)": "{\"description\": \"min=0.411, mean=0.411, max=0.411, sum=0.823 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.41135803937911986\"}",
+              "Medical Genetics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "Medical Genetics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Medical Genetics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Medical Genetics - # prompt tokens": "{\"description\": \"min=334, mean=334, max=334, sum=668 (2)\", \"tab\": \"General information\", \"score\": \"334.0\"}",
+              "Medical Genetics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
-            "additional_details": {}
+            "additional_details": {
+              "subject": "\"medical_genetics\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_medical_genetics\""
+            }
           }
-        }
-      ],
-      "detailed_evaluation_results": null,
-      "generation_config": {
-        "additional_details": {
-          "subject": "[\"abstract_algebra\", \"anatomy\", \"astronomy\", \"business_ethics\", \"clinical_knowledge\", \"college_biology\", \"college_chemistry\", \"college_computer_science\", \"college_mathematics\", \"college_medicine\", \"college_physics\", \"computer_security\", \"conceptual_physics\", \"econometrics\", \"electrical_engineering\", \"elementary_mathematics\", \"formal_logic\", \"global_facts\", \"high_school_biology\", \"high_school_chemistry\", \"high_school_computer_science\", \"high_school_european_history\", \"high_school_geography\", \"high_school_government_and_politics\", \"high_school_macroeconomics\", \"high_school_mathematics\", \"high_school_microeconomics\", \"high_school_physics\", \"high_school_psychology\", \"high_school_statistics\", \"high_school_us_history\", \"high_school_world_history\", \"human_aging\", \"human_sexuality\", \"international_law\", \"jurisprudence\", \"logical_fallacies\", \"machine_learning\", \"management\", \"marketing\", \"medical_genetics\", \"miscellaneous\", \"moral_disputes\", \"moral_scenarios\", \"nutrition\", \"philosophy\", \"prehistory\", \"professional_accounting\", \"professional_law\", \"professional_medicine\", \"professional_psychology\", \"public_relations\", \"security_studies\", \"sociology\", \"us_foreign_policy\", \"virology\", \"world_religions\"]",
-          "method": "\"multiple_choice_joint\"",
-          "eval_split": "\"test\"",
-          "groups": "[\"mmlu_abstract_algebra\", \"mmlu_anatomy\", \"mmlu_astronomy\", \"mmlu_business_ethics\", \"mmlu_clinical_knowledge\", \"mmlu_college_biology\", \"mmlu_college_chemistry\", \"mmlu_college_computer_science\", \"mmlu_college_mathematics\", \"mmlu_college_medicine\", \"mmlu_college_physics\", \"mmlu_computer_security\", \"mmlu_conceptual_physics\", \"mmlu_econometrics\", \"mmlu_electrical_engineering\", \"mmlu_elementary_mathematics\", \"mmlu_formal_logic\", \"mmlu_global_facts\", \"mmlu_high_school_biology\", \"mmlu_high_school_chemistry\", \"mmlu_high_school_computer_science\", \"mmlu_high_school_european_history\", \"mmlu_high_school_geography\", \"mmlu_high_school_government_and_politics\", \"mmlu_high_school_macroeconomics\", \"mmlu_high_school_mathematics\", \"mmlu_high_school_microeconomics\", \"mmlu_high_school_physics\", \"mmlu_high_school_psychology\", \"mmlu_high_school_statistics\", \"mmlu_high_school_us_history\", \"mmlu_high_school_world_history\", \"mmlu_human_aging\", \"mmlu_human_sexuality\", \"mmlu_international_law\", \"mmlu_jurisprudence\", \"mmlu_logical_fallacies\", \"mmlu_machine_learning\", \"mmlu_management\", \"mmlu_marketing\", \"mmlu_medical_genetics\", \"mmlu_miscellaneous\", \"mmlu_moral_disputes\", \"mmlu_moral_scenarios\", \"mmlu_nutrition\", \"mmlu_philosophy\", \"mmlu_prehistory\", \"mmlu_professional_accounting\", \"mmlu_professional_law\", \"mmlu_professional_medicine\", \"mmlu_professional_psychology\", \"mmlu_public_relations\", \"mmlu_security_studies\", \"mmlu_sociology\", \"mmlu_us_foreign_policy\", \"mmlu_virology\", \"mmlu_world_religions\"]"
-        }
-      }
-    },
-    {
-      "evaluation_id": "helm_lite/openai_gpt-4-0613/1774096306.427425",
-      "retrieved_timestamp": "1774096306.427425",
-      "source_metadata": {
-        "source_name": "helm_lite",
-        "source_type": "documentation",
-        "source_organization_name": "crfm",
-        "evaluator_relationship": "third_party"
-      },
-      "eval_library": {
-        "name": "helm",
-        "version": "unknown"
-      },
-      "benchmark": "helm_lite",
-      "evaluation_results": [
+        },
         {
-          "evaluation_name": "Mean win rate",
+          "evaluation_name": "Miscellaneous",
           "source_data": {
-            "dataset_name": "helm_lite",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "How many models this model outperforms on average (over columns).",
+            "evaluation_description": "EM on Miscellaneous",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.867,
+            "score": 0.949,
             "details": {
-              "description": "",
+              "description": "min=0.949, mean=0.949, max=0.949, sum=1.898 (2)",
               "tab": "Accuracy",
-              "Mean win rate - Efficiency": "{\"description\": \"\", \"tab\": \"Efficiency\", \"score\": \"0.5158801498127341\"}",
-              "Mean win rate - General information": "{\"description\": \"\", \"tab\": \"General information\", \"score\": \"\"}"
+              "Miscellaneous - Observed inference time (s)": "{\"description\": \"min=0.451, mean=0.451, max=0.451, sum=0.901 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4505587230088001\"}",
+              "Miscellaneous - # eval": "{\"description\": \"min=783, mean=783, max=783, sum=1566 (2)\", \"tab\": \"General information\", \"score\": \"783.0\"}",
+              "Miscellaneous - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Miscellaneous - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Miscellaneous - # prompt tokens": "{\"description\": \"min=292.925, mean=292.925, max=292.925, sum=585.849 (2)\", \"tab\": \"General information\", \"score\": \"292.92464878671774\"}",
+              "Miscellaneous - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
-            "additional_details": {}
+            "additional_details": {
+              "subject": "\"miscellaneous\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_miscellaneous\""
+            }
           }
         },
         {
-          "evaluation_name": "NarrativeQA",
+          "evaluation_name": "Moral Scenarios",
           "source_data": {
-            "dataset_name": "NarrativeQA",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "F1 on NarrativeQA",
+            "evaluation_description": "EM on Moral Scenarios",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.768,
+            "score": 0.902,
             "details": {
-              "description": "min=0.768, mean=0.768, max=0.768, sum=0.768 (1)",
+              "description": "min=0.902, mean=0.902, max=0.902, sum=1.803 (2)",
               "tab": "Accuracy",
-              "NarrativeQA - Observed inference time (s)": "{\"description\": \"min=0.976, mean=0.976, max=0.976, sum=0.976 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.9758186582108619\"}",
-              "NarrativeQA - # eval": "{\"description\": \"min=355, mean=355, max=355, sum=355 (1)\", \"tab\": \"General information\", \"score\": \"355.0\"}",
-              "NarrativeQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "NarrativeQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "NarrativeQA - # prompt tokens": "{\"description\": \"min=3522.67, mean=3522.67, max=3522.67, sum=3522.67 (1)\", \"tab\": \"General information\", \"score\": \"3522.6704225352114\"}",
-              "NarrativeQA - # output tokens": "{\"description\": \"min=8.515, mean=8.515, max=8.515, sum=8.515 (1)\", \"tab\": \"General information\", \"score\": \"8.51549295774648\"}"
+              "Moral Disputes - Observed inference time (s)": "{\"description\": \"min=0.428, mean=0.428, max=0.428, sum=0.856 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4281756044123214\"}",
+              "Moral Scenarios - Observed inference time (s)": "{\"description\": \"min=0.445, mean=0.445, max=0.445, sum=0.89 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.44513606945229645\"}",
+              "Moral Disputes - # eval": "{\"description\": \"min=346, mean=346, max=346, sum=692 (2)\", \"tab\": \"General information\", \"score\": \"346.0\"}",
+              "Moral Disputes - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Moral Disputes - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Moral Disputes - # prompt tokens": "{\"description\": \"min=469.145, mean=469.145, max=469.145, sum=938.289 (2)\", \"tab\": \"General information\", \"score\": \"469.1445086705202\"}",
+              "Moral Disputes - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "Moral Scenarios - # eval": "{\"description\": \"min=895, mean=895, max=895, sum=1790 (2)\", \"tab\": \"General information\", \"score\": \"895.0\"}",
+              "Moral Scenarios - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Moral Scenarios - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Moral Scenarios - # prompt tokens": "{\"description\": \"min=649.455, mean=649.455, max=649.455, sum=1298.909 (2)\", \"tab\": \"General information\", \"score\": \"649.454748603352\"}",
+              "Moral Scenarios - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
-            "additional_details": {}
+            "additional_details": {
+              "subject": "\"moral_scenarios\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_moral_scenarios\""
+            }
           }
         },
         {
-          "evaluation_name": "NaturalQuestions (closed-book)",
+          "evaluation_name": "Nutrition",
           "source_data": {
-            "dataset_name": "NaturalQuestions (closed-book)",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "F1 on NaturalQuestions (closed-book)",
+            "evaluation_description": "EM on Nutrition",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.457,
+            "score": 0.892,
             "details": {
-              "description": "min=0.457, mean=0.457, max=0.457, sum=0.457 (1)",
+              "description": "min=0.892, mean=0.892, max=0.892, sum=1.784 (2)",
               "tab": "Accuracy",
-              "NaturalQuestions (open-book) - Observed inference time (s)": "{\"description\": \"min=0.908, mean=0.908, max=0.908, sum=0.908 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.9083020164966583\"}",
-              "NaturalQuestions (closed-book) - Observed inference time (s)": "{\"description\": \"min=0.512, mean=0.512, max=0.512, sum=0.512 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.5116857671737671\"}",
-              "NaturalQuestions (open-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}",
-              "NaturalQuestions (open-book) - # train": "{\"description\": \"min=4.964, mean=4.964, max=4.964, sum=4.964 (1)\", \"tab\": \"General information\", \"score\": \"4.964\"}",
-              "NaturalQuestions (open-book) - truncated": "{\"description\": \"min=0.007, mean=0.007, max=0.007, sum=0.007 (1)\", \"tab\": \"General information\", \"score\": \"0.007\"}",
-              "NaturalQuestions (open-book) - # prompt tokens": "{\"description\": \"min=1717.847, mean=1717.847, max=1717.847, sum=1717.847 (1)\", \"tab\": \"General information\", \"score\": \"1717.847\"}",
-              "NaturalQuestions (open-book) - # output tokens": "{\"description\": \"min=8.055, mean=8.055, max=8.055, sum=8.055 (1)\", \"tab\": \"General information\", \"score\": \"8.055\"}",
-              "NaturalQuestions (closed-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}",
-              "NaturalQuestions (closed-book) - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "NaturalQuestions (closed-book) - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "NaturalQuestions (closed-book) - # prompt tokens": "{\"description\": \"min=173.127, mean=173.127, max=173.127, sum=173.127 (1)\", \"tab\": \"General information\", \"score\": \"173.127\"}",
-              "NaturalQuestions (closed-book) - # output tokens": "{\"description\": \"min=3.832, mean=3.832, max=3.832, sum=3.832 (1)\", \"tab\": \"General information\", \"score\": \"3.832\"}"
+              "Nutrition - Observed inference time (s)": "{\"description\": \"min=0.446, mean=0.446, max=0.446, sum=0.892 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4460979816960354\"}",
+              "Nutrition - # eval": "{\"description\": \"min=306, mean=306, max=306, sum=612 (2)\", \"tab\": \"General information\", \"score\": \"306.0\"}",
+              "Nutrition - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Nutrition - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Nutrition - # prompt tokens": "{\"description\": \"min=579.817, mean=579.817, max=579.817, sum=1159.634 (2)\", \"tab\": \"General information\", \"score\": \"579.8169934640523\"}",
+              "Nutrition - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "mode": "\"closedbook\""
+              "subject": "\"nutrition\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_nutrition\""
             }
           }
         },
         {
-          "evaluation_name": "OpenbookQA",
+          "evaluation_name": "Prehistory",
           "source_data": {
-            "dataset_name": "OpenbookQA",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on OpenbookQA",
+            "evaluation_description": "EM on Prehistory",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.96,
+            "score": 0.926,
             "details": {
-              "description": "min=0.96, mean=0.96, max=0.96, sum=0.96 (1)",
+              "description": "min=0.926, mean=0.926, max=0.926, sum=1.852 (2)",
               "tab": "Accuracy",
-              "OpenbookQA - Observed inference time (s)": "{\"description\": \"min=0.401, mean=0.401, max=0.401, sum=0.401 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.40061268854141235\"}",
-              "OpenbookQA - # eval": "{\"description\": \"min=500, mean=500, max=500, sum=500 (1)\", \"tab\": \"General information\", \"score\": \"500.0\"}",
-              "OpenbookQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "OpenbookQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "OpenbookQA - # prompt tokens": "{\"description\": \"min=242.782, mean=242.782, max=242.782, sum=242.782 (1)\", \"tab\": \"General information\", \"score\": \"242.782\"}",
-              "OpenbookQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Prehistory - Observed inference time (s)": "{\"description\": \"min=0.426, mean=0.426, max=0.426, sum=0.852 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42610209665180726\"}",
+              "Prehistory - # eval": "{\"description\": \"min=324, mean=324, max=324, sum=648 (2)\", \"tab\": \"General information\", \"score\": \"324.0\"}",
+              "Prehistory - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Prehistory - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Prehistory - # prompt tokens": "{\"description\": \"min=507.559, mean=507.559, max=507.559, sum=1015.117 (2)\", \"tab\": \"General information\", \"score\": \"507.55864197530866\"}",
+              "Prehistory - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "dataset": "\"openbookqa\"",
-              "method": "\"multiple_choice_joint\""
+              "subject": "\"prehistory\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_prehistory\""
             }
           }
         },
         {
-          "evaluation_name": "MMLU",
+          "evaluation_name": "Public Relations",
           "source_data": {
-            "dataset_name": "MMLU",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on MMLU",
+            "evaluation_description": "EM on Public Relations",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.735,
+            "score": 0.745,
             "details": {
-              "description": "min=0.55, mean=0.735, max=0.95, sum=3.674 (5)",
+              "description": "min=0.745, mean=0.745, max=0.745, sum=1.491 (2)",
               "tab": "Accuracy",
-              "MMLU - Observed inference time (s)": "{\"description\": \"min=0.364, mean=0.391, max=0.434, sum=1.954 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.39080846048656265\"}",
-              "MMLU - # eval": "{\"description\": \"min=100, mean=102.8, max=114, sum=514 (5)\", \"tab\": \"General information\", \"score\": \"102.8\"}",
-              "MMLU - # train": "{\"description\": \"min=5, mean=5, max=5, sum=25 (5)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "MMLU - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "MMLU - # prompt tokens": "{\"description\": \"min=366.44, mean=460.72, max=607.43, sum=2303.6 (5)\", \"tab\": \"General information\", \"score\": \"460.71996491228066\"}",
-              "MMLU - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Public Relations - Observed inference time (s)": "{\"description\": \"min=0.496, mean=0.496, max=0.496, sum=0.992 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.49601870450106533\"}",
+              "Public Relations - # eval": "{\"description\": \"min=110, mean=110, max=110, sum=220 (2)\", \"tab\": \"General information\", \"score\": \"110.0\"}",
+              "Public Relations - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Public Relations - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Public Relations - # prompt tokens": "{\"description\": \"min=398.318, mean=398.318, max=398.318, sum=796.636 (2)\", \"tab\": \"General information\", \"score\": \"398.3181818181818\"}",
+              "Public Relations - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "[\"abstract_algebra\", \"college_chemistry\", \"computer_security\", \"econometrics\", \"us_foreign_policy\"]",
-              "method": "\"multiple_choice_joint\""
+              "subject": "\"public_relations\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_public_relations\""
             }
           }
         },
         {
-          "evaluation_name": "MATH",
+          "evaluation_name": "Security Studies",
           "source_data": {
-            "dataset_name": "MATH",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "Equivalent (CoT) on MATH",
+            "evaluation_description": "EM on Security Studies",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.802,
+            "score": 0.861,
             "details": {
-              "description": "min=0.673, mean=0.802, max=0.948, sum=5.617 (7)",
+              "description": "min=0.861, mean=0.861, max=0.861, sum=1.722 (2)",
               "tab": "Accuracy",
-              "MATH - Observed inference time (s)": "{\"description\": \"min=2.95, mean=3.472, max=4.247, sum=24.303 (7)\", \"tab\": \"Efficiency\", \"score\": \"3.4718795228507955\"}",
-              "MATH - # eval": "{\"description\": \"min=30, mean=62.429, max=135, sum=437 (7)\", \"tab\": \"General information\", \"score\": \"62.42857142857143\"}",
-              "MATH - # train": "{\"description\": \"min=8, mean=8, max=8, sum=56 (7)\", \"tab\": \"General information\", \"score\": \"8.0\"}",
-              "MATH - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (7)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "MATH - # prompt tokens": "{\"description\": \"min=942.363, mean=1323.911, max=2258.577, sum=9267.376 (7)\", \"tab\": \"General information\", \"score\": \"1323.910874184069\"}",
-              "MATH - # output tokens": "{\"description\": \"min=59.674, mean=73.257, max=81.1, sum=512.799 (7)\", \"tab\": \"General information\", \"score\": \"73.25695858608955\"}"
+              "Security Studies - Observed inference time (s)": "{\"description\": \"min=0.471, mean=0.471, max=0.471, sum=0.941 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.47064581306613223\"}",
+              "Security Studies - # eval": "{\"description\": \"min=245, mean=245, max=245, sum=490 (2)\", \"tab\": \"General information\", \"score\": \"245.0\"}",
+              "Security Studies - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Security Studies - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Security Studies - # prompt tokens": "{\"description\": \"min=1157.473, mean=1157.473, max=1157.473, sum=2314.947 (2)\", \"tab\": \"General information\", \"score\": \"1157.4734693877551\"}",
+              "Security Studies - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "[\"algebra\", \"counting_and_probability\", \"geometry\", \"intermediate_algebra\", \"number_theory\", \"prealgebra\", \"precalculus\"]",
-              "level": "\"1\"",
-              "use_official_examples": "\"False\"",
-              "use_chain_of_thought": "\"True\""
+              "subject": "\"security_studies\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_security_studies\""
             }
           }
         },
         {
-          "evaluation_name": "GSM8K",
+          "evaluation_name": "Sociology",
           "source_data": {
-            "dataset_name": "GSM8K",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on GSM8K",
+            "evaluation_description": "EM on Sociology",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.932,
+            "score": 0.93,
             "details": {
-              "description": "min=0.932, mean=0.932, max=0.932, sum=0.932 (1)",
+              "description": "min=0.93, mean=0.93, max=0.93, sum=1.861 (2)",
               "tab": "Accuracy",
-              "GSM8K - Observed inference time (s)": "{\"description\": \"min=4.948, mean=4.948, max=4.948, sum=4.948 (1)\", \"tab\": \"Efficiency\", \"score\": \"4.947624314308166\"}",
-              "GSM8K - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}",
-              "GSM8K - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "GSM8K - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "GSM8K - # prompt tokens": "{\"description\": \"min=1020.035, mean=1020.035, max=1020.035, sum=1020.035 (1)\", \"tab\": \"General information\", \"score\": \"1020.035\"}",
-              "GSM8K - # output tokens": "{\"description\": \"min=111.209, mean=111.209, max=111.209, sum=111.209 (1)\", \"tab\": \"General information\", \"score\": \"111.209\"}"
+              "Sociology - Observed inference time (s)": "{\"description\": \"min=0.43, mean=0.43, max=0.43, sum=0.86 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42976075143956427\"}",
+              "Sociology - # eval": "{\"description\": \"min=201, mean=201, max=201, sum=402 (2)\", \"tab\": \"General information\", \"score\": \"201.0\"}",
+              "Sociology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Sociology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Sociology - # prompt tokens": "{\"description\": \"min=438.522, mean=438.522, max=438.522, sum=877.045 (2)\", \"tab\": \"General information\", \"score\": \"438.5223880597015\"}",
+              "Sociology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
-            "additional_details": {}
+            "additional_details": {
+              "subject": "\"sociology\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_sociology\""
+            }
           }
         },
         {
-          "evaluation_name": "LegalBench",
+          "evaluation_name": "Virology",
           "source_data": {
-            "dataset_name": "LegalBench",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on LegalBench",
+            "evaluation_description": "EM on Virology",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.713,
+            "score": 0.596,
             "details": {
-              "description": "min=0.452, mean=0.713, max=0.905, sum=3.564 (5)",
+              "description": "min=0.596, mean=0.596, max=0.596, sum=1.193 (2)",
               "tab": "Accuracy",
-              "LegalBench - Observed inference time (s)": "{\"description\": \"min=0.46, mean=0.558, max=0.886, sum=2.791 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.5582764348578453\"}",
-              "LegalBench - # eval": "{\"description\": \"min=95, mean=409.4, max=1000, sum=2047 (5)\", \"tab\": \"General information\", \"score\": \"409.4\"}",
-              "LegalBench - # train": "{\"description\": \"min=4, mean=4.798, max=5, sum=23.992 (5)\", \"tab\": \"General information\", \"score\": \"4.798367346938775\"}",
-              "LegalBench - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "LegalBench - # prompt tokens": "{\"description\": \"min=253.442, mean=1568.687, max=6350.008, sum=7843.435 (5)\", \"tab\": \"General information\", \"score\": \"1568.6870529886412\"}",
-              "LegalBench - # output tokens": "{\"description\": \"min=1, mean=1.34, max=2.063, sum=6.698 (5)\", \"tab\": \"General information\", \"score\": \"1.3396070557866055\"}"
+              "Virology - Observed inference time (s)": "{\"description\": \"min=0.42, mean=0.42, max=0.42, sum=0.84 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42023470890091125\"}",
+              "Virology - # eval": "{\"description\": \"min=166, mean=166, max=166, sum=332 (2)\", \"tab\": \"General information\", \"score\": \"166.0\"}",
+              "Virology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Virology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Virology - # prompt tokens": "{\"description\": \"min=336.09, mean=336.09, max=336.09, sum=672.181 (2)\", \"tab\": \"General information\", \"score\": \"336.0903614457831\"}",
+              "Virology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subset": "[\"abercrombie\", \"corporate_lobbying\", \"function_of_decision_section\", \"international_citizenship_questions\", \"proa\"]"
+              "subject": "\"virology\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_virology\""
             }
           }
         },
         {
-          "evaluation_name": "MedQA",
+          "evaluation_name": "World Religions",
           "source_data": {
-            "dataset_name": "MedQA",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on MedQA",
+            "evaluation_description": "EM on World Religions",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.815,
+            "score": 0.877,
             "details": {
-              "description": "min=0.815, mean=0.815, max=0.815, sum=0.815 (1)",
+              "description": "min=0.877, mean=0.877, max=0.877, sum=1.754 (2)",
               "tab": "Accuracy",
-              "MedQA - Observed inference time (s)": "{\"description\": \"min=0.414, mean=0.414, max=0.414, sum=0.414 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.4136932588239787\"}",
-              "MedQA - # eval": "{\"description\": \"min=503, mean=503, max=503, sum=503 (1)\", \"tab\": \"General information\", \"score\": \"503.0\"}",
-              "MedQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "MedQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "MedQA - # prompt tokens": "{\"description\": \"min=1020.414, mean=1020.414, max=1020.414, sum=1020.414 (1)\", \"tab\": \"General information\", \"score\": \"1020.4135188866799\"}",
-              "MedQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "World Religions - Observed inference time (s)": "{\"description\": \"min=0.451, mean=0.451, max=0.451, sum=0.901 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4507097779658803\"}",
+              "World Religions - # eval": "{\"description\": \"min=171, mean=171, max=171, sum=342 (2)\", \"tab\": \"General information\", \"score\": \"171.0\"}",
+              "World Religions - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "World Religions - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "World Religions - # prompt tokens": "{\"description\": \"min=268.561, mean=268.561, max=268.561, sum=537.123 (2)\", \"tab\": \"General information\", \"score\": \"268.56140350877195\"}",
+              "World Religions - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
-            "additional_details": {}
+            "additional_details": {
+              "subject": "\"world_religions\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_world_religions\""
+            }
           }
         },
         {
-          "evaluation_name": "WMT 2014",
+          "evaluation_name": "Mean win rate",
           "source_data": {
-            "dataset_name": "WMT 2014",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "BLEU-4 on WMT 2014",
+            "evaluation_description": "How many models this model outperforms on average (over columns).",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.211,
+            "score": 0.517,
             "details": {
-              "description": "min=0.149, mean=0.211, max=0.256, sum=1.053 (5)",
-              "tab": "Accuracy",
-              "WMT 2014 - Observed inference time (s)": "{\"description\": \"min=1.448, mean=1.58, max=1.724, sum=7.899 (5)\", \"tab\": \"Efficiency\", \"score\": \"1.5797039644192494\"}",
-              "WMT 2014 - # eval": "{\"description\": \"min=503, mean=568.8, max=832, sum=2844 (5)\", \"tab\": \"General information\", \"score\": \"568.8\"}",
-              "WMT 2014 - # train": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "WMT 2014 - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "WMT 2014 - # prompt tokens": "{\"description\": \"min=169.901, mean=193.043, max=213.185, sum=965.213 (5)\", \"tab\": \"General information\", \"score\": \"193.04258583116683\"}",
-              "WMT 2014 - # output tokens": "{\"description\": \"min=23.767, mean=25.424, max=26.121, sum=127.122 (5)\", \"tab\": \"General information\", \"score\": \"25.424382072946933\"}"
+              "description": "",
+              "tab": "Efficiency"
             }
           },
           "generation_config": {
-            "additional_details": {
-              "language_pair": "[\"cs-en\", \"de-en\", \"fr-en\", \"hi-en\", \"ru-en\"]"
-            }
+            "additional_details": {}
           }
         }
       ],
       "detailed_evaluation_results": null,
       "generation_config": {
-        "additional_details": {}
+        "additional_details": {
+          "subject": "[\"abstract_algebra\", \"anatomy\", \"astronomy\", \"business_ethics\", \"clinical_knowledge\", \"college_biology\", \"college_chemistry\", \"college_computer_science\", \"college_mathematics\", \"college_medicine\", \"college_physics\", \"computer_security\", \"conceptual_physics\", \"econometrics\", \"electrical_engineering\", \"elementary_mathematics\", \"formal_logic\", \"global_facts\", \"high_school_biology\", \"high_school_chemistry\", \"high_school_computer_science\", \"high_school_european_history\", \"high_school_geography\", \"high_school_government_and_politics\", \"high_school_macroeconomics\", \"high_school_mathematics\", \"high_school_microeconomics\", \"high_school_physics\", \"high_school_psychology\", \"high_school_statistics\", \"high_school_us_history\", \"high_school_world_history\", \"human_aging\", \"human_sexuality\", \"international_law\", \"jurisprudence\", \"logical_fallacies\", \"machine_learning\", \"management\", \"marketing\", \"medical_genetics\", \"miscellaneous\", \"moral_disputes\", \"moral_scenarios\", \"nutrition\", \"philosophy\", \"prehistory\", \"professional_accounting\", \"professional_law\", \"professional_medicine\", \"professional_psychology\", \"public_relations\", \"security_studies\", \"sociology\", \"us_foreign_policy\", \"virology\", \"world_religions\"]",
+          "method": "\"multiple_choice_joint\"",
+          "eval_split": "\"test\"",
+          "groups": "[\"mmlu_abstract_algebra\", \"mmlu_anatomy\", \"mmlu_astronomy\", \"mmlu_business_ethics\", \"mmlu_clinical_knowledge\", \"mmlu_college_biology\", \"mmlu_college_chemistry\", \"mmlu_college_computer_science\", \"mmlu_college_mathematics\", \"mmlu_college_medicine\", \"mmlu_college_physics\", \"mmlu_computer_security\", \"mmlu_conceptual_physics\", \"mmlu_econometrics\", \"mmlu_electrical_engineering\", \"mmlu_elementary_mathematics\", \"mmlu_formal_logic\", \"mmlu_global_facts\", \"mmlu_high_school_biology\", \"mmlu_high_school_chemistry\", \"mmlu_high_school_computer_science\", \"mmlu_high_school_european_history\", \"mmlu_high_school_geography\", \"mmlu_high_school_government_and_politics\", \"mmlu_high_school_macroeconomics\", \"mmlu_high_school_mathematics\", \"mmlu_high_school_microeconomics\", \"mmlu_high_school_physics\", \"mmlu_high_school_psychology\", \"mmlu_high_school_statistics\", \"mmlu_high_school_us_history\", \"mmlu_high_school_world_history\", \"mmlu_human_aging\", \"mmlu_human_sexuality\", \"mmlu_international_law\", \"mmlu_jurisprudence\", \"mmlu_logical_fallacies\", \"mmlu_machine_learning\", \"mmlu_management\", \"mmlu_marketing\", \"mmlu_medical_genetics\", \"mmlu_miscellaneous\", \"mmlu_moral_disputes\", \"mmlu_moral_scenarios\", \"mmlu_nutrition\", \"mmlu_philosophy\", \"mmlu_prehistory\", \"mmlu_professional_accounting\", \"mmlu_professional_law\", \"mmlu_professional_medicine\", \"mmlu_professional_psychology\", \"mmlu_public_relations\", \"mmlu_security_studies\", \"mmlu_sociology\", \"mmlu_us_foreign_policy\", \"mmlu_virology\", \"mmlu_world_religions\"]"
+        }
       }
     }
   ]
diff --git a/data/models/openai_gpt-4-turbo-2024-04-09.json b/data/models/openai_gpt-4-turbo-2024-04-09.json
index 1fe1d15bc389a507225c8274f004fa48f42d4b5a..e16c126fc0c5b9aa15f8e7eae24f7699fd6de829 100644
--- a/data/models/openai_gpt-4-turbo-2024-04-09.json
+++ b/data/models/openai_gpt-4-turbo-2024-04-09.json
@@ -7,10 +7,10 @@
   },
   "evaluations": [
     {
-      "evaluation_id": "helm_mmlu/openai_gpt-4-turbo-2024-04-09/1774096312.00548",
-      "retrieved_timestamp": "1774096312.00548",
+      "evaluation_id": "helm_lite/openai_gpt-4-turbo-2024-04-09/1774096306.427425",
+      "retrieved_timestamp": "1774096306.427425",
       "source_metadata": {
-        "source_name": "helm_mmlu",
+        "source_name": "helm_lite",
         "source_type": "documentation",
         "source_organization_name": "crfm",
         "evaluator_relationship": "third_party"
@@ -19,438 +19,382 @@
         "name": "helm",
         "version": "unknown"
       },
-      "benchmark": "helm_mmlu",
+      "benchmark": "helm_lite",
       "evaluation_results": [
         {
-          "evaluation_name": "MMLU All Subjects",
+          "evaluation_name": "Mean win rate",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "helm_lite",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on MMLU All Subjects",
+            "evaluation_description": "How many models this model outperforms on average (over columns).",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.813,
+            "score": 0.864,
             "details": {
-              "description": "min=0.515, mean=0.813, max=0.974, sum=92.65 (114)",
+              "description": "",
               "tab": "Accuracy",
-              "MMLU All Subjects - Observed inference time (s)": "{\"description\": \"min=0.479, mean=0.617, max=0.934, sum=70.3 (114)\", \"tab\": \"Efficiency\", \"score\": \"0.6166649052297876\"}",
-              "MMLU All Subjects - # eval": "{\"description\": \"min=100, mean=246.351, max=1534, sum=28084 (114)\", \"tab\": \"General information\", \"score\": \"246.35087719298247\"}",
-              "MMLU All Subjects - # train": "{\"description\": \"min=5, mean=5, max=5, sum=570 (114)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "MMLU All Subjects - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (114)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "MMLU All Subjects - # prompt tokens": "{\"description\": \"min=275.561, mean=614.852, max=2798.073, sum=70093.086 (114)\", \"tab\": \"General information\", \"score\": \"614.851634217556\"}",
-              "MMLU All Subjects - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=114 (114)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Mean win rate - Efficiency": "{\"description\": \"\", \"tab\": \"Efficiency\", \"score\": \"0.4568414481897628\"}",
+              "Mean win rate - General information": "{\"description\": \"\", \"tab\": \"General information\", \"score\": \"\"}"
             }
           },
           "generation_config": {
-            "additional_details": {
-              "subject": "[\"abstract_algebra\", \"anatomy\", \"astronomy\", \"business_ethics\", \"clinical_knowledge\", \"college_biology\", \"college_chemistry\", \"college_computer_science\", \"college_mathematics\", \"college_medicine\", \"college_physics\", \"computer_security\", \"conceptual_physics\", \"econometrics\", \"electrical_engineering\", \"elementary_mathematics\", \"formal_logic\", \"global_facts\", \"high_school_biology\", \"high_school_chemistry\", \"high_school_computer_science\", \"high_school_european_history\", \"high_school_geography\", \"high_school_government_and_politics\", \"high_school_macroeconomics\", \"high_school_mathematics\", \"high_school_microeconomics\", \"high_school_physics\", \"high_school_psychology\", \"high_school_statistics\", \"high_school_us_history\", \"high_school_world_history\", \"human_aging\", \"human_sexuality\", \"international_law\", \"jurisprudence\", \"logical_fallacies\", \"machine_learning\", \"management\", \"marketing\", \"medical_genetics\", \"miscellaneous\", \"moral_disputes\", \"moral_scenarios\", \"nutrition\", \"philosophy\", \"prehistory\", \"professional_accounting\", \"professional_law\", \"professional_medicine\", \"professional_psychology\", \"public_relations\", \"security_studies\", \"sociology\", \"us_foreign_policy\", \"virology\", \"world_religions\"]",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "[\"mmlu_abstract_algebra\", \"mmlu_anatomy\", \"mmlu_astronomy\", \"mmlu_business_ethics\", \"mmlu_clinical_knowledge\", \"mmlu_college_biology\", \"mmlu_college_chemistry\", \"mmlu_college_computer_science\", \"mmlu_college_mathematics\", \"mmlu_college_medicine\", \"mmlu_college_physics\", \"mmlu_computer_security\", \"mmlu_conceptual_physics\", \"mmlu_econometrics\", \"mmlu_electrical_engineering\", \"mmlu_elementary_mathematics\", \"mmlu_formal_logic\", \"mmlu_global_facts\", \"mmlu_high_school_biology\", \"mmlu_high_school_chemistry\", \"mmlu_high_school_computer_science\", \"mmlu_high_school_european_history\", \"mmlu_high_school_geography\", \"mmlu_high_school_government_and_politics\", \"mmlu_high_school_macroeconomics\", \"mmlu_high_school_mathematics\", \"mmlu_high_school_microeconomics\", \"mmlu_high_school_physics\", \"mmlu_high_school_psychology\", \"mmlu_high_school_statistics\", \"mmlu_high_school_us_history\", \"mmlu_high_school_world_history\", \"mmlu_human_aging\", \"mmlu_human_sexuality\", \"mmlu_international_law\", \"mmlu_jurisprudence\", \"mmlu_logical_fallacies\", \"mmlu_machine_learning\", \"mmlu_management\", \"mmlu_marketing\", \"mmlu_medical_genetics\", \"mmlu_miscellaneous\", \"mmlu_moral_disputes\", \"mmlu_moral_scenarios\", \"mmlu_nutrition\", \"mmlu_philosophy\", \"mmlu_prehistory\", \"mmlu_professional_accounting\", \"mmlu_professional_law\", \"mmlu_professional_medicine\", \"mmlu_professional_psychology\", \"mmlu_public_relations\", \"mmlu_security_studies\", \"mmlu_sociology\", \"mmlu_us_foreign_policy\", \"mmlu_virology\", \"mmlu_world_religions\"]"
-            }
+            "additional_details": {}
           }
         },
         {
-          "evaluation_name": "Abstract Algebra",
+          "evaluation_name": "NarrativeQA",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "NarrativeQA",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Abstract Algebra",
+            "evaluation_description": "F1 on NarrativeQA",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.56,
+            "score": 0.761,
             "details": {
-              "description": "min=0.56, mean=0.56, max=0.56, sum=1.12 (2)",
+              "description": "min=0.761, mean=0.761, max=0.761, sum=0.761 (1)",
               "tab": "Accuracy",
-              "Abstract Algebra - Observed inference time (s)": "{\"description\": \"min=0.54, mean=0.54, max=0.54, sum=1.08 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.539907853603363\"}",
-              "Abstract Algebra - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "Abstract Algebra - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Abstract Algebra - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Abstract Algebra - # prompt tokens": "{\"description\": \"min=373.44, mean=373.44, max=373.44, sum=746.88 (2)\", \"tab\": \"General information\", \"score\": \"373.44\"}",
-              "Abstract Algebra - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "NarrativeQA - Observed inference time (s)": "{\"description\": \"min=0.804, mean=0.804, max=0.804, sum=0.804 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.8043310716118611\"}",
+              "NarrativeQA - # eval": "{\"description\": \"min=355, mean=355, max=355, sum=355 (1)\", \"tab\": \"General information\", \"score\": \"355.0\"}",
+              "NarrativeQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "NarrativeQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "NarrativeQA - # prompt tokens": "{\"description\": \"min=3495.67, mean=3495.67, max=3495.67, sum=3495.67 (1)\", \"tab\": \"General information\", \"score\": \"3495.6704225352114\"}",
+              "NarrativeQA - # output tokens": "{\"description\": \"min=6.037, mean=6.037, max=6.037, sum=6.037 (1)\", \"tab\": \"General information\", \"score\": \"6.0366197183098596\"}"
             }
           },
           "generation_config": {
-            "additional_details": {
-              "subject": "\"abstract_algebra\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_abstract_algebra\""
-            }
+            "additional_details": {}
           }
         },
         {
-          "evaluation_name": "Anatomy",
+          "evaluation_name": "NaturalQuestions (closed-book)",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "NaturalQuestions (closed-book)",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Anatomy",
+            "evaluation_description": "F1 on NaturalQuestions (closed-book)",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.822,
+            "score": 0.482,
             "details": {
-              "description": "min=0.822, mean=0.822, max=0.822, sum=1.644 (2)",
+              "description": "min=0.482, mean=0.482, max=0.482, sum=0.482 (1)",
               "tab": "Accuracy",
-              "Anatomy - Observed inference time (s)": "{\"description\": \"min=0.53, mean=0.53, max=0.53, sum=1.06 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5299274744810881\"}",
-              "Anatomy - # eval": "{\"description\": \"min=135, mean=135, max=135, sum=270 (2)\", \"tab\": \"General information\", \"score\": \"135.0\"}",
-              "Anatomy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Anatomy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Anatomy - # prompt tokens": "{\"description\": \"min=353.978, mean=353.978, max=353.978, sum=707.956 (2)\", \"tab\": \"General information\", \"score\": \"353.97777777777776\"}",
-              "Anatomy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "NaturalQuestions (open-book) - Observed inference time (s)": "{\"description\": \"min=0.712, mean=0.712, max=0.712, sum=0.712 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.7120162718296051\"}",
+              "NaturalQuestions (closed-book) - Observed inference time (s)": "{\"description\": \"min=0.605, mean=0.605, max=0.605, sum=0.605 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.6052222681045533\"}",
+              "NaturalQuestions (open-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}",
+              "NaturalQuestions (open-book) - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "NaturalQuestions (open-book) - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "NaturalQuestions (open-book) - # prompt tokens": "{\"description\": \"min=1728.593, mean=1728.593, max=1728.593, sum=1728.593 (1)\", \"tab\": \"General information\", \"score\": \"1728.593\"}",
+              "NaturalQuestions (open-book) - # output tokens": "{\"description\": \"min=5.902, mean=5.902, max=5.902, sum=5.902 (1)\", \"tab\": \"General information\", \"score\": \"5.902\"}",
+              "NaturalQuestions (closed-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}",
+              "NaturalQuestions (closed-book) - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "NaturalQuestions (closed-book) - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "NaturalQuestions (closed-book) - # prompt tokens": "{\"description\": \"min=139.127, mean=139.127, max=139.127, sum=139.127 (1)\", \"tab\": \"General information\", \"score\": \"139.127\"}",
+              "NaturalQuestions (closed-book) - # output tokens": "{\"description\": \"min=5.263, mean=5.263, max=5.263, sum=5.263 (1)\", \"tab\": \"General information\", \"score\": \"5.263\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"anatomy\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_anatomy\""
+              "mode": "\"closedbook\""
             }
           }
         },
         {
-          "evaluation_name": "College Physics",
+          "evaluation_name": "OpenbookQA",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "OpenbookQA",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on College Physics",
+            "evaluation_description": "EM on OpenbookQA",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.539,
+            "score": 0.97,
             "details": {
-              "description": "min=0.539, mean=0.539, max=0.539, sum=1.078 (2)",
+              "description": "min=0.97, mean=0.97, max=0.97, sum=0.97 (1)",
               "tab": "Accuracy",
-              "College Chemistry - Observed inference time (s)": "{\"description\": \"min=0.549, mean=0.549, max=0.549, sum=1.099 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5493535542488098\"}",
-              "College Biology - Observed inference time (s)": "{\"description\": \"min=0.6, mean=0.6, max=0.6, sum=1.199 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5995734184980392\"}",
-              "College Computer Science - Observed inference time (s)": "{\"description\": \"min=0.691, mean=0.691, max=0.691, sum=1.382 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6911867094039917\"}",
-              "College Mathematics - Observed inference time (s)": "{\"description\": \"min=0.609, mean=0.609, max=0.609, sum=1.219 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6092576813697815\"}",
-              "College Medicine - Observed inference time (s)": "{\"description\": \"min=0.67, mean=0.67, max=0.67, sum=1.34 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6697626251705809\"}",
-              "College Physics - Observed inference time (s)": "{\"description\": \"min=0.706, mean=0.706, max=0.706, sum=1.412 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.7058592660754335\"}",
-              "College Chemistry - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "College Chemistry - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "College Chemistry - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "College Chemistry - # prompt tokens": "{\"description\": \"min=549.4, mean=549.4, max=549.4, sum=1098.8 (2)\", \"tab\": \"General information\", \"score\": \"549.4\"}",
-              "College Chemistry - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "College Biology - # eval": "{\"description\": \"min=144, mean=144, max=144, sum=288 (2)\", \"tab\": \"General information\", \"score\": \"144.0\"}",
-              "College Biology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "College Biology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "College Biology - # prompt tokens": "{\"description\": \"min=473.917, mean=473.917, max=473.917, sum=947.833 (2)\", \"tab\": \"General information\", \"score\": \"473.9166666666667\"}",
-              "College Biology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "College Computer Science - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "College Computer Science - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "College Computer Science - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "College Computer Science - # prompt tokens": "{\"description\": \"min=828.39, mean=828.39, max=828.39, sum=1656.78 (2)\", \"tab\": \"General information\", \"score\": \"828.39\"}",
-              "College Computer Science - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "College Mathematics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "College Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "College Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "College Mathematics - # prompt tokens": "{\"description\": \"min=594.52, mean=594.52, max=594.52, sum=1189.04 (2)\", \"tab\": \"General information\", \"score\": \"594.52\"}",
-              "College Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "College Medicine - # eval": "{\"description\": \"min=173, mean=173, max=173, sum=346 (2)\", \"tab\": \"General information\", \"score\": \"173.0\"}",
-              "College Medicine - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "College Medicine - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "College Medicine - # prompt tokens": "{\"description\": \"min=502.728, mean=502.728, max=502.728, sum=1005.457 (2)\", \"tab\": \"General information\", \"score\": \"502.728323699422\"}",
-              "College Medicine - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "College Physics - # eval": "{\"description\": \"min=102, mean=102, max=102, sum=204 (2)\", \"tab\": \"General information\", \"score\": \"102.0\"}",
-              "College Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "College Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "College Physics - # prompt tokens": "{\"description\": \"min=503.608, mean=503.608, max=503.608, sum=1007.216 (2)\", \"tab\": \"General information\", \"score\": \"503.6078431372549\"}",
-              "College Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "OpenbookQA - Observed inference time (s)": "{\"description\": \"min=0.438, mean=0.438, max=0.438, sum=0.438 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.4376141686439514\"}",
+              "OpenbookQA - # eval": "{\"description\": \"min=500, mean=500, max=500, sum=500 (1)\", \"tab\": \"General information\", \"score\": \"500.0\"}",
+              "OpenbookQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "OpenbookQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "OpenbookQA - # prompt tokens": "{\"description\": \"min=249.782, mean=249.782, max=249.782, sum=249.782 (1)\", \"tab\": \"General information\", \"score\": \"249.782\"}",
+              "OpenbookQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"college_physics\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_college_physics\""
+              "dataset": "\"openbookqa\"",
+              "method": "\"multiple_choice_joint\""
             }
           }
         },
         {
-          "evaluation_name": "Computer Security",
+          "evaluation_name": "MMLU",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "MMLU",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Computer Security",
+            "evaluation_description": "EM on MMLU",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.83,
+            "score": 0.711,
             "details": {
-              "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)",
+              "description": "min=0.53, mean=0.711, max=0.96, sum=3.555 (5)",
               "tab": "Accuracy",
-              "Computer Security - Observed inference time (s)": "{\"description\": \"min=0.53, mean=0.53, max=0.53, sum=1.061 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5303381824493408\"}",
-              "Computer Security - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "Computer Security - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Computer Security - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Computer Security - # prompt tokens": "{\"description\": \"min=378.54, mean=378.54, max=378.54, sum=757.08 (2)\", \"tab\": \"General information\", \"score\": \"378.54\"}",
-              "Computer Security - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "MMLU - Observed inference time (s)": "{\"description\": \"min=0.53, mean=0.55, max=0.572, sum=2.749 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.5498773384847139\"}",
+              "MMLU - # eval": "{\"description\": \"min=100, mean=102.8, max=114, sum=514 (5)\", \"tab\": \"General information\", \"score\": \"102.8\"}",
+              "MMLU - # train": "{\"description\": \"min=5, mean=5, max=5, sum=25 (5)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "MMLU - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "MMLU - # prompt tokens": "{\"description\": \"min=373.44, mean=467.72, max=614.43, sum=2338.6 (5)\", \"tab\": \"General information\", \"score\": \"467.71996491228066\"}",
+              "MMLU - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"computer_security\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_computer_security\""
+              "subject": "[\"abstract_algebra\", \"college_chemistry\", \"computer_security\", \"econometrics\", \"us_foreign_policy\"]",
+              "method": "\"multiple_choice_joint\""
             }
           }
         },
         {
-          "evaluation_name": "Econometrics",
+          "evaluation_name": "MATH",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "MATH",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Econometrics",
+            "evaluation_description": "Equivalent (CoT) on MATH",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.675,
+            "score": 0.833,
             "details": {
-              "description": "min=0.675, mean=0.675, max=0.675, sum=1.351 (2)",
+              "description": "min=0.684, mean=0.833, max=0.97, sum=5.83 (7)",
               "tab": "Accuracy",
-              "Econometrics - Observed inference time (s)": "{\"description\": \"min=0.572, mean=0.572, max=0.572, sum=1.144 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5721135453173989\"}",
-              "Econometrics - # eval": "{\"description\": \"min=114, mean=114, max=114, sum=228 (2)\", \"tab\": \"General information\", \"score\": \"114.0\"}",
-              "Econometrics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Econometrics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Econometrics - # prompt tokens": "{\"description\": \"min=614.43, mean=614.43, max=614.43, sum=1228.86 (2)\", \"tab\": \"General information\", \"score\": \"614.4298245614035\"}",
-              "Econometrics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "MATH - Observed inference time (s)": "{\"description\": \"min=4.92, mean=6.678, max=8.338, sum=46.748 (7)\", \"tab\": \"Efficiency\", \"score\": \"6.678270916932833\"}",
+              "MATH - # eval": "{\"description\": \"min=30, mean=62.429, max=135, sum=437 (7)\", \"tab\": \"General information\", \"score\": \"62.42857142857143\"}",
+              "MATH - # train": "{\"description\": \"min=8, mean=8, max=8, sum=56 (7)\", \"tab\": \"General information\", \"score\": \"8.0\"}",
+              "MATH - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (7)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "MATH - # prompt tokens": "{\"description\": \"min=881.363, mean=1262.911, max=2197.577, sum=8840.376 (7)\", \"tab\": \"General information\", \"score\": \"1262.9108741840687\"}",
+              "MATH - # output tokens": "{\"description\": \"min=135.163, mean=189.561, max=219.316, sum=1326.926 (7)\", \"tab\": \"General information\", \"score\": \"189.56082409362702\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"econometrics\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_econometrics\""
+              "subject": "[\"algebra\", \"counting_and_probability\", \"geometry\", \"intermediate_algebra\", \"number_theory\", \"prealgebra\", \"precalculus\"]",
+              "level": "\"1\"",
+              "use_official_examples": "\"False\"",
+              "use_chain_of_thought": "\"True\""
             }
           }
         },
         {
-          "evaluation_name": "Global Facts",
+          "evaluation_name": "GSM8K",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "GSM8K",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Global Facts",
+            "evaluation_description": "EM on GSM8K",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.58,
+            "score": 0.824,
             "details": {
-              "description": "min=0.58, mean=0.58, max=0.58, sum=1.16 (2)",
+              "description": "min=0.824, mean=0.824, max=0.824, sum=0.824 (1)",
               "tab": "Accuracy",
-              "Global Facts - Observed inference time (s)": "{\"description\": \"min=0.479, mean=0.479, max=0.479, sum=0.958 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.47900029182434084\"}",
-              "Global Facts - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "Global Facts - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Global Facts - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Global Facts - # prompt tokens": "{\"description\": \"min=399.71, mean=399.71, max=399.71, sum=799.42 (2)\", \"tab\": \"General information\", \"score\": \"399.71\"}",
-              "Global Facts - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "GSM8K - Observed inference time (s)": "{\"description\": \"min=6.915, mean=6.915, max=6.915, sum=6.915 (1)\", \"tab\": \"Efficiency\", \"score\": \"6.91472976398468\"}",
+              "GSM8K - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}",
+              "GSM8K - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "GSM8K - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "GSM8K - # prompt tokens": "{\"description\": \"min=959.035, mean=959.035, max=959.035, sum=959.035 (1)\", \"tab\": \"General information\", \"score\": \"959.035\"}",
+              "GSM8K - # output tokens": "{\"description\": \"min=141.712, mean=141.712, max=141.712, sum=141.712 (1)\", \"tab\": \"General information\", \"score\": \"141.712\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"global_facts\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_global_facts\""
+              "stop": "\"none\""
             }
           }
         },
         {
-          "evaluation_name": "Jurisprudence",
+          "evaluation_name": "LegalBench",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "LegalBench",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Jurisprudence",
+            "evaluation_description": "EM on LegalBench",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.88,
+            "score": 0.727,
             "details": {
-              "description": "min=0.88, mean=0.88, max=0.88, sum=1.759 (2)",
+              "description": "min=0.417, mean=0.727, max=0.947, sum=3.637 (5)",
               "tab": "Accuracy",
-              "Jurisprudence - Observed inference time (s)": "{\"description\": \"min=0.539, mean=0.539, max=0.539, sum=1.079 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5393155504156042\"}",
-              "Jurisprudence - # eval": "{\"description\": \"min=108, mean=108, max=108, sum=216 (2)\", \"tab\": \"General information\", \"score\": \"108.0\"}",
-              "Jurisprudence - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Jurisprudence - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Jurisprudence - # prompt tokens": "{\"description\": \"min=394.639, mean=394.639, max=394.639, sum=789.278 (2)\", \"tab\": \"General information\", \"score\": \"394.6388888888889\"}",
-              "Jurisprudence - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "LegalBench - Observed inference time (s)": "{\"description\": \"min=0.514, mean=0.608, max=0.803, sum=3.041 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.6081070231398068\"}",
+              "LegalBench - # eval": "{\"description\": \"min=95, mean=409.4, max=1000, sum=2047 (5)\", \"tab\": \"General information\", \"score\": \"409.4\"}",
+              "LegalBench - # train": "{\"description\": \"min=4, mean=4.8, max=5, sum=24 (5)\", \"tab\": \"General information\", \"score\": \"4.8\"}",
+              "LegalBench - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "LegalBench - # prompt tokens": "{\"description\": \"min=207.442, mean=1524.163, max=6311.388, sum=7620.815 (5)\", \"tab\": \"General information\", \"score\": \"1524.162971355988\"}",
+              "LegalBench - # output tokens": "{\"description\": \"min=1, mean=1.325, max=2.032, sum=6.626 (5)\", \"tab\": \"General information\", \"score\": \"1.3251168793919403\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"jurisprudence\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_jurisprudence\""
+              "subset": "[\"abercrombie\", \"corporate_lobbying\", \"function_of_decision_section\", \"international_citizenship_questions\", \"proa\"]"
             }
           }
         },
         {
-          "evaluation_name": "Philosophy",
+          "evaluation_name": "MedQA",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "MedQA",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Philosophy",
+            "evaluation_description": "EM on MedQA",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.868,
+            "score": 0.783,
             "details": {
-              "description": "min=0.868, mean=0.868, max=0.868, sum=1.736 (2)",
+              "description": "min=0.783, mean=0.783, max=0.783, sum=0.783 (1)",
               "tab": "Accuracy",
-              "Philosophy - Observed inference time (s)": "{\"description\": \"min=0.543, mean=0.543, max=0.543, sum=1.087 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5434573969273705\"}",
-              "Philosophy - # eval": "{\"description\": \"min=311, mean=311, max=311, sum=622 (2)\", \"tab\": \"General information\", \"score\": \"311.0\"}",
-              "Philosophy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Philosophy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Philosophy - # prompt tokens": "{\"description\": \"min=329.084, mean=329.084, max=329.084, sum=658.167 (2)\", \"tab\": \"General information\", \"score\": \"329.08360128617363\"}",
-              "Philosophy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "MedQA - Observed inference time (s)": "{\"description\": \"min=0.455, mean=0.455, max=0.455, sum=0.455 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.4549296101329341\"}",
+              "MedQA - # eval": "{\"description\": \"min=503, mean=503, max=503, sum=503 (1)\", \"tab\": \"General information\", \"score\": \"503.0\"}",
+              "MedQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "MedQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "MedQA - # prompt tokens": "{\"description\": \"min=1027.414, mean=1027.414, max=1027.414, sum=1027.414 (1)\", \"tab\": \"General information\", \"score\": \"1027.4135188866799\"}",
+              "MedQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
-            "additional_details": {
-              "subject": "\"philosophy\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_philosophy\""
-            }
+            "additional_details": {}
           }
         },
         {
-          "evaluation_name": "Professional Psychology",
+          "evaluation_name": "WMT 2014",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "WMT 2014",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Professional Psychology",
+            "evaluation_description": "BLEU-4 on WMT 2014",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.873,
+            "score": 0.218,
             "details": {
-              "description": "min=0.873, mean=0.873, max=0.873, sum=1.745 (2)",
+              "description": "min=0.169, mean=0.218, max=0.264, sum=1.088 (5)",
               "tab": "Accuracy",
-              "Professional Medicine - Observed inference time (s)": "{\"description\": \"min=0.579, mean=0.579, max=0.579, sum=1.159 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5794552100055358\"}",
-              "Professional Accounting - Observed inference time (s)": "{\"description\": \"min=0.59, mean=0.59, max=0.59, sum=1.18 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5898241354218612\"}",
-              "Professional Law - Observed inference time (s)": "{\"description\": \"min=0.639, mean=0.639, max=0.639, sum=1.278 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6388053317424371\"}",
-              "Professional Psychology - Observed inference time (s)": "{\"description\": \"min=0.671, mean=0.671, max=0.671, sum=1.342 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6712259284031936\"}",
-              "Professional Medicine - # eval": "{\"description\": \"min=272, mean=272, max=272, sum=544 (2)\", \"tab\": \"General information\", \"score\": \"272.0\"}",
-              "Professional Medicine - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Professional Medicine - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Professional Medicine - # prompt tokens": "{\"description\": \"min=1094.585, mean=1094.585, max=1094.585, sum=2189.169 (2)\", \"tab\": \"General information\", \"score\": \"1094.5845588235295\"}",
-              "Professional Medicine - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "Professional Accounting - # eval": "{\"description\": \"min=282, mean=282, max=282, sum=564 (2)\", \"tab\": \"General information\", \"score\": \"282.0\"}",
-              "Professional Accounting - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Professional Accounting - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Professional Accounting - # prompt tokens": "{\"description\": \"min=658.592, mean=658.592, max=658.592, sum=1317.184 (2)\", \"tab\": \"General information\", \"score\": \"658.5921985815603\"}",
-              "Professional Accounting - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "Professional Law - # eval": "{\"description\": \"min=1534, mean=1534, max=1534, sum=3068 (2)\", \"tab\": \"General information\", \"score\": \"1534.0\"}",
-              "Professional Law - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Professional Law - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Professional Law - # prompt tokens": "{\"description\": \"min=1637.787, mean=1637.787, max=1637.787, sum=3275.574 (2)\", \"tab\": \"General information\", \"score\": \"1637.7868318122555\"}",
-              "Professional Law - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "Professional Psychology - # eval": "{\"description\": \"min=612, mean=612, max=612, sum=1224 (2)\", \"tab\": \"General information\", \"score\": \"612.0\"}",
-              "Professional Psychology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Professional Psychology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Professional Psychology - # prompt tokens": "{\"description\": \"min=575.114, mean=575.114, max=575.114, sum=1150.229 (2)\", \"tab\": \"General information\", \"score\": \"575.1143790849674\"}",
-              "Professional Psychology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "WMT 2014 - Observed inference time (s)": "{\"description\": \"min=1.131, mean=1.185, max=1.222, sum=5.925 (5)\", \"tab\": \"Efficiency\", \"score\": \"1.1850423664020953\"}",
+              "WMT 2014 - # eval": "{\"description\": \"min=503, mean=568.8, max=832, sum=2844 (5)\", \"tab\": \"General information\", \"score\": \"568.8\"}",
+              "WMT 2014 - # train": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "WMT 2014 - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "WMT 2014 - # prompt tokens": "{\"description\": \"min=124.901, mean=148.043, max=168.185, sum=740.213 (5)\", \"tab\": \"General information\", \"score\": \"148.04258583116683\"}",
+              "WMT 2014 - # output tokens": "{\"description\": \"min=23.744, mean=25.264, max=25.938, sum=126.322 (5)\", \"tab\": \"General information\", \"score\": \"25.26444840571953\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"professional_psychology\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_professional_psychology\""
+              "language_pair": "[\"cs-en\", \"de-en\", \"fr-en\", \"hi-en\", \"ru-en\"]"
             }
           }
-        },
+        }
+      ],
+      "detailed_evaluation_results": null,
+      "generation_config": {
+        "additional_details": {}
+      }
+    },
+    {
+      "evaluation_id": "helm_mmlu/openai_gpt-4-turbo-2024-04-09/1774096312.00548",
+      "retrieved_timestamp": "1774096312.00548",
+      "source_metadata": {
+        "source_name": "helm_mmlu",
+        "source_type": "documentation",
+        "source_organization_name": "crfm",
+        "evaluator_relationship": "third_party"
+      },
+      "eval_library": {
+        "name": "helm",
+        "version": "unknown"
+      },
+      "benchmark": "helm_mmlu",
+      "evaluation_results": [
         {
-          "evaluation_name": "Us Foreign Policy",
+          "evaluation_name": "MMLU All Subjects",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -459,36 +403,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Us Foreign Policy",
+            "evaluation_description": "EM on MMLU All Subjects",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.96,
+            "score": 0.813,
             "details": {
-              "description": "min=0.96, mean=0.96, max=0.96, sum=1.92 (2)",
+              "description": "min=0.515, mean=0.813, max=0.974, sum=92.65 (114)",
               "tab": "Accuracy",
-              "Us Foreign Policy - Observed inference time (s)": "{\"description\": \"min=0.558, mean=0.558, max=0.558, sum=1.115 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.557673556804657\"}",
-              "Us Foreign Policy - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "Us Foreign Policy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Us Foreign Policy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Us Foreign Policy - # prompt tokens": "{\"description\": \"min=422.79, mean=422.79, max=422.79, sum=845.58 (2)\", \"tab\": \"General information\", \"score\": \"422.79\"}",
-              "Us Foreign Policy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "MMLU All Subjects - Observed inference time (s)": "{\"description\": \"min=0.479, mean=0.617, max=0.934, sum=70.3 (114)\", \"tab\": \"Efficiency\", \"score\": \"0.6166649052297876\"}",
+              "MMLU All Subjects - # eval": "{\"description\": \"min=100, mean=246.351, max=1534, sum=28084 (114)\", \"tab\": \"General information\", \"score\": \"246.35087719298247\"}",
+              "MMLU All Subjects - # train": "{\"description\": \"min=5, mean=5, max=5, sum=570 (114)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "MMLU All Subjects - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (114)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "MMLU All Subjects - # prompt tokens": "{\"description\": \"min=275.561, mean=614.852, max=2798.073, sum=70093.086 (114)\", \"tab\": \"General information\", \"score\": \"614.851634217556\"}",
+              "MMLU All Subjects - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=114 (114)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"us_foreign_policy\"",
+              "subject": "[\"abstract_algebra\", \"anatomy\", \"astronomy\", \"business_ethics\", \"clinical_knowledge\", \"college_biology\", \"college_chemistry\", \"college_computer_science\", \"college_mathematics\", \"college_medicine\", \"college_physics\", \"computer_security\", \"conceptual_physics\", \"econometrics\", \"electrical_engineering\", \"elementary_mathematics\", \"formal_logic\", \"global_facts\", \"high_school_biology\", \"high_school_chemistry\", \"high_school_computer_science\", \"high_school_european_history\", \"high_school_geography\", \"high_school_government_and_politics\", \"high_school_macroeconomics\", \"high_school_mathematics\", \"high_school_microeconomics\", \"high_school_physics\", \"high_school_psychology\", \"high_school_statistics\", \"high_school_us_history\", \"high_school_world_history\", \"human_aging\", \"human_sexuality\", \"international_law\", \"jurisprudence\", \"logical_fallacies\", \"machine_learning\", \"management\", \"marketing\", \"medical_genetics\", \"miscellaneous\", \"moral_disputes\", \"moral_scenarios\", \"nutrition\", \"philosophy\", \"prehistory\", \"professional_accounting\", \"professional_law\", \"professional_medicine\", \"professional_psychology\", \"public_relations\", \"security_studies\", \"sociology\", \"us_foreign_policy\", \"virology\", \"world_religions\"]",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_us_foreign_policy\""
+              "groups": "[\"mmlu_abstract_algebra\", \"mmlu_anatomy\", \"mmlu_astronomy\", \"mmlu_business_ethics\", \"mmlu_clinical_knowledge\", \"mmlu_college_biology\", \"mmlu_college_chemistry\", \"mmlu_college_computer_science\", \"mmlu_college_mathematics\", \"mmlu_college_medicine\", \"mmlu_college_physics\", \"mmlu_computer_security\", \"mmlu_conceptual_physics\", \"mmlu_econometrics\", \"mmlu_electrical_engineering\", \"mmlu_elementary_mathematics\", \"mmlu_formal_logic\", \"mmlu_global_facts\", \"mmlu_high_school_biology\", \"mmlu_high_school_chemistry\", \"mmlu_high_school_computer_science\", \"mmlu_high_school_european_history\", \"mmlu_high_school_geography\", \"mmlu_high_school_government_and_politics\", \"mmlu_high_school_macroeconomics\", \"mmlu_high_school_mathematics\", \"mmlu_high_school_microeconomics\", \"mmlu_high_school_physics\", \"mmlu_high_school_psychology\", \"mmlu_high_school_statistics\", \"mmlu_high_school_us_history\", \"mmlu_high_school_world_history\", \"mmlu_human_aging\", \"mmlu_human_sexuality\", \"mmlu_international_law\", \"mmlu_jurisprudence\", \"mmlu_logical_fallacies\", \"mmlu_machine_learning\", \"mmlu_management\", \"mmlu_marketing\", \"mmlu_medical_genetics\", \"mmlu_miscellaneous\", \"mmlu_moral_disputes\", \"mmlu_moral_scenarios\", \"mmlu_nutrition\", \"mmlu_philosophy\", \"mmlu_prehistory\", \"mmlu_professional_accounting\", \"mmlu_professional_law\", \"mmlu_professional_medicine\", \"mmlu_professional_psychology\", \"mmlu_public_relations\", \"mmlu_security_studies\", \"mmlu_sociology\", \"mmlu_us_foreign_policy\", \"mmlu_virology\", \"mmlu_world_religions\"]"
             }
           }
         },
         {
-          "evaluation_name": "Astronomy",
+          "evaluation_name": "Abstract Algebra",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -497,36 +441,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Astronomy",
+            "evaluation_description": "EM on Abstract Algebra",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.941,
+            "score": 0.56,
             "details": {
-              "description": "min=0.941, mean=0.941, max=0.941, sum=1.882 (2)",
+              "description": "min=0.56, mean=0.56, max=0.56, sum=1.12 (2)",
               "tab": "Accuracy",
-              "Astronomy - Observed inference time (s)": "{\"description\": \"min=0.666, mean=0.666, max=0.666, sum=1.332 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6662032525790366\"}",
-              "Astronomy - # eval": "{\"description\": \"min=152, mean=152, max=152, sum=304 (2)\", \"tab\": \"General information\", \"score\": \"152.0\"}",
-              "Astronomy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Astronomy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Astronomy - # prompt tokens": "{\"description\": \"min=579.691, mean=579.691, max=579.691, sum=1159.382 (2)\", \"tab\": \"General information\", \"score\": \"579.6907894736842\"}",
-              "Astronomy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Abstract Algebra - Observed inference time (s)": "{\"description\": \"min=0.54, mean=0.54, max=0.54, sum=1.08 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.539907853603363\"}",
+              "Abstract Algebra - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "Abstract Algebra - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Abstract Algebra - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Abstract Algebra - # prompt tokens": "{\"description\": \"min=373.44, mean=373.44, max=373.44, sum=746.88 (2)\", \"tab\": \"General information\", \"score\": \"373.44\"}",
+              "Abstract Algebra - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"astronomy\"",
+              "subject": "\"abstract_algebra\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_astronomy\""
+              "groups": "\"mmlu_abstract_algebra\""
             }
           }
         },
         {
-          "evaluation_name": "Business Ethics",
+          "evaluation_name": "Anatomy",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -535,36 +479,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Business Ethics",
+            "evaluation_description": "EM on Anatomy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.82,
+            "score": 0.822,
             "details": {
-              "description": "min=0.82, mean=0.82, max=0.82, sum=1.64 (2)",
+              "description": "min=0.822, mean=0.822, max=0.822, sum=1.644 (2)",
               "tab": "Accuracy",
-              "Business Ethics - Observed inference time (s)": "{\"description\": \"min=0.598, mean=0.598, max=0.598, sum=1.196 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5981367039680481\"}",
-              "Business Ethics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "Business Ethics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Business Ethics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Business Ethics - # prompt tokens": "{\"description\": \"min=569.52, mean=569.52, max=569.52, sum=1139.04 (2)\", \"tab\": \"General information\", \"score\": \"569.52\"}",
-              "Business Ethics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Anatomy - Observed inference time (s)": "{\"description\": \"min=0.53, mean=0.53, max=0.53, sum=1.06 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5299274744810881\"}",
+              "Anatomy - # eval": "{\"description\": \"min=135, mean=135, max=135, sum=270 (2)\", \"tab\": \"General information\", \"score\": \"135.0\"}",
+              "Anatomy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Anatomy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Anatomy - # prompt tokens": "{\"description\": \"min=353.978, mean=353.978, max=353.978, sum=707.956 (2)\", \"tab\": \"General information\", \"score\": \"353.97777777777776\"}",
+              "Anatomy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"business_ethics\"",
+              "subject": "\"anatomy\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_business_ethics\""
+              "groups": "\"mmlu_anatomy\""
             }
           }
         },
         {
-          "evaluation_name": "Clinical Knowledge",
+          "evaluation_name": "College Physics",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -573,36 +517,66 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Clinical Knowledge",
+            "evaluation_description": "EM on College Physics",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.83,
+            "score": 0.539,
             "details": {
-              "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)",
+              "description": "min=0.539, mean=0.539, max=0.539, sum=1.078 (2)",
               "tab": "Accuracy",
-              "Clinical Knowledge - Observed inference time (s)": "{\"description\": \"min=0.591, mean=0.591, max=0.591, sum=1.183 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5912713131814633\"}",
-              "Clinical Knowledge - # eval": "{\"description\": \"min=265, mean=265, max=265, sum=530 (2)\", \"tab\": \"General information\", \"score\": \"265.0\"}",
-              "Clinical Knowledge - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Clinical Knowledge - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Clinical Knowledge - # prompt tokens": "{\"description\": \"min=397.947, mean=397.947, max=397.947, sum=795.894 (2)\", \"tab\": \"General information\", \"score\": \"397.94716981132075\"}",
-              "Clinical Knowledge - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "College Chemistry - Observed inference time (s)": "{\"description\": \"min=0.549, mean=0.549, max=0.549, sum=1.099 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5493535542488098\"}",
+              "College Biology - Observed inference time (s)": "{\"description\": \"min=0.6, mean=0.6, max=0.6, sum=1.199 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5995734184980392\"}",
+              "College Computer Science - Observed inference time (s)": "{\"description\": \"min=0.691, mean=0.691, max=0.691, sum=1.382 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6911867094039917\"}",
+              "College Mathematics - Observed inference time (s)": "{\"description\": \"min=0.609, mean=0.609, max=0.609, sum=1.219 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6092576813697815\"}",
+              "College Medicine - Observed inference time (s)": "{\"description\": \"min=0.67, mean=0.67, max=0.67, sum=1.34 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6697626251705809\"}",
+              "College Physics - Observed inference time (s)": "{\"description\": \"min=0.706, mean=0.706, max=0.706, sum=1.412 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.7058592660754335\"}",
+              "College Chemistry - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "College Chemistry - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "College Chemistry - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "College Chemistry - # prompt tokens": "{\"description\": \"min=549.4, mean=549.4, max=549.4, sum=1098.8 (2)\", \"tab\": \"General information\", \"score\": \"549.4\"}",
+              "College Chemistry - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "College Biology - # eval": "{\"description\": \"min=144, mean=144, max=144, sum=288 (2)\", \"tab\": \"General information\", \"score\": \"144.0\"}",
+              "College Biology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "College Biology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "College Biology - # prompt tokens": "{\"description\": \"min=473.917, mean=473.917, max=473.917, sum=947.833 (2)\", \"tab\": \"General information\", \"score\": \"473.9166666666667\"}",
+              "College Biology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "College Computer Science - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "College Computer Science - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "College Computer Science - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "College Computer Science - # prompt tokens": "{\"description\": \"min=828.39, mean=828.39, max=828.39, sum=1656.78 (2)\", \"tab\": \"General information\", \"score\": \"828.39\"}",
+              "College Computer Science - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "College Mathematics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "College Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "College Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "College Mathematics - # prompt tokens": "{\"description\": \"min=594.52, mean=594.52, max=594.52, sum=1189.04 (2)\", \"tab\": \"General information\", \"score\": \"594.52\"}",
+              "College Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "College Medicine - # eval": "{\"description\": \"min=173, mean=173, max=173, sum=346 (2)\", \"tab\": \"General information\", \"score\": \"173.0\"}",
+              "College Medicine - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "College Medicine - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "College Medicine - # prompt tokens": "{\"description\": \"min=502.728, mean=502.728, max=502.728, sum=1005.457 (2)\", \"tab\": \"General information\", \"score\": \"502.728323699422\"}",
+              "College Medicine - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "College Physics - # eval": "{\"description\": \"min=102, mean=102, max=102, sum=204 (2)\", \"tab\": \"General information\", \"score\": \"102.0\"}",
+              "College Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "College Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "College Physics - # prompt tokens": "{\"description\": \"min=503.608, mean=503.608, max=503.608, sum=1007.216 (2)\", \"tab\": \"General information\", \"score\": \"503.6078431372549\"}",
+              "College Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"clinical_knowledge\"",
+              "subject": "\"college_physics\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_clinical_knowledge\""
+              "groups": "\"mmlu_college_physics\""
             }
           }
         },
         {
-          "evaluation_name": "Conceptual Physics",
+          "evaluation_name": "Computer Security",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -611,36 +585,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Conceptual Physics",
+            "evaluation_description": "EM on Computer Security",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.894,
+            "score": 0.83,
             "details": {
-              "description": "min=0.894, mean=0.894, max=0.894, sum=1.787 (2)",
+              "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)",
               "tab": "Accuracy",
-              "Conceptual Physics - Observed inference time (s)": "{\"description\": \"min=0.685, mean=0.685, max=0.685, sum=1.369 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.684603402969685\"}",
-              "Conceptual Physics - # eval": "{\"description\": \"min=235, mean=235, max=235, sum=470 (2)\", \"tab\": \"General information\", \"score\": \"235.0\"}",
-              "Conceptual Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Conceptual Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Conceptual Physics - # prompt tokens": "{\"description\": \"min=304.838, mean=304.838, max=304.838, sum=609.677 (2)\", \"tab\": \"General information\", \"score\": \"304.83829787234043\"}",
-              "Conceptual Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Computer Security - Observed inference time (s)": "{\"description\": \"min=0.53, mean=0.53, max=0.53, sum=1.061 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5303381824493408\"}",
+              "Computer Security - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "Computer Security - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Computer Security - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Computer Security - # prompt tokens": "{\"description\": \"min=378.54, mean=378.54, max=378.54, sum=757.08 (2)\", \"tab\": \"General information\", \"score\": \"378.54\"}",
+              "Computer Security - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"conceptual_physics\"",
+              "subject": "\"computer_security\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_conceptual_physics\""
+              "groups": "\"mmlu_computer_security\""
             }
           }
         },
         {
-          "evaluation_name": "Electrical Engineering",
+          "evaluation_name": "Econometrics",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -649,36 +623,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Electrical Engineering",
+            "evaluation_description": "EM on Econometrics",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.752,
+            "score": 0.675,
             "details": {
-              "description": "min=0.752, mean=0.752, max=0.752, sum=1.503 (2)",
+              "description": "min=0.675, mean=0.675, max=0.675, sum=1.351 (2)",
               "tab": "Accuracy",
-              "Electrical Engineering - Observed inference time (s)": "{\"description\": \"min=0.649, mean=0.649, max=0.649, sum=1.297 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6487039006989578\"}",
-              "Electrical Engineering - # eval": "{\"description\": \"min=145, mean=145, max=145, sum=290 (2)\", \"tab\": \"General information\", \"score\": \"145.0\"}",
-              "Electrical Engineering - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Electrical Engineering - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Electrical Engineering - # prompt tokens": "{\"description\": \"min=440.641, mean=440.641, max=440.641, sum=881.283 (2)\", \"tab\": \"General information\", \"score\": \"440.6413793103448\"}",
-              "Electrical Engineering - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Econometrics - Observed inference time (s)": "{\"description\": \"min=0.572, mean=0.572, max=0.572, sum=1.144 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5721135453173989\"}",
+              "Econometrics - # eval": "{\"description\": \"min=114, mean=114, max=114, sum=228 (2)\", \"tab\": \"General information\", \"score\": \"114.0\"}",
+              "Econometrics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Econometrics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Econometrics - # prompt tokens": "{\"description\": \"min=614.43, mean=614.43, max=614.43, sum=1228.86 (2)\", \"tab\": \"General information\", \"score\": \"614.4298245614035\"}",
+              "Econometrics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"electrical_engineering\"",
+              "subject": "\"econometrics\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_electrical_engineering\""
+              "groups": "\"mmlu_econometrics\""
             }
           }
         },
         {
-          "evaluation_name": "Elementary Mathematics",
+          "evaluation_name": "Global Facts",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -687,36 +661,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Elementary Mathematics",
+            "evaluation_description": "EM on Global Facts",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.72,
+            "score": 0.58,
             "details": {
-              "description": "min=0.72, mean=0.72, max=0.72, sum=1.439 (2)",
+              "description": "min=0.58, mean=0.58, max=0.58, sum=1.16 (2)",
               "tab": "Accuracy",
-              "Elementary Mathematics - Observed inference time (s)": "{\"description\": \"min=0.708, mean=0.708, max=0.708, sum=1.417 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.708430844009238\"}",
-              "Elementary Mathematics - # eval": "{\"description\": \"min=378, mean=378, max=378, sum=756 (2)\", \"tab\": \"General information\", \"score\": \"378.0\"}",
-              "Elementary Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Elementary Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Elementary Mathematics - # prompt tokens": "{\"description\": \"min=531.862, mean=531.862, max=531.862, sum=1063.725 (2)\", \"tab\": \"General information\", \"score\": \"531.8624338624338\"}",
-              "Elementary Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Global Facts - Observed inference time (s)": "{\"description\": \"min=0.479, mean=0.479, max=0.479, sum=0.958 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.47900029182434084\"}",
+              "Global Facts - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "Global Facts - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Global Facts - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Global Facts - # prompt tokens": "{\"description\": \"min=399.71, mean=399.71, max=399.71, sum=799.42 (2)\", \"tab\": \"General information\", \"score\": \"399.71\"}",
+              "Global Facts - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"elementary_mathematics\"",
+              "subject": "\"global_facts\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_elementary_mathematics\""
+              "groups": "\"mmlu_global_facts\""
             }
           }
         },
         {
-          "evaluation_name": "Formal Logic",
+          "evaluation_name": "Jurisprudence",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -725,36 +699,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Formal Logic",
+            "evaluation_description": "EM on Jurisprudence",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.706,
+            "score": 0.88,
             "details": {
-              "description": "min=0.706, mean=0.706, max=0.706, sum=1.413 (2)",
+              "description": "min=0.88, mean=0.88, max=0.88, sum=1.759 (2)",
               "tab": "Accuracy",
-              "Formal Logic - Observed inference time (s)": "{\"description\": \"min=0.635, mean=0.635, max=0.635, sum=1.27 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6347800322941372\"}",
-              "Formal Logic - # eval": "{\"description\": \"min=126, mean=126, max=126, sum=252 (2)\", \"tab\": \"General information\", \"score\": \"126.0\"}",
-              "Formal Logic - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Formal Logic - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Formal Logic - # prompt tokens": "{\"description\": \"min=606.762, mean=606.762, max=606.762, sum=1213.524 (2)\", \"tab\": \"General information\", \"score\": \"606.7619047619048\"}",
-              "Formal Logic - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Jurisprudence - Observed inference time (s)": "{\"description\": \"min=0.539, mean=0.539, max=0.539, sum=1.079 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5393155504156042\"}",
+              "Jurisprudence - # eval": "{\"description\": \"min=108, mean=108, max=108, sum=216 (2)\", \"tab\": \"General information\", \"score\": \"108.0\"}",
+              "Jurisprudence - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Jurisprudence - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Jurisprudence - # prompt tokens": "{\"description\": \"min=394.639, mean=394.639, max=394.639, sum=789.278 (2)\", \"tab\": \"General information\", \"score\": \"394.6388888888889\"}",
+              "Jurisprudence - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"formal_logic\"",
+              "subject": "\"jurisprudence\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_formal_logic\""
+              "groups": "\"mmlu_jurisprudence\""
             }
           }
         },
         {
-          "evaluation_name": "High School World History",
+          "evaluation_name": "Philosophy",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -763,114 +737,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on High School World History",
+            "evaluation_description": "EM on Philosophy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.941,
+            "score": 0.868,
             "details": {
-              "description": "min=0.941, mean=0.941, max=0.941, sum=1.882 (2)",
+              "description": "min=0.868, mean=0.868, max=0.868, sum=1.736 (2)",
               "tab": "Accuracy",
-              "High School Biology - Observed inference time (s)": "{\"description\": \"min=0.674, mean=0.674, max=0.674, sum=1.348 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6741217144073979\"}",
-              "High School Chemistry - Observed inference time (s)": "{\"description\": \"min=0.673, mean=0.673, max=0.673, sum=1.346 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6728476491467706\"}",
-              "High School Computer Science - Observed inference time (s)": "{\"description\": \"min=0.626, mean=0.626, max=0.626, sum=1.252 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6261640882492066\"}",
-              "High School European History - Observed inference time (s)": "{\"description\": \"min=0.747, mean=0.747, max=0.747, sum=1.495 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.7474224538514108\"}",
-              "High School Geography - Observed inference time (s)": "{\"description\": \"min=0.667, mean=0.667, max=0.667, sum=1.335 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6672574221485793\"}",
-              "High School Government And Politics - Observed inference time (s)": "{\"description\": \"min=0.683, mean=0.683, max=0.683, sum=1.366 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6831059715290762\"}",
-              "High School Macroeconomics - Observed inference time (s)": "{\"description\": \"min=0.613, mean=0.613, max=0.613, sum=1.226 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6132381714307344\"}",
-              "High School Mathematics - Observed inference time (s)": "{\"description\": \"min=0.594, mean=0.594, max=0.594, sum=1.188 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5939316025486698\"}",
-              "High School Microeconomics - Observed inference time (s)": "{\"description\": \"min=0.585, mean=0.585, max=0.585, sum=1.169 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5845635728675778\"}",
-              "High School Physics - Observed inference time (s)": "{\"description\": \"min=0.934, mean=0.934, max=0.934, sum=1.868 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.9341671135251886\"}",
-              "High School Psychology - Observed inference time (s)": "{\"description\": \"min=0.741, mean=0.741, max=0.741, sum=1.482 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.7410666920723171\"}",
-              "High School Statistics - Observed inference time (s)": "{\"description\": \"min=0.72, mean=0.72, max=0.72, sum=1.439 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.7196061655327126\"}",
-              "High School US History - Observed inference time (s)": "{\"description\": \"min=0.745, mean=0.745, max=0.745, sum=1.491 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.7454434785188413\"}",
-              "High School World History - Observed inference time (s)": "{\"description\": \"min=0.667, mean=0.667, max=0.667, sum=1.333 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6665283818788166\"}",
-              "High School Biology - # eval": "{\"description\": \"min=310, mean=310, max=310, sum=620 (2)\", \"tab\": \"General information\", \"score\": \"310.0\"}",
-              "High School Biology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Biology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Biology - # prompt tokens": "{\"description\": \"min=513.677, mean=513.677, max=513.677, sum=1027.355 (2)\", \"tab\": \"General information\", \"score\": \"513.6774193548387\"}",
-              "High School Biology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Chemistry - # eval": "{\"description\": \"min=203, mean=203, max=203, sum=406 (2)\", \"tab\": \"General information\", \"score\": \"203.0\"}",
-              "High School Chemistry - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Chemistry - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Chemistry - # prompt tokens": "{\"description\": \"min=496.714, mean=496.714, max=496.714, sum=993.429 (2)\", \"tab\": \"General information\", \"score\": \"496.7142857142857\"}",
-              "High School Chemistry - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Computer Science - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "High School Computer Science - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Computer Science - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Computer Science - # prompt tokens": "{\"description\": \"min=867.78, mean=867.78, max=867.78, sum=1735.56 (2)\", \"tab\": \"General information\", \"score\": \"867.78\"}",
-              "High School Computer Science - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School European History - # eval": "{\"description\": \"min=165, mean=165, max=165, sum=330 (2)\", \"tab\": \"General information\", \"score\": \"165.0\"}",
-              "High School European History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School European History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School European History - # prompt tokens": "{\"description\": \"min=2798.073, mean=2798.073, max=2798.073, sum=5596.145 (2)\", \"tab\": \"General information\", \"score\": \"2798.072727272727\"}",
-              "High School European History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Geography - # eval": "{\"description\": \"min=198, mean=198, max=198, sum=396 (2)\", \"tab\": \"General information\", \"score\": \"198.0\"}",
-              "High School Geography - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Geography - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Geography - # prompt tokens": "{\"description\": \"min=372.045, mean=372.045, max=372.045, sum=744.091 (2)\", \"tab\": \"General information\", \"score\": \"372.04545454545456\"}",
-              "High School Geography - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Government And Politics - # eval": "{\"description\": \"min=193, mean=193, max=193, sum=386 (2)\", \"tab\": \"General information\", \"score\": \"193.0\"}",
-              "High School Government And Politics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Government And Politics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Government And Politics - # prompt tokens": "{\"description\": \"min=465.824, mean=465.824, max=465.824, sum=931.648 (2)\", \"tab\": \"General information\", \"score\": \"465.8238341968912\"}",
-              "High School Government And Politics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Macroeconomics - # eval": "{\"description\": \"min=390, mean=390, max=390, sum=780 (2)\", \"tab\": \"General information\", \"score\": \"390.0\"}",
-              "High School Macroeconomics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Macroeconomics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Macroeconomics - # prompt tokens": "{\"description\": \"min=371.562, mean=371.562, max=371.562, sum=743.123 (2)\", \"tab\": \"General information\", \"score\": \"371.5615384615385\"}",
-              "High School Macroeconomics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Mathematics - # eval": "{\"description\": \"min=270, mean=270, max=270, sum=540 (2)\", \"tab\": \"General information\", \"score\": \"270.0\"}",
-              "High School Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Mathematics - # prompt tokens": "{\"description\": \"min=532.374, mean=532.374, max=532.374, sum=1064.748 (2)\", \"tab\": \"General information\", \"score\": \"532.3740740740741\"}",
-              "High School Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Microeconomics - # eval": "{\"description\": \"min=238, mean=238, max=238, sum=476 (2)\", \"tab\": \"General information\", \"score\": \"238.0\"}",
-              "High School Microeconomics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Microeconomics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Microeconomics - # prompt tokens": "{\"description\": \"min=399.025, mean=399.025, max=399.025, sum=798.05 (2)\", \"tab\": \"General information\", \"score\": \"399.02521008403363\"}",
-              "High School Microeconomics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Physics - # eval": "{\"description\": \"min=151, mean=151, max=151, sum=302 (2)\", \"tab\": \"General information\", \"score\": \"151.0\"}",
-              "High School Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Physics - # prompt tokens": "{\"description\": \"min=560.464, mean=560.464, max=560.464, sum=1120.927 (2)\", \"tab\": \"General information\", \"score\": \"560.4635761589404\"}",
-              "High School Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Psychology - # eval": "{\"description\": \"min=545, mean=545, max=545, sum=1090 (2)\", \"tab\": \"General information\", \"score\": \"545.0\"}",
-              "High School Psychology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Psychology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Psychology - # prompt tokens": "{\"description\": \"min=495.246, mean=495.246, max=495.246, sum=990.492 (2)\", \"tab\": \"General information\", \"score\": \"495.24587155963303\"}",
-              "High School Psychology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Statistics - # eval": "{\"description\": \"min=216, mean=216, max=216, sum=432 (2)\", \"tab\": \"General information\", \"score\": \"216.0\"}",
-              "High School Statistics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Statistics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Statistics - # prompt tokens": "{\"description\": \"min=795.699, mean=795.699, max=795.699, sum=1591.398 (2)\", \"tab\": \"General information\", \"score\": \"795.699074074074\"}",
-              "High School Statistics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School US History - # eval": "{\"description\": \"min=204, mean=204, max=204, sum=408 (2)\", \"tab\": \"General information\", \"score\": \"204.0\"}",
-              "High School US History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School US History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School US History - # prompt tokens": "{\"description\": \"min=2217.809, mean=2217.809, max=2217.809, sum=4435.618 (2)\", \"tab\": \"General information\", \"score\": \"2217.8088235294117\"}",
-              "High School US History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School World History - # eval": "{\"description\": \"min=237, mean=237, max=237, sum=474 (2)\", \"tab\": \"General information\", \"score\": \"237.0\"}",
-              "High School World History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School World History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School World History - # prompt tokens": "{\"description\": \"min=1428.27, mean=1428.27, max=1428.27, sum=2856.54 (2)\", \"tab\": \"General information\", \"score\": \"1428.2700421940929\"}",
-              "High School World History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Philosophy - Observed inference time (s)": "{\"description\": \"min=0.543, mean=0.543, max=0.543, sum=1.087 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5434573969273705\"}",
+              "Philosophy - # eval": "{\"description\": \"min=311, mean=311, max=311, sum=622 (2)\", \"tab\": \"General information\", \"score\": \"311.0\"}",
+              "Philosophy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Philosophy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Philosophy - # prompt tokens": "{\"description\": \"min=329.084, mean=329.084, max=329.084, sum=658.167 (2)\", \"tab\": \"General information\", \"score\": \"329.08360128617363\"}",
+              "Philosophy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"high_school_world_history\"",
+              "subject": "\"philosophy\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_high_school_world_history\""
+              "groups": "\"mmlu_philosophy\""
             }
           }
         },
         {
-          "evaluation_name": "Human Sexuality",
+          "evaluation_name": "Professional Psychology",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -879,42 +775,54 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Human Sexuality",
+            "evaluation_description": "EM on Professional Psychology",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.901,
+            "score": 0.873,
             "details": {
-              "description": "min=0.901, mean=0.901, max=0.901, sum=1.802 (2)",
+              "description": "min=0.873, mean=0.873, max=0.873, sum=1.745 (2)",
               "tab": "Accuracy",
-              "Human Aging - Observed inference time (s)": "{\"description\": \"min=0.656, mean=0.656, max=0.656, sum=1.313 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6564141239286003\"}",
-              "Human Sexuality - Observed inference time (s)": "{\"description\": \"min=0.613, mean=0.613, max=0.613, sum=1.226 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6131143715545422\"}",
-              "Human Aging - # eval": "{\"description\": \"min=223, mean=223, max=223, sum=446 (2)\", \"tab\": \"General information\", \"score\": \"223.0\"}",
-              "Human Aging - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Human Aging - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Human Aging - # prompt tokens": "{\"description\": \"min=319.906, mean=319.906, max=319.906, sum=639.812 (2)\", \"tab\": \"General information\", \"score\": \"319.90582959641256\"}",
-              "Human Aging - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "Human Sexuality - # eval": "{\"description\": \"min=131, mean=131, max=131, sum=262 (2)\", \"tab\": \"General information\", \"score\": \"131.0\"}",
-              "Human Sexuality - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Human Sexuality - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Human Sexuality - # prompt tokens": "{\"description\": \"min=341.183, mean=341.183, max=341.183, sum=682.366 (2)\", \"tab\": \"General information\", \"score\": \"341.1832061068702\"}",
-              "Human Sexuality - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Professional Medicine - Observed inference time (s)": "{\"description\": \"min=0.579, mean=0.579, max=0.579, sum=1.159 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5794552100055358\"}",
+              "Professional Accounting - Observed inference time (s)": "{\"description\": \"min=0.59, mean=0.59, max=0.59, sum=1.18 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5898241354218612\"}",
+              "Professional Law - Observed inference time (s)": "{\"description\": \"min=0.639, mean=0.639, max=0.639, sum=1.278 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6388053317424371\"}",
+              "Professional Psychology - Observed inference time (s)": "{\"description\": \"min=0.671, mean=0.671, max=0.671, sum=1.342 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6712259284031936\"}",
+              "Professional Medicine - # eval": "{\"description\": \"min=272, mean=272, max=272, sum=544 (2)\", \"tab\": \"General information\", \"score\": \"272.0\"}",
+              "Professional Medicine - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Professional Medicine - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Professional Medicine - # prompt tokens": "{\"description\": \"min=1094.585, mean=1094.585, max=1094.585, sum=2189.169 (2)\", \"tab\": \"General information\", \"score\": \"1094.5845588235295\"}",
+              "Professional Medicine - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "Professional Accounting - # eval": "{\"description\": \"min=282, mean=282, max=282, sum=564 (2)\", \"tab\": \"General information\", \"score\": \"282.0\"}",
+              "Professional Accounting - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Professional Accounting - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Professional Accounting - # prompt tokens": "{\"description\": \"min=658.592, mean=658.592, max=658.592, sum=1317.184 (2)\", \"tab\": \"General information\", \"score\": \"658.5921985815603\"}",
+              "Professional Accounting - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "Professional Law - # eval": "{\"description\": \"min=1534, mean=1534, max=1534, sum=3068 (2)\", \"tab\": \"General information\", \"score\": \"1534.0\"}",
+              "Professional Law - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Professional Law - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Professional Law - # prompt tokens": "{\"description\": \"min=1637.787, mean=1637.787, max=1637.787, sum=3275.574 (2)\", \"tab\": \"General information\", \"score\": \"1637.7868318122555\"}",
+              "Professional Law - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "Professional Psychology - # eval": "{\"description\": \"min=612, mean=612, max=612, sum=1224 (2)\", \"tab\": \"General information\", \"score\": \"612.0\"}",
+              "Professional Psychology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Professional Psychology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Professional Psychology - # prompt tokens": "{\"description\": \"min=575.114, mean=575.114, max=575.114, sum=1150.229 (2)\", \"tab\": \"General information\", \"score\": \"575.1143790849674\"}",
+              "Professional Psychology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"human_sexuality\"",
+              "subject": "\"professional_psychology\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_human_sexuality\""
+              "groups": "\"mmlu_professional_psychology\""
             }
           }
         },
         {
-          "evaluation_name": "International Law",
+          "evaluation_name": "Us Foreign Policy",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -923,36 +831,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on International Law",
+            "evaluation_description": "EM on Us Foreign Policy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.942,
+            "score": 0.96,
             "details": {
-              "description": "min=0.942, mean=0.942, max=0.942, sum=1.884 (2)",
+              "description": "min=0.96, mean=0.96, max=0.96, sum=1.92 (2)",
               "tab": "Accuracy",
-              "International Law - Observed inference time (s)": "{\"description\": \"min=0.63, mean=0.63, max=0.63, sum=1.26 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6297830116650289\"}",
-              "International Law - # eval": "{\"description\": \"min=121, mean=121, max=121, sum=242 (2)\", \"tab\": \"General information\", \"score\": \"121.0\"}",
-              "International Law - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "International Law - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "International Law - # prompt tokens": "{\"description\": \"min=639.851, mean=639.851, max=639.851, sum=1279.702 (2)\", \"tab\": \"General information\", \"score\": \"639.8512396694215\"}",
-              "International Law - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Us Foreign Policy - Observed inference time (s)": "{\"description\": \"min=0.558, mean=0.558, max=0.558, sum=1.115 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.557673556804657\"}",
+              "Us Foreign Policy - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "Us Foreign Policy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Us Foreign Policy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Us Foreign Policy - # prompt tokens": "{\"description\": \"min=422.79, mean=422.79, max=422.79, sum=845.58 (2)\", \"tab\": \"General information\", \"score\": \"422.79\"}",
+              "Us Foreign Policy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"international_law\"",
+              "subject": "\"us_foreign_policy\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_international_law\""
+              "groups": "\"mmlu_us_foreign_policy\""
             }
           }
         },
         {
-          "evaluation_name": "Logical Fallacies",
+          "evaluation_name": "Astronomy",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -961,36 +869,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Logical Fallacies",
+            "evaluation_description": "EM on Astronomy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.871,
+            "score": 0.941,
             "details": {
-              "description": "min=0.871, mean=0.871, max=0.871, sum=1.742 (2)",
+              "description": "min=0.941, mean=0.941, max=0.941, sum=1.882 (2)",
               "tab": "Accuracy",
-              "Logical Fallacies - Observed inference time (s)": "{\"description\": \"min=0.585, mean=0.585, max=0.585, sum=1.171 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.585445927695994\"}",
-              "Logical Fallacies - # eval": "{\"description\": \"min=163, mean=163, max=163, sum=326 (2)\", \"tab\": \"General information\", \"score\": \"163.0\"}",
-              "Logical Fallacies - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Logical Fallacies - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Logical Fallacies - # prompt tokens": "{\"description\": \"min=449.595, mean=449.595, max=449.595, sum=899.19 (2)\", \"tab\": \"General information\", \"score\": \"449.5950920245399\"}",
-              "Logical Fallacies - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Astronomy - Observed inference time (s)": "{\"description\": \"min=0.666, mean=0.666, max=0.666, sum=1.332 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6662032525790366\"}",
+              "Astronomy - # eval": "{\"description\": \"min=152, mean=152, max=152, sum=304 (2)\", \"tab\": \"General information\", \"score\": \"152.0\"}",
+              "Astronomy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Astronomy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Astronomy - # prompt tokens": "{\"description\": \"min=579.691, mean=579.691, max=579.691, sum=1159.382 (2)\", \"tab\": \"General information\", \"score\": \"579.6907894736842\"}",
+              "Astronomy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"logical_fallacies\"",
+              "subject": "\"astronomy\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_logical_fallacies\""
+              "groups": "\"mmlu_astronomy\""
             }
           }
         },
         {
-          "evaluation_name": "Machine Learning",
+          "evaluation_name": "Business Ethics",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -999,36 +907,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Machine Learning",
+            "evaluation_description": "EM on Business Ethics",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.741,
+            "score": 0.82,
             "details": {
-              "description": "min=0.741, mean=0.741, max=0.741, sum=1.482 (2)",
+              "description": "min=0.82, mean=0.82, max=0.82, sum=1.64 (2)",
               "tab": "Accuracy",
-              "Machine Learning - Observed inference time (s)": "{\"description\": \"min=0.718, mean=0.718, max=0.718, sum=1.436 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.718035706451961\"}",
-              "Machine Learning - # eval": "{\"description\": \"min=112, mean=112, max=112, sum=224 (2)\", \"tab\": \"General information\", \"score\": \"112.0\"}",
-              "Machine Learning - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Machine Learning - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Machine Learning - # prompt tokens": "{\"description\": \"min=668.054, mean=668.054, max=668.054, sum=1336.107 (2)\", \"tab\": \"General information\", \"score\": \"668.0535714285714\"}",
-              "Machine Learning - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Business Ethics - Observed inference time (s)": "{\"description\": \"min=0.598, mean=0.598, max=0.598, sum=1.196 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5981367039680481\"}",
+              "Business Ethics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "Business Ethics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Business Ethics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Business Ethics - # prompt tokens": "{\"description\": \"min=569.52, mean=569.52, max=569.52, sum=1139.04 (2)\", \"tab\": \"General information\", \"score\": \"569.52\"}",
+              "Business Ethics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"machine_learning\"",
+              "subject": "\"business_ethics\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_machine_learning\""
+              "groups": "\"mmlu_business_ethics\""
             }
           }
         },
         {
-          "evaluation_name": "Management",
+          "evaluation_name": "Clinical Knowledge",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1037,36 +945,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Management",
+            "evaluation_description": "EM on Clinical Knowledge",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.883,
+            "score": 0.83,
             "details": {
-              "description": "min=0.883, mean=0.883, max=0.883, sum=1.767 (2)",
+              "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)",
               "tab": "Accuracy",
-              "Management - Observed inference time (s)": "{\"description\": \"min=0.592, mean=0.592, max=0.592, sum=1.184 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5921963488013999\"}",
-              "Management - # eval": "{\"description\": \"min=103, mean=103, max=103, sum=206 (2)\", \"tab\": \"General information\", \"score\": \"103.0\"}",
-              "Management - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Management - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Management - # prompt tokens": "{\"description\": \"min=283.796, mean=283.796, max=283.796, sum=567.592 (2)\", \"tab\": \"General information\", \"score\": \"283.79611650485435\"}",
-              "Management - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Clinical Knowledge - Observed inference time (s)": "{\"description\": \"min=0.591, mean=0.591, max=0.591, sum=1.183 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5912713131814633\"}",
+              "Clinical Knowledge - # eval": "{\"description\": \"min=265, mean=265, max=265, sum=530 (2)\", \"tab\": \"General information\", \"score\": \"265.0\"}",
+              "Clinical Knowledge - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Clinical Knowledge - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Clinical Knowledge - # prompt tokens": "{\"description\": \"min=397.947, mean=397.947, max=397.947, sum=795.894 (2)\", \"tab\": \"General information\", \"score\": \"397.94716981132075\"}",
+              "Clinical Knowledge - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"management\"",
+              "subject": "\"clinical_knowledge\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_management\""
+              "groups": "\"mmlu_clinical_knowledge\""
             }
           }
         },
         {
-          "evaluation_name": "Marketing",
+          "evaluation_name": "Conceptual Physics",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1075,36 +983,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Marketing",
+            "evaluation_description": "EM on Conceptual Physics",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.949,
+            "score": 0.894,
             "details": {
-              "description": "min=0.949, mean=0.949, max=0.949, sum=1.897 (2)",
+              "description": "min=0.894, mean=0.894, max=0.894, sum=1.787 (2)",
               "tab": "Accuracy",
-              "Marketing - Observed inference time (s)": "{\"description\": \"min=0.588, mean=0.588, max=0.588, sum=1.176 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5880082672477788\"}",
-              "Marketing - # eval": "{\"description\": \"min=234, mean=234, max=234, sum=468 (2)\", \"tab\": \"General information\", \"score\": \"234.0\"}",
-              "Marketing - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Marketing - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Marketing - # prompt tokens": "{\"description\": \"min=404.218, mean=404.218, max=404.218, sum=808.436 (2)\", \"tab\": \"General information\", \"score\": \"404.21794871794873\"}",
-              "Marketing - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Conceptual Physics - Observed inference time (s)": "{\"description\": \"min=0.685, mean=0.685, max=0.685, sum=1.369 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.684603402969685\"}",
+              "Conceptual Physics - # eval": "{\"description\": \"min=235, mean=235, max=235, sum=470 (2)\", \"tab\": \"General information\", \"score\": \"235.0\"}",
+              "Conceptual Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Conceptual Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Conceptual Physics - # prompt tokens": "{\"description\": \"min=304.838, mean=304.838, max=304.838, sum=609.677 (2)\", \"tab\": \"General information\", \"score\": \"304.83829787234043\"}",
+              "Conceptual Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"marketing\"",
+              "subject": "\"conceptual_physics\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_marketing\""
+              "groups": "\"mmlu_conceptual_physics\""
             }
           }
         },
         {
-          "evaluation_name": "Medical Genetics",
+          "evaluation_name": "Electrical Engineering",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1113,36 +1021,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Medical Genetics",
+            "evaluation_description": "EM on Electrical Engineering",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.92,
+            "score": 0.752,
             "details": {
-              "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)",
+              "description": "min=0.752, mean=0.752, max=0.752, sum=1.503 (2)",
               "tab": "Accuracy",
-              "Medical Genetics - Observed inference time (s)": "{\"description\": \"min=0.52, mean=0.52, max=0.52, sum=1.04 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5201336288452149\"}",
-              "Medical Genetics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "Medical Genetics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Medical Genetics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Medical Genetics - # prompt tokens": "{\"description\": \"min=341, mean=341, max=341, sum=682 (2)\", \"tab\": \"General information\", \"score\": \"341.0\"}",
-              "Medical Genetics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Electrical Engineering - Observed inference time (s)": "{\"description\": \"min=0.649, mean=0.649, max=0.649, sum=1.297 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6487039006989578\"}",
+              "Electrical Engineering - # eval": "{\"description\": \"min=145, mean=145, max=145, sum=290 (2)\", \"tab\": \"General information\", \"score\": \"145.0\"}",
+              "Electrical Engineering - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Electrical Engineering - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Electrical Engineering - # prompt tokens": "{\"description\": \"min=440.641, mean=440.641, max=440.641, sum=881.283 (2)\", \"tab\": \"General information\", \"score\": \"440.6413793103448\"}",
+              "Electrical Engineering - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"medical_genetics\"",
+              "subject": "\"electrical_engineering\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_medical_genetics\""
+              "groups": "\"mmlu_electrical_engineering\""
             }
           }
         },
         {
-          "evaluation_name": "Miscellaneous",
+          "evaluation_name": "Elementary Mathematics",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1151,36 +1059,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Miscellaneous",
+            "evaluation_description": "EM on Elementary Mathematics",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.945,
+            "score": 0.72,
             "details": {
-              "description": "min=0.945, mean=0.945, max=0.945, sum=1.89 (2)",
+              "description": "min=0.72, mean=0.72, max=0.72, sum=1.439 (2)",
               "tab": "Accuracy",
-              "Miscellaneous - Observed inference time (s)": "{\"description\": \"min=0.565, mean=0.565, max=0.565, sum=1.13 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5650817577561809\"}",
-              "Miscellaneous - # eval": "{\"description\": \"min=783, mean=783, max=783, sum=1566 (2)\", \"tab\": \"General information\", \"score\": \"783.0\"}",
-              "Miscellaneous - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Miscellaneous - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Miscellaneous - # prompt tokens": "{\"description\": \"min=299.925, mean=299.925, max=299.925, sum=599.849 (2)\", \"tab\": \"General information\", \"score\": \"299.92464878671774\"}",
-              "Miscellaneous - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Elementary Mathematics - Observed inference time (s)": "{\"description\": \"min=0.708, mean=0.708, max=0.708, sum=1.417 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.708430844009238\"}",
+              "Elementary Mathematics - # eval": "{\"description\": \"min=378, mean=378, max=378, sum=756 (2)\", \"tab\": \"General information\", \"score\": \"378.0\"}",
+              "Elementary Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Elementary Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Elementary Mathematics - # prompt tokens": "{\"description\": \"min=531.862, mean=531.862, max=531.862, sum=1063.725 (2)\", \"tab\": \"General information\", \"score\": \"531.8624338624338\"}",
+              "Elementary Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"miscellaneous\"",
+              "subject": "\"elementary_mathematics\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_miscellaneous\""
+              "groups": "\"mmlu_elementary_mathematics\""
             }
           }
         },
         {
-          "evaluation_name": "Moral Scenarios",
+          "evaluation_name": "Formal Logic",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1189,42 +1097,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Moral Scenarios",
+            "evaluation_description": "EM on Formal Logic",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.803,
+            "score": 0.706,
             "details": {
-              "description": "min=0.803, mean=0.803, max=0.803, sum=1.607 (2)",
+              "description": "min=0.706, mean=0.706, max=0.706, sum=1.413 (2)",
               "tab": "Accuracy",
-              "Moral Disputes - Observed inference time (s)": "{\"description\": \"min=0.564, mean=0.564, max=0.564, sum=1.129 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5643301023913256\"}",
-              "Moral Scenarios - Observed inference time (s)": "{\"description\": \"min=0.599, mean=0.599, max=0.599, sum=1.197 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5985688052363902\"}",
-              "Moral Disputes - # eval": "{\"description\": \"min=346, mean=346, max=346, sum=692 (2)\", \"tab\": \"General information\", \"score\": \"346.0\"}",
-              "Moral Disputes - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Moral Disputes - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Moral Disputes - # prompt tokens": "{\"description\": \"min=476.145, mean=476.145, max=476.145, sum=952.289 (2)\", \"tab\": \"General information\", \"score\": \"476.1445086705202\"}",
-              "Moral Disputes - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "Moral Scenarios - # eval": "{\"description\": \"min=895, mean=895, max=895, sum=1790 (2)\", \"tab\": \"General information\", \"score\": \"895.0\"}",
-              "Moral Scenarios - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Moral Scenarios - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Moral Scenarios - # prompt tokens": "{\"description\": \"min=656.455, mean=656.455, max=656.455, sum=1312.909 (2)\", \"tab\": \"General information\", \"score\": \"656.454748603352\"}",
-              "Moral Scenarios - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Formal Logic - Observed inference time (s)": "{\"description\": \"min=0.635, mean=0.635, max=0.635, sum=1.27 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6347800322941372\"}",
+              "Formal Logic - # eval": "{\"description\": \"min=126, mean=126, max=126, sum=252 (2)\", \"tab\": \"General information\", \"score\": \"126.0\"}",
+              "Formal Logic - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Formal Logic - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Formal Logic - # prompt tokens": "{\"description\": \"min=606.762, mean=606.762, max=606.762, sum=1213.524 (2)\", \"tab\": \"General information\", \"score\": \"606.7619047619048\"}",
+              "Formal Logic - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"moral_scenarios\"",
+              "subject": "\"formal_logic\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_moral_scenarios\""
+              "groups": "\"mmlu_formal_logic\""
             }
           }
         },
         {
-          "evaluation_name": "Nutrition",
+          "evaluation_name": "High School World History",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1233,36 +1135,114 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Nutrition",
+            "evaluation_description": "EM on High School World History",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.892,
+            "score": 0.941,
             "details": {
-              "description": "min=0.892, mean=0.892, max=0.892, sum=1.784 (2)",
+              "description": "min=0.941, mean=0.941, max=0.941, sum=1.882 (2)",
               "tab": "Accuracy",
-              "Nutrition - Observed inference time (s)": "{\"description\": \"min=0.532, mean=0.532, max=0.532, sum=1.063 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5316595968857311\"}",
-              "Nutrition - # eval": "{\"description\": \"min=306, mean=306, max=306, sum=612 (2)\", \"tab\": \"General information\", \"score\": \"306.0\"}",
-              "Nutrition - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Nutrition - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Nutrition - # prompt tokens": "{\"description\": \"min=586.817, mean=586.817, max=586.817, sum=1173.634 (2)\", \"tab\": \"General information\", \"score\": \"586.8169934640523\"}",
-              "Nutrition - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "High School Biology - Observed inference time (s)": "{\"description\": \"min=0.674, mean=0.674, max=0.674, sum=1.348 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6741217144073979\"}",
+              "High School Chemistry - Observed inference time (s)": "{\"description\": \"min=0.673, mean=0.673, max=0.673, sum=1.346 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6728476491467706\"}",
+              "High School Computer Science - Observed inference time (s)": "{\"description\": \"min=0.626, mean=0.626, max=0.626, sum=1.252 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6261640882492066\"}",
+              "High School European History - Observed inference time (s)": "{\"description\": \"min=0.747, mean=0.747, max=0.747, sum=1.495 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.7474224538514108\"}",
+              "High School Geography - Observed inference time (s)": "{\"description\": \"min=0.667, mean=0.667, max=0.667, sum=1.335 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6672574221485793\"}",
+              "High School Government And Politics - Observed inference time (s)": "{\"description\": \"min=0.683, mean=0.683, max=0.683, sum=1.366 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6831059715290762\"}",
+              "High School Macroeconomics - Observed inference time (s)": "{\"description\": \"min=0.613, mean=0.613, max=0.613, sum=1.226 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6132381714307344\"}",
+              "High School Mathematics - Observed inference time (s)": "{\"description\": \"min=0.594, mean=0.594, max=0.594, sum=1.188 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5939316025486698\"}",
+              "High School Microeconomics - Observed inference time (s)": "{\"description\": \"min=0.585, mean=0.585, max=0.585, sum=1.169 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5845635728675778\"}",
+              "High School Physics - Observed inference time (s)": "{\"description\": \"min=0.934, mean=0.934, max=0.934, sum=1.868 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.9341671135251886\"}",
+              "High School Psychology - Observed inference time (s)": "{\"description\": \"min=0.741, mean=0.741, max=0.741, sum=1.482 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.7410666920723171\"}",
+              "High School Statistics - Observed inference time (s)": "{\"description\": \"min=0.72, mean=0.72, max=0.72, sum=1.439 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.7196061655327126\"}",
+              "High School US History - Observed inference time (s)": "{\"description\": \"min=0.745, mean=0.745, max=0.745, sum=1.491 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.7454434785188413\"}",
+              "High School World History - Observed inference time (s)": "{\"description\": \"min=0.667, mean=0.667, max=0.667, sum=1.333 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6665283818788166\"}",
+              "High School Biology - # eval": "{\"description\": \"min=310, mean=310, max=310, sum=620 (2)\", \"tab\": \"General information\", \"score\": \"310.0\"}",
+              "High School Biology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Biology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Biology - # prompt tokens": "{\"description\": \"min=513.677, mean=513.677, max=513.677, sum=1027.355 (2)\", \"tab\": \"General information\", \"score\": \"513.6774193548387\"}",
+              "High School Biology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Chemistry - # eval": "{\"description\": \"min=203, mean=203, max=203, sum=406 (2)\", \"tab\": \"General information\", \"score\": \"203.0\"}",
+              "High School Chemistry - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Chemistry - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Chemistry - # prompt tokens": "{\"description\": \"min=496.714, mean=496.714, max=496.714, sum=993.429 (2)\", \"tab\": \"General information\", \"score\": \"496.7142857142857\"}",
+              "High School Chemistry - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Computer Science - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "High School Computer Science - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Computer Science - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Computer Science - # prompt tokens": "{\"description\": \"min=867.78, mean=867.78, max=867.78, sum=1735.56 (2)\", \"tab\": \"General information\", \"score\": \"867.78\"}",
+              "High School Computer Science - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School European History - # eval": "{\"description\": \"min=165, mean=165, max=165, sum=330 (2)\", \"tab\": \"General information\", \"score\": \"165.0\"}",
+              "High School European History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School European History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School European History - # prompt tokens": "{\"description\": \"min=2798.073, mean=2798.073, max=2798.073, sum=5596.145 (2)\", \"tab\": \"General information\", \"score\": \"2798.072727272727\"}",
+              "High School European History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Geography - # eval": "{\"description\": \"min=198, mean=198, max=198, sum=396 (2)\", \"tab\": \"General information\", \"score\": \"198.0\"}",
+              "High School Geography - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Geography - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Geography - # prompt tokens": "{\"description\": \"min=372.045, mean=372.045, max=372.045, sum=744.091 (2)\", \"tab\": \"General information\", \"score\": \"372.04545454545456\"}",
+              "High School Geography - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Government And Politics - # eval": "{\"description\": \"min=193, mean=193, max=193, sum=386 (2)\", \"tab\": \"General information\", \"score\": \"193.0\"}",
+              "High School Government And Politics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Government And Politics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Government And Politics - # prompt tokens": "{\"description\": \"min=465.824, mean=465.824, max=465.824, sum=931.648 (2)\", \"tab\": \"General information\", \"score\": \"465.8238341968912\"}",
+              "High School Government And Politics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Macroeconomics - # eval": "{\"description\": \"min=390, mean=390, max=390, sum=780 (2)\", \"tab\": \"General information\", \"score\": \"390.0\"}",
+              "High School Macroeconomics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Macroeconomics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Macroeconomics - # prompt tokens": "{\"description\": \"min=371.562, mean=371.562, max=371.562, sum=743.123 (2)\", \"tab\": \"General information\", \"score\": \"371.5615384615385\"}",
+              "High School Macroeconomics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Mathematics - # eval": "{\"description\": \"min=270, mean=270, max=270, sum=540 (2)\", \"tab\": \"General information\", \"score\": \"270.0\"}",
+              "High School Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Mathematics - # prompt tokens": "{\"description\": \"min=532.374, mean=532.374, max=532.374, sum=1064.748 (2)\", \"tab\": \"General information\", \"score\": \"532.3740740740741\"}",
+              "High School Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Microeconomics - # eval": "{\"description\": \"min=238, mean=238, max=238, sum=476 (2)\", \"tab\": \"General information\", \"score\": \"238.0\"}",
+              "High School Microeconomics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Microeconomics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Microeconomics - # prompt tokens": "{\"description\": \"min=399.025, mean=399.025, max=399.025, sum=798.05 (2)\", \"tab\": \"General information\", \"score\": \"399.02521008403363\"}",
+              "High School Microeconomics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Physics - # eval": "{\"description\": \"min=151, mean=151, max=151, sum=302 (2)\", \"tab\": \"General information\", \"score\": \"151.0\"}",
+              "High School Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Physics - # prompt tokens": "{\"description\": \"min=560.464, mean=560.464, max=560.464, sum=1120.927 (2)\", \"tab\": \"General information\", \"score\": \"560.4635761589404\"}",
+              "High School Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Psychology - # eval": "{\"description\": \"min=545, mean=545, max=545, sum=1090 (2)\", \"tab\": \"General information\", \"score\": \"545.0\"}",
+              "High School Psychology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Psychology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Psychology - # prompt tokens": "{\"description\": \"min=495.246, mean=495.246, max=495.246, sum=990.492 (2)\", \"tab\": \"General information\", \"score\": \"495.24587155963303\"}",
+              "High School Psychology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Statistics - # eval": "{\"description\": \"min=216, mean=216, max=216, sum=432 (2)\", \"tab\": \"General information\", \"score\": \"216.0\"}",
+              "High School Statistics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Statistics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Statistics - # prompt tokens": "{\"description\": \"min=795.699, mean=795.699, max=795.699, sum=1591.398 (2)\", \"tab\": \"General information\", \"score\": \"795.699074074074\"}",
+              "High School Statistics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School US History - # eval": "{\"description\": \"min=204, mean=204, max=204, sum=408 (2)\", \"tab\": \"General information\", \"score\": \"204.0\"}",
+              "High School US History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School US History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School US History - # prompt tokens": "{\"description\": \"min=2217.809, mean=2217.809, max=2217.809, sum=4435.618 (2)\", \"tab\": \"General information\", \"score\": \"2217.8088235294117\"}",
+              "High School US History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School World History - # eval": "{\"description\": \"min=237, mean=237, max=237, sum=474 (2)\", \"tab\": \"General information\", \"score\": \"237.0\"}",
+              "High School World History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School World History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School World History - # prompt tokens": "{\"description\": \"min=1428.27, mean=1428.27, max=1428.27, sum=2856.54 (2)\", \"tab\": \"General information\", \"score\": \"1428.2700421940929\"}",
+              "High School World History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"nutrition\"",
+              "subject": "\"high_school_world_history\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_nutrition\""
+              "groups": "\"mmlu_high_school_world_history\""
             }
           }
         },
         {
-          "evaluation_name": "Prehistory",
+          "evaluation_name": "Human Sexuality",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1271,36 +1251,42 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Prehistory",
+            "evaluation_description": "EM on Human Sexuality",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.92,
+            "score": 0.901,
             "details": {
-              "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)",
+              "description": "min=0.901, mean=0.901, max=0.901, sum=1.802 (2)",
               "tab": "Accuracy",
-              "Prehistory - Observed inference time (s)": "{\"description\": \"min=0.54, mean=0.54, max=0.54, sum=1.079 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5397091279795141\"}",
-              "Prehistory - # eval": "{\"description\": \"min=324, mean=324, max=324, sum=648 (2)\", \"tab\": \"General information\", \"score\": \"324.0\"}",
-              "Prehistory - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Prehistory - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Prehistory - # prompt tokens": "{\"description\": \"min=514.559, mean=514.559, max=514.559, sum=1029.117 (2)\", \"tab\": \"General information\", \"score\": \"514.5586419753087\"}",
-              "Prehistory - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Human Aging - Observed inference time (s)": "{\"description\": \"min=0.656, mean=0.656, max=0.656, sum=1.313 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6564141239286003\"}",
+              "Human Sexuality - Observed inference time (s)": "{\"description\": \"min=0.613, mean=0.613, max=0.613, sum=1.226 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6131143715545422\"}",
+              "Human Aging - # eval": "{\"description\": \"min=223, mean=223, max=223, sum=446 (2)\", \"tab\": \"General information\", \"score\": \"223.0\"}",
+              "Human Aging - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Human Aging - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Human Aging - # prompt tokens": "{\"description\": \"min=319.906, mean=319.906, max=319.906, sum=639.812 (2)\", \"tab\": \"General information\", \"score\": \"319.90582959641256\"}",
+              "Human Aging - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "Human Sexuality - # eval": "{\"description\": \"min=131, mean=131, max=131, sum=262 (2)\", \"tab\": \"General information\", \"score\": \"131.0\"}",
+              "Human Sexuality - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Human Sexuality - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Human Sexuality - # prompt tokens": "{\"description\": \"min=341.183, mean=341.183, max=341.183, sum=682.366 (2)\", \"tab\": \"General information\", \"score\": \"341.1832061068702\"}",
+              "Human Sexuality - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"prehistory\"",
+              "subject": "\"human_sexuality\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_prehistory\""
+              "groups": "\"mmlu_human_sexuality\""
             }
           }
         },
         {
-          "evaluation_name": "Public Relations",
+          "evaluation_name": "International Law",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1309,36 +1295,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Public Relations",
+            "evaluation_description": "EM on International Law",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.755,
+            "score": 0.942,
             "details": {
-              "description": "min=0.755, mean=0.755, max=0.755, sum=1.509 (2)",
+              "description": "min=0.942, mean=0.942, max=0.942, sum=1.884 (2)",
               "tab": "Accuracy",
-              "Public Relations - Observed inference time (s)": "{\"description\": \"min=0.584, mean=0.584, max=0.584, sum=1.168 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5840315688740123\"}",
-              "Public Relations - # eval": "{\"description\": \"min=110, mean=110, max=110, sum=220 (2)\", \"tab\": \"General information\", \"score\": \"110.0\"}",
-              "Public Relations - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Public Relations - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Public Relations - # prompt tokens": "{\"description\": \"min=405.318, mean=405.318, max=405.318, sum=810.636 (2)\", \"tab\": \"General information\", \"score\": \"405.3181818181818\"}",
-              "Public Relations - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "International Law - Observed inference time (s)": "{\"description\": \"min=0.63, mean=0.63, max=0.63, sum=1.26 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6297830116650289\"}",
+              "International Law - # eval": "{\"description\": \"min=121, mean=121, max=121, sum=242 (2)\", \"tab\": \"General information\", \"score\": \"121.0\"}",
+              "International Law - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "International Law - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "International Law - # prompt tokens": "{\"description\": \"min=639.851, mean=639.851, max=639.851, sum=1279.702 (2)\", \"tab\": \"General information\", \"score\": \"639.8512396694215\"}",
+              "International Law - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"public_relations\"",
+              "subject": "\"international_law\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_public_relations\""
+              "groups": "\"mmlu_international_law\""
             }
           }
         },
         {
-          "evaluation_name": "Security Studies",
+          "evaluation_name": "Logical Fallacies",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1347,36 +1333,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Security Studies",
+            "evaluation_description": "EM on Logical Fallacies",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8,
+            "score": 0.871,
             "details": {
-              "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)",
+              "description": "min=0.871, mean=0.871, max=0.871, sum=1.742 (2)",
               "tab": "Accuracy",
-              "Security Studies - Observed inference time (s)": "{\"description\": \"min=0.529, mean=0.529, max=0.529, sum=1.058 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.529095221538933\"}",
-              "Security Studies - # eval": "{\"description\": \"min=245, mean=245, max=245, sum=490 (2)\", \"tab\": \"General information\", \"score\": \"245.0\"}",
-              "Security Studies - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Security Studies - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Security Studies - # prompt tokens": "{\"description\": \"min=1164.473, mean=1164.473, max=1164.473, sum=2328.947 (2)\", \"tab\": \"General information\", \"score\": \"1164.4734693877551\"}",
-              "Security Studies - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Logical Fallacies - Observed inference time (s)": "{\"description\": \"min=0.585, mean=0.585, max=0.585, sum=1.171 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.585445927695994\"}",
+              "Logical Fallacies - # eval": "{\"description\": \"min=163, mean=163, max=163, sum=326 (2)\", \"tab\": \"General information\", \"score\": \"163.0\"}",
+              "Logical Fallacies - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Logical Fallacies - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Logical Fallacies - # prompt tokens": "{\"description\": \"min=449.595, mean=449.595, max=449.595, sum=899.19 (2)\", \"tab\": \"General information\", \"score\": \"449.5950920245399\"}",
+              "Logical Fallacies - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"security_studies\"",
+              "subject": "\"logical_fallacies\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_security_studies\""
+              "groups": "\"mmlu_logical_fallacies\""
             }
           }
         },
         {
-          "evaluation_name": "Sociology",
+          "evaluation_name": "Machine Learning",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1385,36 +1371,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Sociology",
+            "evaluation_description": "EM on Machine Learning",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.915,
+            "score": 0.741,
             "details": {
-              "description": "min=0.915, mean=0.915, max=0.915, sum=1.831 (2)",
+              "description": "min=0.741, mean=0.741, max=0.741, sum=1.482 (2)",
               "tab": "Accuracy",
-              "Sociology - Observed inference time (s)": "{\"description\": \"min=0.52, mean=0.52, max=0.52, sum=1.04 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5199050891458692\"}",
-              "Sociology - # eval": "{\"description\": \"min=201, mean=201, max=201, sum=402 (2)\", \"tab\": \"General information\", \"score\": \"201.0\"}",
-              "Sociology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Sociology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Sociology - # prompt tokens": "{\"description\": \"min=445.522, mean=445.522, max=445.522, sum=891.045 (2)\", \"tab\": \"General information\", \"score\": \"445.5223880597015\"}",
-              "Sociology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Machine Learning - Observed inference time (s)": "{\"description\": \"min=0.718, mean=0.718, max=0.718, sum=1.436 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.718035706451961\"}",
+              "Machine Learning - # eval": "{\"description\": \"min=112, mean=112, max=112, sum=224 (2)\", \"tab\": \"General information\", \"score\": \"112.0\"}",
+              "Machine Learning - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Machine Learning - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Machine Learning - # prompt tokens": "{\"description\": \"min=668.054, mean=668.054, max=668.054, sum=1336.107 (2)\", \"tab\": \"General information\", \"score\": \"668.0535714285714\"}",
+              "Machine Learning - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"sociology\"",
+              "subject": "\"machine_learning\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_sociology\""
+              "groups": "\"mmlu_machine_learning\""
             }
           }
         },
         {
-          "evaluation_name": "Virology",
+          "evaluation_name": "Management",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1423,36 +1409,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Virology",
+            "evaluation_description": "EM on Management",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.602,
+            "score": 0.883,
             "details": {
-              "description": "min=0.602, mean=0.602, max=0.602, sum=1.205 (2)",
+              "description": "min=0.883, mean=0.883, max=0.883, sum=1.767 (2)",
               "tab": "Accuracy",
-              "Virology - Observed inference time (s)": "{\"description\": \"min=0.523, mean=0.523, max=0.523, sum=1.045 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5226844951330897\"}",
-              "Virology - # eval": "{\"description\": \"min=166, mean=166, max=166, sum=332 (2)\", \"tab\": \"General information\", \"score\": \"166.0\"}",
-              "Virology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Virology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Virology - # prompt tokens": "{\"description\": \"min=343.09, mean=343.09, max=343.09, sum=686.181 (2)\", \"tab\": \"General information\", \"score\": \"343.0903614457831\"}",
-              "Virology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Management - Observed inference time (s)": "{\"description\": \"min=0.592, mean=0.592, max=0.592, sum=1.184 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5921963488013999\"}",
+              "Management - # eval": "{\"description\": \"min=103, mean=103, max=103, sum=206 (2)\", \"tab\": \"General information\", \"score\": \"103.0\"}",
+              "Management - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Management - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Management - # prompt tokens": "{\"description\": \"min=283.796, mean=283.796, max=283.796, sum=567.592 (2)\", \"tab\": \"General information\", \"score\": \"283.79611650485435\"}",
+              "Management - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"virology\"",
+              "subject": "\"management\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_virology\""
+              "groups": "\"mmlu_management\""
             }
           }
         },
         {
-          "evaluation_name": "World Religions",
+          "evaluation_name": "Marketing",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1461,36 +1447,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on World Religions",
+            "evaluation_description": "EM on Marketing",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.848,
+            "score": 0.949,
             "details": {
-              "description": "min=0.848, mean=0.848, max=0.848, sum=1.696 (2)",
+              "description": "min=0.949, mean=0.949, max=0.949, sum=1.897 (2)",
               "tab": "Accuracy",
-              "World Religions - Observed inference time (s)": "{\"description\": \"min=0.494, mean=0.494, max=0.494, sum=0.988 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.49407080739562276\"}",
-              "World Religions - # eval": "{\"description\": \"min=171, mean=171, max=171, sum=342 (2)\", \"tab\": \"General information\", \"score\": \"171.0\"}",
-              "World Religions - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "World Religions - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "World Religions - # prompt tokens": "{\"description\": \"min=275.561, mean=275.561, max=275.561, sum=551.123 (2)\", \"tab\": \"General information\", \"score\": \"275.56140350877195\"}",
-              "World Religions - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Marketing - Observed inference time (s)": "{\"description\": \"min=0.588, mean=0.588, max=0.588, sum=1.176 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5880082672477788\"}",
+              "Marketing - # eval": "{\"description\": \"min=234, mean=234, max=234, sum=468 (2)\", \"tab\": \"General information\", \"score\": \"234.0\"}",
+              "Marketing - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Marketing - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Marketing - # prompt tokens": "{\"description\": \"min=404.218, mean=404.218, max=404.218, sum=808.436 (2)\", \"tab\": \"General information\", \"score\": \"404.21794871794873\"}",
+              "Marketing - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"world_religions\"",
+              "subject": "\"marketing\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_world_religions\""
+              "groups": "\"mmlu_marketing\""
             }
           }
         },
         {
-          "evaluation_name": "Mean win rate",
+          "evaluation_name": "Medical Genetics",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1499,404 +1485,418 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "How many models this model outperforms on average (over columns).",
+            "evaluation_description": "EM on Medical Genetics",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.351,
+            "score": 0.92,
             "details": {
-              "description": "",
-              "tab": "Efficiency"
+              "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)",
+              "tab": "Accuracy",
+              "Medical Genetics - Observed inference time (s)": "{\"description\": \"min=0.52, mean=0.52, max=0.52, sum=1.04 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5201336288452149\"}",
+              "Medical Genetics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "Medical Genetics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Medical Genetics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Medical Genetics - # prompt tokens": "{\"description\": \"min=341, mean=341, max=341, sum=682 (2)\", \"tab\": \"General information\", \"score\": \"341.0\"}",
+              "Medical Genetics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
-            "additional_details": {}
+            "additional_details": {
+              "subject": "\"medical_genetics\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_medical_genetics\""
+            }
           }
-        }
-      ],
-      "detailed_evaluation_results": null,
-      "generation_config": {
-        "additional_details": {
-          "subject": "[\"abstract_algebra\", \"anatomy\", \"astronomy\", \"business_ethics\", \"clinical_knowledge\", \"college_biology\", \"college_chemistry\", \"college_computer_science\", \"college_mathematics\", \"college_medicine\", \"college_physics\", \"computer_security\", \"conceptual_physics\", \"econometrics\", \"electrical_engineering\", \"elementary_mathematics\", \"formal_logic\", \"global_facts\", \"high_school_biology\", \"high_school_chemistry\", \"high_school_computer_science\", \"high_school_european_history\", \"high_school_geography\", \"high_school_government_and_politics\", \"high_school_macroeconomics\", \"high_school_mathematics\", \"high_school_microeconomics\", \"high_school_physics\", \"high_school_psychology\", \"high_school_statistics\", \"high_school_us_history\", \"high_school_world_history\", \"human_aging\", \"human_sexuality\", \"international_law\", \"jurisprudence\", \"logical_fallacies\", \"machine_learning\", \"management\", \"marketing\", \"medical_genetics\", \"miscellaneous\", \"moral_disputes\", \"moral_scenarios\", \"nutrition\", \"philosophy\", \"prehistory\", \"professional_accounting\", \"professional_law\", \"professional_medicine\", \"professional_psychology\", \"public_relations\", \"security_studies\", \"sociology\", \"us_foreign_policy\", \"virology\", \"world_religions\"]",
-          "method": "\"multiple_choice_joint\"",
-          "eval_split": "\"test\"",
-          "groups": "[\"mmlu_abstract_algebra\", \"mmlu_anatomy\", \"mmlu_astronomy\", \"mmlu_business_ethics\", \"mmlu_clinical_knowledge\", \"mmlu_college_biology\", \"mmlu_college_chemistry\", \"mmlu_college_computer_science\", \"mmlu_college_mathematics\", \"mmlu_college_medicine\", \"mmlu_college_physics\", \"mmlu_computer_security\", \"mmlu_conceptual_physics\", \"mmlu_econometrics\", \"mmlu_electrical_engineering\", \"mmlu_elementary_mathematics\", \"mmlu_formal_logic\", \"mmlu_global_facts\", \"mmlu_high_school_biology\", \"mmlu_high_school_chemistry\", \"mmlu_high_school_computer_science\", \"mmlu_high_school_european_history\", \"mmlu_high_school_geography\", \"mmlu_high_school_government_and_politics\", \"mmlu_high_school_macroeconomics\", \"mmlu_high_school_mathematics\", \"mmlu_high_school_microeconomics\", \"mmlu_high_school_physics\", \"mmlu_high_school_psychology\", \"mmlu_high_school_statistics\", \"mmlu_high_school_us_history\", \"mmlu_high_school_world_history\", \"mmlu_human_aging\", \"mmlu_human_sexuality\", \"mmlu_international_law\", \"mmlu_jurisprudence\", \"mmlu_logical_fallacies\", \"mmlu_machine_learning\", \"mmlu_management\", \"mmlu_marketing\", \"mmlu_medical_genetics\", \"mmlu_miscellaneous\", \"mmlu_moral_disputes\", \"mmlu_moral_scenarios\", \"mmlu_nutrition\", \"mmlu_philosophy\", \"mmlu_prehistory\", \"mmlu_professional_accounting\", \"mmlu_professional_law\", \"mmlu_professional_medicine\", \"mmlu_professional_psychology\", \"mmlu_public_relations\", \"mmlu_security_studies\", \"mmlu_sociology\", \"mmlu_us_foreign_policy\", \"mmlu_virology\", \"mmlu_world_religions\"]"
-        }
-      }
-    },
-    {
-      "evaluation_id": "helm_lite/openai_gpt-4-turbo-2024-04-09/1774096306.427425",
-      "retrieved_timestamp": "1774096306.427425",
-      "source_metadata": {
-        "source_name": "helm_lite",
-        "source_type": "documentation",
-        "source_organization_name": "crfm",
-        "evaluator_relationship": "third_party"
-      },
-      "eval_library": {
-        "name": "helm",
-        "version": "unknown"
-      },
-      "benchmark": "helm_lite",
-      "evaluation_results": [
+        },
         {
-          "evaluation_name": "Mean win rate",
+          "evaluation_name": "Miscellaneous",
           "source_data": {
-            "dataset_name": "helm_lite",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "How many models this model outperforms on average (over columns).",
+            "evaluation_description": "EM on Miscellaneous",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.864,
+            "score": 0.945,
             "details": {
-              "description": "",
+              "description": "min=0.945, mean=0.945, max=0.945, sum=1.89 (2)",
               "tab": "Accuracy",
-              "Mean win rate - Efficiency": "{\"description\": \"\", \"tab\": \"Efficiency\", \"score\": \"0.4568414481897628\"}",
-              "Mean win rate - General information": "{\"description\": \"\", \"tab\": \"General information\", \"score\": \"\"}"
+              "Miscellaneous - Observed inference time (s)": "{\"description\": \"min=0.565, mean=0.565, max=0.565, sum=1.13 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5650817577561809\"}",
+              "Miscellaneous - # eval": "{\"description\": \"min=783, mean=783, max=783, sum=1566 (2)\", \"tab\": \"General information\", \"score\": \"783.0\"}",
+              "Miscellaneous - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Miscellaneous - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Miscellaneous - # prompt tokens": "{\"description\": \"min=299.925, mean=299.925, max=299.925, sum=599.849 (2)\", \"tab\": \"General information\", \"score\": \"299.92464878671774\"}",
+              "Miscellaneous - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
-            "additional_details": {}
+            "additional_details": {
+              "subject": "\"miscellaneous\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_miscellaneous\""
+            }
           }
         },
         {
-          "evaluation_name": "NarrativeQA",
+          "evaluation_name": "Moral Scenarios",
           "source_data": {
-            "dataset_name": "NarrativeQA",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "F1 on NarrativeQA",
+            "evaluation_description": "EM on Moral Scenarios",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.761,
+            "score": 0.803,
             "details": {
-              "description": "min=0.761, mean=0.761, max=0.761, sum=0.761 (1)",
+              "description": "min=0.803, mean=0.803, max=0.803, sum=1.607 (2)",
               "tab": "Accuracy",
-              "NarrativeQA - Observed inference time (s)": "{\"description\": \"min=0.804, mean=0.804, max=0.804, sum=0.804 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.8043310716118611\"}",
-              "NarrativeQA - # eval": "{\"description\": \"min=355, mean=355, max=355, sum=355 (1)\", \"tab\": \"General information\", \"score\": \"355.0\"}",
-              "NarrativeQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "NarrativeQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "NarrativeQA - # prompt tokens": "{\"description\": \"min=3495.67, mean=3495.67, max=3495.67, sum=3495.67 (1)\", \"tab\": \"General information\", \"score\": \"3495.6704225352114\"}",
-              "NarrativeQA - # output tokens": "{\"description\": \"min=6.037, mean=6.037, max=6.037, sum=6.037 (1)\", \"tab\": \"General information\", \"score\": \"6.0366197183098596\"}"
+              "Moral Disputes - Observed inference time (s)": "{\"description\": \"min=0.564, mean=0.564, max=0.564, sum=1.129 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5643301023913256\"}",
+              "Moral Scenarios - Observed inference time (s)": "{\"description\": \"min=0.599, mean=0.599, max=0.599, sum=1.197 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5985688052363902\"}",
+              "Moral Disputes - # eval": "{\"description\": \"min=346, mean=346, max=346, sum=692 (2)\", \"tab\": \"General information\", \"score\": \"346.0\"}",
+              "Moral Disputes - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Moral Disputes - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Moral Disputes - # prompt tokens": "{\"description\": \"min=476.145, mean=476.145, max=476.145, sum=952.289 (2)\", \"tab\": \"General information\", \"score\": \"476.1445086705202\"}",
+              "Moral Disputes - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "Moral Scenarios - # eval": "{\"description\": \"min=895, mean=895, max=895, sum=1790 (2)\", \"tab\": \"General information\", \"score\": \"895.0\"}",
+              "Moral Scenarios - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Moral Scenarios - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Moral Scenarios - # prompt tokens": "{\"description\": \"min=656.455, mean=656.455, max=656.455, sum=1312.909 (2)\", \"tab\": \"General information\", \"score\": \"656.454748603352\"}",
+              "Moral Scenarios - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
-            "additional_details": {}
+            "additional_details": {
+              "subject": "\"moral_scenarios\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_moral_scenarios\""
+            }
           }
         },
         {
-          "evaluation_name": "NaturalQuestions (closed-book)",
+          "evaluation_name": "Nutrition",
           "source_data": {
-            "dataset_name": "NaturalQuestions (closed-book)",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "F1 on NaturalQuestions (closed-book)",
+            "evaluation_description": "EM on Nutrition",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.482,
+            "score": 0.892,
             "details": {
-              "description": "min=0.482, mean=0.482, max=0.482, sum=0.482 (1)",
+              "description": "min=0.892, mean=0.892, max=0.892, sum=1.784 (2)",
               "tab": "Accuracy",
-              "NaturalQuestions (open-book) - Observed inference time (s)": "{\"description\": \"min=0.712, mean=0.712, max=0.712, sum=0.712 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.7120162718296051\"}",
-              "NaturalQuestions (closed-book) - Observed inference time (s)": "{\"description\": \"min=0.605, mean=0.605, max=0.605, sum=0.605 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.6052222681045533\"}",
-              "NaturalQuestions (open-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}",
-              "NaturalQuestions (open-book) - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "NaturalQuestions (open-book) - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "NaturalQuestions (open-book) - # prompt tokens": "{\"description\": \"min=1728.593, mean=1728.593, max=1728.593, sum=1728.593 (1)\", \"tab\": \"General information\", \"score\": \"1728.593\"}",
-              "NaturalQuestions (open-book) - # output tokens": "{\"description\": \"min=5.902, mean=5.902, max=5.902, sum=5.902 (1)\", \"tab\": \"General information\", \"score\": \"5.902\"}",
-              "NaturalQuestions (closed-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}",
-              "NaturalQuestions (closed-book) - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "NaturalQuestions (closed-book) - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "NaturalQuestions (closed-book) - # prompt tokens": "{\"description\": \"min=139.127, mean=139.127, max=139.127, sum=139.127 (1)\", \"tab\": \"General information\", \"score\": \"139.127\"}",
-              "NaturalQuestions (closed-book) - # output tokens": "{\"description\": \"min=5.263, mean=5.263, max=5.263, sum=5.263 (1)\", \"tab\": \"General information\", \"score\": \"5.263\"}"
+              "Nutrition - Observed inference time (s)": "{\"description\": \"min=0.532, mean=0.532, max=0.532, sum=1.063 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5316595968857311\"}",
+              "Nutrition - # eval": "{\"description\": \"min=306, mean=306, max=306, sum=612 (2)\", \"tab\": \"General information\", \"score\": \"306.0\"}",
+              "Nutrition - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Nutrition - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Nutrition - # prompt tokens": "{\"description\": \"min=586.817, mean=586.817, max=586.817, sum=1173.634 (2)\", \"tab\": \"General information\", \"score\": \"586.8169934640523\"}",
+              "Nutrition - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "mode": "\"closedbook\""
+              "subject": "\"nutrition\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_nutrition\""
             }
           }
         },
         {
-          "evaluation_name": "OpenbookQA",
+          "evaluation_name": "Prehistory",
           "source_data": {
-            "dataset_name": "OpenbookQA",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on OpenbookQA",
+            "evaluation_description": "EM on Prehistory",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.97,
+            "score": 0.92,
             "details": {
-              "description": "min=0.97, mean=0.97, max=0.97, sum=0.97 (1)",
+              "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)",
               "tab": "Accuracy",
-              "OpenbookQA - Observed inference time (s)": "{\"description\": \"min=0.438, mean=0.438, max=0.438, sum=0.438 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.4376141686439514\"}",
-              "OpenbookQA - # eval": "{\"description\": \"min=500, mean=500, max=500, sum=500 (1)\", \"tab\": \"General information\", \"score\": \"500.0\"}",
-              "OpenbookQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "OpenbookQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "OpenbookQA - # prompt tokens": "{\"description\": \"min=249.782, mean=249.782, max=249.782, sum=249.782 (1)\", \"tab\": \"General information\", \"score\": \"249.782\"}",
-              "OpenbookQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Prehistory - Observed inference time (s)": "{\"description\": \"min=0.54, mean=0.54, max=0.54, sum=1.079 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5397091279795141\"}",
+              "Prehistory - # eval": "{\"description\": \"min=324, mean=324, max=324, sum=648 (2)\", \"tab\": \"General information\", \"score\": \"324.0\"}",
+              "Prehistory - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Prehistory - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Prehistory - # prompt tokens": "{\"description\": \"min=514.559, mean=514.559, max=514.559, sum=1029.117 (2)\", \"tab\": \"General information\", \"score\": \"514.5586419753087\"}",
+              "Prehistory - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "dataset": "\"openbookqa\"",
-              "method": "\"multiple_choice_joint\""
+              "subject": "\"prehistory\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_prehistory\""
             }
           }
         },
         {
-          "evaluation_name": "MMLU",
+          "evaluation_name": "Public Relations",
           "source_data": {
-            "dataset_name": "MMLU",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on MMLU",
+            "evaluation_description": "EM on Public Relations",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.711,
+            "score": 0.755,
             "details": {
-              "description": "min=0.53, mean=0.711, max=0.96, sum=3.555 (5)",
+              "description": "min=0.755, mean=0.755, max=0.755, sum=1.509 (2)",
               "tab": "Accuracy",
-              "MMLU - Observed inference time (s)": "{\"description\": \"min=0.53, mean=0.55, max=0.572, sum=2.749 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.5498773384847139\"}",
-              "MMLU - # eval": "{\"description\": \"min=100, mean=102.8, max=114, sum=514 (5)\", \"tab\": \"General information\", \"score\": \"102.8\"}",
-              "MMLU - # train": "{\"description\": \"min=5, mean=5, max=5, sum=25 (5)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "MMLU - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "MMLU - # prompt tokens": "{\"description\": \"min=373.44, mean=467.72, max=614.43, sum=2338.6 (5)\", \"tab\": \"General information\", \"score\": \"467.71996491228066\"}",
-              "MMLU - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Public Relations - Observed inference time (s)": "{\"description\": \"min=0.584, mean=0.584, max=0.584, sum=1.168 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5840315688740123\"}",
+              "Public Relations - # eval": "{\"description\": \"min=110, mean=110, max=110, sum=220 (2)\", \"tab\": \"General information\", \"score\": \"110.0\"}",
+              "Public Relations - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Public Relations - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Public Relations - # prompt tokens": "{\"description\": \"min=405.318, mean=405.318, max=405.318, sum=810.636 (2)\", \"tab\": \"General information\", \"score\": \"405.3181818181818\"}",
+              "Public Relations - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "[\"abstract_algebra\", \"college_chemistry\", \"computer_security\", \"econometrics\", \"us_foreign_policy\"]",
-              "method": "\"multiple_choice_joint\""
+              "subject": "\"public_relations\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_public_relations\""
             }
           }
         },
         {
-          "evaluation_name": "MATH",
+          "evaluation_name": "Security Studies",
           "source_data": {
-            "dataset_name": "MATH",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "Equivalent (CoT) on MATH",
+            "evaluation_description": "EM on Security Studies",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.833,
+            "score": 0.8,
             "details": {
-              "description": "min=0.684, mean=0.833, max=0.97, sum=5.83 (7)",
+              "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)",
               "tab": "Accuracy",
-              "MATH - Observed inference time (s)": "{\"description\": \"min=4.92, mean=6.678, max=8.338, sum=46.748 (7)\", \"tab\": \"Efficiency\", \"score\": \"6.678270916932833\"}",
-              "MATH - # eval": "{\"description\": \"min=30, mean=62.429, max=135, sum=437 (7)\", \"tab\": \"General information\", \"score\": \"62.42857142857143\"}",
-              "MATH - # train": "{\"description\": \"min=8, mean=8, max=8, sum=56 (7)\", \"tab\": \"General information\", \"score\": \"8.0\"}",
-              "MATH - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (7)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "MATH - # prompt tokens": "{\"description\": \"min=881.363, mean=1262.911, max=2197.577, sum=8840.376 (7)\", \"tab\": \"General information\", \"score\": \"1262.9108741840687\"}",
-              "MATH - # output tokens": "{\"description\": \"min=135.163, mean=189.561, max=219.316, sum=1326.926 (7)\", \"tab\": \"General information\", \"score\": \"189.56082409362702\"}"
+              "Security Studies - Observed inference time (s)": "{\"description\": \"min=0.529, mean=0.529, max=0.529, sum=1.058 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.529095221538933\"}",
+              "Security Studies - # eval": "{\"description\": \"min=245, mean=245, max=245, sum=490 (2)\", \"tab\": \"General information\", \"score\": \"245.0\"}",
+              "Security Studies - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Security Studies - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Security Studies - # prompt tokens": "{\"description\": \"min=1164.473, mean=1164.473, max=1164.473, sum=2328.947 (2)\", \"tab\": \"General information\", \"score\": \"1164.4734693877551\"}",
+              "Security Studies - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "[\"algebra\", \"counting_and_probability\", \"geometry\", \"intermediate_algebra\", \"number_theory\", \"prealgebra\", \"precalculus\"]",
-              "level": "\"1\"",
-              "use_official_examples": "\"False\"",
-              "use_chain_of_thought": "\"True\""
+              "subject": "\"security_studies\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_security_studies\""
             }
           }
         },
         {
-          "evaluation_name": "GSM8K",
+          "evaluation_name": "Sociology",
           "source_data": {
-            "dataset_name": "GSM8K",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on GSM8K",
+            "evaluation_description": "EM on Sociology",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.824,
+            "score": 0.915,
             "details": {
-              "description": "min=0.824, mean=0.824, max=0.824, sum=0.824 (1)",
+              "description": "min=0.915, mean=0.915, max=0.915, sum=1.831 (2)",
               "tab": "Accuracy",
-              "GSM8K - Observed inference time (s)": "{\"description\": \"min=6.915, mean=6.915, max=6.915, sum=6.915 (1)\", \"tab\": \"Efficiency\", \"score\": \"6.91472976398468\"}",
-              "GSM8K - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}",
-              "GSM8K - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "GSM8K - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "GSM8K - # prompt tokens": "{\"description\": \"min=959.035, mean=959.035, max=959.035, sum=959.035 (1)\", \"tab\": \"General information\", \"score\": \"959.035\"}",
-              "GSM8K - # output tokens": "{\"description\": \"min=141.712, mean=141.712, max=141.712, sum=141.712 (1)\", \"tab\": \"General information\", \"score\": \"141.712\"}"
+              "Sociology - Observed inference time (s)": "{\"description\": \"min=0.52, mean=0.52, max=0.52, sum=1.04 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5199050891458692\"}",
+              "Sociology - # eval": "{\"description\": \"min=201, mean=201, max=201, sum=402 (2)\", \"tab\": \"General information\", \"score\": \"201.0\"}",
+              "Sociology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Sociology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Sociology - # prompt tokens": "{\"description\": \"min=445.522, mean=445.522, max=445.522, sum=891.045 (2)\", \"tab\": \"General information\", \"score\": \"445.5223880597015\"}",
+              "Sociology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "stop": "\"none\""
+              "subject": "\"sociology\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_sociology\""
             }
           }
         },
         {
-          "evaluation_name": "LegalBench",
+          "evaluation_name": "Virology",
           "source_data": {
-            "dataset_name": "LegalBench",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on LegalBench",
+            "evaluation_description": "EM on Virology",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.727,
+            "score": 0.602,
             "details": {
-              "description": "min=0.417, mean=0.727, max=0.947, sum=3.637 (5)",
+              "description": "min=0.602, mean=0.602, max=0.602, sum=1.205 (2)",
               "tab": "Accuracy",
-              "LegalBench - Observed inference time (s)": "{\"description\": \"min=0.514, mean=0.608, max=0.803, sum=3.041 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.6081070231398068\"}",
-              "LegalBench - # eval": "{\"description\": \"min=95, mean=409.4, max=1000, sum=2047 (5)\", \"tab\": \"General information\", \"score\": \"409.4\"}",
-              "LegalBench - # train": "{\"description\": \"min=4, mean=4.8, max=5, sum=24 (5)\", \"tab\": \"General information\", \"score\": \"4.8\"}",
-              "LegalBench - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "LegalBench - # prompt tokens": "{\"description\": \"min=207.442, mean=1524.163, max=6311.388, sum=7620.815 (5)\", \"tab\": \"General information\", \"score\": \"1524.162971355988\"}",
-              "LegalBench - # output tokens": "{\"description\": \"min=1, mean=1.325, max=2.032, sum=6.626 (5)\", \"tab\": \"General information\", \"score\": \"1.3251168793919403\"}"
+              "Virology - Observed inference time (s)": "{\"description\": \"min=0.523, mean=0.523, max=0.523, sum=1.045 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5226844951330897\"}",
+              "Virology - # eval": "{\"description\": \"min=166, mean=166, max=166, sum=332 (2)\", \"tab\": \"General information\", \"score\": \"166.0\"}",
+              "Virology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Virology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Virology - # prompt tokens": "{\"description\": \"min=343.09, mean=343.09, max=343.09, sum=686.181 (2)\", \"tab\": \"General information\", \"score\": \"343.0903614457831\"}",
+              "Virology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subset": "[\"abercrombie\", \"corporate_lobbying\", \"function_of_decision_section\", \"international_citizenship_questions\", \"proa\"]"
+              "subject": "\"virology\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_virology\""
             }
           }
         },
         {
-          "evaluation_name": "MedQA",
+          "evaluation_name": "World Religions",
           "source_data": {
-            "dataset_name": "MedQA",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on MedQA",
+            "evaluation_description": "EM on World Religions",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.783,
+            "score": 0.848,
             "details": {
-              "description": "min=0.783, mean=0.783, max=0.783, sum=0.783 (1)",
+              "description": "min=0.848, mean=0.848, max=0.848, sum=1.696 (2)",
               "tab": "Accuracy",
-              "MedQA - Observed inference time (s)": "{\"description\": \"min=0.455, mean=0.455, max=0.455, sum=0.455 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.4549296101329341\"}",
-              "MedQA - # eval": "{\"description\": \"min=503, mean=503, max=503, sum=503 (1)\", \"tab\": \"General information\", \"score\": \"503.0\"}",
-              "MedQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "MedQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "MedQA - # prompt tokens": "{\"description\": \"min=1027.414, mean=1027.414, max=1027.414, sum=1027.414 (1)\", \"tab\": \"General information\", \"score\": \"1027.4135188866799\"}",
-              "MedQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "World Religions - Observed inference time (s)": "{\"description\": \"min=0.494, mean=0.494, max=0.494, sum=0.988 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.49407080739562276\"}",
+              "World Religions - # eval": "{\"description\": \"min=171, mean=171, max=171, sum=342 (2)\", \"tab\": \"General information\", \"score\": \"171.0\"}",
+              "World Religions - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "World Religions - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "World Religions - # prompt tokens": "{\"description\": \"min=275.561, mean=275.561, max=275.561, sum=551.123 (2)\", \"tab\": \"General information\", \"score\": \"275.56140350877195\"}",
+              "World Religions - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
-            "additional_details": {}
+            "additional_details": {
+              "subject": "\"world_religions\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_world_religions\""
+            }
           }
         },
         {
-          "evaluation_name": "WMT 2014",
+          "evaluation_name": "Mean win rate",
           "source_data": {
-            "dataset_name": "WMT 2014",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "BLEU-4 on WMT 2014",
+            "evaluation_description": "How many models this model outperforms on average (over columns).",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.218,
+            "score": 0.351,
             "details": {
-              "description": "min=0.169, mean=0.218, max=0.264, sum=1.088 (5)",
-              "tab": "Accuracy",
-              "WMT 2014 - Observed inference time (s)": "{\"description\": \"min=1.131, mean=1.185, max=1.222, sum=5.925 (5)\", \"tab\": \"Efficiency\", \"score\": \"1.1850423664020953\"}",
-              "WMT 2014 - # eval": "{\"description\": \"min=503, mean=568.8, max=832, sum=2844 (5)\", \"tab\": \"General information\", \"score\": \"568.8\"}",
-              "WMT 2014 - # train": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "WMT 2014 - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "WMT 2014 - # prompt tokens": "{\"description\": \"min=124.901, mean=148.043, max=168.185, sum=740.213 (5)\", \"tab\": \"General information\", \"score\": \"148.04258583116683\"}",
-              "WMT 2014 - # output tokens": "{\"description\": \"min=23.744, mean=25.264, max=25.938, sum=126.322 (5)\", \"tab\": \"General information\", \"score\": \"25.26444840571953\"}"
+              "description": "",
+              "tab": "Efficiency"
             }
           },
           "generation_config": {
-            "additional_details": {
-              "language_pair": "[\"cs-en\", \"de-en\", \"fr-en\", \"hi-en\", \"ru-en\"]"
-            }
+            "additional_details": {}
           }
         }
       ],
       "detailed_evaluation_results": null,
       "generation_config": {
-        "additional_details": {}
+        "additional_details": {
+          "subject": "[\"abstract_algebra\", \"anatomy\", \"astronomy\", \"business_ethics\", \"clinical_knowledge\", \"college_biology\", \"college_chemistry\", \"college_computer_science\", \"college_mathematics\", \"college_medicine\", \"college_physics\", \"computer_security\", \"conceptual_physics\", \"econometrics\", \"electrical_engineering\", \"elementary_mathematics\", \"formal_logic\", \"global_facts\", \"high_school_biology\", \"high_school_chemistry\", \"high_school_computer_science\", \"high_school_european_history\", \"high_school_geography\", \"high_school_government_and_politics\", \"high_school_macroeconomics\", \"high_school_mathematics\", \"high_school_microeconomics\", \"high_school_physics\", \"high_school_psychology\", \"high_school_statistics\", \"high_school_us_history\", \"high_school_world_history\", \"human_aging\", \"human_sexuality\", \"international_law\", \"jurisprudence\", \"logical_fallacies\", \"machine_learning\", \"management\", \"marketing\", \"medical_genetics\", \"miscellaneous\", \"moral_disputes\", \"moral_scenarios\", \"nutrition\", \"philosophy\", \"prehistory\", \"professional_accounting\", \"professional_law\", \"professional_medicine\", \"professional_psychology\", \"public_relations\", \"security_studies\", \"sociology\", \"us_foreign_policy\", \"virology\", \"world_religions\"]",
+          "method": "\"multiple_choice_joint\"",
+          "eval_split": "\"test\"",
+          "groups": "[\"mmlu_abstract_algebra\", \"mmlu_anatomy\", \"mmlu_astronomy\", \"mmlu_business_ethics\", \"mmlu_clinical_knowledge\", \"mmlu_college_biology\", \"mmlu_college_chemistry\", \"mmlu_college_computer_science\", \"mmlu_college_mathematics\", \"mmlu_college_medicine\", \"mmlu_college_physics\", \"mmlu_computer_security\", \"mmlu_conceptual_physics\", \"mmlu_econometrics\", \"mmlu_electrical_engineering\", \"mmlu_elementary_mathematics\", \"mmlu_formal_logic\", \"mmlu_global_facts\", \"mmlu_high_school_biology\", \"mmlu_high_school_chemistry\", \"mmlu_high_school_computer_science\", \"mmlu_high_school_european_history\", \"mmlu_high_school_geography\", \"mmlu_high_school_government_and_politics\", \"mmlu_high_school_macroeconomics\", \"mmlu_high_school_mathematics\", \"mmlu_high_school_microeconomics\", \"mmlu_high_school_physics\", \"mmlu_high_school_psychology\", \"mmlu_high_school_statistics\", \"mmlu_high_school_us_history\", \"mmlu_high_school_world_history\", \"mmlu_human_aging\", \"mmlu_human_sexuality\", \"mmlu_international_law\", \"mmlu_jurisprudence\", \"mmlu_logical_fallacies\", \"mmlu_machine_learning\", \"mmlu_management\", \"mmlu_marketing\", \"mmlu_medical_genetics\", \"mmlu_miscellaneous\", \"mmlu_moral_disputes\", \"mmlu_moral_scenarios\", \"mmlu_nutrition\", \"mmlu_philosophy\", \"mmlu_prehistory\", \"mmlu_professional_accounting\", \"mmlu_professional_law\", \"mmlu_professional_medicine\", \"mmlu_professional_psychology\", \"mmlu_public_relations\", \"mmlu_security_studies\", \"mmlu_sociology\", \"mmlu_us_foreign_policy\", \"mmlu_virology\", \"mmlu_world_religions\"]"
+        }
       }
     },
     {
diff --git a/data/models/openai_gpt-4o-2024-08-06.json b/data/models/openai_gpt-4o-2024-08-06.json
index 4523783fb76c33e56985cf8706dfcde93c76d009..ca15abfce433f9cf3b132cc5fefadfcef719867a 100644
--- a/data/models/openai_gpt-4o-2024-08-06.json
+++ b/data/models/openai_gpt-4o-2024-08-06.json
@@ -1900,10 +1900,10 @@
       }
     },
     {
-      "evaluation_id": "reward-bench/openai_gpt-4o-2024-08-06/1766412838.146816",
+      "evaluation_id": "reward-bench-2/openai_gpt-4o-2024-08-06/1766412838.146816",
       "retrieved_timestamp": "1766412838.146816",
       "source_metadata": {
-        "source_name": "RewardBench",
+        "source_name": "RewardBench 2",
         "source_type": "documentation",
         "source_organization_name": "Allen Institute for AI",
         "source_organization_url": "https://allenai.org",
@@ -1922,128 +1922,104 @@
         {
           "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench Score",
+            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8673
+            "score": 0.6493
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat",
+          "evaluation_name": "Factuality",
           "metric_config": {
-            "evaluation_description": "Chat accuracy - includes easy chat subsets",
+            "evaluation_description": "Factuality score - measures factual accuracy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9609
+            "score": 0.5684
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat Hard",
+          "evaluation_name": "Precise IF",
           "metric_config": {
-            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
+            "evaluation_description": "Precise Instruction Following score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.761
+            "score": 0.3312
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Safety",
+          "evaluation_name": "Math",
           "metric_config": {
-            "evaluation_description": "Safety accuracy - includes safety subsets",
+            "evaluation_description": "Math score - measures mathematical reasoning",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8811
+            "score": 0.623
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Reasoning",
+          "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
+            "evaluation_description": "Safety score - measures safety awareness",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8661
+            "score": 0.8619
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
-        }
-      ],
-      "detailed_evaluation_results": null,
-      "generation_config": null
-    },
-    {
-      "evaluation_id": "reward-bench-2/openai_gpt-4o-2024-08-06/1766412838.146816",
-      "retrieved_timestamp": "1766412838.146816",
-      "source_metadata": {
-        "source_name": "RewardBench 2",
-        "source_type": "documentation",
-        "source_organization_name": "Allen Institute for AI",
-        "source_organization_url": "https://allenai.org",
-        "evaluator_relationship": "third_party"
-      },
-      "eval_library": {
-        "name": "rewardbench",
-        "version": "0.1.3",
-        "additional_details": {
-          "subsets": "Chat, Chat Hard, Safety, Reasoning",
-          "hf_space": "allenai/reward-bench"
-        }
-      },
-      "benchmark": "reward-bench",
-      "evaluation_results": [
+        },
         {
-          "evaluation_name": "Score",
+          "evaluation_name": "Focus",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
+            "evaluation_description": "Focus score - measures response focus",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6493
+            "score": 0.7293
           },
           "source_data": {
             "dataset_name": "RewardBench 2",
@@ -2052,111 +2028,135 @@
           }
         },
         {
-          "evaluation_name": "Factuality",
+          "evaluation_name": "Ties",
           "metric_config": {
-            "evaluation_description": "Factuality score - measures factual accuracy",
+            "evaluation_description": "Ties score - ability to identify tie cases",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5684
+            "score": 0.7819
           },
           "source_data": {
             "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
             "hf_repo": "allenai/reward-bench-2-results"
           }
-        },
+        }
+      ],
+      "detailed_evaluation_results": null,
+      "generation_config": null
+    },
+    {
+      "evaluation_id": "reward-bench/openai_gpt-4o-2024-08-06/1766412838.146816",
+      "retrieved_timestamp": "1766412838.146816",
+      "source_metadata": {
+        "source_name": "RewardBench",
+        "source_type": "documentation",
+        "source_organization_name": "Allen Institute for AI",
+        "source_organization_url": "https://allenai.org",
+        "evaluator_relationship": "third_party"
+      },
+      "eval_library": {
+        "name": "rewardbench",
+        "version": "0.1.3",
+        "additional_details": {
+          "subsets": "Chat, Chat Hard, Safety, Reasoning",
+          "hf_space": "allenai/reward-bench"
+        }
+      },
+      "benchmark": "reward-bench",
+      "evaluation_results": [
         {
-          "evaluation_name": "Precise IF",
+          "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Precise Instruction Following score",
+            "evaluation_description": "Overall RewardBench Score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3312
+            "score": 0.8673
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Math",
+          "evaluation_name": "Chat",
           "metric_config": {
-            "evaluation_description": "Math score - measures mathematical reasoning",
+            "evaluation_description": "Chat accuracy - includes easy chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.623
+            "score": 0.9609
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Safety",
+          "evaluation_name": "Chat Hard",
           "metric_config": {
-            "evaluation_description": "Safety score - measures safety awareness",
+            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8619
+            "score": 0.761
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Focus",
+          "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Focus score - measures response focus",
+            "evaluation_description": "Safety accuracy - includes safety subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7293
+            "score": 0.8811
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Ties",
+          "evaluation_name": "Reasoning",
           "metric_config": {
-            "evaluation_description": "Ties score - ability to identify tie cases",
+            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7819
+            "score": 0.8661
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         }
       ],
diff --git a/data/models/openai_gpt-5-2025-08-07.json b/data/models/openai_gpt-5-2025-08-07.json
index 0853492fcc4bbd45f07185718454686350fe1be2..532bfb57f278c67f06480977b0f49dd924ee82c1 100644
--- a/data/models/openai_gpt-5-2025-08-07.json
+++ b/data/models/openai_gpt-5-2025-08-07.json
@@ -1264,13 +1264,13 @@
       }
     },
     {
-      "evaluation_id": "livecodebenchpro/gpt-5-2025-08-07/1760492095.8105888",
-      "retrieved_timestamp": "1760492095.8105888",
+      "evaluation_id": "livecodebenchpro/gpt-5-2025-08-07/1770683238.099205",
+      "retrieved_timestamp": "1770683238.099205",
       "source_metadata": {
-        "source_organization_name": "New York University,  Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy",
-        "evaluator_relationship": "third_party",
         "source_name": "Live Code Bench Pro",
-        "source_type": "documentation"
+        "source_type": "documentation",
+        "source_organization_name": "New York University, Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy",
+        "evaluator_relationship": "third_party"
       },
       "eval_library": {
         "name": "unknown",
@@ -1280,62 +1280,62 @@
       "evaluation_results": [
         {
           "evaluation_name": "Hard Problems",
-          "metric_config": {
-            "evaluation_description": "Pass@1 on Hard Problems",
-            "lower_is_better": false,
-            "score_type": "continuous",
-            "min_score": 0,
-            "max_score": 1
-          },
-          "score_details": {
-            "score": 0.04225352112676056
-          },
           "source_data": {
             "dataset_name": "Hard Problems",
             "source_type": "url",
             "url": [
               "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live"
             ]
-          }
-        },
-        {
-          "evaluation_name": "Medium Problems",
+          },
           "metric_config": {
-            "evaluation_description": "Pass@1 on Medium Problems",
+            "evaluation_description": "Pass@1 on Hard Problems",
             "lower_is_better": false,
             "score_type": "continuous",
-            "min_score": 0,
-            "max_score": 1
+            "min_score": 0.0,
+            "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4084507042253521
-          },
+            "score": 0.0423
+          }
+        },
+        {
+          "evaluation_name": "Medium Problems",
           "source_data": {
             "dataset_name": "Medium Problems",
             "source_type": "url",
             "url": [
               "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live"
             ]
-          }
-        },
-        {
-          "evaluation_name": "Easy Problems",
+          },
           "metric_config": {
-            "evaluation_description": "Pass@1 on Easy Problems",
+            "evaluation_description": "Pass@1 on Medium Problems",
             "lower_is_better": false,
             "score_type": "continuous",
-            "min_score": 0,
-            "max_score": 1
+            "min_score": 0.0,
+            "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8873239436619719
-          },
+            "score": 0.4085
+          }
+        },
+        {
+          "evaluation_name": "Easy Problems",
           "source_data": {
             "dataset_name": "Easy Problems",
             "source_type": "url",
             "url": [
               "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live"
             ]
+          },
+          "metric_config": {
+            "evaluation_description": "Pass@1 on Easy Problems",
+            "lower_is_better": false,
+            "score_type": "continuous",
+            "min_score": 0.0,
+            "max_score": 1.0
+          },
+          "score_details": {
+            "score": 0.9014
           }
         }
       ],
@@ -1343,13 +1343,13 @@
       "generation_config": null
     },
     {
-      "evaluation_id": "livecodebenchpro/gpt-5-2025-08-07/1770683238.099205",
-      "retrieved_timestamp": "1770683238.099205",
+      "evaluation_id": "livecodebenchpro/gpt-5-2025-08-07/1760492095.8105888",
+      "retrieved_timestamp": "1760492095.8105888",
       "source_metadata": {
+        "source_organization_name": "New York University,  Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy",
+        "evaluator_relationship": "third_party",
         "source_name": "Live Code Bench Pro",
-        "source_type": "documentation",
-        "source_organization_name": "New York University, Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy",
-        "evaluator_relationship": "third_party"
+        "source_type": "documentation"
       },
       "eval_library": {
         "name": "unknown",
@@ -1359,62 +1359,62 @@
       "evaluation_results": [
         {
           "evaluation_name": "Hard Problems",
+          "metric_config": {
+            "evaluation_description": "Pass@1 on Hard Problems",
+            "lower_is_better": false,
+            "score_type": "continuous",
+            "min_score": 0,
+            "max_score": 1
+          },
+          "score_details": {
+            "score": 0.04225352112676056
+          },
           "source_data": {
             "dataset_name": "Hard Problems",
             "source_type": "url",
             "url": [
               "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live"
             ]
-          },
+          }
+        },
+        {
+          "evaluation_name": "Medium Problems",
           "metric_config": {
-            "evaluation_description": "Pass@1 on Hard Problems",
+            "evaluation_description": "Pass@1 on Medium Problems",
             "lower_is_better": false,
             "score_type": "continuous",
-            "min_score": 0.0,
-            "max_score": 1.0
+            "min_score": 0,
+            "max_score": 1
           },
           "score_details": {
-            "score": 0.0423
-          }
-        },
-        {
-          "evaluation_name": "Medium Problems",
+            "score": 0.4084507042253521
+          },
           "source_data": {
             "dataset_name": "Medium Problems",
             "source_type": "url",
             "url": [
               "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live"
             ]
-          },
+          }
+        },
+        {
+          "evaluation_name": "Easy Problems",
           "metric_config": {
-            "evaluation_description": "Pass@1 on Medium Problems",
+            "evaluation_description": "Pass@1 on Easy Problems",
             "lower_is_better": false,
             "score_type": "continuous",
-            "min_score": 0.0,
-            "max_score": 1.0
+            "min_score": 0,
+            "max_score": 1
           },
           "score_details": {
-            "score": 0.4085
-          }
-        },
-        {
-          "evaluation_name": "Easy Problems",
+            "score": 0.8873239436619719
+          },
           "source_data": {
             "dataset_name": "Easy Problems",
             "source_type": "url",
             "url": [
               "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live"
             ]
-          },
-          "metric_config": {
-            "evaluation_description": "Pass@1 on Easy Problems",
-            "lower_is_better": false,
-            "score_type": "continuous",
-            "min_score": 0.0,
-            "max_score": 1.0
-          },
-          "score_details": {
-            "score": 0.9014
           }
         }
       ],
diff --git a/data/models/openai_gpt-5-codex.json b/data/models/openai_gpt-5-codex.json
index c82c5f95f908380f688e9ef7118281bf4bf2ade4..a875eb11356a27d3b1b0f42875c13e7560d24fec 100644
--- a/data/models/openai_gpt-5-codex.json
+++ b/data/models/openai_gpt-5-codex.json
@@ -4,13 +4,13 @@
     "id": "openai/gpt-5-codex",
     "developer": "OpenAI",
     "additional_details": {
-      "agent_name": "Mini-SWE-Agent",
-      "agent_organization": "Princeton"
+      "agent_name": "Codex CLI",
+      "agent_organization": "OpenAI"
     }
   },
   "evaluations": [
     {
-      "evaluation_id": "terminal-bench-2.0/mini-swe-agent__gpt-5-codex/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/codex-cli__gpt-5-codex/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -34,7 +34,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-11-03",
+          "evaluation_timestamp": "2025-11-04",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -43,17 +43,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 41.3,
+            "score": 44.3,
             "uncertainty": {
               "standard_error": {
-                "value": 2.8
+                "value": 2.7
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"GPT-5-Codex\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Codex CLI\" -m \"GPT-5-Codex\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -70,7 +70,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"GPT-5-Codex\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Codex CLI\" -m \"GPT-5-Codex\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -84,7 +84,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/terminus-2__gpt-5-codex/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/mini-swe-agent__gpt-5-codex/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -108,7 +108,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-10-31",
+          "evaluation_timestamp": "2025-11-03",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -117,17 +117,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 43.4,
+            "score": 41.3,
             "uncertainty": {
               "standard_error": {
-                "value": 2.9
+                "value": 2.8
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5-Codex\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"GPT-5-Codex\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -144,7 +144,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5-Codex\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"GPT-5-Codex\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -158,7 +158,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/codex-cli__gpt-5-codex/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/terminus-2__gpt-5-codex/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -182,7 +182,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-11-04",
+          "evaluation_timestamp": "2025-10-31",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -191,17 +191,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 44.3,
+            "score": 43.4,
             "uncertainty": {
               "standard_error": {
-                "value": 2.7
+                "value": 2.9
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Codex CLI\" -m \"GPT-5-Codex\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5-Codex\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -218,7 +218,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Codex CLI\" -m \"GPT-5-Codex\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5-Codex\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
diff --git a/data/models/openai_gpt-5-mini.json b/data/models/openai_gpt-5-mini.json
index d47eef41739a278ed131ade530579316195a6b05..8229a45bea599ef0cf4ec037999fa4d4fdccb65f 100644
--- a/data/models/openai_gpt-5-mini.json
+++ b/data/models/openai_gpt-5-mini.json
@@ -84,7 +84,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/mini-swe-agent__gpt-5-mini/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/openhands__gpt-5-mini/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -108,7 +108,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-11-03",
+          "evaluation_timestamp": "2025-11-02",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -117,17 +117,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 22.2,
+            "score": 29.2,
             "uncertainty": {
               "standard_error": {
-                "value": 2.6
+                "value": 2.8
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"GPT-5-Mini\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"GPT-5-Mini\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -144,7 +144,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"GPT-5-Mini\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"GPT-5-Mini\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -158,7 +158,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/openhands__gpt-5-mini/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/mini-swe-agent__gpt-5-mini/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -182,7 +182,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-11-02",
+          "evaluation_timestamp": "2025-11-03",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -191,17 +191,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 29.2,
+            "score": 22.2,
             "uncertainty": {
               "standard_error": {
-                "value": 2.8
+                "value": 2.6
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"GPT-5-Mini\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"GPT-5-Mini\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -218,7 +218,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"GPT-5-Mini\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"GPT-5-Mini\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -232,7 +232,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/terminus-2__gpt-5-mini/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/spoox-m__gpt-5-mini/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -256,7 +256,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-10-31",
+          "evaluation_timestamp": "2025-12-24",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -265,17 +265,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 24.0,
+            "score": 34.8,
             "uncertainty": {
               "standard_error": {
-                "value": 2.5
+                "value": 2.7
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5-Mini\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"spoox-m\" -m \"GPT-5-Mini\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -292,7 +292,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5-Mini\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"spoox-m\" -m \"GPT-5-Mini\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -306,7 +306,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/spoox-m__gpt-5-mini/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/terminus-2__gpt-5-mini/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -330,7 +330,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-12-24",
+          "evaluation_timestamp": "2025-10-31",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -339,17 +339,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 34.8,
+            "score": 24.0,
             "uncertainty": {
               "standard_error": {
-                "value": 2.7
+                "value": 2.5
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"spoox-m\" -m \"GPT-5-Mini\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5-Mini\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -366,7 +366,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"spoox-m\" -m \"GPT-5-Mini\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5-Mini\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
diff --git a/data/models/openai_gpt-5-nano.json b/data/models/openai_gpt-5-nano.json
index ce03029c7ef9b6ec65527da76a59725ce6e990db..9816cdcd5d573c1531a9f756bbbe26744d018229 100644
--- a/data/models/openai_gpt-5-nano.json
+++ b/data/models/openai_gpt-5-nano.json
@@ -4,13 +4,13 @@
     "id": "openai/gpt-5-nano",
     "developer": "OpenAI",
     "additional_details": {
-      "agent_name": "Codex CLI",
-      "agent_organization": "OpenAI"
+      "agent_name": "OpenHands",
+      "agent_organization": "OpenHands"
     }
   },
   "evaluations": [
     {
-      "evaluation_id": "terminal-bench-2.0/codex-cli__gpt-5-nano/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/openhands__gpt-5-nano/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -34,7 +34,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-11-04",
+          "evaluation_timestamp": "2025-11-02",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -43,17 +43,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 11.5,
+            "score": 9.9,
             "uncertainty": {
               "standard_error": {
-                "value": 2.3
+                "value": 2.1
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Codex CLI\" -m \"GPT-5-Nano\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"GPT-5-Nano\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -70,7 +70,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Codex CLI\" -m \"GPT-5-Nano\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"GPT-5-Nano\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -232,7 +232,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/openhands__gpt-5-nano/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/codex-cli__gpt-5-nano/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -256,7 +256,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-11-02",
+          "evaluation_timestamp": "2025-11-04",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -265,17 +265,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 9.9,
+            "score": 11.5,
             "uncertainty": {
               "standard_error": {
-                "value": 2.1
+                "value": 2.3
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"GPT-5-Nano\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Codex CLI\" -m \"GPT-5-Nano\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -292,7 +292,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"GPT-5-Nano\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Codex CLI\" -m \"GPT-5-Nano\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
diff --git a/data/models/openai_gpt-5.1-codex.json b/data/models/openai_gpt-5.1-codex.json
index c0cb4dd134e66b0f125db24b4289a9c34bf024c0..a2aac5e629083b81e1729673348d7d31d575cd9b 100644
--- a/data/models/openai_gpt-5.1-codex.json
+++ b/data/models/openai_gpt-5.1-codex.json
@@ -4,13 +4,13 @@
     "id": "openai/gpt-5.1-codex",
     "developer": "OpenAI",
     "additional_details": {
-      "agent_name": "Crux",
-      "agent_organization": "Roam"
+      "agent_name": "Terminus 2",
+      "agent_organization": "Terminal Bench"
     }
   },
   "evaluations": [
     {
-      "evaluation_id": "terminal-bench-2.0/crux__gpt-5.1-codex/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/terminus-2__gpt-5.1-codex/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -34,7 +34,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-11-16",
+          "evaluation_timestamp": "2025-11-17",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -43,17 +43,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 57.8,
+            "score": 36.9,
             "uncertainty": {
               "standard_error": {
-                "value": 2.9
+                "value": 3.2
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Crux\" -m \"GPT-5.1-Codex\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5.1-Codex\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -70,7 +70,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Crux\" -m \"GPT-5.1-Codex\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5.1-Codex\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -84,7 +84,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/terminus-2__gpt-5.1-codex/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/letta-code__gpt-5.1-codex/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -108,7 +108,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-11-17",
+          "evaluation_timestamp": "2025-12-17",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -117,17 +117,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 36.9,
+            "score": 53.5,
             "uncertainty": {
               "standard_error": {
-                "value": 3.2
+                "value": 2.8
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5.1-Codex\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Letta Code\" -m \"GPT-5.1-Codex\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -144,7 +144,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5.1-Codex\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Letta Code\" -m \"GPT-5.1-Codex\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -158,7 +158,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/letta-code__gpt-5.1-codex/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/crux__gpt-5.1-codex/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -182,7 +182,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-12-17",
+          "evaluation_timestamp": "2025-11-16",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -191,17 +191,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 53.5,
+            "score": 57.8,
             "uncertainty": {
               "standard_error": {
-                "value": 2.8
+                "value": 2.9
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Letta Code\" -m \"GPT-5.1-Codex\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Crux\" -m \"GPT-5.1-Codex\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -218,7 +218,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Letta Code\" -m \"GPT-5.1-Codex\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Crux\" -m \"GPT-5.1-Codex\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
diff --git a/data/models/openai_gpt-5.2-2025-12-11.json b/data/models/openai_gpt-5.2-2025-12-11.json
index 6e12cfe05d65d67e592592610eaa134b705b88c4..b70c28d6ee613e835fe13ebe0ae4b3e2cc5d0ae8 100644
--- a/data/models/openai_gpt-5.2-2025-12-11.json
+++ b/data/models/openai_gpt-5.2-2025-12-11.json
@@ -78,7 +78,7 @@
       }
     },
     {
-      "evaluation_id": "appworld/test_normal/claude-code-cli__openai_gpt-5.2-2025-12-11/1774263615.0201504",
+      "evaluation_id": "appworld/test_normal/openai-solo__openai_gpt-5.2-2025-12-11/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -125,8 +125,8 @@
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "Claude Code CLI",
-                  "agent_framework": "claude_code"
+                  "agent_name": "OpenAI Solo",
+                  "agent_framework": "openai_solo"
                 }
               }
             }
@@ -138,15 +138,15 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "Claude Code CLI",
-              "agent_framework": "claude_code"
+              "agent_name": "OpenAI Solo",
+              "agent_framework": "openai_solo"
             }
           }
         }
       }
     },
     {
-      "evaluation_id": "appworld/test_normal/openai-solo__openai_gpt-5.2-2025-12-11/1774263615.0201504",
+      "evaluation_id": "appworld/test_normal/litellm-tool-calling__openai_gpt-5.2-2025-12-11/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -193,8 +193,8 @@
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "OpenAI Solo",
-                  "agent_framework": "openai_solo"
+                  "agent_name": "LiteLLM Tool Calling",
+                  "agent_framework": "tool_calling"
                 }
               }
             }
@@ -206,8 +206,8 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "OpenAI Solo",
-              "agent_framework": "openai_solo"
+              "agent_name": "LiteLLM Tool Calling",
+              "agent_framework": "tool_calling"
             }
           }
         }
@@ -282,7 +282,7 @@
       }
     },
     {
-      "evaluation_id": "appworld/test_normal/litellm-tool-calling__openai_gpt-5.2-2025-12-11/1774263615.0201504",
+      "evaluation_id": "appworld/test_normal/claude-code-cli__openai_gpt-5.2-2025-12-11/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -329,8 +329,8 @@
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "LiteLLM Tool Calling",
-                  "agent_framework": "tool_calling"
+                  "agent_name": "Claude Code CLI",
+                  "agent_framework": "claude_code"
                 }
               }
             }
@@ -342,15 +342,15 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "LiteLLM Tool Calling",
-              "agent_framework": "tool_calling"
+              "agent_name": "Claude Code CLI",
+              "agent_framework": "claude_code"
             }
           }
         }
       }
     },
     {
-      "evaluation_id": "browsecompplus/litellm-tool-calling__openai_gpt-5.2-2025-12-11/1774263615.0201504",
+      "evaluation_id": "browsecompplus/openai-solo__openai_gpt-5.2-2025-12-11/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -382,23 +382,23 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.46,
+            "score": 0.48,
             "uncertainty": {
               "num_samples": 100
             },
             "details": {
-              "average_agent_cost": "0.3",
-              "total_run_cost": "29.78",
-              "average_steps": "8.14",
-              "percent_finished": "0.99"
+              "average_agent_cost": "0.38",
+              "total_run_cost": "38.21",
+              "average_steps": "14.27",
+              "percent_finished": "1.0"
             }
           },
           "generation_config": {
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "LiteLLM Tool Calling",
-                  "agent_framework": "tool_calling"
+                  "agent_name": "OpenAI Solo",
+                  "agent_framework": "openai_solo"
                 }
               }
             }
@@ -410,15 +410,15 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "LiteLLM Tool Calling",
-              "agent_framework": "tool_calling"
+              "agent_name": "OpenAI Solo",
+              "agent_framework": "openai_solo"
             }
           }
         }
       }
     },
     {
-      "evaluation_id": "browsecompplus/smolagents-code__openai_gpt-5.2-2025-12-11/1774263615.0201504",
+      "evaluation_id": "browsecompplus/litellm-tool-calling-with-shortlisting__openai_gpt-5.2-2025-12-11/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -450,14 +450,14 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.26,
+            "score": 0.46,
             "uncertainty": {
               "num_samples": 100
             },
             "details": {
-              "average_agent_cost": "0.17",
-              "total_run_cost": "17.31",
-              "average_steps": "6.57",
+              "average_agent_cost": "0.3",
+              "total_run_cost": "29.78",
+              "average_steps": "8.14",
               "percent_finished": "0.99"
             }
           },
@@ -465,8 +465,8 @@
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "SmolAgents Code",
-                  "agent_framework": "smolagents_code"
+                  "agent_name": "LiteLLM Tool Calling with Shortlisting",
+                  "agent_framework": "tool_calling_with_shortlisting"
                 }
               }
             }
@@ -478,15 +478,15 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "SmolAgents Code",
-              "agent_framework": "smolagents_code"
+              "agent_name": "LiteLLM Tool Calling with Shortlisting",
+              "agent_framework": "tool_calling_with_shortlisting"
             }
           }
         }
       }
     },
     {
-      "evaluation_id": "browsecompplus/claude-code-cli__openai_gpt-5.2-2025-12-11/1774263615.0201504",
+      "evaluation_id": "browsecompplus/litellm-tool-calling__openai_gpt-5.2-2025-12-11/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -518,23 +518,23 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.43,
+            "score": 0.46,
             "uncertainty": {
               "num_samples": 100
             },
             "details": {
-              "average_agent_cost": "0.43",
-              "total_run_cost": "43.11",
-              "average_steps": "8.97",
-              "percent_finished": "1.0"
+              "average_agent_cost": "0.3",
+              "total_run_cost": "29.78",
+              "average_steps": "8.14",
+              "percent_finished": "0.99"
             }
           },
           "generation_config": {
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "Claude Code CLI",
-                  "agent_framework": "claude_code"
+                  "agent_name": "LiteLLM Tool Calling",
+                  "agent_framework": "tool_calling"
                 }
               }
             }
@@ -546,15 +546,15 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "Claude Code CLI",
-              "agent_framework": "claude_code"
+              "agent_name": "LiteLLM Tool Calling",
+              "agent_framework": "tool_calling"
             }
           }
         }
       }
     },
     {
-      "evaluation_id": "browsecompplus/litellm-tool-calling-with-shortlisting__openai_gpt-5.2-2025-12-11/1774263615.0201504",
+      "evaluation_id": "browsecompplus/smolagents-code__openai_gpt-5.2-2025-12-11/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -586,14 +586,14 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.46,
+            "score": 0.26,
             "uncertainty": {
               "num_samples": 100
             },
             "details": {
-              "average_agent_cost": "0.3",
-              "total_run_cost": "29.78",
-              "average_steps": "8.14",
+              "average_agent_cost": "0.17",
+              "total_run_cost": "17.31",
+              "average_steps": "6.57",
               "percent_finished": "0.99"
             }
           },
@@ -601,8 +601,8 @@
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "LiteLLM Tool Calling with Shortlisting",
-                  "agent_framework": "tool_calling_with_shortlisting"
+                  "agent_name": "SmolAgents Code",
+                  "agent_framework": "smolagents_code"
                 }
               }
             }
@@ -614,15 +614,15 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "LiteLLM Tool Calling with Shortlisting",
-              "agent_framework": "tool_calling_with_shortlisting"
+              "agent_name": "SmolAgents Code",
+              "agent_framework": "smolagents_code"
             }
           }
         }
       }
     },
     {
-      "evaluation_id": "browsecompplus/openai-solo__openai_gpt-5.2-2025-12-11/1774263615.0201504",
+      "evaluation_id": "browsecompplus/claude-code-cli__openai_gpt-5.2-2025-12-11/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -654,14 +654,14 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.48,
+            "score": 0.43,
             "uncertainty": {
               "num_samples": 100
             },
             "details": {
-              "average_agent_cost": "0.38",
-              "total_run_cost": "38.21",
-              "average_steps": "14.27",
+              "average_agent_cost": "0.43",
+              "total_run_cost": "43.11",
+              "average_steps": "8.97",
               "percent_finished": "1.0"
             }
           },
@@ -669,8 +669,8 @@
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "OpenAI Solo",
-                  "agent_framework": "openai_solo"
+                  "agent_name": "Claude Code CLI",
+                  "agent_framework": "claude_code"
                 }
               }
             }
@@ -682,8 +682,8 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "OpenAI Solo",
-              "agent_framework": "openai_solo"
+              "agent_name": "Claude Code CLI",
+              "agent_framework": "claude_code"
             }
           }
         }
@@ -769,7 +769,7 @@
       "generation_config": null
     },
     {
-      "evaluation_id": "swe-bench/smolagents-code__openai_gpt-5.2-2025-12-11/1774263615.0201504",
+      "evaluation_id": "swe-bench/claude-code-cli__openai_gpt-5.2-2025-12-11/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -801,14 +801,14 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5253,
+            "score": 0.58,
             "uncertainty": {
-              "num_samples": 99
+              "num_samples": 100
             },
             "details": {
-              "average_agent_cost": "0.45",
-              "total_run_cost": "44.58",
-              "average_steps": "19.98",
+              "average_agent_cost": "0.94",
+              "total_run_cost": "93.98",
+              "average_steps": "23.99",
               "percent_finished": "1.0"
             }
           },
@@ -816,8 +816,8 @@
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "SmolAgents Code",
-                  "agent_framework": "smolagents_code"
+                  "agent_name": "Claude Code CLI",
+                  "agent_framework": "claude_code"
                 }
               }
             }
@@ -829,8 +829,8 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "SmolAgents Code",
-              "agent_framework": "smolagents_code"
+              "agent_name": "Claude Code CLI",
+              "agent_framework": "claude_code"
             }
           }
         }
@@ -905,7 +905,7 @@
       }
     },
     {
-      "evaluation_id": "swe-bench/claude-code-cli__openai_gpt-5.2-2025-12-11/1774263615.0201504",
+      "evaluation_id": "swe-bench/litellm-tool-calling__openai_gpt-5.2-2025-12-11/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -937,14 +937,14 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.58,
+            "score": 0.57,
             "uncertainty": {
               "num_samples": 100
             },
             "details": {
-              "average_agent_cost": "0.94",
-              "total_run_cost": "93.98",
-              "average_steps": "23.99",
+              "average_agent_cost": "0.25",
+              "total_run_cost": "24.76",
+              "average_steps": "20.47",
               "percent_finished": "1.0"
             }
           },
@@ -952,8 +952,8 @@
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "Claude Code CLI",
-                  "agent_framework": "claude_code"
+                  "agent_name": "LiteLLM Tool Calling",
+                  "agent_framework": "tool_calling"
                 }
               }
             }
@@ -965,15 +965,15 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "Claude Code CLI",
-              "agent_framework": "claude_code"
+              "agent_name": "LiteLLM Tool Calling",
+              "agent_framework": "tool_calling"
             }
           }
         }
       }
     },
     {
-      "evaluation_id": "swe-bench/litellm-tool-calling__openai_gpt-5.2-2025-12-11/1774263615.0201504",
+      "evaluation_id": "swe-bench/smolagents-code__openai_gpt-5.2-2025-12-11/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -1005,14 +1005,14 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.57,
+            "score": 0.5253,
             "uncertainty": {
-              "num_samples": 100
+              "num_samples": 99
             },
             "details": {
-              "average_agent_cost": "0.25",
-              "total_run_cost": "24.76",
-              "average_steps": "20.47",
+              "average_agent_cost": "0.45",
+              "total_run_cost": "44.58",
+              "average_steps": "19.98",
               "percent_finished": "1.0"
             }
           },
@@ -1020,8 +1020,8 @@
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "LiteLLM Tool Calling",
-                  "agent_framework": "tool_calling"
+                  "agent_name": "SmolAgents Code",
+                  "agent_framework": "smolagents_code"
                 }
               }
             }
@@ -1033,8 +1033,8 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "LiteLLM Tool Calling",
-              "agent_framework": "tool_calling"
+              "agent_name": "SmolAgents Code",
+              "agent_framework": "smolagents_code"
             }
           }
         }
@@ -1109,7 +1109,7 @@
       }
     },
     {
-      "evaluation_id": "tau-bench-2/airline/litellm-tool-calling-with-shortlisting__openai_gpt-5.2-2025-12-11/1774263615.0201504",
+      "evaluation_id": "tau-bench-2/airline/claude-code-cli__openai_gpt-5.2-2025-12-11/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -1141,14 +1141,14 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.54,
+            "score": 0.48,
             "uncertainty": {
               "num_samples": 50
             },
             "details": {
-              "average_agent_cost": "0.13",
-              "total_run_cost": "6.96",
-              "average_steps": "11.22",
+              "average_agent_cost": "0.21",
+              "total_run_cost": "11.23",
+              "average_steps": "10.18",
               "percent_finished": "1.0"
             }
           },
@@ -1156,8 +1156,8 @@
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "LiteLLM Tool Calling with Shortlisting",
-                  "agent_framework": "tool_calling_with_shortlisting"
+                  "agent_name": "Claude Code CLI",
+                  "agent_framework": "claude_code"
                 }
               }
             }
@@ -1169,8 +1169,8 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "LiteLLM Tool Calling with Shortlisting",
-              "agent_framework": "tool_calling_with_shortlisting"
+              "agent_name": "Claude Code CLI",
+              "agent_framework": "claude_code"
             }
           }
         }
@@ -1245,7 +1245,7 @@
       }
     },
     {
-      "evaluation_id": "tau-bench-2/airline/claude-code-cli__openai_gpt-5.2-2025-12-11/1774263615.0201504",
+      "evaluation_id": "tau-bench-2/airline/litellm-tool-calling__openai_gpt-5.2-2025-12-11/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -1277,14 +1277,14 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.48,
+            "score": 0.54,
             "uncertainty": {
               "num_samples": 50
             },
             "details": {
-              "average_agent_cost": "0.21",
-              "total_run_cost": "11.23",
-              "average_steps": "10.18",
+              "average_agent_cost": "0.13",
+              "total_run_cost": "6.96",
+              "average_steps": "11.22",
               "percent_finished": "1.0"
             }
           },
@@ -1292,8 +1292,8 @@
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "Claude Code CLI",
-                  "agent_framework": "claude_code"
+                  "agent_name": "LiteLLM Tool Calling",
+                  "agent_framework": "tool_calling"
                 }
               }
             }
@@ -1305,15 +1305,15 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "Claude Code CLI",
-              "agent_framework": "claude_code"
+              "agent_name": "LiteLLM Tool Calling",
+              "agent_framework": "tool_calling"
             }
           }
         }
       }
     },
     {
-      "evaluation_id": "tau-bench-2/airline/litellm-tool-calling__openai_gpt-5.2-2025-12-11/1774263615.0201504",
+      "evaluation_id": "tau-bench-2/airline/litellm-tool-calling-with-shortlisting__openai_gpt-5.2-2025-12-11/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -1360,8 +1360,8 @@
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "LiteLLM Tool Calling",
-                  "agent_framework": "tool_calling"
+                  "agent_name": "LiteLLM Tool Calling with Shortlisting",
+                  "agent_framework": "tool_calling_with_shortlisting"
                 }
               }
             }
@@ -1373,15 +1373,15 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "LiteLLM Tool Calling",
-              "agent_framework": "tool_calling"
+              "agent_name": "LiteLLM Tool Calling with Shortlisting",
+              "agent_framework": "tool_calling_with_shortlisting"
             }
           }
         }
       }
     },
     {
-      "evaluation_id": "tau-bench-2/airline/smolagents-code__openai_gpt-5.2-2025-12-11/1774263615.0201504",
+      "evaluation_id": "tau-bench-2/retail/smolagents-code__openai_gpt-5.2-2025-12-11/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -1394,33 +1394,33 @@
         "name": "exgentic",
         "version": "0.1.0"
       },
-      "benchmark": "tau-bench-2_airline",
+      "benchmark": "tau-bench-2_retail",
       "evaluation_results": [
         {
-          "evaluation_name": "tau-bench-2/airline",
+          "evaluation_name": "tau-bench-2/retail",
           "source_data": {
-            "dataset_name": "tau-bench-2/airline",
+            "dataset_name": "tau-bench-2/retail",
             "source_type": "url",
             "url": [
               "https://github.com/Exgentic/exgentic"
             ]
           },
           "metric_config": {
-            "evaluation_description": "Tau Bench 2 benchmark evaluation (airline subset)",
+            "evaluation_description": "Tau Bench 2 benchmark evaluation (retail subset)",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6,
+            "score": 0.68,
             "uncertainty": {
-              "num_samples": 50
+              "num_samples": 100
             },
             "details": {
-              "average_agent_cost": "0.29",
-              "total_run_cost": "15.28",
-              "average_steps": "10.68",
+              "average_agent_cost": "0.25",
+              "total_run_cost": "26.27",
+              "average_steps": "11.08",
               "percent_finished": "1.0"
             }
           },
@@ -1449,7 +1449,7 @@
       }
     },
     {
-      "evaluation_id": "tau-bench-2/retail/openai-solo__openai_gpt-5.2-2025-12-11/1774263615.0201504",
+      "evaluation_id": "tau-bench-2/retail/claude-code-cli__openai_gpt-5.2-2025-12-11/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -1481,23 +1481,23 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5354,
+            "score": 0.51,
             "uncertainty": {
               "num_samples": 100
             },
             "details": {
-              "average_agent_cost": "0.11",
-              "total_run_cost": "11.54",
-              "average_steps": "9.55",
-              "percent_finished": "0.99"
+              "average_agent_cost": "0.12",
+              "total_run_cost": "12.63",
+              "average_steps": "9.92",
+              "percent_finished": "0.98"
             }
           },
           "generation_config": {
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "OpenAI Solo",
-                  "agent_framework": "openai_solo"
+                  "agent_name": "Claude Code CLI",
+                  "agent_framework": "claude_code"
                 }
               }
             }
@@ -1509,15 +1509,15 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "OpenAI Solo",
-              "agent_framework": "openai_solo"
+              "agent_name": "Claude Code CLI",
+              "agent_framework": "claude_code"
             }
           }
         }
       }
     },
     {
-      "evaluation_id": "tau-bench-2/retail/litellm-tool-calling-with-shortlisting__openai_gpt-5.2-2025-12-11/1774263615.0201504",
+      "evaluation_id": "tau-bench-2/airline/smolagents-code__openai_gpt-5.2-2025-12-11/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -1530,33 +1530,33 @@
         "name": "exgentic",
         "version": "0.1.0"
       },
-      "benchmark": "tau-bench-2_retail",
+      "benchmark": "tau-bench-2_airline",
       "evaluation_results": [
         {
-          "evaluation_name": "tau-bench-2/retail",
+          "evaluation_name": "tau-bench-2/airline",
           "source_data": {
-            "dataset_name": "tau-bench-2/retail",
+            "dataset_name": "tau-bench-2/airline",
             "source_type": "url",
             "url": [
               "https://github.com/Exgentic/exgentic"
             ]
           },
           "metric_config": {
-            "evaluation_description": "Tau Bench 2 benchmark evaluation (retail subset)",
+            "evaluation_description": "Tau Bench 2 benchmark evaluation (airline subset)",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.73,
+            "score": 0.6,
             "uncertainty": {
-              "num_samples": 100
+              "num_samples": 50
             },
             "details": {
-              "average_agent_cost": "0.11",
-              "total_run_cost": "12.27",
-              "average_steps": "10.33",
+              "average_agent_cost": "0.29",
+              "total_run_cost": "15.28",
+              "average_steps": "10.68",
               "percent_finished": "1.0"
             }
           },
@@ -1564,8 +1564,8 @@
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "LiteLLM Tool Calling with Shortlisting",
-                  "agent_framework": "tool_calling_with_shortlisting"
+                  "agent_name": "SmolAgents Code",
+                  "agent_framework": "smolagents_code"
                 }
               }
             }
@@ -1577,15 +1577,15 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "LiteLLM Tool Calling with Shortlisting",
-              "agent_framework": "tool_calling_with_shortlisting"
+              "agent_name": "SmolAgents Code",
+              "agent_framework": "smolagents_code"
             }
           }
         }
       }
     },
     {
-      "evaluation_id": "tau-bench-2/retail/litellm-tool-calling__openai_gpt-5.2-2025-12-11/1774263615.0201504",
+      "evaluation_id": "tau-bench-2/retail/openai-solo__openai_gpt-5.2-2025-12-11/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -1617,23 +1617,23 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.73,
+            "score": 0.5354,
             "uncertainty": {
               "num_samples": 100
             },
             "details": {
               "average_agent_cost": "0.11",
-              "total_run_cost": "12.27",
-              "average_steps": "10.33",
-              "percent_finished": "1.0"
+              "total_run_cost": "11.54",
+              "average_steps": "9.55",
+              "percent_finished": "0.99"
             }
           },
           "generation_config": {
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "LiteLLM Tool Calling",
-                  "agent_framework": "tool_calling"
+                  "agent_name": "OpenAI Solo",
+                  "agent_framework": "openai_solo"
                 }
               }
             }
@@ -1645,15 +1645,15 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "LiteLLM Tool Calling",
-              "agent_framework": "tool_calling"
+              "agent_name": "OpenAI Solo",
+              "agent_framework": "openai_solo"
             }
           }
         }
       }
     },
     {
-      "evaluation_id": "tau-bench-2/retail/claude-code-cli__openai_gpt-5.2-2025-12-11/1774263615.0201504",
+      "evaluation_id": "tau-bench-2/retail/litellm-tool-calling__openai_gpt-5.2-2025-12-11/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -1685,23 +1685,23 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.51,
+            "score": 0.73,
             "uncertainty": {
               "num_samples": 100
             },
             "details": {
-              "average_agent_cost": "0.12",
-              "total_run_cost": "12.63",
-              "average_steps": "9.92",
-              "percent_finished": "0.98"
+              "average_agent_cost": "0.11",
+              "total_run_cost": "12.27",
+              "average_steps": "10.33",
+              "percent_finished": "1.0"
             }
           },
           "generation_config": {
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "Claude Code CLI",
-                  "agent_framework": "claude_code"
+                  "agent_name": "LiteLLM Tool Calling",
+                  "agent_framework": "tool_calling"
                 }
               }
             }
@@ -1713,15 +1713,15 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "Claude Code CLI",
-              "agent_framework": "claude_code"
+              "agent_name": "LiteLLM Tool Calling",
+              "agent_framework": "tool_calling"
             }
           }
         }
       }
     },
     {
-      "evaluation_id": "tau-bench-2/retail/smolagents-code__openai_gpt-5.2-2025-12-11/1774263615.0201504",
+      "evaluation_id": "tau-bench-2/retail/litellm-tool-calling-with-shortlisting__openai_gpt-5.2-2025-12-11/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -1753,14 +1753,14 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.68,
+            "score": 0.73,
             "uncertainty": {
               "num_samples": 100
             },
             "details": {
-              "average_agent_cost": "0.25",
-              "total_run_cost": "26.27",
-              "average_steps": "11.08",
+              "average_agent_cost": "0.11",
+              "total_run_cost": "12.27",
+              "average_steps": "10.33",
               "percent_finished": "1.0"
             }
           },
@@ -1768,8 +1768,8 @@
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "SmolAgents Code",
-                  "agent_framework": "smolagents_code"
+                  "agent_name": "LiteLLM Tool Calling with Shortlisting",
+                  "agent_framework": "tool_calling_with_shortlisting"
                 }
               }
             }
@@ -1781,15 +1781,15 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "SmolAgents Code",
-              "agent_framework": "smolagents_code"
+              "agent_name": "LiteLLM Tool Calling with Shortlisting",
+              "agent_framework": "tool_calling_with_shortlisting"
             }
           }
         }
       }
     },
     {
-      "evaluation_id": "tau-bench-2/telecom/smolagents-code__openai_gpt-5.2-2025-12-11/1774263615.0201504",
+      "evaluation_id": "tau-bench-2/telecom/openai-solo__openai_gpt-5.2-2025-12-11/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -1821,14 +1821,14 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.71,
+            "score": 0.53,
             "uncertainty": {
               "num_samples": 100
             },
             "details": {
-              "average_agent_cost": "0.3",
-              "total_run_cost": "35.31",
-              "average_steps": "10.11",
+              "average_agent_cost": "0.15",
+              "total_run_cost": "18.88",
+              "average_steps": "9.92",
               "percent_finished": "1.0"
             }
           },
@@ -1836,8 +1836,8 @@
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "SmolAgents Code",
-                  "agent_framework": "smolagents_code"
+                  "agent_name": "OpenAI Solo",
+                  "agent_framework": "openai_solo"
                 }
               }
             }
@@ -1849,8 +1849,8 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "SmolAgents Code",
-              "agent_framework": "smolagents_code"
+              "agent_name": "OpenAI Solo",
+              "agent_framework": "openai_solo"
             }
           }
         }
@@ -1993,7 +1993,7 @@
       }
     },
     {
-      "evaluation_id": "tau-bench-2/telecom/openai-solo__openai_gpt-5.2-2025-12-11/1774263615.0201504",
+      "evaluation_id": "tau-bench-2/telecom/litellm-tool-calling__openai_gpt-5.2-2025-12-11/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -2025,23 +2025,23 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.53,
+            "score": 0.5354,
             "uncertainty": {
               "num_samples": 100
             },
             "details": {
-              "average_agent_cost": "0.15",
-              "total_run_cost": "18.88",
-              "average_steps": "9.92",
-              "percent_finished": "1.0"
+              "average_agent_cost": "0.14",
+              "total_run_cost": "19.92",
+              "average_steps": "10.18",
+              "percent_finished": "0.99"
             }
           },
           "generation_config": {
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "OpenAI Solo",
-                  "agent_framework": "openai_solo"
+                  "agent_name": "LiteLLM Tool Calling",
+                  "agent_framework": "tool_calling"
                 }
               }
             }
@@ -2053,15 +2053,15 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "OpenAI Solo",
-              "agent_framework": "openai_solo"
+              "agent_name": "LiteLLM Tool Calling",
+              "agent_framework": "tool_calling"
             }
           }
         }
       }
     },
     {
-      "evaluation_id": "tau-bench-2/telecom/litellm-tool-calling__openai_gpt-5.2-2025-12-11/1774263615.0201504",
+      "evaluation_id": "tau-bench-2/telecom/smolagents-code__openai_gpt-5.2-2025-12-11/1774263615.0201504",
       "retrieved_timestamp": "1774263615.0201504",
       "source_metadata": {
         "source_name": "Exgentic Open Agent Leaderboard",
@@ -2093,23 +2093,23 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5354,
+            "score": 0.71,
             "uncertainty": {
               "num_samples": 100
             },
             "details": {
-              "average_agent_cost": "0.14",
-              "total_run_cost": "19.92",
-              "average_steps": "10.18",
-              "percent_finished": "0.99"
+              "average_agent_cost": "0.3",
+              "total_run_cost": "35.31",
+              "average_steps": "10.11",
+              "percent_finished": "1.0"
             }
           },
           "generation_config": {
             "generation_args": {
               "agentic_eval_config": {
                 "additional_details": {
-                  "agent_name": "LiteLLM Tool Calling",
-                  "agent_framework": "tool_calling"
+                  "agent_name": "SmolAgents Code",
+                  "agent_framework": "smolagents_code"
                 }
               }
             }
@@ -2121,8 +2121,8 @@
         "generation_args": {
           "agentic_eval_config": {
             "additional_details": {
-              "agent_name": "LiteLLM Tool Calling",
-              "agent_framework": "tool_calling"
+              "agent_name": "SmolAgents Code",
+              "agent_framework": "smolagents_code"
             }
           }
         }
diff --git a/data/models/openai_gpt-5.2.json b/data/models/openai_gpt-5.2.json
index 77930b98006acfd2c6efa9a1e831cbd44874ee48..26fe4dd6a4fc6668b751e0d5655642084b59ba75 100644
--- a/data/models/openai_gpt-5.2.json
+++ b/data/models/openai_gpt-5.2.json
@@ -4,13 +4,13 @@
     "id": "openai/gpt-5.2",
     "developer": "OpenAI",
     "additional_details": {
-      "agent_name": "Codex CLI",
-      "agent_organization": "OpenAI"
+      "agent_name": "Droid",
+      "agent_organization": "Factory"
     }
   },
   "evaluations": [
     {
-      "evaluation_id": "terminal-bench-2.0/codex-cli__gpt-5.2/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/droid__gpt-5.2/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -34,7 +34,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-12-18",
+          "evaluation_timestamp": "2025-12-24",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -43,17 +43,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 62.9,
+            "score": 64.9,
             "uncertainty": {
               "standard_error": {
-                "value": 3.0
+                "value": 2.8
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Codex CLI\" -m \"GPT-5.2\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Droid\" -m \"GPT-5.2\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -70,7 +70,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Codex CLI\" -m \"GPT-5.2\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Droid\" -m \"GPT-5.2\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -84,7 +84,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/droid__gpt-5.2/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/terminus-2__gpt-5.2/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -108,7 +108,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-12-24",
+          "evaluation_timestamp": "2025-12-12",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -117,17 +117,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 64.9,
+            "score": 54.0,
             "uncertainty": {
               "standard_error": {
-                "value": 2.8
+                "value": 2.9
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Droid\" -m \"GPT-5.2\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5.2\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -144,7 +144,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Droid\" -m \"GPT-5.2\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5.2\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -158,7 +158,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/terminus-2__gpt-5.2/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/mux__gpt-5.2/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -182,7 +182,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-12-12",
+          "evaluation_timestamp": "2026-01-17",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -191,17 +191,11 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 54.0,
-            "uncertainty": {
-              "standard_error": {
-                "value": 2.9
-              },
-              "num_samples": 435
-            }
+            "score": 60.7
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5.2\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mux\" -m \"GPT-5.2\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -218,7 +212,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5.2\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mux\" -m \"GPT-5.2\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -232,7 +226,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/mux__gpt-5.2/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/codex-cli__gpt-5.2/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -256,7 +250,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2026-01-17",
+          "evaluation_timestamp": "2025-12-18",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -265,11 +259,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 60.7
+            "score": 62.9,
+            "uncertainty": {
+              "standard_error": {
+                "value": 3.0
+              },
+              "num_samples": 435
+            }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mux\" -m \"GPT-5.2\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Codex CLI\" -m \"GPT-5.2\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -286,7 +286,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mux\" -m \"GPT-5.2\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Codex CLI\" -m \"GPT-5.2\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
diff --git a/data/models/openai_gpt-5.3-codex.json b/data/models/openai_gpt-5.3-codex.json
index bc1283e272bffaad3e91b4e9076eec602a9b6d32..d48b8e986526c7f815b5efdb141e51c853537ca1 100644
--- a/data/models/openai_gpt-5.3-codex.json
+++ b/data/models/openai_gpt-5.3-codex.json
@@ -4,13 +4,13 @@
     "id": "openai/gpt-5.3-codex",
     "developer": "OpenAI",
     "additional_details": {
-      "agent_name": "Simple Codex",
-      "agent_organization": "OpenAI"
+      "agent_name": "Terminus 2",
+      "agent_organization": "Terminal Bench"
     }
   },
   "evaluations": [
     {
-      "evaluation_id": "terminal-bench-2.0/simple-codex__gpt-5.3-codex/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/terminus-2__gpt-5.3-codex/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -34,7 +34,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2026-02-06",
+          "evaluation_timestamp": "2026-02-05",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -43,17 +43,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 75.1,
+            "score": 64.7,
             "uncertainty": {
               "standard_error": {
-                "value": 2.4
+                "value": 2.7
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Simple Codex\" -m \"GPT-5.3-Codex\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5.3-Codex\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -70,7 +70,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Simple Codex\" -m \"GPT-5.3-Codex\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5.3-Codex\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -158,7 +158,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/codebrain-1__gpt-5.3-codex/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/simple-codex__gpt-5.3-codex/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -182,7 +182,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2026-02-10",
+          "evaluation_timestamp": "2026-02-06",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -191,17 +191,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 70.3,
+            "score": 75.1,
             "uncertainty": {
               "standard_error": {
-                "value": 2.6
+                "value": 2.4
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"CodeBrain-1\" -m \"GPT-5.3-Codex\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Simple Codex\" -m \"GPT-5.3-Codex\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -218,7 +218,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"CodeBrain-1\" -m \"GPT-5.3-Codex\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Simple Codex\" -m \"GPT-5.3-Codex\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -232,7 +232,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/droid__gpt-5.3-codex/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/codebrain-1__gpt-5.3-codex/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -256,7 +256,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2026-02-24",
+          "evaluation_timestamp": "2026-02-10",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -265,17 +265,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 77.3,
+            "score": 70.3,
             "uncertainty": {
               "standard_error": {
-                "value": 2.2
+                "value": 2.6
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Droid\" -m \"GPT-5.3-Codex\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"CodeBrain-1\" -m \"GPT-5.3-Codex\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -292,7 +292,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Droid\" -m \"GPT-5.3-Codex\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"CodeBrain-1\" -m \"GPT-5.3-Codex\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -306,7 +306,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/terminus-2__gpt-5.3-codex/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/droid__gpt-5.3-codex/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -330,7 +330,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2026-02-05",
+          "evaluation_timestamp": "2026-02-24",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -339,17 +339,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 64.7,
+            "score": 77.3,
             "uncertainty": {
               "standard_error": {
-                "value": 2.7
+                "value": 2.2
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5.3-Codex\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Droid\" -m \"GPT-5.3-Codex\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -366,7 +366,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5.3-Codex\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Droid\" -m \"GPT-5.3-Codex\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
diff --git a/data/models/openai_gpt-5.json b/data/models/openai_gpt-5.json
index 27068eec1e4a30557b2050b38502f142637aeb61..539597ca005b6b7d3304665ed895ff7bb0860332 100644
--- a/data/models/openai_gpt-5.json
+++ b/data/models/openai_gpt-5.json
@@ -4,13 +4,13 @@
     "id": "openai/gpt-5",
     "developer": "OpenAI",
     "additional_details": {
-      "agent_name": "Codex CLI",
-      "agent_organization": "OpenAI"
+      "agent_name": "Terminus 2",
+      "agent_organization": "Terminal Bench"
     }
   },
   "evaluations": [
     {
-      "evaluation_id": "terminal-bench-2.0/codex-cli__gpt-5/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/terminus-2__gpt-5/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -34,7 +34,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-11-04",
+          "evaluation_timestamp": "2025-10-31",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -43,17 +43,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 49.6,
+            "score": 35.2,
             "uncertainty": {
               "standard_error": {
-                "value": 2.9
+                "value": 3.1
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Codex CLI\" -m \"GPT-5\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -70,7 +70,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Codex CLI\" -m \"GPT-5\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -232,7 +232,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/terminus-2__gpt-5/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/codex-cli__gpt-5/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -256,7 +256,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-10-31",
+          "evaluation_timestamp": "2025-11-04",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -265,17 +265,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 35.2,
+            "score": 49.6,
             "uncertainty": {
               "standard_error": {
-                "value": 3.1
+                "value": 2.9
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Codex CLI\" -m \"GPT-5\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -292,7 +292,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Codex CLI\" -m \"GPT-5\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
diff --git a/data/models/openai_gpt-oss-120b.json b/data/models/openai_gpt-oss-120b.json
index 396f861a54b2b1a7762f9a8f0810f5cd018d5e52..cc688cac8bf3c3ea698efc0de68329e08bff64c1 100644
--- a/data/models/openai_gpt-oss-120b.json
+++ b/data/models/openai_gpt-oss-120b.json
@@ -310,7 +310,7 @@
       "generation_config": null
     },
     {
-      "evaluation_id": "terminal-bench-2.0/terminus-2__gpt-oss-120b/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/mini-swe-agent__gpt-oss-120b/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -334,7 +334,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-11-01",
+          "evaluation_timestamp": "2025-11-03",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -343,17 +343,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 18.7,
+            "score": 14.2,
             "uncertainty": {
               "standard_error": {
-                "value": 2.7
+                "value": 2.3
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-OSS-120B\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"GPT-OSS-120B\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -370,7 +370,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-OSS-120B\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"GPT-OSS-120B\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -384,7 +384,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/mini-swe-agent__gpt-oss-120b/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/terminus-2__gpt-oss-120b/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -408,7 +408,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-11-03",
+          "evaluation_timestamp": "2025-11-01",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -417,17 +417,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 14.2,
+            "score": 18.7,
             "uncertainty": {
               "standard_error": {
-                "value": 2.3
+                "value": 2.7
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"GPT-OSS-120B\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-OSS-120B\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -444,7 +444,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"GPT-OSS-120B\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-OSS-120B\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
diff --git a/data/models/openai_gpt-oss-20b.json b/data/models/openai_gpt-oss-20b.json
index d658cd5464b4783e6e0372fda0cb20e4e0a6a422..584e35a6df7897b2cd4e9e1cdb5c6db8d87d90ab 100644
--- a/data/models/openai_gpt-oss-20b.json
+++ b/data/models/openai_gpt-oss-20b.json
@@ -310,7 +310,7 @@
       "generation_config": null
     },
     {
-      "evaluation_id": "terminal-bench-2.0/mini-swe-agent__gpt-oss-20b/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/terminus-2__gpt-oss-20b/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -334,7 +334,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-11-03",
+          "evaluation_timestamp": "2025-11-01",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -343,17 +343,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 3.4,
+            "score": 3.1,
             "uncertainty": {
               "standard_error": {
-                "value": 1.4
+                "value": 1.5
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"GPT-OSS-20B\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-OSS-20B\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -370,7 +370,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"GPT-OSS-20B\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-OSS-20B\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -384,7 +384,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/terminus-2__gpt-oss-20b/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/mini-swe-agent__gpt-oss-20b/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -408,7 +408,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-11-01",
+          "evaluation_timestamp": "2025-11-03",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -417,17 +417,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 3.1,
+            "score": 3.4,
             "uncertainty": {
               "standard_error": {
-                "value": 1.5
+                "value": 1.4
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-OSS-20B\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"GPT-OSS-20B\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -444,7 +444,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-OSS-20B\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"GPT-OSS-20B\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
diff --git a/data/models/openai_gpt_5.2.json b/data/models/openai_gpt_5.2.json
index e5de01feca478b0842db2b2533d006446492021b..fec8e3d9659540e746cd368872cad0ec3c496416 100644
--- a/data/models/openai_gpt_5.2.json
+++ b/data/models/openai_gpt_5.2.json
@@ -7,10 +7,10 @@
   },
   "evaluations": [
     {
-      "evaluation_id": "apex-agents/openai_gpt-5.2/1773260200",
+      "evaluation_id": "ace/openai_gpt-5.2/1773260200",
       "retrieved_timestamp": "1773260200",
       "source_metadata": {
-        "source_name": "Mercor APEX-Agents Leaderboard",
+        "source_name": "Mercor ACE Leaderboard",
         "source_type": "evaluation_run",
         "source_organization_name": "Mercor",
         "source_organization_url": "https://www.mercor.com",
@@ -20,24 +20,24 @@
         "name": "archipelago",
         "version": "1.0.0"
       },
-      "benchmark": "apex-agents",
+      "benchmark": "ace",
       "evaluation_results": [
         {
-          "evaluation_name": "Overall Pass@1",
+          "evaluation_name": "Overall Score",
           "source_data": {
-            "dataset_name": "apex-agents",
+            "dataset_name": "ace",
             "source_type": "hf_dataset",
-            "hf_repo": "mercor/apex-agents"
+            "hf_repo": "Mercor/ACE"
           },
           "metric_config": {
-            "evaluation_description": "Overall Pass@1 (dataset card / paper snapshot).",
+            "evaluation_description": "Overall ACE score across all consumer-task domains.",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0,
             "max_score": 1
           },
           "score_details": {
-            "score": 0.23,
+            "score": 0.515,
             "uncertainty": {
               "confidence_interval": {
                 "lower": -0.032,
@@ -53,28 +53,21 @@
           }
         },
         {
-          "evaluation_name": "Overall Pass@8",
+          "evaluation_name": "Food Score",
           "source_data": {
-            "dataset_name": "apex-agents",
+            "dataset_name": "ace",
             "source_type": "hf_dataset",
-            "hf_repo": "mercor/apex-agents"
+            "hf_repo": "Mercor/ACE"
           },
           "metric_config": {
-            "evaluation_description": "Overall Pass@8 (dataset card / paper snapshot).",
+            "evaluation_description": "Food domain score.",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0,
             "max_score": 1
           },
           "score_details": {
-            "score": 0.4,
-            "uncertainty": {
-              "confidence_interval": {
-                "lower": -0.044,
-                "upper": 0.044,
-                "method": "bootstrap"
-              }
-            }
+            "score": 0.65
           },
           "generation_config": {
             "additional_details": {
@@ -83,44 +76,75 @@
           }
         },
         {
-          "evaluation_name": "Overall Mean Score",
+          "evaluation_name": "Gaming Score",
           "source_data": {
-            "dataset_name": "apex-agents",
+            "dataset_name": "ace",
             "source_type": "hf_dataset",
-            "hf_repo": "mercor/apex-agents"
+            "hf_repo": "Mercor/ACE"
           },
           "metric_config": {
-            "evaluation_description": "Overall mean rubric score.",
+            "evaluation_description": "Gaming domain score.",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0,
             "max_score": 1
           },
           "score_details": {
-            "score": 0.387
+            "score": 0.578
           },
           "generation_config": {
             "additional_details": {
               "run_setting": "High"
             }
           }
-        },
+        }
+      ],
+      "detailed_evaluation_results": null,
+      "generation_config": {
+        "additional_details": {
+          "run_setting": "High"
+        }
+      }
+    },
+    {
+      "evaluation_id": "apex-agents/openai_gpt-5.2/1773260200",
+      "retrieved_timestamp": "1773260200",
+      "source_metadata": {
+        "source_name": "Mercor APEX-Agents Leaderboard",
+        "source_type": "evaluation_run",
+        "source_organization_name": "Mercor",
+        "source_organization_url": "https://www.mercor.com",
+        "evaluator_relationship": "first_party"
+      },
+      "eval_library": {
+        "name": "archipelago",
+        "version": "1.0.0"
+      },
+      "benchmark": "apex-agents",
+      "evaluation_results": [
         {
-          "evaluation_name": "Investment Banking Pass@1",
+          "evaluation_name": "Overall Pass@1",
           "source_data": {
             "dataset_name": "apex-agents",
             "source_type": "hf_dataset",
             "hf_repo": "mercor/apex-agents"
           },
           "metric_config": {
-            "evaluation_description": "Investment banking world Pass@1.",
+            "evaluation_description": "Overall Pass@1 (dataset card / paper snapshot).",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0,
             "max_score": 1
           },
           "score_details": {
-            "score": 0.273
+            "score": 0.23,
+            "uncertainty": {
+              "confidence_interval": {
+                "lower": -0.032,
+                "upper": 0.032,
+                "method": "bootstrap"
+              }
+            }
           },
           "generation_config": {
             "additional_details": {
@@ -129,21 +153,28 @@
           }
         },
         {
-          "evaluation_name": "Management Consulting Pass@1",
+          "evaluation_name": "Overall Pass@8",
           "source_data": {
             "dataset_name": "apex-agents",
             "source_type": "hf_dataset",
             "hf_repo": "mercor/apex-agents"
           },
           "metric_config": {
-            "evaluation_description": "Management consulting world Pass@1.",
+            "evaluation_description": "Overall Pass@8 (dataset card / paper snapshot).",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0,
             "max_score": 1
           },
           "score_details": {
-            "score": 0.227
+            "score": 0.4,
+            "uncertainty": {
+              "confidence_interval": {
+                "lower": -0.044,
+                "upper": 0.044,
+                "method": "bootstrap"
+              }
+            }
           },
           "generation_config": {
             "additional_details": {
@@ -152,21 +183,21 @@
           }
         },
         {
-          "evaluation_name": "Corporate Law Pass@1",
+          "evaluation_name": "Overall Mean Score",
           "source_data": {
             "dataset_name": "apex-agents",
             "source_type": "hf_dataset",
             "hf_repo": "mercor/apex-agents"
           },
           "metric_config": {
-            "evaluation_description": "Corporate law world Pass@1.",
+            "evaluation_description": "Overall mean rubric score.",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0,
             "max_score": 1
           },
           "score_details": {
-            "score": 0.189
+            "score": 0.387
           },
           "generation_config": {
             "additional_details": {
@@ -175,75 +206,44 @@
           }
         },
         {
-          "evaluation_name": "Corporate Lawyer Mean Score",
+          "evaluation_name": "Investment Banking Pass@1",
           "source_data": {
             "dataset_name": "apex-agents",
             "source_type": "hf_dataset",
             "hf_repo": "mercor/apex-agents"
           },
           "metric_config": {
-            "evaluation_description": "Corporate lawyer world mean score.",
+            "evaluation_description": "Investment banking world Pass@1.",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0,
             "max_score": 1
           },
           "score_details": {
-            "score": 0.443
+            "score": 0.273
           },
           "generation_config": {
             "additional_details": {
               "run_setting": "High"
             }
           }
-        }
-      ],
-      "detailed_evaluation_results": null,
-      "generation_config": {
-        "additional_details": {
-          "run_setting": "High"
-        }
-      }
-    },
-    {
-      "evaluation_id": "ace/openai_gpt-5.2/1773260200",
-      "retrieved_timestamp": "1773260200",
-      "source_metadata": {
-        "source_name": "Mercor ACE Leaderboard",
-        "source_type": "evaluation_run",
-        "source_organization_name": "Mercor",
-        "source_organization_url": "https://www.mercor.com",
-        "evaluator_relationship": "first_party"
-      },
-      "eval_library": {
-        "name": "archipelago",
-        "version": "1.0.0"
-      },
-      "benchmark": "ace",
-      "evaluation_results": [
+        },
         {
-          "evaluation_name": "Overall Score",
+          "evaluation_name": "Management Consulting Pass@1",
           "source_data": {
-            "dataset_name": "ace",
+            "dataset_name": "apex-agents",
             "source_type": "hf_dataset",
-            "hf_repo": "Mercor/ACE"
+            "hf_repo": "mercor/apex-agents"
           },
           "metric_config": {
-            "evaluation_description": "Overall ACE score across all consumer-task domains.",
+            "evaluation_description": "Management consulting world Pass@1.",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0,
             "max_score": 1
           },
           "score_details": {
-            "score": 0.515,
-            "uncertainty": {
-              "confidence_interval": {
-                "lower": -0.032,
-                "upper": 0.032,
-                "method": "bootstrap"
-              }
-            }
+            "score": 0.227
           },
           "generation_config": {
             "additional_details": {
@@ -252,21 +252,21 @@
           }
         },
         {
-          "evaluation_name": "Food Score",
+          "evaluation_name": "Corporate Law Pass@1",
           "source_data": {
-            "dataset_name": "ace",
+            "dataset_name": "apex-agents",
             "source_type": "hf_dataset",
-            "hf_repo": "Mercor/ACE"
+            "hf_repo": "mercor/apex-agents"
           },
           "metric_config": {
-            "evaluation_description": "Food domain score.",
+            "evaluation_description": "Corporate law world Pass@1.",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0,
             "max_score": 1
           },
           "score_details": {
-            "score": 0.65
+            "score": 0.189
           },
           "generation_config": {
             "additional_details": {
@@ -275,21 +275,21 @@
           }
         },
         {
-          "evaluation_name": "Gaming Score",
+          "evaluation_name": "Corporate Lawyer Mean Score",
           "source_data": {
-            "dataset_name": "ace",
+            "dataset_name": "apex-agents",
             "source_type": "hf_dataset",
-            "hf_repo": "Mercor/ACE"
+            "hf_repo": "mercor/apex-agents"
           },
           "metric_config": {
-            "evaluation_description": "Gaming domain score.",
+            "evaluation_description": "Corporate lawyer world mean score.",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0,
             "max_score": 1
           },
           "score_details": {
-            "score": 0.578
+            "score": 0.443
           },
           "generation_config": {
             "additional_details": {
diff --git a/data/models/openassistant_oasst-rm-2-pythia-6.9b-epoch-1.json b/data/models/openassistant_oasst-rm-2-pythia-6.9b-epoch-1.json
index cdfb6729d8ee2c00438e79bd148b5b9825fbce54..34afb9663c74112d98337da6fd83b3594c465f50 100644
--- a/data/models/openassistant_oasst-rm-2-pythia-6.9b-epoch-1.json
+++ b/data/models/openassistant_oasst-rm-2-pythia-6.9b-epoch-1.json
@@ -9,10 +9,10 @@
   },
   "evaluations": [
     {
-      "evaluation_id": "reward-bench-2/OpenAssistant_oasst-rm-2-pythia-6.9b-epoch-1/1766412838.146816",
+      "evaluation_id": "reward-bench/OpenAssistant_oasst-rm-2-pythia-6.9b-epoch-1/1766412838.146816",
       "retrieved_timestamp": "1766412838.146816",
       "source_metadata": {
-        "source_name": "RewardBench 2",
+        "source_name": "RewardBench",
         "source_type": "documentation",
         "source_organization_name": "Allen Institute for AI",
         "source_organization_url": "https://allenai.org",
@@ -31,127 +31,109 @@
         {
           "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-            "lower_is_better": false,
-            "score_type": "continuous",
-            "min_score": 0.0,
-            "max_score": 1.0
-          },
-          "score_details": {
-            "score": 0.2653
-          },
-          "source_data": {
-            "dataset_name": "RewardBench 2",
-            "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
-          }
-        },
-        {
-          "evaluation_name": "Factuality",
-          "metric_config": {
-            "evaluation_description": "Factuality score - measures factual accuracy",
+            "evaluation_description": "Overall RewardBench Score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3979
+            "score": 0.615
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Precise IF",
+          "evaluation_name": "Chat",
           "metric_config": {
-            "evaluation_description": "Precise Instruction Following score",
+            "evaluation_description": "Chat accuracy - includes easy chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2875
+            "score": 0.9246
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Math",
+          "evaluation_name": "Chat Hard",
           "metric_config": {
-            "evaluation_description": "Math score - measures mathematical reasoning",
+            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.377
+            "score": 0.3728
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
           "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Safety score - measures safety awareness",
+            "evaluation_description": "Safety accuracy - includes safety subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3289
+            "score": 0.5446
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Focus",
+          "evaluation_name": "Reasoning",
           "metric_config": {
-            "evaluation_description": "Focus score - measures response focus",
+            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.1535
+            "score": 0.5855
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Ties",
+          "evaluation_name": "Prior Sets (0.5 weight)",
           "metric_config": {
-            "evaluation_description": "Ties score - ability to identify tie cases",
+            "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.047
+            "score": 0.6801
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         }
       ],
@@ -159,10 +141,10 @@
       "generation_config": null
     },
     {
-      "evaluation_id": "reward-bench/OpenAssistant_oasst-rm-2-pythia-6.9b-epoch-1/1766412838.146816",
+      "evaluation_id": "reward-bench-2/OpenAssistant_oasst-rm-2-pythia-6.9b-epoch-1/1766412838.146816",
       "retrieved_timestamp": "1766412838.146816",
       "source_metadata": {
-        "source_name": "RewardBench",
+        "source_name": "RewardBench 2",
         "source_type": "documentation",
         "source_organization_name": "Allen Institute for AI",
         "source_organization_url": "https://allenai.org",
@@ -181,109 +163,127 @@
         {
           "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench Score",
+            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.615
+            "score": 0.2653
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat",
+          "evaluation_name": "Factuality",
           "metric_config": {
-            "evaluation_description": "Chat accuracy - includes easy chat subsets",
+            "evaluation_description": "Factuality score - measures factual accuracy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9246
+            "score": 0.3979
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat Hard",
+          "evaluation_name": "Precise IF",
           "metric_config": {
-            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
+            "evaluation_description": "Precise Instruction Following score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3728
+            "score": 0.2875
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
+          }
+        },
+        {
+          "evaluation_name": "Math",
+          "metric_config": {
+            "evaluation_description": "Math score - measures mathematical reasoning",
+            "lower_is_better": false,
+            "score_type": "continuous",
+            "min_score": 0.0,
+            "max_score": 1.0
+          },
+          "score_details": {
+            "score": 0.377
+          },
+          "source_data": {
+            "dataset_name": "RewardBench 2",
+            "source_type": "hf_dataset",
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
           "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Safety accuracy - includes safety subsets",
+            "evaluation_description": "Safety score - measures safety awareness",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5446
+            "score": 0.3289
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Reasoning",
+          "evaluation_name": "Focus",
           "metric_config": {
-            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
+            "evaluation_description": "Focus score - measures response focus",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5855
+            "score": 0.1535
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Prior Sets (0.5 weight)",
+          "evaluation_name": "Ties",
           "metric_config": {
-            "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
+            "evaluation_description": "Ties score - ability to identify tie cases",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6801
+            "score": 0.047
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         }
       ],
diff --git a/data/models/openassistant_oasst-rm-2.1-pythia-1.4b-epoch-2.5.json b/data/models/openassistant_oasst-rm-2.1-pythia-1.4b-epoch-2.5.json
index d028cad4784e7392a6a614670c5c27a9b8900f31..4f200b5bed952e0790f7951e7076e297510a1c46 100644
--- a/data/models/openassistant_oasst-rm-2.1-pythia-1.4b-epoch-2.5.json
+++ b/data/models/openassistant_oasst-rm-2.1-pythia-1.4b-epoch-2.5.json
@@ -9,10 +9,10 @@
   },
   "evaluations": [
     {
-      "evaluation_id": "reward-bench/OpenAssistant_oasst-rm-2.1-pythia-1.4b-epoch-2.5/1766412838.146816",
+      "evaluation_id": "reward-bench-2/OpenAssistant_oasst-rm-2.1-pythia-1.4b-epoch-2.5/1766412838.146816",
       "retrieved_timestamp": "1766412838.146816",
       "source_metadata": {
-        "source_name": "RewardBench",
+        "source_name": "RewardBench 2",
         "source_type": "documentation",
         "source_organization_name": "Allen Institute for AI",
         "source_organization_url": "https://allenai.org",
@@ -31,109 +31,127 @@
         {
           "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench Score",
+            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6901
+            "score": 0.2648
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat",
+          "evaluation_name": "Factuality",
           "metric_config": {
-            "evaluation_description": "Chat accuracy - includes easy chat subsets",
+            "evaluation_description": "Factuality score - measures factual accuracy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8855
+            "score": 0.3179
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat Hard",
+          "evaluation_name": "Precise IF",
           "metric_config": {
-            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
+            "evaluation_description": "Precise Instruction Following score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4868
+            "score": 0.2625
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
+          }
+        },
+        {
+          "evaluation_name": "Math",
+          "metric_config": {
+            "evaluation_description": "Math score - measures mathematical reasoning",
+            "lower_is_better": false,
+            "score_type": "continuous",
+            "min_score": 0.0,
+            "max_score": 1.0
+          },
+          "score_details": {
+            "score": 0.3934
+          },
+          "source_data": {
+            "dataset_name": "RewardBench 2",
+            "source_type": "hf_dataset",
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
           "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Safety accuracy - includes safety subsets",
+            "evaluation_description": "Safety score - measures safety awareness",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6311
+            "score": 0.3244
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Reasoning",
+          "evaluation_name": "Focus",
           "metric_config": {
-            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
+            "evaluation_description": "Focus score - measures response focus",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7752
+            "score": 0.2707
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Prior Sets (0.5 weight)",
+          "evaluation_name": "Ties",
           "metric_config": {
-            "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
+            "evaluation_description": "Ties score - ability to identify tie cases",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6533
+            "score": 0.0198
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         }
       ],
@@ -141,10 +159,10 @@
       "generation_config": null
     },
     {
-      "evaluation_id": "reward-bench-2/OpenAssistant_oasst-rm-2.1-pythia-1.4b-epoch-2.5/1766412838.146816",
+      "evaluation_id": "reward-bench/OpenAssistant_oasst-rm-2.1-pythia-1.4b-epoch-2.5/1766412838.146816",
       "retrieved_timestamp": "1766412838.146816",
       "source_metadata": {
-        "source_name": "RewardBench 2",
+        "source_name": "RewardBench",
         "source_type": "documentation",
         "source_organization_name": "Allen Institute for AI",
         "source_organization_url": "https://allenai.org",
@@ -163,127 +181,109 @@
         {
           "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-            "lower_is_better": false,
-            "score_type": "continuous",
-            "min_score": 0.0,
-            "max_score": 1.0
-          },
-          "score_details": {
-            "score": 0.2648
-          },
-          "source_data": {
-            "dataset_name": "RewardBench 2",
-            "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
-          }
-        },
-        {
-          "evaluation_name": "Factuality",
-          "metric_config": {
-            "evaluation_description": "Factuality score - measures factual accuracy",
+            "evaluation_description": "Overall RewardBench Score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3179
+            "score": 0.6901
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Precise IF",
+          "evaluation_name": "Chat",
           "metric_config": {
-            "evaluation_description": "Precise Instruction Following score",
+            "evaluation_description": "Chat accuracy - includes easy chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2625
+            "score": 0.8855
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Math",
+          "evaluation_name": "Chat Hard",
           "metric_config": {
-            "evaluation_description": "Math score - measures mathematical reasoning",
+            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3934
+            "score": 0.4868
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
           "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Safety score - measures safety awareness",
+            "evaluation_description": "Safety accuracy - includes safety subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3244
+            "score": 0.6311
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Focus",
+          "evaluation_name": "Reasoning",
           "metric_config": {
-            "evaluation_description": "Focus score - measures response focus",
+            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2707
+            "score": 0.7752
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Ties",
+          "evaluation_name": "Prior Sets (0.5 weight)",
           "metric_config": {
-            "evaluation_description": "Ties score - ability to identify tie cases",
+            "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0198
+            "score": 0.6533
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         }
       ],
diff --git a/data/models/openbmb_eurus-rm-7b.json b/data/models/openbmb_eurus-rm-7b.json
index 44637ca17276f082725f61866dcfa3229cc54274..e1154a660c89f219cec7ec844602d9d8fdfa08ad 100644
--- a/data/models/openbmb_eurus-rm-7b.json
+++ b/data/models/openbmb_eurus-rm-7b.json
@@ -9,10 +9,10 @@
   },
   "evaluations": [
     {
-      "evaluation_id": "reward-bench/openbmb_Eurus-RM-7b/1766412838.146816",
+      "evaluation_id": "reward-bench-2/openbmb_Eurus-RM-7b/1766412838.146816",
       "retrieved_timestamp": "1766412838.146816",
       "source_metadata": {
-        "source_name": "RewardBench",
+        "source_name": "RewardBench 2",
         "source_type": "documentation",
         "source_organization_name": "Allen Institute for AI",
         "source_organization_url": "https://allenai.org",
@@ -31,109 +31,127 @@
         {
           "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench Score",
+            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8159
+            "score": 0.5806
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat",
+          "evaluation_name": "Factuality",
           "metric_config": {
-            "evaluation_description": "Chat accuracy - includes easy chat subsets",
+            "evaluation_description": "Factuality score - measures factual accuracy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9804
+            "score": 0.6
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat Hard",
+          "evaluation_name": "Precise IF",
           "metric_config": {
-            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
+            "evaluation_description": "Precise Instruction Following score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6557
+            "score": 0.3438
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
+          }
+        },
+        {
+          "evaluation_name": "Math",
+          "metric_config": {
+            "evaluation_description": "Math score - measures mathematical reasoning",
+            "lower_is_better": false,
+            "score_type": "continuous",
+            "min_score": 0.0,
+            "max_score": 1.0
+          },
+          "score_details": {
+            "score": 0.5683
+          },
+          "source_data": {
+            "dataset_name": "RewardBench 2",
+            "source_type": "hf_dataset",
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
           "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Safety accuracy - includes safety subsets",
+            "evaluation_description": "Safety score - measures safety awareness",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8135
+            "score": 0.6267
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Reasoning",
+          "evaluation_name": "Focus",
           "metric_config": {
-            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
+            "evaluation_description": "Focus score - measures response focus",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8633
+            "score": 0.7475
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Prior Sets (0.5 weight)",
+          "evaluation_name": "Ties",
           "metric_config": {
-            "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
+            "evaluation_description": "Ties score - ability to identify tie cases",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7172
+            "score": 0.5972
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         }
       ],
@@ -141,10 +159,10 @@
       "generation_config": null
     },
     {
-      "evaluation_id": "reward-bench-2/openbmb_Eurus-RM-7b/1766412838.146816",
+      "evaluation_id": "reward-bench/openbmb_Eurus-RM-7b/1766412838.146816",
       "retrieved_timestamp": "1766412838.146816",
       "source_metadata": {
-        "source_name": "RewardBench 2",
+        "source_name": "RewardBench",
         "source_type": "documentation",
         "source_organization_name": "Allen Institute for AI",
         "source_organization_url": "https://allenai.org",
@@ -163,127 +181,109 @@
         {
           "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-            "lower_is_better": false,
-            "score_type": "continuous",
-            "min_score": 0.0,
-            "max_score": 1.0
-          },
-          "score_details": {
-            "score": 0.5806
-          },
-          "source_data": {
-            "dataset_name": "RewardBench 2",
-            "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
-          }
-        },
-        {
-          "evaluation_name": "Factuality",
-          "metric_config": {
-            "evaluation_description": "Factuality score - measures factual accuracy",
+            "evaluation_description": "Overall RewardBench Score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6
+            "score": 0.8159
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Precise IF",
+          "evaluation_name": "Chat",
           "metric_config": {
-            "evaluation_description": "Precise Instruction Following score",
+            "evaluation_description": "Chat accuracy - includes easy chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3438
+            "score": 0.9804
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Math",
+          "evaluation_name": "Chat Hard",
           "metric_config": {
-            "evaluation_description": "Math score - measures mathematical reasoning",
+            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5683
+            "score": 0.6557
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
           "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Safety score - measures safety awareness",
+            "evaluation_description": "Safety accuracy - includes safety subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6267
+            "score": 0.8135
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Focus",
+          "evaluation_name": "Reasoning",
           "metric_config": {
-            "evaluation_description": "Focus score - measures response focus",
+            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7475
+            "score": 0.8633
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Ties",
+          "evaluation_name": "Prior Sets (0.5 weight)",
           "metric_config": {
-            "evaluation_description": "Ties score - ability to identify tie cases",
+            "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5972
+            "score": 0.7172
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         }
       ],
diff --git a/data/models/pku-alignment_beaver-7b-v1.0-cost.json b/data/models/pku-alignment_beaver-7b-v1.0-cost.json
index 8e786484059e3101c1249c1cce8c4b2be81faa5a..3777eba3edfdc470c669a503ac85994bf8139135 100644
--- a/data/models/pku-alignment_beaver-7b-v1.0-cost.json
+++ b/data/models/pku-alignment_beaver-7b-v1.0-cost.json
@@ -9,10 +9,10 @@
   },
   "evaluations": [
     {
-      "evaluation_id": "reward-bench/PKU-Alignment_beaver-7b-v1.0-cost/1766412838.146816",
+      "evaluation_id": "reward-bench-2/PKU-Alignment_beaver-7b-v1.0-cost/1766412838.146816",
       "retrieved_timestamp": "1766412838.146816",
       "source_metadata": {
-        "source_name": "RewardBench",
+        "source_name": "RewardBench 2",
         "source_type": "documentation",
         "source_organization_name": "Allen Institute for AI",
         "source_organization_url": "https://allenai.org",
@@ -31,109 +31,127 @@
         {
           "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench Score",
+            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5798
+            "score": 0.3332
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat",
+          "evaluation_name": "Factuality",
           "metric_config": {
-            "evaluation_description": "Chat accuracy - includes easy chat subsets",
+            "evaluation_description": "Factuality score - measures factual accuracy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6173
+            "score": 0.3263
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat Hard",
+          "evaluation_name": "Precise IF",
           "metric_config": {
-            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
+            "evaluation_description": "Precise Instruction Following score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4232
+            "score": 0.2313
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
+          }
+        },
+        {
+          "evaluation_name": "Math",
+          "metric_config": {
+            "evaluation_description": "Math score - measures mathematical reasoning",
+            "lower_is_better": false,
+            "score_type": "continuous",
+            "min_score": 0.0,
+            "max_score": 1.0
+          },
+          "score_details": {
+            "score": 0.3989
+          },
+          "source_data": {
+            "dataset_name": "RewardBench 2",
+            "source_type": "hf_dataset",
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
           "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Safety accuracy - includes safety subsets",
+            "evaluation_description": "Safety score - measures safety awareness",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7351
+            "score": 0.7589
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Reasoning",
+          "evaluation_name": "Focus",
           "metric_config": {
-            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
+            "evaluation_description": "Focus score - measures response focus",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5482
+            "score": 0.2939
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Prior Sets (0.5 weight)",
+          "evaluation_name": "Ties",
           "metric_config": {
-            "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
+            "evaluation_description": "Ties score - ability to identify tie cases",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.57
+            "score": -0.01
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         }
       ],
@@ -141,10 +159,10 @@
       "generation_config": null
     },
     {
-      "evaluation_id": "reward-bench-2/PKU-Alignment_beaver-7b-v1.0-cost/1766412838.146816",
+      "evaluation_id": "reward-bench/PKU-Alignment_beaver-7b-v1.0-cost/1766412838.146816",
       "retrieved_timestamp": "1766412838.146816",
       "source_metadata": {
-        "source_name": "RewardBench 2",
+        "source_name": "RewardBench",
         "source_type": "documentation",
         "source_organization_name": "Allen Institute for AI",
         "source_organization_url": "https://allenai.org",
@@ -163,127 +181,109 @@
         {
           "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-            "lower_is_better": false,
-            "score_type": "continuous",
-            "min_score": 0.0,
-            "max_score": 1.0
-          },
-          "score_details": {
-            "score": 0.3332
-          },
-          "source_data": {
-            "dataset_name": "RewardBench 2",
-            "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
-          }
-        },
-        {
-          "evaluation_name": "Factuality",
-          "metric_config": {
-            "evaluation_description": "Factuality score - measures factual accuracy",
+            "evaluation_description": "Overall RewardBench Score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3263
+            "score": 0.5798
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Precise IF",
+          "evaluation_name": "Chat",
           "metric_config": {
-            "evaluation_description": "Precise Instruction Following score",
+            "evaluation_description": "Chat accuracy - includes easy chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2313
+            "score": 0.6173
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Math",
+          "evaluation_name": "Chat Hard",
           "metric_config": {
-            "evaluation_description": "Math score - measures mathematical reasoning",
+            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3989
+            "score": 0.4232
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
           "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Safety score - measures safety awareness",
+            "evaluation_description": "Safety accuracy - includes safety subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7589
+            "score": 0.7351
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Focus",
+          "evaluation_name": "Reasoning",
           "metric_config": {
-            "evaluation_description": "Focus score - measures response focus",
+            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2939
+            "score": 0.5482
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Ties",
+          "evaluation_name": "Prior Sets (0.5 weight)",
           "metric_config": {
-            "evaluation_description": "Ties score - ability to identify tie cases",
+            "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": -0.01
+            "score": 0.57
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         }
       ],
diff --git a/data/models/pku-alignment_beaver-7b-v1.0-reward.json b/data/models/pku-alignment_beaver-7b-v1.0-reward.json
index adf890d6a20deeb5311651870c4e5638866b9733..ee66fd95461643d78856676dc7b45eb953fc109f 100644
--- a/data/models/pku-alignment_beaver-7b-v1.0-reward.json
+++ b/data/models/pku-alignment_beaver-7b-v1.0-reward.json
@@ -9,10 +9,10 @@
   },
   "evaluations": [
     {
-      "evaluation_id": "reward-bench-2/PKU-Alignment_beaver-7b-v1.0-reward/1766412838.146816",
+      "evaluation_id": "reward-bench/PKU-Alignment_beaver-7b-v1.0-reward/1766412838.146816",
       "retrieved_timestamp": "1766412838.146816",
       "source_metadata": {
-        "source_name": "RewardBench 2",
+        "source_name": "RewardBench",
         "source_type": "documentation",
         "source_organization_name": "Allen Institute for AI",
         "source_organization_url": "https://allenai.org",
@@ -31,127 +31,109 @@
         {
           "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-            "lower_is_better": false,
-            "score_type": "continuous",
-            "min_score": 0.0,
-            "max_score": 1.0
-          },
-          "score_details": {
-            "score": 0.1606
-          },
-          "source_data": {
-            "dataset_name": "RewardBench 2",
-            "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
-          }
-        },
-        {
-          "evaluation_name": "Factuality",
-          "metric_config": {
-            "evaluation_description": "Factuality score - measures factual accuracy",
+            "evaluation_description": "Overall RewardBench Score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2105
+            "score": 0.4727
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Precise IF",
+          "evaluation_name": "Chat",
           "metric_config": {
-            "evaluation_description": "Precise Instruction Following score",
+            "evaluation_description": "Chat accuracy - includes easy chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2938
+            "score": 0.8184
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Math",
+          "evaluation_name": "Chat Hard",
           "metric_config": {
-            "evaluation_description": "Math score - measures mathematical reasoning",
+            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2623
+            "score": 0.2873
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
           "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Safety score - measures safety awareness",
+            "evaluation_description": "Safety accuracy - includes safety subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.1422
+            "score": 0.3757
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Focus",
+          "evaluation_name": "Reasoning",
           "metric_config": {
-            "evaluation_description": "Focus score - measures response focus",
+            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0646
+            "score": 0.346
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Ties",
+          "evaluation_name": "Prior Sets (0.5 weight)",
           "metric_config": {
-            "evaluation_description": "Ties score - ability to identify tie cases",
+            "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": -0.01
+            "score": 0.5993
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         }
       ],
@@ -159,10 +141,10 @@
       "generation_config": null
     },
     {
-      "evaluation_id": "reward-bench/PKU-Alignment_beaver-7b-v1.0-reward/1766412838.146816",
+      "evaluation_id": "reward-bench-2/PKU-Alignment_beaver-7b-v1.0-reward/1766412838.146816",
       "retrieved_timestamp": "1766412838.146816",
       "source_metadata": {
-        "source_name": "RewardBench",
+        "source_name": "RewardBench 2",
         "source_type": "documentation",
         "source_organization_name": "Allen Institute for AI",
         "source_organization_url": "https://allenai.org",
@@ -181,109 +163,127 @@
         {
           "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench Score",
+            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4727
+            "score": 0.1606
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat",
+          "evaluation_name": "Factuality",
           "metric_config": {
-            "evaluation_description": "Chat accuracy - includes easy chat subsets",
+            "evaluation_description": "Factuality score - measures factual accuracy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8184
+            "score": 0.2105
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat Hard",
+          "evaluation_name": "Precise IF",
           "metric_config": {
-            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
+            "evaluation_description": "Precise Instruction Following score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2873
+            "score": 0.2938
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
+          }
+        },
+        {
+          "evaluation_name": "Math",
+          "metric_config": {
+            "evaluation_description": "Math score - measures mathematical reasoning",
+            "lower_is_better": false,
+            "score_type": "continuous",
+            "min_score": 0.0,
+            "max_score": 1.0
+          },
+          "score_details": {
+            "score": 0.2623
+          },
+          "source_data": {
+            "dataset_name": "RewardBench 2",
+            "source_type": "hf_dataset",
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
           "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Safety accuracy - includes safety subsets",
+            "evaluation_description": "Safety score - measures safety awareness",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3757
+            "score": 0.1422
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Reasoning",
+          "evaluation_name": "Focus",
           "metric_config": {
-            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
+            "evaluation_description": "Focus score - measures response focus",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.346
+            "score": 0.0646
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Prior Sets (0.5 weight)",
+          "evaluation_name": "Ties",
           "metric_config": {
-            "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
+            "evaluation_description": "Ties score - ability to identify tie cases",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5993
+            "score": -0.01
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         }
       ],
diff --git a/data/models/pku-alignment_beaver-7b-v2.0-reward.json b/data/models/pku-alignment_beaver-7b-v2.0-reward.json
index e6954fa8026e0ae406f243f530096227275cb53f..4c7c399de00819c04f2b08eddfc902f45548a34f 100644
--- a/data/models/pku-alignment_beaver-7b-v2.0-reward.json
+++ b/data/models/pku-alignment_beaver-7b-v2.0-reward.json
@@ -9,10 +9,10 @@
   },
   "evaluations": [
     {
-      "evaluation_id": "reward-bench-2/PKU-Alignment_beaver-7b-v2.0-reward/1766412838.146816",
+      "evaluation_id": "reward-bench/PKU-Alignment_beaver-7b-v2.0-reward/1766412838.146816",
       "retrieved_timestamp": "1766412838.146816",
       "source_metadata": {
-        "source_name": "RewardBench 2",
+        "source_name": "RewardBench",
         "source_type": "documentation",
         "source_organization_name": "Allen Institute for AI",
         "source_organization_url": "https://allenai.org",
@@ -31,127 +31,109 @@
         {
           "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-            "lower_is_better": false,
-            "score_type": "continuous",
-            "min_score": 0.0,
-            "max_score": 1.0
-          },
-          "score_details": {
-            "score": 0.2544
-          },
-          "source_data": {
-            "dataset_name": "RewardBench 2",
-            "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
-          }
-        },
-        {
-          "evaluation_name": "Factuality",
-          "metric_config": {
-            "evaluation_description": "Factuality score - measures factual accuracy",
+            "evaluation_description": "Overall RewardBench Score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2168
+            "score": 0.6366
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Precise IF",
+          "evaluation_name": "Chat",
           "metric_config": {
-            "evaluation_description": "Precise Instruction Following score",
+            "evaluation_description": "Chat accuracy - includes easy chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2562
+            "score": 0.8994
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Math",
+          "evaluation_name": "Chat Hard",
           "metric_config": {
-            "evaluation_description": "Math score - measures mathematical reasoning",
+            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3825
+            "score": 0.364
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
           "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Safety score - measures safety awareness",
+            "evaluation_description": "Safety accuracy - includes safety subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3156
+            "score": 0.6041
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Focus",
+          "evaluation_name": "Reasoning",
           "metric_config": {
-            "evaluation_description": "Focus score - measures response focus",
+            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2606
+            "score": 0.6887
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Ties",
+          "evaluation_name": "Prior Sets (0.5 weight)",
           "metric_config": {
-            "evaluation_description": "Ties score - ability to identify tie cases",
+            "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0944
+            "score": 0.6171
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         }
       ],
@@ -159,10 +141,10 @@
       "generation_config": null
     },
     {
-      "evaluation_id": "reward-bench/PKU-Alignment_beaver-7b-v2.0-reward/1766412838.146816",
+      "evaluation_id": "reward-bench-2/PKU-Alignment_beaver-7b-v2.0-reward/1766412838.146816",
       "retrieved_timestamp": "1766412838.146816",
       "source_metadata": {
-        "source_name": "RewardBench",
+        "source_name": "RewardBench 2",
         "source_type": "documentation",
         "source_organization_name": "Allen Institute for AI",
         "source_organization_url": "https://allenai.org",
@@ -181,109 +163,127 @@
         {
           "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench Score",
+            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6366
+            "score": 0.2544
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat",
+          "evaluation_name": "Factuality",
           "metric_config": {
-            "evaluation_description": "Chat accuracy - includes easy chat subsets",
+            "evaluation_description": "Factuality score - measures factual accuracy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8994
+            "score": 0.2168
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat Hard",
+          "evaluation_name": "Precise IF",
           "metric_config": {
-            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
+            "evaluation_description": "Precise Instruction Following score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.364
+            "score": 0.2562
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
+          }
+        },
+        {
+          "evaluation_name": "Math",
+          "metric_config": {
+            "evaluation_description": "Math score - measures mathematical reasoning",
+            "lower_is_better": false,
+            "score_type": "continuous",
+            "min_score": 0.0,
+            "max_score": 1.0
+          },
+          "score_details": {
+            "score": 0.3825
+          },
+          "source_data": {
+            "dataset_name": "RewardBench 2",
+            "source_type": "hf_dataset",
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
           "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Safety accuracy - includes safety subsets",
+            "evaluation_description": "Safety score - measures safety awareness",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6041
+            "score": 0.3156
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Reasoning",
+          "evaluation_name": "Focus",
           "metric_config": {
-            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
+            "evaluation_description": "Focus score - measures response focus",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6887
+            "score": 0.2606
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Prior Sets (0.5 weight)",
+          "evaluation_name": "Ties",
           "metric_config": {
-            "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
+            "evaluation_description": "Ties score - ability to identify tie cases",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6171
+            "score": 0.0944
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         }
       ],
diff --git a/data/models/primeintellect_intellect-1.json b/data/models/primeintellect_intellect-1.json
index 7d9ec915394b34f60309abc9ed073a5aa8bce5ab..b8fa0710a43fde559062289de677b131f6527599 100644
--- a/data/models/primeintellect_intellect-1.json
+++ b/data/models/primeintellect_intellect-1.json
@@ -5,7 +5,7 @@
     "developer": "PrimeIntellect",
     "inference_platform": "unknown",
     "additional_details": {
-      "precision": "float16",
+      "precision": "bfloat16",
       "architecture": "LlamaForCausalLM",
       "params_billions": "10.211"
     }
@@ -62,7 +62,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.274
+            "score": 0.276
           }
         },
         {
@@ -98,7 +98,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.25
+            "score": 0.2534
           }
         },
         {
@@ -116,7 +116,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3753
+            "score": 0.3339
           }
         },
         {
@@ -134,7 +134,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.112
+            "score": 0.1123
           }
         }
       ],
@@ -192,7 +192,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.276
+            "score": 0.274
           }
         },
         {
@@ -228,7 +228,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2534
+            "score": 0.25
           }
         },
         {
@@ -246,7 +246,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3339
+            "score": 0.3753
           }
         },
         {
@@ -264,7 +264,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.1123
+            "score": 0.112
           }
         }
       ],
diff --git a/data/models/princeton-nlp_llama-3-8b-prolong-512k-instruct.json b/data/models/princeton-nlp_llama-3-8b-prolong-512k-instruct.json
index a2239eea28a466521ab226132351f6bc4c958619..b88fd125d78454888c7b70c3de2f72cbe2fd7732 100644
--- a/data/models/princeton-nlp_llama-3-8b-prolong-512k-instruct.json
+++ b/data/models/princeton-nlp_llama-3-8b-prolong-512k-instruct.json
@@ -5,7 +5,7 @@
     "developer": "princeton-nlp",
     "inference_platform": "unknown",
     "additional_details": {
-      "precision": "float16",
+      "precision": "bfloat16",
       "architecture": "LlamaForCausalLM",
       "params_billions": "8.03"
     }
@@ -44,7 +44,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3978
+            "score": 0.5508
           }
         },
         {
@@ -62,7 +62,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4983
+            "score": 0.5028
           }
         },
         {
@@ -80,7 +80,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0582
+            "score": 0.0529
           }
         },
         {
@@ -98,7 +98,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.281
+            "score": 0.2861
           }
         },
         {
@@ -116,7 +116,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.425
+            "score": 0.4266
           }
         },
         {
@@ -134,7 +134,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3246
+            "score": 0.3231
           }
         }
       ],
@@ -174,7 +174,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5508
+            "score": 0.3978
           }
         },
         {
@@ -192,7 +192,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5028
+            "score": 0.4983
           }
         },
         {
@@ -210,7 +210,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0529
+            "score": 0.0582
           }
         },
         {
@@ -228,7 +228,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2861
+            "score": 0.281
           }
         },
         {
@@ -246,7 +246,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4266
+            "score": 0.425
           }
         },
         {
@@ -264,7 +264,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3231
+            "score": 0.3246
           }
         }
       ],
diff --git a/data/models/quazim0t0_odb-14b-sce.json b/data/models/quazim0t0_odb-14b-sce.json
index 0894dd13c3bc882b56c1b90b77e354b0b8fcf1b3..b2854fae2e7ea9ad5a32eb4ec453ef9c1365f1b7 100644
--- a/data/models/quazim0t0_odb-14b-sce.json
+++ b/data/models/quazim0t0_odb-14b-sce.json
@@ -6,8 +6,8 @@
     "inference_platform": "unknown",
     "additional_details": {
       "precision": "bfloat16",
-      "architecture": "Unknown",
-      "params_billions": "0.0",
+      "architecture": "LlamaForCausalLM",
+      "params_billions": "14.66",
       "model_id_aliases": [
         "Quazim0t0/ODB-14b-sce"
       ]
@@ -15,7 +15,7 @@
   },
   "evaluations": [
     {
-      "evaluation_id": "hfopenllm_v2/Quazim0t0_ODB-14B-sce/1773936498.240187",
+      "evaluation_id": "hfopenllm_v2/Quazim0t0_ODB-14b-sce/1773936498.240187",
       "retrieved_timestamp": "1773936498.240187",
       "source_metadata": {
         "source_name": "HF Open LLM v2",
@@ -47,7 +47,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2922
+            "score": 0.7016
           }
         },
         {
@@ -65,7 +65,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6559
+            "score": 0.6942
           }
         },
         {
@@ -83,7 +83,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2545
+            "score": 0.4116
           }
         },
         {
@@ -101,7 +101,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2659
+            "score": 0.3624
           }
         },
         {
@@ -119,7 +119,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3929
+            "score": 0.4571
           }
         },
         {
@@ -137,7 +137,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5207
+            "score": 0.5411
           }
         }
       ],
@@ -145,7 +145,7 @@
       "generation_config": null
     },
     {
-      "evaluation_id": "hfopenllm_v2/Quazim0t0_ODB-14b-sce/1773936498.240187",
+      "evaluation_id": "hfopenllm_v2/Quazim0t0_ODB-14B-sce/1773936498.240187",
       "retrieved_timestamp": "1773936498.240187",
       "source_metadata": {
         "source_name": "HF Open LLM v2",
@@ -177,7 +177,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7016
+            "score": 0.2922
           }
         },
         {
@@ -195,7 +195,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6942
+            "score": 0.6559
           }
         },
         {
@@ -213,7 +213,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4116
+            "score": 0.2545
           }
         },
         {
@@ -231,7 +231,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3624
+            "score": 0.2659
           }
         },
         {
@@ -249,7 +249,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4571
+            "score": 0.3929
           }
         },
         {
@@ -267,7 +267,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5411
+            "score": 0.5207
           }
         }
       ],
diff --git a/data/models/qwen_qwen2.5-0.5b-instruct.json b/data/models/qwen_qwen2.5-0.5b-instruct.json
index c0e268fa987678ac81eb89b230a531c20916f2db..09847e1fd5f51b0203501d1e930a33e69907de86 100644
--- a/data/models/qwen_qwen2.5-0.5b-instruct.json
+++ b/data/models/qwen_qwen2.5-0.5b-instruct.json
@@ -5,9 +5,9 @@
     "developer": "Qwen",
     "inference_platform": "unknown",
     "additional_details": {
-      "precision": "bfloat16",
+      "precision": "float16",
       "architecture": "Qwen2ForCausalLM",
-      "params_billions": "0.5"
+      "params_billions": "0.494"
     }
   },
   "evaluations": [
@@ -44,7 +44,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3071
+            "score": 0.3153
           }
         },
         {
@@ -62,7 +62,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3341
+            "score": 0.3322
           }
         },
         {
@@ -80,7 +80,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0
+            "score": 0.1035
           }
         },
         {
@@ -98,7 +98,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2576
+            "score": 0.2592
           }
         },
         {
@@ -116,7 +116,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3329
+            "score": 0.3342
           }
         },
         {
@@ -134,7 +134,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.1697
+            "score": 0.172
           }
         }
       ],
@@ -174,7 +174,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3153
+            "score": 0.3071
           }
         },
         {
@@ -192,7 +192,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3322
+            "score": 0.3341
           }
         },
         {
@@ -210,7 +210,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.1035
+            "score": 0.0
           }
         },
         {
@@ -228,7 +228,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2592
+            "score": 0.2576
           }
         },
         {
@@ -246,7 +246,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3342
+            "score": 0.3329
           }
         },
         {
@@ -264,7 +264,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.172
+            "score": 0.1697
           }
         }
       ],
diff --git a/data/models/qwen_qwen2.5-3b-instruct.json b/data/models/qwen_qwen2.5-3b-instruct.json
index f61e692a8920e6654870904e7032a5b9b962252b..a7b912165d684c557d1da01d4fe87e78c2addb23 100644
--- a/data/models/qwen_qwen2.5-3b-instruct.json
+++ b/data/models/qwen_qwen2.5-3b-instruct.json
@@ -140,6 +140,195 @@
       ],
       "detailed_evaluation_results": null,
       "generation_config": null
+    },
+    {
+      "evaluation_id": "theory_of_mind/hf_Qwen_Qwen2.5-3B-Instruct/1772541652.0",
+      "retrieved_timestamp": "1774793718.284365",
+      "source_metadata": {
+        "source_name": "inspect_ai",
+        "source_type": "evaluation_run",
+        "source_organization_name": "unknown",
+        "evaluator_relationship": "third_party"
+      },
+      "eval_library": {
+        "name": "inspect",
+        "version": "inspect_ai:0.3.185"
+      },
+      "benchmark": "theory_of_mind",
+      "evaluation_results": [
+        {
+          "evaluation_name": "accuracy on theory_of_mind for scorer model_graded_fact",
+          "source_data": {
+            "dataset_name": "theory_of_mind",
+            "source_type": "hf_dataset",
+            "hf_repo": "example://theory_of_mind",
+            "samples_number": 100,
+            "sample_ids": [
+              "1",
+              "2",
+              "3",
+              "4",
+              "5",
+              "6",
+              "7",
+              "8",
+              "9",
+              "10",
+              "11",
+              "12",
+              "13",
+              "14",
+              "15",
+              "16",
+              "17",
+              "18",
+              "19",
+              "20",
+              "21",
+              "22",
+              "23",
+              "24",
+              "25",
+              "26",
+              "27",
+              "28",
+              "29",
+              "30",
+              "31",
+              "32",
+              "33",
+              "34",
+              "35",
+              "36",
+              "37",
+              "38",
+              "39",
+              "40",
+              "41",
+              "42",
+              "43",
+              "44",
+              "45",
+              "46",
+              "47",
+              "48",
+              "49",
+              "50",
+              "51",
+              "52",
+              "53",
+              "54",
+              "55",
+              "56",
+              "57",
+              "58",
+              "59",
+              "60",
+              "61",
+              "62",
+              "63",
+              "64",
+              "65",
+              "66",
+              "67",
+              "68",
+              "69",
+              "70",
+              "71",
+              "72",
+              "73",
+              "74",
+              "75",
+              "76",
+              "77",
+              "78",
+              "79",
+              "80",
+              "81",
+              "82",
+              "83",
+              "84",
+              "85",
+              "86",
+              "87",
+              "88",
+              "89",
+              "90",
+              "91",
+              "92",
+              "93",
+              "94",
+              "95",
+              "96",
+              "97",
+              "98",
+              "99",
+              "100"
+            ],
+            "additional_details": {
+              "shuffled": "False"
+            }
+          },
+          "evaluation_timestamp": "1772541652.0",
+          "metric_config": {
+            "evaluation_description": "accuracy",
+            "lower_is_better": false,
+            "score_type": "continuous",
+            "min_score": 0.0,
+            "max_score": 1.0
+          },
+          "score_details": {
+            "score": 0.78,
+            "uncertainty": {
+              "standard_error": {
+                "value": 0.04163331998932266
+              },
+              "num_samples": 100
+            }
+          },
+          "generation_config": {
+            "generation_args": {
+              "reasoning": false,
+              "agentic_eval_config": {
+                "available_tools": []
+              },
+              "eval_plan": {
+                "name": "plan",
+                "steps": [
+                  "{\"solver\": \"generate\", \"params\": {\"tool_calls\": \"loop\", \"kwargs\": {}}, \"params_passed\": {}}"
+                ],
+                "config": {}
+              },
+              "eval_limits": {},
+              "sandbox": {}
+            }
+          }
+        }
+      ],
+      "detailed_evaluation_results": {
+        "format": "jsonl",
+        "file_path": "data/theory_of_mind/Qwen/Qwen2.5-3B-Instruct/30ed1a75-5bfd-4405-abce-b0fd5e0165ba_samples.jsonl",
+        "hash_algorithm": "sha256",
+        "checksum": "22c5bd6a8da6c54dfb409425283b1e136f76a225daa48c28b963f3be1f13d697",
+        "total_rows": 100
+      },
+      "generation_config": {
+        "generation_args": {
+          "reasoning": false,
+          "agentic_eval_config": {
+            "available_tools": []
+          },
+          "eval_plan": {
+            "name": "plan",
+            "steps": [
+              "{\"solver\": \"generate\", \"params\": {\"tool_calls\": \"loop\", \"kwargs\": {}}, \"params_passed\": {}}"
+            ],
+            "config": {}
+          },
+          "eval_limits": {},
+          "sandbox": {}
+        }
+      }
     }
   ]
 }
\ No newline at end of file
diff --git a/data/models/qwen_qwen2.5-coder-7b-instruct.json b/data/models/qwen_qwen2.5-coder-7b-instruct.json
index 96b0a8af52fb442affa2ad7ba1ed017d80ec2231..0cdcd4044d4f077bb1870a78d07a83c3bbc7d0d0 100644
--- a/data/models/qwen_qwen2.5-coder-7b-instruct.json
+++ b/data/models/qwen_qwen2.5-coder-7b-instruct.json
@@ -5,7 +5,7 @@
     "developer": "Qwen",
     "inference_platform": "unknown",
     "additional_details": {
-      "precision": "bfloat16",
+      "precision": "float16",
       "architecture": "Qwen2ForCausalLM",
       "params_billions": "7.616"
     }
@@ -44,7 +44,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6101
+            "score": 0.6147
           }
         },
         {
@@ -62,7 +62,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5008
+            "score": 0.4999
           }
         },
         {
@@ -80,7 +80,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3716
+            "score": 0.031
           }
         },
         {
@@ -98,7 +98,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2919
+            "score": 0.2936
           }
         },
         {
@@ -116,7 +116,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4073
+            "score": 0.4099
           }
         },
         {
@@ -134,7 +134,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3352
+            "score": 0.3354
           }
         }
       ],
@@ -174,7 +174,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6147
+            "score": 0.6101
           }
         },
         {
@@ -192,7 +192,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4999
+            "score": 0.5008
           }
         },
         {
@@ -210,7 +210,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.031
+            "score": 0.3716
           }
         },
         {
@@ -228,7 +228,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2936
+            "score": 0.2919
           }
         },
         {
@@ -246,7 +246,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4099
+            "score": 0.4073
           }
         },
         {
@@ -264,7 +264,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3354
+            "score": 0.3352
           }
         }
       ],
diff --git a/data/models/ray2333_grm-gemma2-2b-rewardmodel-ft.json b/data/models/ray2333_grm-gemma2-2b-rewardmodel-ft.json
index 24dd55ac376e51965493e81885df295559db7184..cd8835eaf989dbd622c51262fef9b72b10534d67 100644
--- a/data/models/ray2333_grm-gemma2-2b-rewardmodel-ft.json
+++ b/data/models/ray2333_grm-gemma2-2b-rewardmodel-ft.json
@@ -9,10 +9,10 @@
   },
   "evaluations": [
     {
-      "evaluation_id": "reward-bench-2/Ray2333_GRM-gemma2-2B-rewardmodel-ft/1766412838.146816",
+      "evaluation_id": "reward-bench/Ray2333_GRM-gemma2-2B-rewardmodel-ft/1766412838.146816",
       "retrieved_timestamp": "1766412838.146816",
       "source_metadata": {
-        "source_name": "RewardBench 2",
+        "source_name": "RewardBench",
         "source_type": "documentation",
         "source_organization_name": "Allen Institute for AI",
         "source_organization_url": "https://allenai.org",
@@ -31,104 +31,128 @@
         {
           "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
+            "evaluation_description": "Overall RewardBench Score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5966
+            "score": 0.8839
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Factuality",
+          "evaluation_name": "Chat",
           "metric_config": {
-            "evaluation_description": "Factuality score - measures factual accuracy",
+            "evaluation_description": "Chat accuracy - includes easy chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5305
+            "score": 0.9302
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Precise IF",
+          "evaluation_name": "Chat Hard",
           "metric_config": {
-            "evaluation_description": "Precise Instruction Following score",
+            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3125
+            "score": 0.7719
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Math",
+          "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Math score - measures mathematical reasoning",
+            "evaluation_description": "Safety accuracy - includes safety subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5902
+            "score": 0.9216
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Safety",
+          "evaluation_name": "Reasoning",
           "metric_config": {
-            "evaluation_description": "Safety score - measures safety awareness",
+            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9222
+            "score": 0.912
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
-        },
+        }
+      ],
+      "detailed_evaluation_results": null,
+      "generation_config": null
+    },
+    {
+      "evaluation_id": "reward-bench-2/Ray2333_GRM-gemma2-2B-rewardmodel-ft/1766412838.146816",
+      "retrieved_timestamp": "1766412838.146816",
+      "source_metadata": {
+        "source_name": "RewardBench 2",
+        "source_type": "documentation",
+        "source_organization_name": "Allen Institute for AI",
+        "source_organization_url": "https://allenai.org",
+        "evaluator_relationship": "third_party"
+      },
+      "eval_library": {
+        "name": "rewardbench",
+        "version": "0.1.3",
+        "additional_details": {
+          "subsets": "Chat, Chat Hard, Safety, Reasoning",
+          "hf_space": "allenai/reward-bench"
+        }
+      },
+      "benchmark": "reward-bench",
+      "evaluation_results": [
         {
-          "evaluation_name": "Focus",
+          "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Focus score - measures response focus",
+            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7455
+            "score": 0.5966
           },
           "source_data": {
             "dataset_name": "RewardBench 2",
@@ -137,135 +161,111 @@
           }
         },
         {
-          "evaluation_name": "Ties",
+          "evaluation_name": "Factuality",
           "metric_config": {
-            "evaluation_description": "Ties score - ability to identify tie cases",
+            "evaluation_description": "Factuality score - measures factual accuracy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4788
+            "score": 0.5305
           },
           "source_data": {
             "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
             "hf_repo": "allenai/reward-bench-2-results"
           }
-        }
-      ],
-      "detailed_evaluation_results": null,
-      "generation_config": null
-    },
-    {
-      "evaluation_id": "reward-bench/Ray2333_GRM-gemma2-2B-rewardmodel-ft/1766412838.146816",
-      "retrieved_timestamp": "1766412838.146816",
-      "source_metadata": {
-        "source_name": "RewardBench",
-        "source_type": "documentation",
-        "source_organization_name": "Allen Institute for AI",
-        "source_organization_url": "https://allenai.org",
-        "evaluator_relationship": "third_party"
-      },
-      "eval_library": {
-        "name": "rewardbench",
-        "version": "0.1.3",
-        "additional_details": {
-          "subsets": "Chat, Chat Hard, Safety, Reasoning",
-          "hf_space": "allenai/reward-bench"
-        }
-      },
-      "benchmark": "reward-bench",
-      "evaluation_results": [
+        },
         {
-          "evaluation_name": "Score",
+          "evaluation_name": "Precise IF",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench Score",
+            "evaluation_description": "Precise Instruction Following score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8839
+            "score": 0.3125
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat",
+          "evaluation_name": "Math",
           "metric_config": {
-            "evaluation_description": "Chat accuracy - includes easy chat subsets",
+            "evaluation_description": "Math score - measures mathematical reasoning",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9302
+            "score": 0.5902
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat Hard",
+          "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
+            "evaluation_description": "Safety score - measures safety awareness",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7719
+            "score": 0.9222
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Safety",
+          "evaluation_name": "Focus",
           "metric_config": {
-            "evaluation_description": "Safety accuracy - includes safety subsets",
+            "evaluation_description": "Focus score - measures response focus",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9216
+            "score": 0.7455
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Reasoning",
+          "evaluation_name": "Ties",
           "metric_config": {
-            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
+            "evaluation_description": "Ties score - ability to identify tie cases",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.912
+            "score": 0.4788
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         }
       ],
diff --git a/data/models/ray2333_grm-llama3-8b-sftreg.json b/data/models/ray2333_grm-llama3-8b-sftreg.json
index bd70639489991e3e8880cad041284119bd35ecc9..15f35c842dfd5169bfd85bae53f76a90d5850421 100644
--- a/data/models/ray2333_grm-llama3-8b-sftreg.json
+++ b/data/models/ray2333_grm-llama3-8b-sftreg.json
@@ -9,10 +9,10 @@
   },
   "evaluations": [
     {
-      "evaluation_id": "reward-bench/Ray2333_GRM-llama3-8B-sftreg/1766412838.146816",
+      "evaluation_id": "reward-bench-2/Ray2333_GRM-llama3-8B-sftreg/1766412838.146816",
       "retrieved_timestamp": "1766412838.146816",
       "source_metadata": {
-        "source_name": "RewardBench",
+        "source_name": "RewardBench 2",
         "source_type": "documentation",
         "source_organization_name": "Allen Institute for AI",
         "source_organization_url": "https://allenai.org",
@@ -31,109 +31,127 @@
         {
           "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench Score",
+            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8542
+            "score": 0.6089
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat",
+          "evaluation_name": "Factuality",
           "metric_config": {
-            "evaluation_description": "Chat accuracy - includes easy chat subsets",
+            "evaluation_description": "Factuality score - measures factual accuracy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.986
+            "score": 0.6189
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat Hard",
+          "evaluation_name": "Precise IF",
           "metric_config": {
-            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
+            "evaluation_description": "Precise Instruction Following score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6776
+            "score": 0.3875
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
+          }
+        },
+        {
+          "evaluation_name": "Math",
+          "metric_config": {
+            "evaluation_description": "Math score - measures mathematical reasoning",
+            "lower_is_better": false,
+            "score_type": "continuous",
+            "min_score": 0.0,
+            "max_score": 1.0
+          },
+          "score_details": {
+            "score": 0.5792
+          },
+          "source_data": {
+            "dataset_name": "RewardBench 2",
+            "source_type": "hf_dataset",
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
           "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Safety accuracy - includes safety subsets",
+            "evaluation_description": "Safety score - measures safety awareness",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8919
+            "score": 0.7867
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Reasoning",
+          "evaluation_name": "Focus",
           "metric_config": {
-            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
+            "evaluation_description": "Focus score - measures response focus",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9229
+            "score": 0.6828
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Prior Sets (0.5 weight)",
+          "evaluation_name": "Ties",
           "metric_config": {
-            "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
+            "evaluation_description": "Ties score - ability to identify tie cases",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7309
+            "score": 0.5981
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         }
       ],
@@ -141,10 +159,10 @@
       "generation_config": null
     },
     {
-      "evaluation_id": "reward-bench-2/Ray2333_GRM-llama3-8B-sftreg/1766412838.146816",
+      "evaluation_id": "reward-bench/Ray2333_GRM-llama3-8B-sftreg/1766412838.146816",
       "retrieved_timestamp": "1766412838.146816",
       "source_metadata": {
-        "source_name": "RewardBench 2",
+        "source_name": "RewardBench",
         "source_type": "documentation",
         "source_organization_name": "Allen Institute for AI",
         "source_organization_url": "https://allenai.org",
@@ -163,127 +181,109 @@
         {
           "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-            "lower_is_better": false,
-            "score_type": "continuous",
-            "min_score": 0.0,
-            "max_score": 1.0
-          },
-          "score_details": {
-            "score": 0.6089
-          },
-          "source_data": {
-            "dataset_name": "RewardBench 2",
-            "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
-          }
-        },
-        {
-          "evaluation_name": "Factuality",
-          "metric_config": {
-            "evaluation_description": "Factuality score - measures factual accuracy",
+            "evaluation_description": "Overall RewardBench Score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6189
+            "score": 0.8542
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Precise IF",
+          "evaluation_name": "Chat",
           "metric_config": {
-            "evaluation_description": "Precise Instruction Following score",
+            "evaluation_description": "Chat accuracy - includes easy chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3875
+            "score": 0.986
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Math",
+          "evaluation_name": "Chat Hard",
           "metric_config": {
-            "evaluation_description": "Math score - measures mathematical reasoning",
+            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5792
+            "score": 0.6776
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
           "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Safety score - measures safety awareness",
+            "evaluation_description": "Safety accuracy - includes safety subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7867
+            "score": 0.8919
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Focus",
+          "evaluation_name": "Reasoning",
           "metric_config": {
-            "evaluation_description": "Focus score - measures response focus",
+            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6828
+            "score": 0.9229
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Ties",
+          "evaluation_name": "Prior Sets (0.5 weight)",
           "metric_config": {
-            "evaluation_description": "Ties score - ability to identify tie cases",
+            "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5981
+            "score": 0.7309
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         }
       ],
diff --git a/data/models/recoilme_gemma-2-ataraxy-gemmasutra-9b-slerp.json b/data/models/recoilme_gemma-2-ataraxy-gemmasutra-9b-slerp.json
index 07191bcaa576a8b8b67650d88a341b91a6316705..f26fe1b9038e8edc381d5411187a5c6b2c7cbf6e 100644
--- a/data/models/recoilme_gemma-2-ataraxy-gemmasutra-9b-slerp.json
+++ b/data/models/recoilme_gemma-2-ataraxy-gemmasutra-9b-slerp.json
@@ -5,7 +5,7 @@
     "developer": "recoilme",
     "inference_platform": "unknown",
     "additional_details": {
-      "precision": "bfloat16",
+      "precision": "float16",
       "architecture": "Gemma2ForCausalLM",
       "params_billions": "10.159"
     }
@@ -44,7 +44,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2854
+            "score": 0.7649
           }
         },
         {
@@ -62,7 +62,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5984
+            "score": 0.5974
           }
         },
         {
@@ -80,7 +80,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.1005
+            "score": 0.0174
           }
         },
         {
@@ -98,7 +98,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3297
+            "score": 0.3305
           }
         },
         {
@@ -116,7 +116,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4607
+            "score": 0.4245
           }
         },
         {
@@ -134,7 +134,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4162
+            "score": 0.4207
           }
         }
       ],
@@ -174,7 +174,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7649
+            "score": 0.2854
           }
         },
         {
@@ -192,7 +192,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5974
+            "score": 0.5984
           }
         },
         {
@@ -210,7 +210,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0174
+            "score": 0.1005
           }
         },
         {
@@ -228,7 +228,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3305
+            "score": 0.3297
           }
         },
         {
@@ -246,7 +246,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4245
+            "score": 0.4607
           }
         },
         {
@@ -264,7 +264,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4207
+            "score": 0.4162
           }
         }
       ],
diff --git a/data/models/recoilme_recoilme-gemma-2-9b-v0.2.json b/data/models/recoilme_recoilme-gemma-2-9b-v0.2.json
index 83bfdb05cda442c3bac351d424fa26cf0ae864fe..34121094195afd5811c50d769ba5e5e07669b30d 100644
--- a/data/models/recoilme_recoilme-gemma-2-9b-v0.2.json
+++ b/data/models/recoilme_recoilme-gemma-2-9b-v0.2.json
@@ -5,7 +5,7 @@
     "developer": "recoilme",
     "inference_platform": "unknown",
     "additional_details": {
-      "precision": "float16",
+      "precision": "bfloat16",
       "architecture": "Gemma2ForCausalLM",
       "params_billions": "10.159"
     }
@@ -44,7 +44,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7592
+            "score": 0.2747
           }
         },
         {
@@ -62,7 +62,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6026
+            "score": 0.6031
           }
         },
         {
@@ -80,7 +80,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0529
+            "score": 0.0831
           }
         },
         {
@@ -98,7 +98,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3289
+            "score": 0.3305
           }
         },
         {
@@ -116,7 +116,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4099
+            "score": 0.4686
           }
         },
         {
@@ -134,7 +134,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4163
+            "score": 0.4122
           }
         }
       ],
@@ -174,7 +174,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2747
+            "score": 0.7592
           }
         },
         {
@@ -192,7 +192,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6031
+            "score": 0.6026
           }
         },
         {
@@ -210,7 +210,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0831
+            "score": 0.0529
           }
         },
         {
@@ -228,7 +228,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3305
+            "score": 0.3289
           }
         },
         {
@@ -246,7 +246,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4686
+            "score": 0.4099
           }
         },
         {
@@ -264,7 +264,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4122
+            "score": 0.4163
           }
         }
       ],
diff --git a/data/models/recoilme_recoilme-gemma-2-9b-v0.3.json b/data/models/recoilme_recoilme-gemma-2-9b-v0.3.json
index d5cadb462419994ffdabe8b6cdbd73be57e99fa8..812871dcfb75c54e3765d2c8d03eeadc60d898aa 100644
--- a/data/models/recoilme_recoilme-gemma-2-9b-v0.3.json
+++ b/data/models/recoilme_recoilme-gemma-2-9b-v0.3.json
@@ -5,7 +5,7 @@
     "developer": "recoilme",
     "inference_platform": "unknown",
     "additional_details": {
-      "precision": "bfloat16",
+      "precision": "float16",
       "architecture": "Gemma2ForCausalLM",
       "params_billions": "10.159"
     }
@@ -44,7 +44,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5761
+            "score": 0.7439
           }
         },
         {
@@ -62,7 +62,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.602
+            "score": 0.5993
           }
         },
         {
@@ -80,7 +80,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.1888
+            "score": 0.0876
           }
         },
         {
@@ -98,7 +98,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3372
+            "score": 0.3238
           }
         },
         {
@@ -116,7 +116,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4632
+            "score": 0.4204
           }
         },
         {
@@ -134,7 +134,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4039
+            "score": 0.4072
           }
         }
       ],
@@ -174,7 +174,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7439
+            "score": 0.5761
           }
         },
         {
@@ -192,7 +192,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5993
+            "score": 0.602
           }
         },
         {
@@ -210,7 +210,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0876
+            "score": 0.1888
           }
         },
         {
@@ -228,7 +228,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3238
+            "score": 0.3372
           }
         },
         {
@@ -246,7 +246,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4204
+            "score": 0.4632
           }
         },
         {
@@ -264,7 +264,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4072
+            "score": 0.4039
           }
         }
       ],
diff --git a/data/models/replete-ai_replete-llm-qwen2-7b.json b/data/models/replete-ai_replete-llm-qwen2-7b.json
index 627d67572ee0cd0135e173767115d5ee7360b5e6..e51b10d1bb4ff7d76603042a5cdf25b16b9bc87f 100644
--- a/data/models/replete-ai_replete-llm-qwen2-7b.json
+++ b/data/models/replete-ai_replete-llm-qwen2-7b.json
@@ -5,7 +5,7 @@
     "developer": "Replete-AI",
     "inference_platform": "unknown",
     "additional_details": {
-      "precision": "float16",
+      "precision": "bfloat16",
       "architecture": "Qwen2ForCausalLM",
       "params_billions": "7.616"
     }
@@ -44,7 +44,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0905
+            "score": 0.0932
           }
         },
         {
@@ -62,7 +62,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2985
+            "score": 0.2977
           }
         },
         {
@@ -98,7 +98,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2534
+            "score": 0.2475
           }
         },
         {
@@ -116,7 +116,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3848
+            "score": 0.3941
           }
         },
         {
@@ -134,7 +134,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.1158
+            "score": 0.1157
           }
         }
       ],
@@ -174,7 +174,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0932
+            "score": 0.0905
           }
         },
         {
@@ -192,7 +192,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2977
+            "score": 0.2985
           }
         },
         {
@@ -228,7 +228,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2475
+            "score": 0.2534
           }
         },
         {
@@ -246,7 +246,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3941
+            "score": 0.3848
           }
         },
         {
@@ -264,7 +264,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.1157
+            "score": 0.1158
           }
         }
       ],
diff --git a/data/models/sfairxc_fsfairx-llama3-rm-v0.1.json b/data/models/sfairxc_fsfairx-llama3-rm-v0.1.json
index 83ff30916d3f35c6333d5f652927c45d63836756..ecef2ed14b1e2210796bfe9e4943404f7ebd3b7c 100644
--- a/data/models/sfairxc_fsfairx-llama3-rm-v0.1.json
+++ b/data/models/sfairxc_fsfairx-llama3-rm-v0.1.json
@@ -9,10 +9,10 @@
   },
   "evaluations": [
     {
-      "evaluation_id": "reward-bench/sfairXC_FsfairX-LLaMA3-RM-v0.1/1766412838.146816",
+      "evaluation_id": "reward-bench-2/sfairXC_FsfairX-LLaMA3-RM-v0.1/1766412838.146816",
       "retrieved_timestamp": "1766412838.146816",
       "source_metadata": {
-        "source_name": "RewardBench",
+        "source_name": "RewardBench 2",
         "source_type": "documentation",
         "source_organization_name": "Allen Institute for AI",
         "source_organization_url": "https://allenai.org",
@@ -31,109 +31,127 @@
         {
           "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench Score",
+            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8338
+            "score": 0.6292
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat",
+          "evaluation_name": "Factuality",
           "metric_config": {
-            "evaluation_description": "Chat accuracy - includes easy chat subsets",
+            "evaluation_description": "Factuality score - measures factual accuracy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9944
+            "score": 0.5916
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat Hard",
+          "evaluation_name": "Precise IF",
           "metric_config": {
-            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
+            "evaluation_description": "Precise Instruction Following score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6513
+            "score": 0.4188
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
+          }
+        },
+        {
+          "evaluation_name": "Math",
+          "metric_config": {
+            "evaluation_description": "Math score - measures mathematical reasoning",
+            "lower_is_better": false,
+            "score_type": "continuous",
+            "min_score": 0.0,
+            "max_score": 1.0
+          },
+          "score_details": {
+            "score": 0.6284
+          },
+          "source_data": {
+            "dataset_name": "RewardBench 2",
+            "source_type": "hf_dataset",
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
           "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Safety accuracy - includes safety subsets",
+            "evaluation_description": "Safety score - measures safety awareness",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8676
+            "score": 0.7667
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Reasoning",
+          "evaluation_name": "Focus",
           "metric_config": {
-            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
+            "evaluation_description": "Focus score - measures response focus",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8644
+            "score": 0.7051
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Prior Sets (0.5 weight)",
+          "evaluation_name": "Ties",
           "metric_config": {
-            "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
+            "evaluation_description": "Ties score - ability to identify tie cases",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7492
+            "score": 0.6647
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         }
       ],
@@ -141,10 +159,10 @@
       "generation_config": null
     },
     {
-      "evaluation_id": "reward-bench-2/sfairXC_FsfairX-LLaMA3-RM-v0.1/1766412838.146816",
+      "evaluation_id": "reward-bench/sfairXC_FsfairX-LLaMA3-RM-v0.1/1766412838.146816",
       "retrieved_timestamp": "1766412838.146816",
       "source_metadata": {
-        "source_name": "RewardBench 2",
+        "source_name": "RewardBench",
         "source_type": "documentation",
         "source_organization_name": "Allen Institute for AI",
         "source_organization_url": "https://allenai.org",
@@ -163,127 +181,109 @@
         {
           "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-            "lower_is_better": false,
-            "score_type": "continuous",
-            "min_score": 0.0,
-            "max_score": 1.0
-          },
-          "score_details": {
-            "score": 0.6292
-          },
-          "source_data": {
-            "dataset_name": "RewardBench 2",
-            "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
-          }
-        },
-        {
-          "evaluation_name": "Factuality",
-          "metric_config": {
-            "evaluation_description": "Factuality score - measures factual accuracy",
+            "evaluation_description": "Overall RewardBench Score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5916
+            "score": 0.8338
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Precise IF",
+          "evaluation_name": "Chat",
           "metric_config": {
-            "evaluation_description": "Precise Instruction Following score",
+            "evaluation_description": "Chat accuracy - includes easy chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4188
+            "score": 0.9944
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Math",
+          "evaluation_name": "Chat Hard",
           "metric_config": {
-            "evaluation_description": "Math score - measures mathematical reasoning",
+            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6284
+            "score": 0.6513
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
           "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Safety score - measures safety awareness",
+            "evaluation_description": "Safety accuracy - includes safety subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7667
+            "score": 0.8676
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Focus",
+          "evaluation_name": "Reasoning",
           "metric_config": {
-            "evaluation_description": "Focus score - measures response focus",
+            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7051
+            "score": 0.8644
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Ties",
+          "evaluation_name": "Prior Sets (0.5 weight)",
           "metric_config": {
-            "evaluation_description": "Ties score - ability to identify tie cases",
+            "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6647
+            "score": 0.7492
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         }
       ],
diff --git a/data/models/skywork_skywork-reward-gemma-2-27b-v0.2.json b/data/models/skywork_skywork-reward-gemma-2-27b-v0.2.json
index 52952d2b07ce3bbc9c007749585e45b758ae4213..4109d07feac30d842a656539e17ae764cf441c79 100644
--- a/data/models/skywork_skywork-reward-gemma-2-27b-v0.2.json
+++ b/data/models/skywork_skywork-reward-gemma-2-27b-v0.2.json
@@ -142,10 +142,10 @@
       "generation_config": null
     },
     {
-      "evaluation_id": "reward-bench/Skywork_Skywork-Reward-Gemma-2-27B-v0.2/1766412838.146816",
+      "evaluation_id": "reward-bench-2/Skywork_Skywork-Reward-Gemma-2-27B-v0.2/1766412838.146816",
       "retrieved_timestamp": "1766412838.146816",
       "source_metadata": {
-        "source_name": "RewardBench",
+        "source_name": "RewardBench 2",
         "source_type": "documentation",
         "source_organization_name": "Allen Institute for AI",
         "source_organization_url": "https://allenai.org",
@@ -164,128 +164,104 @@
         {
           "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench Score",
+            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9426
+            "score": 0.7531
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat",
+          "evaluation_name": "Factuality",
           "metric_config": {
-            "evaluation_description": "Chat accuracy - includes easy chat subsets",
+            "evaluation_description": "Factuality score - measures factual accuracy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9609
+            "score": 0.7674
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat Hard",
+          "evaluation_name": "Precise IF",
           "metric_config": {
-            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
+            "evaluation_description": "Precise Instruction Following score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8991
+            "score": 0.375
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Safety",
+          "evaluation_name": "Math",
           "metric_config": {
-            "evaluation_description": "Safety accuracy - includes safety subsets",
+            "evaluation_description": "Math score - measures mathematical reasoning",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9297
+            "score": 0.6721
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Reasoning",
+          "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
+            "evaluation_description": "Safety score - measures safety awareness",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9807
+            "score": 0.9689
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
-        }
-      ],
-      "detailed_evaluation_results": null,
-      "generation_config": null
-    },
-    {
-      "evaluation_id": "reward-bench-2/Skywork_Skywork-Reward-Gemma-2-27B-v0.2/1766412838.146816",
-      "retrieved_timestamp": "1766412838.146816",
-      "source_metadata": {
-        "source_name": "RewardBench 2",
-        "source_type": "documentation",
-        "source_organization_name": "Allen Institute for AI",
-        "source_organization_url": "https://allenai.org",
-        "evaluator_relationship": "third_party"
-      },
-      "eval_library": {
-        "name": "rewardbench",
-        "version": "0.1.3",
-        "additional_details": {
-          "subsets": "Chat, Chat Hard, Safety, Reasoning",
-          "hf_space": "allenai/reward-bench"
-        }
-      },
-      "benchmark": "reward-bench",
-      "evaluation_results": [
+        },
         {
-          "evaluation_name": "Score",
+          "evaluation_name": "Focus",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
+            "evaluation_description": "Focus score - measures response focus",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7531
+            "score": 0.9172
           },
           "source_data": {
             "dataset_name": "RewardBench 2",
@@ -294,111 +270,135 @@
           }
         },
         {
-          "evaluation_name": "Factuality",
+          "evaluation_name": "Ties",
           "metric_config": {
-            "evaluation_description": "Factuality score - measures factual accuracy",
+            "evaluation_description": "Ties score - ability to identify tie cases",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7674
+            "score": 0.8182
           },
           "source_data": {
             "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
             "hf_repo": "allenai/reward-bench-2-results"
           }
-        },
+        }
+      ],
+      "detailed_evaluation_results": null,
+      "generation_config": null
+    },
+    {
+      "evaluation_id": "reward-bench/Skywork_Skywork-Reward-Gemma-2-27B-v0.2/1766412838.146816",
+      "retrieved_timestamp": "1766412838.146816",
+      "source_metadata": {
+        "source_name": "RewardBench",
+        "source_type": "documentation",
+        "source_organization_name": "Allen Institute for AI",
+        "source_organization_url": "https://allenai.org",
+        "evaluator_relationship": "third_party"
+      },
+      "eval_library": {
+        "name": "rewardbench",
+        "version": "0.1.3",
+        "additional_details": {
+          "subsets": "Chat, Chat Hard, Safety, Reasoning",
+          "hf_space": "allenai/reward-bench"
+        }
+      },
+      "benchmark": "reward-bench",
+      "evaluation_results": [
         {
-          "evaluation_name": "Precise IF",
+          "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Precise Instruction Following score",
+            "evaluation_description": "Overall RewardBench Score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.375
+            "score": 0.9426
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Math",
+          "evaluation_name": "Chat",
           "metric_config": {
-            "evaluation_description": "Math score - measures mathematical reasoning",
+            "evaluation_description": "Chat accuracy - includes easy chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6721
+            "score": 0.9609
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Safety",
+          "evaluation_name": "Chat Hard",
           "metric_config": {
-            "evaluation_description": "Safety score - measures safety awareness",
+            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9689
+            "score": 0.8991
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Focus",
+          "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Focus score - measures response focus",
+            "evaluation_description": "Safety accuracy - includes safety subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9172
+            "score": 0.9297
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Ties",
+          "evaluation_name": "Reasoning",
           "metric_config": {
-            "evaluation_description": "Ties score - ability to identify tie cases",
+            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8182
+            "score": 0.9807
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         }
       ],
diff --git a/data/models/skywork_skywork-reward-gemma-2-27b.json b/data/models/skywork_skywork-reward-gemma-2-27b.json
index c2b742dbc17aa9720f66d090e08eb784ec7accdc..0b4cc11a7f17a4535cc37f935fdae034a6214bce 100644
--- a/data/models/skywork_skywork-reward-gemma-2-27b.json
+++ b/data/models/skywork_skywork-reward-gemma-2-27b.json
@@ -9,10 +9,10 @@
   },
   "evaluations": [
     {
-      "evaluation_id": "reward-bench-2/Skywork_Skywork-Reward-Gemma-2-27B/1766412838.146816",
+      "evaluation_id": "reward-bench/Skywork_Skywork-Reward-Gemma-2-27B/1766412838.146816",
       "retrieved_timestamp": "1766412838.146816",
       "source_metadata": {
-        "source_name": "RewardBench 2",
+        "source_name": "RewardBench",
         "source_type": "documentation",
         "source_organization_name": "Allen Institute for AI",
         "source_organization_url": "https://allenai.org",
@@ -31,104 +31,128 @@
         {
           "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
+            "evaluation_description": "Overall RewardBench Score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7576
+            "score": 0.938
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Factuality",
+          "evaluation_name": "Chat",
           "metric_config": {
-            "evaluation_description": "Factuality score - measures factual accuracy",
+            "evaluation_description": "Chat accuracy - includes easy chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7368
+            "score": 0.9581
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Precise IF",
+          "evaluation_name": "Chat Hard",
           "metric_config": {
-            "evaluation_description": "Precise Instruction Following score",
+            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4031
+            "score": 0.9145
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Math",
+          "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Math score - measures mathematical reasoning",
+            "evaluation_description": "Safety accuracy - includes safety subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7049
+            "score": 0.9189
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Safety",
+          "evaluation_name": "Reasoning",
           "metric_config": {
-            "evaluation_description": "Safety score - measures safety awareness",
+            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9422
+            "score": 0.9606
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
-        },
+        }
+      ],
+      "detailed_evaluation_results": null,
+      "generation_config": null
+    },
+    {
+      "evaluation_id": "reward-bench-2/Skywork_Skywork-Reward-Gemma-2-27B/1766412838.146816",
+      "retrieved_timestamp": "1766412838.146816",
+      "source_metadata": {
+        "source_name": "RewardBench 2",
+        "source_type": "documentation",
+        "source_organization_name": "Allen Institute for AI",
+        "source_organization_url": "https://allenai.org",
+        "evaluator_relationship": "third_party"
+      },
+      "eval_library": {
+        "name": "rewardbench",
+        "version": "0.1.3",
+        "additional_details": {
+          "subsets": "Chat, Chat Hard, Safety, Reasoning",
+          "hf_space": "allenai/reward-bench"
+        }
+      },
+      "benchmark": "reward-bench",
+      "evaluation_results": [
         {
-          "evaluation_name": "Focus",
+          "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Focus score - measures response focus",
+            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9323
+            "score": 0.7576
           },
           "source_data": {
             "dataset_name": "RewardBench 2",
@@ -137,135 +161,111 @@
           }
         },
         {
-          "evaluation_name": "Ties",
+          "evaluation_name": "Factuality",
           "metric_config": {
-            "evaluation_description": "Ties score - ability to identify tie cases",
+            "evaluation_description": "Factuality score - measures factual accuracy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8261
+            "score": 0.7368
           },
           "source_data": {
             "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
             "hf_repo": "allenai/reward-bench-2-results"
           }
-        }
-      ],
-      "detailed_evaluation_results": null,
-      "generation_config": null
-    },
-    {
-      "evaluation_id": "reward-bench/Skywork_Skywork-Reward-Gemma-2-27B/1766412838.146816",
-      "retrieved_timestamp": "1766412838.146816",
-      "source_metadata": {
-        "source_name": "RewardBench",
-        "source_type": "documentation",
-        "source_organization_name": "Allen Institute for AI",
-        "source_organization_url": "https://allenai.org",
-        "evaluator_relationship": "third_party"
-      },
-      "eval_library": {
-        "name": "rewardbench",
-        "version": "0.1.3",
-        "additional_details": {
-          "subsets": "Chat, Chat Hard, Safety, Reasoning",
-          "hf_space": "allenai/reward-bench"
-        }
-      },
-      "benchmark": "reward-bench",
-      "evaluation_results": [
+        },
         {
-          "evaluation_name": "Score",
+          "evaluation_name": "Precise IF",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench Score",
+            "evaluation_description": "Precise Instruction Following score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.938
+            "score": 0.4031
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat",
+          "evaluation_name": "Math",
           "metric_config": {
-            "evaluation_description": "Chat accuracy - includes easy chat subsets",
+            "evaluation_description": "Math score - measures mathematical reasoning",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9581
+            "score": 0.7049
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat Hard",
+          "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
+            "evaluation_description": "Safety score - measures safety awareness",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9145
+            "score": 0.9422
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Safety",
+          "evaluation_name": "Focus",
           "metric_config": {
-            "evaluation_description": "Safety accuracy - includes safety subsets",
+            "evaluation_description": "Focus score - measures response focus",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9189
+            "score": 0.9323
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Reasoning",
+          "evaluation_name": "Ties",
           "metric_config": {
-            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
+            "evaluation_description": "Ties score - ability to identify tie cases",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9606
+            "score": 0.8261
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         }
       ],
diff --git a/data/models/skywork_skywork-reward-llama-3.1-8b.json b/data/models/skywork_skywork-reward-llama-3.1-8b.json
index dedd0015bc30c7a59cc8db5e1fbdb3b6b6cbc978..d700e3f49377a01e2f9d11b4773879e002a65e98 100644
--- a/data/models/skywork_skywork-reward-llama-3.1-8b.json
+++ b/data/models/skywork_skywork-reward-llama-3.1-8b.json
@@ -9,10 +9,10 @@
   },
   "evaluations": [
     {
-      "evaluation_id": "reward-bench/Skywork_Skywork-Reward-Llama-3.1-8B/1766412838.146816",
+      "evaluation_id": "reward-bench-2/Skywork_Skywork-Reward-Llama-3.1-8B/1766412838.146816",
       "retrieved_timestamp": "1766412838.146816",
       "source_metadata": {
-        "source_name": "RewardBench",
+        "source_name": "RewardBench 2",
         "source_type": "documentation",
         "source_organization_name": "Allen Institute for AI",
         "source_organization_url": "https://allenai.org",
@@ -31,128 +31,104 @@
         {
           "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench Score",
+            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9252
+            "score": 0.7314
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat",
+          "evaluation_name": "Factuality",
           "metric_config": {
-            "evaluation_description": "Chat accuracy - includes easy chat subsets",
+            "evaluation_description": "Factuality score - measures factual accuracy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9581
+            "score": 0.6989
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat Hard",
+          "evaluation_name": "Precise IF",
           "metric_config": {
-            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
+            "evaluation_description": "Precise Instruction Following score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8728
+            "score": 0.425
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Safety",
+          "evaluation_name": "Math",
           "metric_config": {
-            "evaluation_description": "Safety accuracy - includes safety subsets",
+            "evaluation_description": "Math score - measures mathematical reasoning",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9081
+            "score": 0.6284
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Reasoning",
+          "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
+            "evaluation_description": "Safety score - measures safety awareness",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.962
+            "score": 0.9333
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
-        }
-      ],
-      "detailed_evaluation_results": null,
-      "generation_config": null
-    },
-    {
-      "evaluation_id": "reward-bench-2/Skywork_Skywork-Reward-Llama-3.1-8B/1766412838.146816",
-      "retrieved_timestamp": "1766412838.146816",
-      "source_metadata": {
-        "source_name": "RewardBench 2",
-        "source_type": "documentation",
-        "source_organization_name": "Allen Institute for AI",
-        "source_organization_url": "https://allenai.org",
-        "evaluator_relationship": "third_party"
-      },
-      "eval_library": {
-        "name": "rewardbench",
-        "version": "0.1.3",
-        "additional_details": {
-          "subsets": "Chat, Chat Hard, Safety, Reasoning",
-          "hf_space": "allenai/reward-bench"
-        }
-      },
-      "benchmark": "reward-bench",
-      "evaluation_results": [
+        },
         {
-          "evaluation_name": "Score",
+          "evaluation_name": "Focus",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
+            "evaluation_description": "Focus score - measures response focus",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7314
+            "score": 0.9616
           },
           "source_data": {
             "dataset_name": "RewardBench 2",
@@ -161,111 +137,135 @@
           }
         },
         {
-          "evaluation_name": "Factuality",
+          "evaluation_name": "Ties",
           "metric_config": {
-            "evaluation_description": "Factuality score - measures factual accuracy",
+            "evaluation_description": "Ties score - ability to identify tie cases",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6989
+            "score": 0.741
           },
           "source_data": {
             "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
             "hf_repo": "allenai/reward-bench-2-results"
           }
-        },
+        }
+      ],
+      "detailed_evaluation_results": null,
+      "generation_config": null
+    },
+    {
+      "evaluation_id": "reward-bench/Skywork_Skywork-Reward-Llama-3.1-8B/1766412838.146816",
+      "retrieved_timestamp": "1766412838.146816",
+      "source_metadata": {
+        "source_name": "RewardBench",
+        "source_type": "documentation",
+        "source_organization_name": "Allen Institute for AI",
+        "source_organization_url": "https://allenai.org",
+        "evaluator_relationship": "third_party"
+      },
+      "eval_library": {
+        "name": "rewardbench",
+        "version": "0.1.3",
+        "additional_details": {
+          "subsets": "Chat, Chat Hard, Safety, Reasoning",
+          "hf_space": "allenai/reward-bench"
+        }
+      },
+      "benchmark": "reward-bench",
+      "evaluation_results": [
         {
-          "evaluation_name": "Precise IF",
+          "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Precise Instruction Following score",
+            "evaluation_description": "Overall RewardBench Score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.425
+            "score": 0.9252
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Math",
+          "evaluation_name": "Chat",
           "metric_config": {
-            "evaluation_description": "Math score - measures mathematical reasoning",
+            "evaluation_description": "Chat accuracy - includes easy chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6284
+            "score": 0.9581
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Safety",
+          "evaluation_name": "Chat Hard",
           "metric_config": {
-            "evaluation_description": "Safety score - measures safety awareness",
+            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9333
+            "score": 0.8728
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Focus",
+          "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Focus score - measures response focus",
+            "evaluation_description": "Safety accuracy - includes safety subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9616
+            "score": 0.9081
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Ties",
+          "evaluation_name": "Reasoning",
           "metric_config": {
-            "evaluation_description": "Ties score - ability to identify tie cases",
+            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.741
+            "score": 0.962
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         }
       ],
diff --git a/data/models/skywork_skywork-vl-reward-7b.json b/data/models/skywork_skywork-vl-reward-7b.json
index 651d1416fd84d9618565234fc2f23befa272cb51..d1caca7afd32adac0ce3eaccc5894c6b1d1db99d 100644
--- a/data/models/skywork_skywork-vl-reward-7b.json
+++ b/data/models/skywork_skywork-vl-reward-7b.json
@@ -9,10 +9,10 @@
   },
   "evaluations": [
     {
-      "evaluation_id": "reward-bench-2/Skywork_Skywork-VL-Reward-7B/1766412838.146816",
+      "evaluation_id": "reward-bench/Skywork_Skywork-VL-Reward-7B/1766412838.146816",
       "retrieved_timestamp": "1766412838.146816",
       "source_metadata": {
-        "source_name": "RewardBench 2",
+        "source_name": "RewardBench",
         "source_type": "documentation",
         "source_organization_name": "Allen Institute for AI",
         "source_organization_url": "https://allenai.org",
@@ -31,104 +31,128 @@
         {
           "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
+            "evaluation_description": "Overall RewardBench Score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6885
+            "score": 0.9007
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Factuality",
+          "evaluation_name": "Chat",
           "metric_config": {
-            "evaluation_description": "Factuality score - measures factual accuracy",
+            "evaluation_description": "Chat accuracy - includes easy chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6063
+            "score": 0.8994
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Precise IF",
+          "evaluation_name": "Chat Hard",
           "metric_config": {
-            "evaluation_description": "Precise Instruction Following score",
+            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.35
+            "score": 0.875
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Math",
+          "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Math score - measures mathematical reasoning",
+            "evaluation_description": "Safety accuracy - includes safety subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6339
+            "score": 0.9108
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Safety",
+          "evaluation_name": "Reasoning",
           "metric_config": {
-            "evaluation_description": "Safety score - measures safety awareness",
+            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8911
+            "score": 0.9176
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
-        },
+        }
+      ],
+      "detailed_evaluation_results": null,
+      "generation_config": null
+    },
+    {
+      "evaluation_id": "reward-bench-2/Skywork_Skywork-VL-Reward-7B/1766412838.146816",
+      "retrieved_timestamp": "1766412838.146816",
+      "source_metadata": {
+        "source_name": "RewardBench 2",
+        "source_type": "documentation",
+        "source_organization_name": "Allen Institute for AI",
+        "source_organization_url": "https://allenai.org",
+        "evaluator_relationship": "third_party"
+      },
+      "eval_library": {
+        "name": "rewardbench",
+        "version": "0.1.3",
+        "additional_details": {
+          "subsets": "Chat, Chat Hard, Safety, Reasoning",
+          "hf_space": "allenai/reward-bench"
+        }
+      },
+      "benchmark": "reward-bench",
+      "evaluation_results": [
         {
-          "evaluation_name": "Focus",
+          "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Focus score - measures response focus",
+            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8909
+            "score": 0.6885
           },
           "source_data": {
             "dataset_name": "RewardBench 2",
@@ -137,135 +161,111 @@
           }
         },
         {
-          "evaluation_name": "Ties",
+          "evaluation_name": "Factuality",
           "metric_config": {
-            "evaluation_description": "Ties score - ability to identify tie cases",
+            "evaluation_description": "Factuality score - measures factual accuracy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7586
+            "score": 0.6063
           },
           "source_data": {
             "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
             "hf_repo": "allenai/reward-bench-2-results"
           }
-        }
-      ],
-      "detailed_evaluation_results": null,
-      "generation_config": null
-    },
-    {
-      "evaluation_id": "reward-bench/Skywork_Skywork-VL-Reward-7B/1766412838.146816",
-      "retrieved_timestamp": "1766412838.146816",
-      "source_metadata": {
-        "source_name": "RewardBench",
-        "source_type": "documentation",
-        "source_organization_name": "Allen Institute for AI",
-        "source_organization_url": "https://allenai.org",
-        "evaluator_relationship": "third_party"
-      },
-      "eval_library": {
-        "name": "rewardbench",
-        "version": "0.1.3",
-        "additional_details": {
-          "subsets": "Chat, Chat Hard, Safety, Reasoning",
-          "hf_space": "allenai/reward-bench"
-        }
-      },
-      "benchmark": "reward-bench",
-      "evaluation_results": [
+        },
         {
-          "evaluation_name": "Score",
+          "evaluation_name": "Precise IF",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench Score",
+            "evaluation_description": "Precise Instruction Following score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9007
+            "score": 0.35
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat",
+          "evaluation_name": "Math",
           "metric_config": {
-            "evaluation_description": "Chat accuracy - includes easy chat subsets",
+            "evaluation_description": "Math score - measures mathematical reasoning",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8994
+            "score": 0.6339
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat Hard",
+          "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
+            "evaluation_description": "Safety score - measures safety awareness",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.875
+            "score": 0.8911
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Safety",
+          "evaluation_name": "Focus",
           "metric_config": {
-            "evaluation_description": "Safety accuracy - includes safety subsets",
+            "evaluation_description": "Focus score - measures response focus",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9108
+            "score": 0.8909
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Reasoning",
+          "evaluation_name": "Ties",
           "metric_config": {
-            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
+            "evaluation_description": "Ties score - ability to identify tie cases",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9176
+            "score": 0.7586
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         }
       ],
diff --git a/data/models/snowflake_snowflake-arctic-instruct.json b/data/models/snowflake_snowflake-arctic-instruct.json
index 332fb7788a04f01cd7fc12206b12004a2af1d5d3..dcce9cb71bbe18fac2e40bbf0810fef5adcc4078 100644
--- a/data/models/snowflake_snowflake-arctic-instruct.json
+++ b/data/models/snowflake_snowflake-arctic-instruct.json
@@ -7,10 +7,10 @@
   },
   "evaluations": [
     {
-      "evaluation_id": "helm_mmlu/snowflake_snowflake-arctic-instruct/1774096312.00548",
-      "retrieved_timestamp": "1774096312.00548",
+      "evaluation_id": "helm_lite/snowflake_snowflake-arctic-instruct/1774096306.427425",
+      "retrieved_timestamp": "1774096306.427425",
       "source_metadata": {
-        "source_name": "helm_mmlu",
+        "source_name": "helm_lite",
         "source_type": "documentation",
         "source_organization_name": "crfm",
         "evaluator_relationship": "third_party"
@@ -19,438 +19,382 @@
         "name": "helm",
         "version": "unknown"
       },
-      "benchmark": "helm_mmlu",
+      "benchmark": "helm_lite",
       "evaluation_results": [
         {
-          "evaluation_name": "MMLU All Subjects",
+          "evaluation_name": "Mean win rate",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "helm_lite",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on MMLU All Subjects",
+            "evaluation_description": "How many models this model outperforms on average (over columns).",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.677,
+            "score": 0.338,
             "details": {
-              "description": "min=0.28, mean=0.677, max=0.912, sum=77.129 (114)",
+              "description": "",
               "tab": "Accuracy",
-              "MMLU All Subjects - Observed inference time (s)": "{\"description\": \"min=0.35, mean=0.42, max=0.544, sum=47.89 (114)\", \"tab\": \"Efficiency\", \"score\": \"0.4200856614493726\"}",
-              "MMLU All Subjects - # eval": "{\"description\": \"min=100, mean=246.351, max=1534, sum=28084 (114)\", \"tab\": \"General information\", \"score\": \"246.35087719298247\"}",
-              "MMLU All Subjects - # train": "{\"description\": \"min=5, mean=5, max=5, sum=570 (114)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "MMLU All Subjects - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (114)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "MMLU All Subjects - # prompt tokens": "{\"description\": \"min=304.474, mean=706.682, max=3159.636, sum=80561.749 (114)\", \"tab\": \"General information\", \"score\": \"706.6820126388612\"}",
-              "MMLU All Subjects - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=114 (114)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Mean win rate - Efficiency": "{\"description\": \"\", \"tab\": \"Efficiency\", \"score\": \"0.7606242197253433\"}",
+              "Mean win rate - General information": "{\"description\": \"\", \"tab\": \"General information\", \"score\": \"\"}"
             }
           },
           "generation_config": {
-            "additional_details": {
-              "subject": "[\"abstract_algebra\", \"anatomy\", \"astronomy\", \"business_ethics\", \"clinical_knowledge\", \"college_biology\", \"college_chemistry\", \"college_computer_science\", \"college_mathematics\", \"college_medicine\", \"college_physics\", \"computer_security\", \"conceptual_physics\", \"econometrics\", \"electrical_engineering\", \"elementary_mathematics\", \"formal_logic\", \"global_facts\", \"high_school_biology\", \"high_school_chemistry\", \"high_school_computer_science\", \"high_school_european_history\", \"high_school_geography\", \"high_school_government_and_politics\", \"high_school_macroeconomics\", \"high_school_mathematics\", \"high_school_microeconomics\", \"high_school_physics\", \"high_school_psychology\", \"high_school_statistics\", \"high_school_us_history\", \"high_school_world_history\", \"human_aging\", \"human_sexuality\", \"international_law\", \"jurisprudence\", \"logical_fallacies\", \"machine_learning\", \"management\", \"marketing\", \"medical_genetics\", \"miscellaneous\", \"moral_disputes\", \"moral_scenarios\", \"nutrition\", \"philosophy\", \"prehistory\", \"professional_accounting\", \"professional_law\", \"professional_medicine\", \"professional_psychology\", \"public_relations\", \"security_studies\", \"sociology\", \"us_foreign_policy\", \"virology\", \"world_religions\"]",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "[\"mmlu_abstract_algebra\", \"mmlu_anatomy\", \"mmlu_astronomy\", \"mmlu_business_ethics\", \"mmlu_clinical_knowledge\", \"mmlu_college_biology\", \"mmlu_college_chemistry\", \"mmlu_college_computer_science\", \"mmlu_college_mathematics\", \"mmlu_college_medicine\", \"mmlu_college_physics\", \"mmlu_computer_security\", \"mmlu_conceptual_physics\", \"mmlu_econometrics\", \"mmlu_electrical_engineering\", \"mmlu_elementary_mathematics\", \"mmlu_formal_logic\", \"mmlu_global_facts\", \"mmlu_high_school_biology\", \"mmlu_high_school_chemistry\", \"mmlu_high_school_computer_science\", \"mmlu_high_school_european_history\", \"mmlu_high_school_geography\", \"mmlu_high_school_government_and_politics\", \"mmlu_high_school_macroeconomics\", \"mmlu_high_school_mathematics\", \"mmlu_high_school_microeconomics\", \"mmlu_high_school_physics\", \"mmlu_high_school_psychology\", \"mmlu_high_school_statistics\", \"mmlu_high_school_us_history\", \"mmlu_high_school_world_history\", \"mmlu_human_aging\", \"mmlu_human_sexuality\", \"mmlu_international_law\", \"mmlu_jurisprudence\", \"mmlu_logical_fallacies\", \"mmlu_machine_learning\", \"mmlu_management\", \"mmlu_marketing\", \"mmlu_medical_genetics\", \"mmlu_miscellaneous\", \"mmlu_moral_disputes\", \"mmlu_moral_scenarios\", \"mmlu_nutrition\", \"mmlu_philosophy\", \"mmlu_prehistory\", \"mmlu_professional_accounting\", \"mmlu_professional_law\", \"mmlu_professional_medicine\", \"mmlu_professional_psychology\", \"mmlu_public_relations\", \"mmlu_security_studies\", \"mmlu_sociology\", \"mmlu_us_foreign_policy\", \"mmlu_virology\", \"mmlu_world_religions\"]"
-            }
+            "additional_details": {}
           }
         },
         {
-          "evaluation_name": "Abstract Algebra",
+          "evaluation_name": "NarrativeQA",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "NarrativeQA",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Abstract Algebra",
+            "evaluation_description": "F1 on NarrativeQA",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.35,
+            "score": 0.654,
             "details": {
-              "description": "min=0.35, mean=0.35, max=0.35, sum=0.7 (2)",
+              "description": "min=0.654, mean=0.654, max=0.654, sum=0.654 (1)",
               "tab": "Accuracy",
-              "Abstract Algebra - Observed inference time (s)": "{\"description\": \"min=0.377, mean=0.377, max=0.377, sum=0.753 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.37665764808654784\"}",
-              "Abstract Algebra - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "Abstract Algebra - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Abstract Algebra - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Abstract Algebra - # prompt tokens": "{\"description\": \"min=397.65, mean=397.65, max=397.65, sum=795.3 (2)\", \"tab\": \"General information\", \"score\": \"397.65\"}",
-              "Abstract Algebra - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "NarrativeQA - Observed inference time (s)": "{\"description\": \"min=0.624, mean=0.624, max=0.624, sum=0.624 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.6239793220036466\"}",
+              "NarrativeQA - # eval": "{\"description\": \"min=355, mean=355, max=355, sum=355 (1)\", \"tab\": \"General information\", \"score\": \"355.0\"}",
+              "NarrativeQA - # train": "{\"description\": \"min=4.262, mean=4.262, max=4.262, sum=4.262 (1)\", \"tab\": \"General information\", \"score\": \"4.261971830985916\"}",
+              "NarrativeQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "NarrativeQA - # prompt tokens": "{\"description\": \"min=3603.217, mean=3603.217, max=3603.217, sum=3603.217 (1)\", \"tab\": \"General information\", \"score\": \"3603.2169014084507\"}",
+              "NarrativeQA - # output tokens": "{\"description\": \"min=11.907, mean=11.907, max=11.907, sum=11.907 (1)\", \"tab\": \"General information\", \"score\": \"11.907042253521126\"}"
             }
           },
           "generation_config": {
-            "additional_details": {
-              "subject": "\"abstract_algebra\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_abstract_algebra\""
-            }
+            "additional_details": {}
           }
         },
         {
-          "evaluation_name": "Anatomy",
+          "evaluation_name": "NaturalQuestions (closed-book)",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "NaturalQuestions (closed-book)",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Anatomy",
+            "evaluation_description": "F1 on NaturalQuestions (closed-book)",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.652,
+            "score": 0.39,
             "details": {
-              "description": "min=0.652, mean=0.652, max=0.652, sum=1.304 (2)",
+              "description": "min=0.39, mean=0.39, max=0.39, sum=0.39 (1)",
               "tab": "Accuracy",
-              "Anatomy - Observed inference time (s)": "{\"description\": \"min=0.365, mean=0.365, max=0.365, sum=0.731 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3654881194785789\"}",
-              "Anatomy - # eval": "{\"description\": \"min=135, mean=135, max=135, sum=270 (2)\", \"tab\": \"General information\", \"score\": \"135.0\"}",
-              "Anatomy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Anatomy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Anatomy - # prompt tokens": "{\"description\": \"min=409.133, mean=409.133, max=409.133, sum=818.267 (2)\", \"tab\": \"General information\", \"score\": \"409.1333333333333\"}",
-              "Anatomy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "NaturalQuestions (open-book) - Observed inference time (s)": "{\"description\": \"min=0.636, mean=0.636, max=0.636, sum=0.636 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.6355201268196106\"}",
+              "NaturalQuestions (closed-book) - Observed inference time (s)": "{\"description\": \"min=0.469, mean=0.469, max=0.469, sum=0.469 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.4687326259613037\"}",
+              "NaturalQuestions (open-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}",
+              "NaturalQuestions (open-book) - # train": "{\"description\": \"min=4.825, mean=4.825, max=4.825, sum=4.825 (1)\", \"tab\": \"General information\", \"score\": \"4.825\"}",
+              "NaturalQuestions (open-book) - truncated": "{\"description\": \"min=0.028, mean=0.028, max=0.028, sum=0.028 (1)\", \"tab\": \"General information\", \"score\": \"0.028\"}",
+              "NaturalQuestions (open-book) - # prompt tokens": "{\"description\": \"min=2311.514, mean=2311.514, max=2311.514, sum=2311.514 (1)\", \"tab\": \"General information\", \"score\": \"2311.514\"}",
+              "NaturalQuestions (open-book) - # output tokens": "{\"description\": \"min=18.701, mean=18.701, max=18.701, sum=18.701 (1)\", \"tab\": \"General information\", \"score\": \"18.701\"}",
+              "NaturalQuestions (closed-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}",
+              "NaturalQuestions (closed-book) - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "NaturalQuestions (closed-book) - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "NaturalQuestions (closed-book) - # prompt tokens": "{\"description\": \"min=166.383, mean=166.383, max=166.383, sum=166.383 (1)\", \"tab\": \"General information\", \"score\": \"166.383\"}",
+              "NaturalQuestions (closed-book) - # output tokens": "{\"description\": \"min=14.473, mean=14.473, max=14.473, sum=14.473 (1)\", \"tab\": \"General information\", \"score\": \"14.473\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"anatomy\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_anatomy\""
+              "mode": "\"closedbook\""
             }
           }
         },
         {
-          "evaluation_name": "College Physics",
+          "evaluation_name": "OpenbookQA",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "OpenbookQA",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on College Physics",
+            "evaluation_description": "EM on OpenbookQA",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.461,
+            "score": 0.828,
             "details": {
-              "description": "min=0.461, mean=0.461, max=0.461, sum=0.922 (2)",
+              "description": "min=0.828, mean=0.828, max=0.828, sum=0.828 (1)",
               "tab": "Accuracy",
-              "College Chemistry - Observed inference time (s)": "{\"description\": \"min=0.35, mean=0.35, max=0.35, sum=0.701 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3502761268615723\"}",
-              "College Biology - Observed inference time (s)": "{\"description\": \"min=0.421, mean=0.421, max=0.421, sum=0.842 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.421069688267178\"}",
-              "College Computer Science - Observed inference time (s)": "{\"description\": \"min=0.427, mean=0.427, max=0.427, sum=0.853 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4266632032394409\"}",
-              "College Mathematics - Observed inference time (s)": "{\"description\": \"min=0.429, mean=0.429, max=0.429, sum=0.858 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42887043952941895\"}",
-              "College Medicine - Observed inference time (s)": "{\"description\": \"min=0.434, mean=0.434, max=0.434, sum=0.869 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4343285574389331\"}",
-              "College Physics - Observed inference time (s)": "{\"description\": \"min=0.421, mean=0.421, max=0.421, sum=0.842 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4209739086674709\"}",
-              "College Chemistry - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "College Chemistry - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "College Chemistry - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "College Chemistry - # prompt tokens": "{\"description\": \"min=622.43, mean=622.43, max=622.43, sum=1244.86 (2)\", \"tab\": \"General information\", \"score\": \"622.43\"}",
-              "College Chemistry - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "College Biology - # eval": "{\"description\": \"min=144, mean=144, max=144, sum=288 (2)\", \"tab\": \"General information\", \"score\": \"144.0\"}",
-              "College Biology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "College Biology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "College Biology - # prompt tokens": "{\"description\": \"min=553.632, mean=553.632, max=553.632, sum=1107.264 (2)\", \"tab\": \"General information\", \"score\": \"553.6319444444445\"}",
-              "College Biology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "College Computer Science - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "College Computer Science - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "College Computer Science - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "College Computer Science - # prompt tokens": "{\"description\": \"min=901.14, mean=901.14, max=901.14, sum=1802.28 (2)\", \"tab\": \"General information\", \"score\": \"901.14\"}",
-              "College Computer Science - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "College Mathematics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "College Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "College Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "College Mathematics - # prompt tokens": "{\"description\": \"min=646.96, mean=646.96, max=646.96, sum=1293.92 (2)\", \"tab\": \"General information\", \"score\": \"646.96\"}",
-              "College Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "College Medicine - # eval": "{\"description\": \"min=173, mean=173, max=173, sum=346 (2)\", \"tab\": \"General information\", \"score\": \"173.0\"}",
-              "College Medicine - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "College Medicine - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "College Medicine - # prompt tokens": "{\"description\": \"min=608.671, mean=608.671, max=608.671, sum=1217.341 (2)\", \"tab\": \"General information\", \"score\": \"608.6705202312139\"}",
-              "College Medicine - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "College Physics - # eval": "{\"description\": \"min=102, mean=102, max=102, sum=204 (2)\", \"tab\": \"General information\", \"score\": \"102.0\"}",
-              "College Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "College Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "College Physics - # prompt tokens": "{\"description\": \"min=551.873, mean=551.873, max=551.873, sum=1103.745 (2)\", \"tab\": \"General information\", \"score\": \"551.8725490196078\"}",
-              "College Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "OpenbookQA - Observed inference time (s)": "{\"description\": \"min=0.284, mean=0.284, max=0.284, sum=0.284 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.2840936713218689\"}",
+              "OpenbookQA - # eval": "{\"description\": \"min=500, mean=500, max=500, sum=500 (1)\", \"tab\": \"General information\", \"score\": \"500.0\"}",
+              "OpenbookQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "OpenbookQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "OpenbookQA - # prompt tokens": "{\"description\": \"min=291.574, mean=291.574, max=291.574, sum=291.574 (1)\", \"tab\": \"General information\", \"score\": \"291.574\"}",
+              "OpenbookQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"college_physics\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_college_physics\""
+              "dataset": "\"openbookqa\"",
+              "method": "\"multiple_choice_joint\""
             }
           }
         },
         {
-          "evaluation_name": "Computer Security",
+          "evaluation_name": "MMLU",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "MMLU",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Computer Security",
+            "evaluation_description": "EM on MMLU",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.84,
+            "score": 0.575,
             "details": {
-              "description": "min=0.84, mean=0.84, max=0.84, sum=1.68 (2)",
+              "description": "min=0.31, mean=0.575, max=0.88, sum=2.876 (5)",
               "tab": "Accuracy",
-              "Computer Security - Observed inference time (s)": "{\"description\": \"min=0.412, mean=0.412, max=0.412, sum=0.825 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.41247488737106325\"}",
-              "Computer Security - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "Computer Security - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Computer Security - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Computer Security - # prompt tokens": "{\"description\": \"min=428.17, mean=428.17, max=428.17, sum=856.34 (2)\", \"tab\": \"General information\", \"score\": \"428.17\"}",
-              "Computer Security - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "MMLU - Observed inference time (s)": "{\"description\": \"min=0.293, mean=0.303, max=0.317, sum=1.516 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.30325288054817606\"}",
+              "MMLU - # eval": "{\"description\": \"min=100, mean=102.8, max=114, sum=514 (5)\", \"tab\": \"General information\", \"score\": \"102.8\"}",
+              "MMLU - # train": "{\"description\": \"min=5, mean=5, max=5, sum=25 (5)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "MMLU - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "MMLU - # prompt tokens": "{\"description\": \"min=406.65, mean=531.547, max=693.675, sum=2657.735 (5)\", \"tab\": \"General information\", \"score\": \"531.5470877192982\"}",
+              "MMLU - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"computer_security\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_computer_security\""
+              "subject": "[\"abstract_algebra\", \"college_chemistry\", \"computer_security\", \"econometrics\", \"us_foreign_policy\"]",
+              "method": "\"multiple_choice_joint\""
             }
           }
         },
         {
-          "evaluation_name": "Econometrics",
+          "evaluation_name": "MATH",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "MATH",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Econometrics",
+            "evaluation_description": "Equivalent (CoT) on MATH",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5,
+            "score": 0.519,
             "details": {
-              "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)",
+              "description": "min=0.316, mean=0.519, max=0.785, sum=3.636 (7)",
               "tab": "Accuracy",
-              "Econometrics - Observed inference time (s)": "{\"description\": \"min=0.436, mean=0.436, max=0.436, sum=0.873 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.436487873395284\"}",
-              "Econometrics - # eval": "{\"description\": \"min=114, mean=114, max=114, sum=228 (2)\", \"tab\": \"General information\", \"score\": \"114.0\"}",
-              "Econometrics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Econometrics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Econometrics - # prompt tokens": "{\"description\": \"min=684.675, mean=684.675, max=684.675, sum=1369.351 (2)\", \"tab\": \"General information\", \"score\": \"684.6754385964912\"}",
-              "Econometrics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "MATH - Observed inference time (s)": "{\"description\": \"min=1.482, mean=1.724, max=1.995, sum=12.068 (7)\", \"tab\": \"Efficiency\", \"score\": \"1.723981539653867\"}",
+              "MATH - # eval": "{\"description\": \"min=30, mean=62.429, max=135, sum=437 (7)\", \"tab\": \"General information\", \"score\": \"62.42857142857143\"}",
+              "MATH - # train": "{\"description\": \"min=8, mean=8, max=8, sum=56 (7)\", \"tab\": \"General information\", \"score\": \"8.0\"}",
+              "MATH - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (7)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "MATH - # prompt tokens": "{\"description\": \"min=971.652, mean=1438.636, max=2490.962, sum=10070.453 (7)\", \"tab\": \"General information\", \"score\": \"1438.6362030100095\"}",
+              "MATH - # output tokens": "{\"description\": \"min=82.872, mean=98.802, max=122.233, sum=691.615 (7)\", \"tab\": \"General information\", \"score\": \"98.80208187931566\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"econometrics\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_econometrics\""
+              "subject": "[\"algebra\", \"counting_and_probability\", \"geometry\", \"intermediate_algebra\", \"number_theory\", \"prealgebra\", \"precalculus\"]",
+              "level": "\"1\"",
+              "use_official_examples": "\"False\"",
+              "use_chain_of_thought": "\"True\""
             }
           }
         },
         {
-          "evaluation_name": "Global Facts",
+          "evaluation_name": "GSM8K",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "GSM8K",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Global Facts",
+            "evaluation_description": "EM on GSM8K",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.39,
+            "score": 0.768,
             "details": {
-              "description": "min=0.39, mean=0.39, max=0.39, sum=0.78 (2)",
+              "description": "min=0.768, mean=0.768, max=0.768, sum=0.768 (1)",
               "tab": "Accuracy",
-              "Global Facts - Observed inference time (s)": "{\"description\": \"min=0.42, mean=0.42, max=0.42, sum=0.839 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.41951879262924197\"}",
-              "Global Facts - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "Global Facts - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Global Facts - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Global Facts - # prompt tokens": "{\"description\": \"min=484.54, mean=484.54, max=484.54, sum=969.08 (2)\", \"tab\": \"General information\", \"score\": \"484.54\"}",
-              "Global Facts - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "GSM8K - Observed inference time (s)": "{\"description\": \"min=2.961, mean=2.961, max=2.961, sum=2.961 (1)\", \"tab\": \"Efficiency\", \"score\": \"2.9610197002887726\"}",
+              "GSM8K - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}",
+              "GSM8K - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "GSM8K - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "GSM8K - # prompt tokens": "{\"description\": \"min=1207.746, mean=1207.746, max=1207.746, sum=1207.746 (1)\", \"tab\": \"General information\", \"score\": \"1207.746\"}",
+              "GSM8K - # output tokens": "{\"description\": \"min=189.305, mean=189.305, max=189.305, sum=189.305 (1)\", \"tab\": \"General information\", \"score\": \"189.305\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"global_facts\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_global_facts\""
+              "stop": "\"none\""
             }
           }
         },
         {
-          "evaluation_name": "Jurisprudence",
+          "evaluation_name": "LegalBench",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "LegalBench",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Jurisprudence",
+            "evaluation_description": "EM on LegalBench",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.741,
+            "score": 0.588,
             "details": {
-              "description": "min=0.741, mean=0.741, max=0.741, sum=1.481 (2)",
+              "description": "min=0.351, mean=0.588, max=0.874, sum=2.94 (5)",
               "tab": "Accuracy",
-              "Jurisprudence - Observed inference time (s)": "{\"description\": \"min=0.422, mean=0.422, max=0.422, sum=0.843 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.421647725281892\"}",
-              "Jurisprudence - # eval": "{\"description\": \"min=108, mean=108, max=108, sum=216 (2)\", \"tab\": \"General information\", \"score\": \"108.0\"}",
-              "Jurisprudence - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Jurisprudence - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Jurisprudence - # prompt tokens": "{\"description\": \"min=449.898, mean=449.898, max=449.898, sum=899.796 (2)\", \"tab\": \"General information\", \"score\": \"449.89814814814815\"}",
-              "Jurisprudence - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "LegalBench - Observed inference time (s)": "{\"description\": \"min=0.292, mean=0.346, max=0.462, sum=1.729 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.34576316386866485\"}",
+              "LegalBench - # eval": "{\"description\": \"min=95, mean=409.4, max=1000, sum=2047 (5)\", \"tab\": \"General information\", \"score\": \"409.4\"}",
+              "LegalBench - # train": "{\"description\": \"min=1.81, mean=4.162, max=5, sum=20.81 (5)\", \"tab\": \"General information\", \"score\": \"4.162040816326531\"}",
+              "LegalBench - truncated": "{\"description\": \"min=0, mean=0.002, max=0.008, sum=0.008 (5)\", \"tab\": \"General information\", \"score\": \"0.0016326530612244899\"}",
+              "LegalBench - # prompt tokens": "{\"description\": \"min=239.137, mean=1024.722, max=3561.237, sum=5123.61 (5)\", \"tab\": \"General information\", \"score\": \"1024.7220443430492\"}",
+              "LegalBench - # output tokens": "{\"description\": \"min=2, mean=2.438, max=3.421, sum=12.188 (5)\", \"tab\": \"General information\", \"score\": \"2.4375592890361366\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"jurisprudence\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_jurisprudence\""
+              "subset": "[\"abercrombie\", \"corporate_lobbying\", \"function_of_decision_section\", \"international_citizenship_questions\", \"proa\"]"
             }
           }
         },
         {
-          "evaluation_name": "Philosophy",
+          "evaluation_name": "MedQA",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "MedQA",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Philosophy",
+            "evaluation_description": "EM on MedQA",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.752,
+            "score": 0.581,
             "details": {
-              "description": "min=0.752, mean=0.752, max=0.752, sum=1.505 (2)",
+              "description": "min=0.581, mean=0.581, max=0.581, sum=0.581 (1)",
               "tab": "Accuracy",
-              "Philosophy - Observed inference time (s)": "{\"description\": \"min=0.418, mean=0.418, max=0.418, sum=0.837 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.418486426497579\"}",
-              "Philosophy - # eval": "{\"description\": \"min=311, mean=311, max=311, sum=622 (2)\", \"tab\": \"General information\", \"score\": \"311.0\"}",
-              "Philosophy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Philosophy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Philosophy - # prompt tokens": "{\"description\": \"min=372.122, mean=372.122, max=372.122, sum=744.244 (2)\", \"tab\": \"General information\", \"score\": \"372.12218649517683\"}",
-              "Philosophy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "MedQA - Observed inference time (s)": "{\"description\": \"min=0.313, mean=0.313, max=0.313, sum=0.313 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.31300480038697864\"}",
+              "MedQA - # eval": "{\"description\": \"min=503, mean=503, max=503, sum=503 (1)\", \"tab\": \"General information\", \"score\": \"503.0\"}",
+              "MedQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "MedQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "MedQA - # prompt tokens": "{\"description\": \"min=1243.901, mean=1243.901, max=1243.901, sum=1243.901 (1)\", \"tab\": \"General information\", \"score\": \"1243.9005964214712\"}",
+              "MedQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
-            "additional_details": {
-              "subject": "\"philosophy\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_philosophy\""
-            }
+            "additional_details": {}
           }
         },
         {
-          "evaluation_name": "Professional Psychology",
+          "evaluation_name": "WMT 2014",
           "source_data": {
-            "dataset_name": "helm_mmlu",
+            "dataset_name": "WMT 2014",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
+              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Professional Psychology",
+            "evaluation_description": "BLEU-4 on WMT 2014",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.724,
+            "score": 0.172,
             "details": {
-              "description": "min=0.724, mean=0.724, max=0.724, sum=1.448 (2)",
+              "description": "min=0.09, mean=0.172, max=0.217, sum=0.86 (5)",
               "tab": "Accuracy",
-              "Professional Medicine - Observed inference time (s)": "{\"description\": \"min=0.445, mean=0.445, max=0.445, sum=0.89 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4448305149288738\"}",
-              "Professional Accounting - Observed inference time (s)": "{\"description\": \"min=0.443, mean=0.443, max=0.443, sum=0.887 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.44340477683019974\"}",
-              "Professional Law - Observed inference time (s)": "{\"description\": \"min=0.531, mean=0.531, max=0.531, sum=1.062 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.531202322345669\"}",
-              "Professional Psychology - Observed inference time (s)": "{\"description\": \"min=0.423, mean=0.423, max=0.423, sum=0.847 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42342418120577446\"}",
-              "Professional Medicine - # eval": "{\"description\": \"min=272, mean=272, max=272, sum=544 (2)\", \"tab\": \"General information\", \"score\": \"272.0\"}",
-              "Professional Medicine - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Professional Medicine - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Professional Medicine - # prompt tokens": "{\"description\": \"min=1330.647, mean=1330.647, max=1330.647, sum=2661.294 (2)\", \"tab\": \"General information\", \"score\": \"1330.6470588235295\"}",
-              "Professional Medicine - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "Professional Accounting - # eval": "{\"description\": \"min=282, mean=282, max=282, sum=564 (2)\", \"tab\": \"General information\", \"score\": \"282.0\"}",
-              "Professional Accounting - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Professional Accounting - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Professional Accounting - # prompt tokens": "{\"description\": \"min=823.277, mean=823.277, max=823.277, sum=1646.553 (2)\", \"tab\": \"General information\", \"score\": \"823.2765957446809\"}",
-              "Professional Accounting - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "Professional Law - # eval": "{\"description\": \"min=1534, mean=1534, max=1534, sum=3068 (2)\", \"tab\": \"General information\", \"score\": \"1534.0\"}",
-              "Professional Law - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Professional Law - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Professional Law - # prompt tokens": "{\"description\": \"min=1915.007, mean=1915.007, max=1915.007, sum=3830.014 (2)\", \"tab\": \"General information\", \"score\": \"1915.0071707953064\"}",
-              "Professional Law - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "Professional Psychology - # eval": "{\"description\": \"min=612, mean=612, max=612, sum=1224 (2)\", \"tab\": \"General information\", \"score\": \"612.0\"}",
-              "Professional Psychology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Professional Psychology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Professional Psychology - # prompt tokens": "{\"description\": \"min=650.078, mean=650.078, max=650.078, sum=1300.157 (2)\", \"tab\": \"General information\", \"score\": \"650.0784313725491\"}",
-              "Professional Psychology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "WMT 2014 - Observed inference time (s)": "{\"description\": \"min=0.65, mean=0.681, max=0.702, sum=3.405 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.681007040066764\"}",
+              "WMT 2014 - # eval": "{\"description\": \"min=503, mean=568.8, max=832, sum=2844 (5)\", \"tab\": \"General information\", \"score\": \"568.8\"}",
+              "WMT 2014 - # train": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "WMT 2014 - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "WMT 2014 - # prompt tokens": "{\"description\": \"min=145.523, mean=160.288, max=182.972, sum=801.438 (5)\", \"tab\": \"General information\", \"score\": \"160.28751290334915\"}",
+              "WMT 2014 - # output tokens": "{\"description\": \"min=28.596, mean=30.59, max=31.485, sum=152.951 (5)\", \"tab\": \"General information\", \"score\": \"30.59012702630372\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"professional_psychology\"",
-              "method": "\"multiple_choice_joint\"",
-              "eval_split": "\"test\"",
-              "groups": "\"mmlu_professional_psychology\""
+              "language_pair": "[\"cs-en\", \"de-en\", \"fr-en\", \"hi-en\", \"ru-en\"]"
             }
           }
-        },
+        }
+      ],
+      "detailed_evaluation_results": null,
+      "generation_config": {
+        "additional_details": {}
+      }
+    },
+    {
+      "evaluation_id": "helm_mmlu/snowflake_snowflake-arctic-instruct/1774096312.00548",
+      "retrieved_timestamp": "1774096312.00548",
+      "source_metadata": {
+        "source_name": "helm_mmlu",
+        "source_type": "documentation",
+        "source_organization_name": "crfm",
+        "evaluator_relationship": "third_party"
+      },
+      "eval_library": {
+        "name": "helm",
+        "version": "unknown"
+      },
+      "benchmark": "helm_mmlu",
+      "evaluation_results": [
         {
-          "evaluation_name": "Us Foreign Policy",
+          "evaluation_name": "MMLU All Subjects",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -459,36 +403,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Us Foreign Policy",
+            "evaluation_description": "EM on MMLU All Subjects",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.88,
+            "score": 0.677,
             "details": {
-              "description": "min=0.88, mean=0.88, max=0.88, sum=1.76 (2)",
+              "description": "min=0.28, mean=0.677, max=0.912, sum=77.129 (114)",
               "tab": "Accuracy",
-              "Us Foreign Policy - Observed inference time (s)": "{\"description\": \"min=0.424, mean=0.424, max=0.424, sum=0.848 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42398189067840575\"}",
-              "Us Foreign Policy - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "Us Foreign Policy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Us Foreign Policy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Us Foreign Policy - # prompt tokens": "{\"description\": \"min=479.81, mean=479.81, max=479.81, sum=959.62 (2)\", \"tab\": \"General information\", \"score\": \"479.81\"}",
-              "Us Foreign Policy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "MMLU All Subjects - Observed inference time (s)": "{\"description\": \"min=0.35, mean=0.42, max=0.544, sum=47.89 (114)\", \"tab\": \"Efficiency\", \"score\": \"0.4200856614493726\"}",
+              "MMLU All Subjects - # eval": "{\"description\": \"min=100, mean=246.351, max=1534, sum=28084 (114)\", \"tab\": \"General information\", \"score\": \"246.35087719298247\"}",
+              "MMLU All Subjects - # train": "{\"description\": \"min=5, mean=5, max=5, sum=570 (114)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "MMLU All Subjects - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (114)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "MMLU All Subjects - # prompt tokens": "{\"description\": \"min=304.474, mean=706.682, max=3159.636, sum=80561.749 (114)\", \"tab\": \"General information\", \"score\": \"706.6820126388612\"}",
+              "MMLU All Subjects - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=114 (114)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"us_foreign_policy\"",
+              "subject": "[\"abstract_algebra\", \"anatomy\", \"astronomy\", \"business_ethics\", \"clinical_knowledge\", \"college_biology\", \"college_chemistry\", \"college_computer_science\", \"college_mathematics\", \"college_medicine\", \"college_physics\", \"computer_security\", \"conceptual_physics\", \"econometrics\", \"electrical_engineering\", \"elementary_mathematics\", \"formal_logic\", \"global_facts\", \"high_school_biology\", \"high_school_chemistry\", \"high_school_computer_science\", \"high_school_european_history\", \"high_school_geography\", \"high_school_government_and_politics\", \"high_school_macroeconomics\", \"high_school_mathematics\", \"high_school_microeconomics\", \"high_school_physics\", \"high_school_psychology\", \"high_school_statistics\", \"high_school_us_history\", \"high_school_world_history\", \"human_aging\", \"human_sexuality\", \"international_law\", \"jurisprudence\", \"logical_fallacies\", \"machine_learning\", \"management\", \"marketing\", \"medical_genetics\", \"miscellaneous\", \"moral_disputes\", \"moral_scenarios\", \"nutrition\", \"philosophy\", \"prehistory\", \"professional_accounting\", \"professional_law\", \"professional_medicine\", \"professional_psychology\", \"public_relations\", \"security_studies\", \"sociology\", \"us_foreign_policy\", \"virology\", \"world_religions\"]",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_us_foreign_policy\""
+              "groups": "[\"mmlu_abstract_algebra\", \"mmlu_anatomy\", \"mmlu_astronomy\", \"mmlu_business_ethics\", \"mmlu_clinical_knowledge\", \"mmlu_college_biology\", \"mmlu_college_chemistry\", \"mmlu_college_computer_science\", \"mmlu_college_mathematics\", \"mmlu_college_medicine\", \"mmlu_college_physics\", \"mmlu_computer_security\", \"mmlu_conceptual_physics\", \"mmlu_econometrics\", \"mmlu_electrical_engineering\", \"mmlu_elementary_mathematics\", \"mmlu_formal_logic\", \"mmlu_global_facts\", \"mmlu_high_school_biology\", \"mmlu_high_school_chemistry\", \"mmlu_high_school_computer_science\", \"mmlu_high_school_european_history\", \"mmlu_high_school_geography\", \"mmlu_high_school_government_and_politics\", \"mmlu_high_school_macroeconomics\", \"mmlu_high_school_mathematics\", \"mmlu_high_school_microeconomics\", \"mmlu_high_school_physics\", \"mmlu_high_school_psychology\", \"mmlu_high_school_statistics\", \"mmlu_high_school_us_history\", \"mmlu_high_school_world_history\", \"mmlu_human_aging\", \"mmlu_human_sexuality\", \"mmlu_international_law\", \"mmlu_jurisprudence\", \"mmlu_logical_fallacies\", \"mmlu_machine_learning\", \"mmlu_management\", \"mmlu_marketing\", \"mmlu_medical_genetics\", \"mmlu_miscellaneous\", \"mmlu_moral_disputes\", \"mmlu_moral_scenarios\", \"mmlu_nutrition\", \"mmlu_philosophy\", \"mmlu_prehistory\", \"mmlu_professional_accounting\", \"mmlu_professional_law\", \"mmlu_professional_medicine\", \"mmlu_professional_psychology\", \"mmlu_public_relations\", \"mmlu_security_studies\", \"mmlu_sociology\", \"mmlu_us_foreign_policy\", \"mmlu_virology\", \"mmlu_world_religions\"]"
             }
           }
         },
         {
-          "evaluation_name": "Astronomy",
+          "evaluation_name": "Abstract Algebra",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -497,36 +441,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Astronomy",
+            "evaluation_description": "EM on Abstract Algebra",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.763,
+            "score": 0.35,
             "details": {
-              "description": "min=0.763, mean=0.763, max=0.763, sum=1.526 (2)",
+              "description": "min=0.35, mean=0.35, max=0.35, sum=0.7 (2)",
               "tab": "Accuracy",
-              "Astronomy - Observed inference time (s)": "{\"description\": \"min=0.424, mean=0.424, max=0.424, sum=0.848 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42381788398090164\"}",
-              "Astronomy - # eval": "{\"description\": \"min=152, mean=152, max=152, sum=304 (2)\", \"tab\": \"General information\", \"score\": \"152.0\"}",
-              "Astronomy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Astronomy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Astronomy - # prompt tokens": "{\"description\": \"min=681.079, mean=681.079, max=681.079, sum=1362.158 (2)\", \"tab\": \"General information\", \"score\": \"681.078947368421\"}",
-              "Astronomy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Abstract Algebra - Observed inference time (s)": "{\"description\": \"min=0.377, mean=0.377, max=0.377, sum=0.753 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.37665764808654784\"}",
+              "Abstract Algebra - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "Abstract Algebra - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Abstract Algebra - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Abstract Algebra - # prompt tokens": "{\"description\": \"min=397.65, mean=397.65, max=397.65, sum=795.3 (2)\", \"tab\": \"General information\", \"score\": \"397.65\"}",
+              "Abstract Algebra - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"astronomy\"",
+              "subject": "\"abstract_algebra\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_astronomy\""
+              "groups": "\"mmlu_abstract_algebra\""
             }
           }
         },
         {
-          "evaluation_name": "Business Ethics",
+          "evaluation_name": "Anatomy",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -535,36 +479,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Business Ethics",
+            "evaluation_description": "EM on Anatomy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.69,
+            "score": 0.652,
             "details": {
-              "description": "min=0.69, mean=0.69, max=0.69, sum=1.38 (2)",
+              "description": "min=0.652, mean=0.652, max=0.652, sum=1.304 (2)",
               "tab": "Accuracy",
-              "Business Ethics - Observed inference time (s)": "{\"description\": \"min=0.432, mean=0.432, max=0.432, sum=0.863 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4315712761878967\"}",
-              "Business Ethics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "Business Ethics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Business Ethics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Business Ethics - # prompt tokens": "{\"description\": \"min=674.44, mean=674.44, max=674.44, sum=1348.88 (2)\", \"tab\": \"General information\", \"score\": \"674.44\"}",
-              "Business Ethics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Anatomy - Observed inference time (s)": "{\"description\": \"min=0.365, mean=0.365, max=0.365, sum=0.731 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3654881194785789\"}",
+              "Anatomy - # eval": "{\"description\": \"min=135, mean=135, max=135, sum=270 (2)\", \"tab\": \"General information\", \"score\": \"135.0\"}",
+              "Anatomy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Anatomy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Anatomy - # prompt tokens": "{\"description\": \"min=409.133, mean=409.133, max=409.133, sum=818.267 (2)\", \"tab\": \"General information\", \"score\": \"409.1333333333333\"}",
+              "Anatomy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"business_ethics\"",
+              "subject": "\"anatomy\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_business_ethics\""
+              "groups": "\"mmlu_anatomy\""
             }
           }
         },
         {
-          "evaluation_name": "Clinical Knowledge",
+          "evaluation_name": "College Physics",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -573,36 +517,66 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Clinical Knowledge",
+            "evaluation_description": "EM on College Physics",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.781,
+            "score": 0.461,
             "details": {
-              "description": "min=0.781, mean=0.781, max=0.781, sum=1.562 (2)",
+              "description": "min=0.461, mean=0.461, max=0.461, sum=0.922 (2)",
               "tab": "Accuracy",
-              "Clinical Knowledge - Observed inference time (s)": "{\"description\": \"min=0.42, mean=0.42, max=0.42, sum=0.841 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4204666920428006\"}",
-              "Clinical Knowledge - # eval": "{\"description\": \"min=265, mean=265, max=265, sum=530 (2)\", \"tab\": \"General information\", \"score\": \"265.0\"}",
-              "Clinical Knowledge - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Clinical Knowledge - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Clinical Knowledge - # prompt tokens": "{\"description\": \"min=487.374, mean=487.374, max=487.374, sum=974.747 (2)\", \"tab\": \"General information\", \"score\": \"487.3735849056604\"}",
-              "Clinical Knowledge - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "College Chemistry - Observed inference time (s)": "{\"description\": \"min=0.35, mean=0.35, max=0.35, sum=0.701 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3502761268615723\"}",
+              "College Biology - Observed inference time (s)": "{\"description\": \"min=0.421, mean=0.421, max=0.421, sum=0.842 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.421069688267178\"}",
+              "College Computer Science - Observed inference time (s)": "{\"description\": \"min=0.427, mean=0.427, max=0.427, sum=0.853 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4266632032394409\"}",
+              "College Mathematics - Observed inference time (s)": "{\"description\": \"min=0.429, mean=0.429, max=0.429, sum=0.858 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42887043952941895\"}",
+              "College Medicine - Observed inference time (s)": "{\"description\": \"min=0.434, mean=0.434, max=0.434, sum=0.869 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4343285574389331\"}",
+              "College Physics - Observed inference time (s)": "{\"description\": \"min=0.421, mean=0.421, max=0.421, sum=0.842 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4209739086674709\"}",
+              "College Chemistry - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "College Chemistry - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "College Chemistry - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "College Chemistry - # prompt tokens": "{\"description\": \"min=622.43, mean=622.43, max=622.43, sum=1244.86 (2)\", \"tab\": \"General information\", \"score\": \"622.43\"}",
+              "College Chemistry - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "College Biology - # eval": "{\"description\": \"min=144, mean=144, max=144, sum=288 (2)\", \"tab\": \"General information\", \"score\": \"144.0\"}",
+              "College Biology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "College Biology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "College Biology - # prompt tokens": "{\"description\": \"min=553.632, mean=553.632, max=553.632, sum=1107.264 (2)\", \"tab\": \"General information\", \"score\": \"553.6319444444445\"}",
+              "College Biology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "College Computer Science - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "College Computer Science - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "College Computer Science - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "College Computer Science - # prompt tokens": "{\"description\": \"min=901.14, mean=901.14, max=901.14, sum=1802.28 (2)\", \"tab\": \"General information\", \"score\": \"901.14\"}",
+              "College Computer Science - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "College Mathematics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "College Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "College Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "College Mathematics - # prompt tokens": "{\"description\": \"min=646.96, mean=646.96, max=646.96, sum=1293.92 (2)\", \"tab\": \"General information\", \"score\": \"646.96\"}",
+              "College Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "College Medicine - # eval": "{\"description\": \"min=173, mean=173, max=173, sum=346 (2)\", \"tab\": \"General information\", \"score\": \"173.0\"}",
+              "College Medicine - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "College Medicine - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "College Medicine - # prompt tokens": "{\"description\": \"min=608.671, mean=608.671, max=608.671, sum=1217.341 (2)\", \"tab\": \"General information\", \"score\": \"608.6705202312139\"}",
+              "College Medicine - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "College Physics - # eval": "{\"description\": \"min=102, mean=102, max=102, sum=204 (2)\", \"tab\": \"General information\", \"score\": \"102.0\"}",
+              "College Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "College Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "College Physics - # prompt tokens": "{\"description\": \"min=551.873, mean=551.873, max=551.873, sum=1103.745 (2)\", \"tab\": \"General information\", \"score\": \"551.8725490196078\"}",
+              "College Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"clinical_knowledge\"",
+              "subject": "\"college_physics\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_clinical_knowledge\""
+              "groups": "\"mmlu_college_physics\""
             }
           }
         },
         {
-          "evaluation_name": "Conceptual Physics",
+          "evaluation_name": "Computer Security",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -611,36 +585,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Conceptual Physics",
+            "evaluation_description": "EM on Computer Security",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.634,
+            "score": 0.84,
             "details": {
-              "description": "min=0.634, mean=0.634, max=0.634, sum=1.268 (2)",
+              "description": "min=0.84, mean=0.84, max=0.84, sum=1.68 (2)",
               "tab": "Accuracy",
-              "Conceptual Physics - Observed inference time (s)": "{\"description\": \"min=0.412, mean=0.412, max=0.412, sum=0.824 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4118805824442113\"}",
-              "Conceptual Physics - # eval": "{\"description\": \"min=235, mean=235, max=235, sum=470 (2)\", \"tab\": \"General information\", \"score\": \"235.0\"}",
-              "Conceptual Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Conceptual Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Conceptual Physics - # prompt tokens": "{\"description\": \"min=333.153, mean=333.153, max=333.153, sum=666.306 (2)\", \"tab\": \"General information\", \"score\": \"333.1531914893617\"}",
-              "Conceptual Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Computer Security - Observed inference time (s)": "{\"description\": \"min=0.412, mean=0.412, max=0.412, sum=0.825 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.41247488737106325\"}",
+              "Computer Security - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "Computer Security - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Computer Security - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Computer Security - # prompt tokens": "{\"description\": \"min=428.17, mean=428.17, max=428.17, sum=856.34 (2)\", \"tab\": \"General information\", \"score\": \"428.17\"}",
+              "Computer Security - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"conceptual_physics\"",
+              "subject": "\"computer_security\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_conceptual_physics\""
+              "groups": "\"mmlu_computer_security\""
             }
           }
         },
         {
-          "evaluation_name": "Electrical Engineering",
+          "evaluation_name": "Econometrics",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -649,36 +623,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Electrical Engineering",
+            "evaluation_description": "EM on Econometrics",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.662,
+            "score": 0.5,
             "details": {
-              "description": "min=0.662, mean=0.662, max=0.662, sum=1.324 (2)",
+              "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)",
               "tab": "Accuracy",
-              "Electrical Engineering - Observed inference time (s)": "{\"description\": \"min=0.428, mean=0.428, max=0.428, sum=0.856 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42821227435407966\"}",
-              "Electrical Engineering - # eval": "{\"description\": \"min=145, mean=145, max=145, sum=290 (2)\", \"tab\": \"General information\", \"score\": \"145.0\"}",
-              "Electrical Engineering - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Electrical Engineering - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Electrical Engineering - # prompt tokens": "{\"description\": \"min=497.779, mean=497.779, max=497.779, sum=995.559 (2)\", \"tab\": \"General information\", \"score\": \"497.7793103448276\"}",
-              "Electrical Engineering - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Econometrics - Observed inference time (s)": "{\"description\": \"min=0.436, mean=0.436, max=0.436, sum=0.873 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.436487873395284\"}",
+              "Econometrics - # eval": "{\"description\": \"min=114, mean=114, max=114, sum=228 (2)\", \"tab\": \"General information\", \"score\": \"114.0\"}",
+              "Econometrics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Econometrics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Econometrics - # prompt tokens": "{\"description\": \"min=684.675, mean=684.675, max=684.675, sum=1369.351 (2)\", \"tab\": \"General information\", \"score\": \"684.6754385964912\"}",
+              "Econometrics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"electrical_engineering\"",
+              "subject": "\"econometrics\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_electrical_engineering\""
+              "groups": "\"mmlu_econometrics\""
             }
           }
         },
         {
-          "evaluation_name": "Elementary Mathematics",
+          "evaluation_name": "Global Facts",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -687,36 +661,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Elementary Mathematics",
+            "evaluation_description": "EM on Global Facts",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.481,
+            "score": 0.39,
             "details": {
-              "description": "min=0.481, mean=0.481, max=0.481, sum=0.963 (2)",
+              "description": "min=0.39, mean=0.39, max=0.39, sum=0.78 (2)",
               "tab": "Accuracy",
-              "Elementary Mathematics - Observed inference time (s)": "{\"description\": \"min=0.427, mean=0.427, max=0.427, sum=0.853 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4265344634888664\"}",
-              "Elementary Mathematics - # eval": "{\"description\": \"min=378, mean=378, max=378, sum=756 (2)\", \"tab\": \"General information\", \"score\": \"378.0\"}",
-              "Elementary Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Elementary Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Elementary Mathematics - # prompt tokens": "{\"description\": \"min=609.156, mean=609.156, max=609.156, sum=1218.312 (2)\", \"tab\": \"General information\", \"score\": \"609.1560846560847\"}",
-              "Elementary Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Global Facts - Observed inference time (s)": "{\"description\": \"min=0.42, mean=0.42, max=0.42, sum=0.839 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.41951879262924197\"}",
+              "Global Facts - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "Global Facts - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Global Facts - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Global Facts - # prompt tokens": "{\"description\": \"min=484.54, mean=484.54, max=484.54, sum=969.08 (2)\", \"tab\": \"General information\", \"score\": \"484.54\"}",
+              "Global Facts - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"elementary_mathematics\"",
+              "subject": "\"global_facts\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_elementary_mathematics\""
+              "groups": "\"mmlu_global_facts\""
             }
           }
         },
         {
-          "evaluation_name": "Formal Logic",
+          "evaluation_name": "Jurisprudence",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -725,36 +699,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Formal Logic",
+            "evaluation_description": "EM on Jurisprudence",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.444,
+            "score": 0.741,
             "details": {
-              "description": "min=0.444, mean=0.444, max=0.444, sum=0.889 (2)",
+              "description": "min=0.741, mean=0.741, max=0.741, sum=1.481 (2)",
               "tab": "Accuracy",
-              "Formal Logic - Observed inference time (s)": "{\"description\": \"min=0.411, mean=0.411, max=0.411, sum=0.821 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4107102117841206\"}",
-              "Formal Logic - # eval": "{\"description\": \"min=126, mean=126, max=126, sum=252 (2)\", \"tab\": \"General information\", \"score\": \"126.0\"}",
-              "Formal Logic - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Formal Logic - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Formal Logic - # prompt tokens": "{\"description\": \"min=691.81, mean=691.81, max=691.81, sum=1383.619 (2)\", \"tab\": \"General information\", \"score\": \"691.8095238095239\"}",
-              "Formal Logic - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Jurisprudence - Observed inference time (s)": "{\"description\": \"min=0.422, mean=0.422, max=0.422, sum=0.843 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.421647725281892\"}",
+              "Jurisprudence - # eval": "{\"description\": \"min=108, mean=108, max=108, sum=216 (2)\", \"tab\": \"General information\", \"score\": \"108.0\"}",
+              "Jurisprudence - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Jurisprudence - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Jurisprudence - # prompt tokens": "{\"description\": \"min=449.898, mean=449.898, max=449.898, sum=899.796 (2)\", \"tab\": \"General information\", \"score\": \"449.89814814814815\"}",
+              "Jurisprudence - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"formal_logic\"",
+              "subject": "\"jurisprudence\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_formal_logic\""
+              "groups": "\"mmlu_jurisprudence\""
             }
           }
         },
         {
-          "evaluation_name": "High School World History",
+          "evaluation_name": "Philosophy",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -763,114 +737,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on High School World History",
+            "evaluation_description": "EM on Philosophy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.827,
+            "score": 0.752,
             "details": {
-              "description": "min=0.827, mean=0.827, max=0.827, sum=1.654 (2)",
+              "description": "min=0.752, mean=0.752, max=0.752, sum=1.505 (2)",
               "tab": "Accuracy",
-              "High School Biology - Observed inference time (s)": "{\"description\": \"min=0.424, mean=0.424, max=0.424, sum=0.847 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42357982127897204\"}",
-              "High School Chemistry - Observed inference time (s)": "{\"description\": \"min=0.412, mean=0.412, max=0.412, sum=0.825 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.41242665375394777\"}",
-              "High School Computer Science - Observed inference time (s)": "{\"description\": \"min=0.445, mean=0.445, max=0.445, sum=0.89 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.44495458364486695\"}",
-              "High School European History - Observed inference time (s)": "{\"description\": \"min=0.544, mean=0.544, max=0.544, sum=1.088 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5441486705433238\"}",
-              "High School Geography - Observed inference time (s)": "{\"description\": \"min=0.415, mean=0.415, max=0.415, sum=0.83 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4149725003675981\"}",
-              "High School Government And Politics - Observed inference time (s)": "{\"description\": \"min=0.383, mean=0.383, max=0.383, sum=0.766 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.38312110629106433\"}",
-              "High School Macroeconomics - Observed inference time (s)": "{\"description\": \"min=0.403, mean=0.403, max=0.403, sum=0.807 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4034240123553154\"}",
-              "High School Mathematics - Observed inference time (s)": "{\"description\": \"min=0.39, mean=0.39, max=0.39, sum=0.779 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.38954139285617406\"}",
-              "High School Microeconomics - Observed inference time (s)": "{\"description\": \"min=0.399, mean=0.399, max=0.399, sum=0.798 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3992174813727371\"}",
-              "High School Physics - Observed inference time (s)": "{\"description\": \"min=0.409, mean=0.409, max=0.409, sum=0.819 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.40926165138648835\"}",
-              "High School Psychology - Observed inference time (s)": "{\"description\": \"min=0.408, mean=0.408, max=0.408, sum=0.816 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4081065694126514\"}",
-              "High School Statistics - Observed inference time (s)": "{\"description\": \"min=0.417, mean=0.417, max=0.417, sum=0.833 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4166152830477114\"}",
-              "High School US History - Observed inference time (s)": "{\"description\": \"min=0.45, mean=0.45, max=0.45, sum=0.901 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4504043985815609\"}",
-              "High School World History - Observed inference time (s)": "{\"description\": \"min=0.416, mean=0.416, max=0.416, sum=0.833 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4162542166086189\"}",
-              "High School Biology - # eval": "{\"description\": \"min=310, mean=310, max=310, sum=620 (2)\", \"tab\": \"General information\", \"score\": \"310.0\"}",
-              "High School Biology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Biology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Biology - # prompt tokens": "{\"description\": \"min=596.894, mean=596.894, max=596.894, sum=1193.787 (2)\", \"tab\": \"General information\", \"score\": \"596.8935483870968\"}",
-              "High School Biology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Chemistry - # eval": "{\"description\": \"min=203, mean=203, max=203, sum=406 (2)\", \"tab\": \"General information\", \"score\": \"203.0\"}",
-              "High School Chemistry - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Chemistry - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Chemistry - # prompt tokens": "{\"description\": \"min=568.665, mean=568.665, max=568.665, sum=1137.33 (2)\", \"tab\": \"General information\", \"score\": \"568.6650246305419\"}",
-              "High School Chemistry - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Computer Science - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "High School Computer Science - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Computer Science - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Computer Science - # prompt tokens": "{\"description\": \"min=988.57, mean=988.57, max=988.57, sum=1977.14 (2)\", \"tab\": \"General information\", \"score\": \"988.57\"}",
-              "High School Computer Science - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School European History - # eval": "{\"description\": \"min=165, mean=165, max=165, sum=330 (2)\", \"tab\": \"General information\", \"score\": \"165.0\"}",
-              "High School European History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School European History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School European History - # prompt tokens": "{\"description\": \"min=3159.636, mean=3159.636, max=3159.636, sum=6319.273 (2)\", \"tab\": \"General information\", \"score\": \"3159.6363636363635\"}",
-              "High School European History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Geography - # eval": "{\"description\": \"min=198, mean=198, max=198, sum=396 (2)\", \"tab\": \"General information\", \"score\": \"198.0\"}",
-              "High School Geography - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Geography - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Geography - # prompt tokens": "{\"description\": \"min=436.657, mean=436.657, max=436.657, sum=873.313 (2)\", \"tab\": \"General information\", \"score\": \"436.65656565656565\"}",
-              "High School Geography - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Government And Politics - # eval": "{\"description\": \"min=193, mean=193, max=193, sum=386 (2)\", \"tab\": \"General information\", \"score\": \"193.0\"}",
-              "High School Government And Politics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Government And Politics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Government And Politics - # prompt tokens": "{\"description\": \"min=527.927, mean=527.927, max=527.927, sum=1055.855 (2)\", \"tab\": \"General information\", \"score\": \"527.9274611398964\"}",
-              "High School Government And Politics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Macroeconomics - # eval": "{\"description\": \"min=390, mean=390, max=390, sum=780 (2)\", \"tab\": \"General information\", \"score\": \"390.0\"}",
-              "High School Macroeconomics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Macroeconomics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Macroeconomics - # prompt tokens": "{\"description\": \"min=445.662, mean=445.662, max=445.662, sum=891.323 (2)\", \"tab\": \"General information\", \"score\": \"445.66153846153844\"}",
-              "High School Macroeconomics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Mathematics - # eval": "{\"description\": \"min=270, mean=270, max=270, sum=540 (2)\", \"tab\": \"General information\", \"score\": \"270.0\"}",
-              "High School Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Mathematics - # prompt tokens": "{\"description\": \"min=579.181, mean=579.181, max=579.181, sum=1158.363 (2)\", \"tab\": \"General information\", \"score\": \"579.1814814814815\"}",
-              "High School Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Microeconomics - # eval": "{\"description\": \"min=238, mean=238, max=238, sum=476 (2)\", \"tab\": \"General information\", \"score\": \"238.0\"}",
-              "High School Microeconomics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Microeconomics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Microeconomics - # prompt tokens": "{\"description\": \"min=449.492, mean=449.492, max=449.492, sum=898.983 (2)\", \"tab\": \"General information\", \"score\": \"449.49159663865544\"}",
-              "High School Microeconomics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Physics - # eval": "{\"description\": \"min=151, mean=151, max=151, sum=302 (2)\", \"tab\": \"General information\", \"score\": \"151.0\"}",
-              "High School Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Physics - # prompt tokens": "{\"description\": \"min=621.788, mean=621.788, max=621.788, sum=1243.576 (2)\", \"tab\": \"General information\", \"score\": \"621.7880794701987\"}",
-              "High School Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Psychology - # eval": "{\"description\": \"min=545, mean=545, max=545, sum=1090 (2)\", \"tab\": \"General information\", \"score\": \"545.0\"}",
-              "High School Psychology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Psychology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Psychology - # prompt tokens": "{\"description\": \"min=585.919, mean=585.919, max=585.919, sum=1171.839 (2)\", \"tab\": \"General information\", \"score\": \"585.9192660550459\"}",
-              "High School Psychology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School Statistics - # eval": "{\"description\": \"min=216, mean=216, max=216, sum=432 (2)\", \"tab\": \"General information\", \"score\": \"216.0\"}",
-              "High School Statistics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School Statistics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School Statistics - # prompt tokens": "{\"description\": \"min=908.208, mean=908.208, max=908.208, sum=1816.417 (2)\", \"tab\": \"General information\", \"score\": \"908.2083333333334\"}",
-              "High School Statistics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School US History - # eval": "{\"description\": \"min=204, mean=204, max=204, sum=408 (2)\", \"tab\": \"General information\", \"score\": \"204.0\"}",
-              "High School US History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School US History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School US History - # prompt tokens": "{\"description\": \"min=2535.324, mean=2535.324, max=2535.324, sum=5070.647 (2)\", \"tab\": \"General information\", \"score\": \"2535.323529411765\"}",
-              "High School US History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "High School World History - # eval": "{\"description\": \"min=237, mean=237, max=237, sum=474 (2)\", \"tab\": \"General information\", \"score\": \"237.0\"}",
-              "High School World History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "High School World History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "High School World History - # prompt tokens": "{\"description\": \"min=1638.219, mean=1638.219, max=1638.219, sum=3276.439 (2)\", \"tab\": \"General information\", \"score\": \"1638.2194092827003\"}",
-              "High School World History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Philosophy - Observed inference time (s)": "{\"description\": \"min=0.418, mean=0.418, max=0.418, sum=0.837 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.418486426497579\"}",
+              "Philosophy - # eval": "{\"description\": \"min=311, mean=311, max=311, sum=622 (2)\", \"tab\": \"General information\", \"score\": \"311.0\"}",
+              "Philosophy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Philosophy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Philosophy - # prompt tokens": "{\"description\": \"min=372.122, mean=372.122, max=372.122, sum=744.244 (2)\", \"tab\": \"General information\", \"score\": \"372.12218649517683\"}",
+              "Philosophy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"high_school_world_history\"",
+              "subject": "\"philosophy\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_high_school_world_history\""
+              "groups": "\"mmlu_philosophy\""
             }
           }
         },
         {
-          "evaluation_name": "Human Sexuality",
+          "evaluation_name": "Professional Psychology",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -879,42 +775,54 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Human Sexuality",
+            "evaluation_description": "EM on Professional Psychology",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.847,
+            "score": 0.724,
             "details": {
-              "description": "min=0.847, mean=0.847, max=0.847, sum=1.695 (2)",
+              "description": "min=0.724, mean=0.724, max=0.724, sum=1.448 (2)",
               "tab": "Accuracy",
-              "Human Aging - Observed inference time (s)": "{\"description\": \"min=0.401, mean=0.401, max=0.401, sum=0.802 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4010318255745242\"}",
-              "Human Sexuality - Observed inference time (s)": "{\"description\": \"min=0.393, mean=0.393, max=0.393, sum=0.787 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.39331119843111695\"}",
-              "Human Aging - # eval": "{\"description\": \"min=223, mean=223, max=223, sum=446 (2)\", \"tab\": \"General information\", \"score\": \"223.0\"}",
-              "Human Aging - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Human Aging - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Human Aging - # prompt tokens": "{\"description\": \"min=361.26, mean=361.26, max=361.26, sum=722.52 (2)\", \"tab\": \"General information\", \"score\": \"361.26008968609864\"}",
-              "Human Aging - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "Human Sexuality - # eval": "{\"description\": \"min=131, mean=131, max=131, sum=262 (2)\", \"tab\": \"General information\", \"score\": \"131.0\"}",
-              "Human Sexuality - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Human Sexuality - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Human Sexuality - # prompt tokens": "{\"description\": \"min=403.382, mean=403.382, max=403.382, sum=806.763 (2)\", \"tab\": \"General information\", \"score\": \"403.381679389313\"}",
-              "Human Sexuality - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Professional Medicine - Observed inference time (s)": "{\"description\": \"min=0.445, mean=0.445, max=0.445, sum=0.89 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4448305149288738\"}",
+              "Professional Accounting - Observed inference time (s)": "{\"description\": \"min=0.443, mean=0.443, max=0.443, sum=0.887 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.44340477683019974\"}",
+              "Professional Law - Observed inference time (s)": "{\"description\": \"min=0.531, mean=0.531, max=0.531, sum=1.062 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.531202322345669\"}",
+              "Professional Psychology - Observed inference time (s)": "{\"description\": \"min=0.423, mean=0.423, max=0.423, sum=0.847 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42342418120577446\"}",
+              "Professional Medicine - # eval": "{\"description\": \"min=272, mean=272, max=272, sum=544 (2)\", \"tab\": \"General information\", \"score\": \"272.0\"}",
+              "Professional Medicine - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Professional Medicine - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Professional Medicine - # prompt tokens": "{\"description\": \"min=1330.647, mean=1330.647, max=1330.647, sum=2661.294 (2)\", \"tab\": \"General information\", \"score\": \"1330.6470588235295\"}",
+              "Professional Medicine - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "Professional Accounting - # eval": "{\"description\": \"min=282, mean=282, max=282, sum=564 (2)\", \"tab\": \"General information\", \"score\": \"282.0\"}",
+              "Professional Accounting - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Professional Accounting - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Professional Accounting - # prompt tokens": "{\"description\": \"min=823.277, mean=823.277, max=823.277, sum=1646.553 (2)\", \"tab\": \"General information\", \"score\": \"823.2765957446809\"}",
+              "Professional Accounting - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "Professional Law - # eval": "{\"description\": \"min=1534, mean=1534, max=1534, sum=3068 (2)\", \"tab\": \"General information\", \"score\": \"1534.0\"}",
+              "Professional Law - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Professional Law - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Professional Law - # prompt tokens": "{\"description\": \"min=1915.007, mean=1915.007, max=1915.007, sum=3830.014 (2)\", \"tab\": \"General information\", \"score\": \"1915.0071707953064\"}",
+              "Professional Law - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "Professional Psychology - # eval": "{\"description\": \"min=612, mean=612, max=612, sum=1224 (2)\", \"tab\": \"General information\", \"score\": \"612.0\"}",
+              "Professional Psychology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Professional Psychology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Professional Psychology - # prompt tokens": "{\"description\": \"min=650.078, mean=650.078, max=650.078, sum=1300.157 (2)\", \"tab\": \"General information\", \"score\": \"650.0784313725491\"}",
+              "Professional Psychology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"human_sexuality\"",
+              "subject": "\"professional_psychology\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_human_sexuality\""
+              "groups": "\"mmlu_professional_psychology\""
             }
           }
         },
         {
-          "evaluation_name": "International Law",
+          "evaluation_name": "Us Foreign Policy",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -923,36 +831,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on International Law",
+            "evaluation_description": "EM on Us Foreign Policy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.826,
+            "score": 0.88,
             "details": {
-              "description": "min=0.826, mean=0.826, max=0.826, sum=1.653 (2)",
+              "description": "min=0.88, mean=0.88, max=0.88, sum=1.76 (2)",
               "tab": "Accuracy",
-              "International Law - Observed inference time (s)": "{\"description\": \"min=0.42, mean=0.42, max=0.42, sum=0.841 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42040472779392213\"}",
-              "International Law - # eval": "{\"description\": \"min=121, mean=121, max=121, sum=242 (2)\", \"tab\": \"General information\", \"score\": \"121.0\"}",
-              "International Law - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "International Law - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "International Law - # prompt tokens": "{\"description\": \"min=729.463, mean=729.463, max=729.463, sum=1458.926 (2)\", \"tab\": \"General information\", \"score\": \"729.4628099173553\"}",
-              "International Law - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Us Foreign Policy - Observed inference time (s)": "{\"description\": \"min=0.424, mean=0.424, max=0.424, sum=0.848 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42398189067840575\"}",
+              "Us Foreign Policy - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "Us Foreign Policy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Us Foreign Policy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Us Foreign Policy - # prompt tokens": "{\"description\": \"min=479.81, mean=479.81, max=479.81, sum=959.62 (2)\", \"tab\": \"General information\", \"score\": \"479.81\"}",
+              "Us Foreign Policy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"international_law\"",
+              "subject": "\"us_foreign_policy\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_international_law\""
+              "groups": "\"mmlu_us_foreign_policy\""
             }
           }
         },
         {
-          "evaluation_name": "Logical Fallacies",
+          "evaluation_name": "Astronomy",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -961,36 +869,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Logical Fallacies",
+            "evaluation_description": "EM on Astronomy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.779,
+            "score": 0.763,
             "details": {
-              "description": "min=0.779, mean=0.779, max=0.779, sum=1.558 (2)",
+              "description": "min=0.763, mean=0.763, max=0.763, sum=1.526 (2)",
               "tab": "Accuracy",
-              "Logical Fallacies - Observed inference time (s)": "{\"description\": \"min=0.404, mean=0.404, max=0.404, sum=0.809 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4043445353127696\"}",
-              "Logical Fallacies - # eval": "{\"description\": \"min=163, mean=163, max=163, sum=326 (2)\", \"tab\": \"General information\", \"score\": \"163.0\"}",
-              "Logical Fallacies - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Logical Fallacies - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Logical Fallacies - # prompt tokens": "{\"description\": \"min=502.755, mean=502.755, max=502.755, sum=1005.509 (2)\", \"tab\": \"General information\", \"score\": \"502.7546012269939\"}",
-              "Logical Fallacies - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Astronomy - Observed inference time (s)": "{\"description\": \"min=0.424, mean=0.424, max=0.424, sum=0.848 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42381788398090164\"}",
+              "Astronomy - # eval": "{\"description\": \"min=152, mean=152, max=152, sum=304 (2)\", \"tab\": \"General information\", \"score\": \"152.0\"}",
+              "Astronomy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Astronomy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Astronomy - # prompt tokens": "{\"description\": \"min=681.079, mean=681.079, max=681.079, sum=1362.158 (2)\", \"tab\": \"General information\", \"score\": \"681.078947368421\"}",
+              "Astronomy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"logical_fallacies\"",
+              "subject": "\"astronomy\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_logical_fallacies\""
+              "groups": "\"mmlu_astronomy\""
             }
           }
         },
         {
-          "evaluation_name": "Machine Learning",
+          "evaluation_name": "Business Ethics",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -999,36 +907,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Machine Learning",
+            "evaluation_description": "EM on Business Ethics",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.473,
+            "score": 0.69,
             "details": {
-              "description": "min=0.473, mean=0.473, max=0.473, sum=0.946 (2)",
+              "description": "min=0.69, mean=0.69, max=0.69, sum=1.38 (2)",
               "tab": "Accuracy",
-              "Machine Learning - Observed inference time (s)": "{\"description\": \"min=0.421, mean=0.421, max=0.421, sum=0.842 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42122456644262585\"}",
-              "Machine Learning - # eval": "{\"description\": \"min=112, mean=112, max=112, sum=224 (2)\", \"tab\": \"General information\", \"score\": \"112.0\"}",
-              "Machine Learning - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Machine Learning - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Machine Learning - # prompt tokens": "{\"description\": \"min=730.402, mean=730.402, max=730.402, sum=1460.804 (2)\", \"tab\": \"General information\", \"score\": \"730.4017857142857\"}",
-              "Machine Learning - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Business Ethics - Observed inference time (s)": "{\"description\": \"min=0.432, mean=0.432, max=0.432, sum=0.863 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4315712761878967\"}",
+              "Business Ethics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "Business Ethics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Business Ethics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Business Ethics - # prompt tokens": "{\"description\": \"min=674.44, mean=674.44, max=674.44, sum=1348.88 (2)\", \"tab\": \"General information\", \"score\": \"674.44\"}",
+              "Business Ethics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"machine_learning\"",
+              "subject": "\"business_ethics\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_machine_learning\""
+              "groups": "\"mmlu_business_ethics\""
             }
           }
         },
         {
-          "evaluation_name": "Management",
+          "evaluation_name": "Clinical Knowledge",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1037,36 +945,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Management",
+            "evaluation_description": "EM on Clinical Knowledge",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.796,
+            "score": 0.781,
             "details": {
-              "description": "min=0.796, mean=0.796, max=0.796, sum=1.592 (2)",
+              "description": "min=0.781, mean=0.781, max=0.781, sum=1.562 (2)",
               "tab": "Accuracy",
-              "Management - Observed inference time (s)": "{\"description\": \"min=0.392, mean=0.392, max=0.392, sum=0.785 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.392485206566968\"}",
-              "Management - # eval": "{\"description\": \"min=103, mean=103, max=103, sum=206 (2)\", \"tab\": \"General information\", \"score\": \"103.0\"}",
-              "Management - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Management - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Management - # prompt tokens": "{\"description\": \"min=315.777, mean=315.777, max=315.777, sum=631.553 (2)\", \"tab\": \"General information\", \"score\": \"315.77669902912623\"}",
-              "Management - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Clinical Knowledge - Observed inference time (s)": "{\"description\": \"min=0.42, mean=0.42, max=0.42, sum=0.841 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4204666920428006\"}",
+              "Clinical Knowledge - # eval": "{\"description\": \"min=265, mean=265, max=265, sum=530 (2)\", \"tab\": \"General information\", \"score\": \"265.0\"}",
+              "Clinical Knowledge - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Clinical Knowledge - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Clinical Knowledge - # prompt tokens": "{\"description\": \"min=487.374, mean=487.374, max=487.374, sum=974.747 (2)\", \"tab\": \"General information\", \"score\": \"487.3735849056604\"}",
+              "Clinical Knowledge - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"management\"",
+              "subject": "\"clinical_knowledge\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_management\""
+              "groups": "\"mmlu_clinical_knowledge\""
             }
           }
         },
         {
-          "evaluation_name": "Marketing",
+          "evaluation_name": "Conceptual Physics",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1075,36 +983,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Marketing",
+            "evaluation_description": "EM on Conceptual Physics",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.902,
+            "score": 0.634,
             "details": {
-              "description": "min=0.902, mean=0.902, max=0.902, sum=1.803 (2)",
+              "description": "min=0.634, mean=0.634, max=0.634, sum=1.268 (2)",
               "tab": "Accuracy",
-              "Marketing - Observed inference time (s)": "{\"description\": \"min=0.407, mean=0.407, max=0.407, sum=0.813 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.406507401384859\"}",
-              "Marketing - # eval": "{\"description\": \"min=234, mean=234, max=234, sum=468 (2)\", \"tab\": \"General information\", \"score\": \"234.0\"}",
-              "Marketing - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Marketing - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Marketing - # prompt tokens": "{\"description\": \"min=472.628, mean=472.628, max=472.628, sum=945.256 (2)\", \"tab\": \"General information\", \"score\": \"472.62820512820514\"}",
-              "Marketing - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Conceptual Physics - Observed inference time (s)": "{\"description\": \"min=0.412, mean=0.412, max=0.412, sum=0.824 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4118805824442113\"}",
+              "Conceptual Physics - # eval": "{\"description\": \"min=235, mean=235, max=235, sum=470 (2)\", \"tab\": \"General information\", \"score\": \"235.0\"}",
+              "Conceptual Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Conceptual Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Conceptual Physics - # prompt tokens": "{\"description\": \"min=333.153, mean=333.153, max=333.153, sum=666.306 (2)\", \"tab\": \"General information\", \"score\": \"333.1531914893617\"}",
+              "Conceptual Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"marketing\"",
+              "subject": "\"conceptual_physics\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_marketing\""
+              "groups": "\"mmlu_conceptual_physics\""
             }
           }
         },
         {
-          "evaluation_name": "Medical Genetics",
+          "evaluation_name": "Electrical Engineering",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1113,36 +1021,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Medical Genetics",
+            "evaluation_description": "EM on Electrical Engineering",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.76,
+            "score": 0.662,
             "details": {
-              "description": "min=0.76, mean=0.76, max=0.76, sum=1.52 (2)",
+              "description": "min=0.662, mean=0.662, max=0.662, sum=1.324 (2)",
               "tab": "Accuracy",
-              "Medical Genetics - Observed inference time (s)": "{\"description\": \"min=0.417, mean=0.417, max=0.417, sum=0.835 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.41734427213668823\"}",
-              "Medical Genetics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
-              "Medical Genetics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Medical Genetics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Medical Genetics - # prompt tokens": "{\"description\": \"min=408.14, mean=408.14, max=408.14, sum=816.28 (2)\", \"tab\": \"General information\", \"score\": \"408.14\"}",
-              "Medical Genetics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Electrical Engineering - Observed inference time (s)": "{\"description\": \"min=0.428, mean=0.428, max=0.428, sum=0.856 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42821227435407966\"}",
+              "Electrical Engineering - # eval": "{\"description\": \"min=145, mean=145, max=145, sum=290 (2)\", \"tab\": \"General information\", \"score\": \"145.0\"}",
+              "Electrical Engineering - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Electrical Engineering - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Electrical Engineering - # prompt tokens": "{\"description\": \"min=497.779, mean=497.779, max=497.779, sum=995.559 (2)\", \"tab\": \"General information\", \"score\": \"497.7793103448276\"}",
+              "Electrical Engineering - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"medical_genetics\"",
+              "subject": "\"electrical_engineering\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_medical_genetics\""
+              "groups": "\"mmlu_electrical_engineering\""
             }
           }
         },
         {
-          "evaluation_name": "Miscellaneous",
+          "evaluation_name": "Elementary Mathematics",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1151,36 +1059,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Miscellaneous",
+            "evaluation_description": "EM on Elementary Mathematics",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.875,
+            "score": 0.481,
             "details": {
-              "description": "min=0.875, mean=0.875, max=0.875, sum=1.75 (2)",
+              "description": "min=0.481, mean=0.481, max=0.481, sum=0.963 (2)",
               "tab": "Accuracy",
-              "Miscellaneous - Observed inference time (s)": "{\"description\": \"min=0.407, mean=0.407, max=0.407, sum=0.814 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.40693108880200146\"}",
-              "Miscellaneous - # eval": "{\"description\": \"min=783, mean=783, max=783, sum=1566 (2)\", \"tab\": \"General information\", \"score\": \"783.0\"}",
-              "Miscellaneous - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Miscellaneous - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Miscellaneous - # prompt tokens": "{\"description\": \"min=345.913, mean=345.913, max=345.913, sum=691.826 (2)\", \"tab\": \"General information\", \"score\": \"345.9131545338442\"}",
-              "Miscellaneous - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Elementary Mathematics - Observed inference time (s)": "{\"description\": \"min=0.427, mean=0.427, max=0.427, sum=0.853 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4265344634888664\"}",
+              "Elementary Mathematics - # eval": "{\"description\": \"min=378, mean=378, max=378, sum=756 (2)\", \"tab\": \"General information\", \"score\": \"378.0\"}",
+              "Elementary Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Elementary Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Elementary Mathematics - # prompt tokens": "{\"description\": \"min=609.156, mean=609.156, max=609.156, sum=1218.312 (2)\", \"tab\": \"General information\", \"score\": \"609.1560846560847\"}",
+              "Elementary Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"miscellaneous\"",
+              "subject": "\"elementary_mathematics\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_miscellaneous\""
+              "groups": "\"mmlu_elementary_mathematics\""
             }
           }
         },
         {
-          "evaluation_name": "Moral Scenarios",
+          "evaluation_name": "Formal Logic",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1189,42 +1097,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Moral Scenarios",
+            "evaluation_description": "EM on Formal Logic",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.28,
+            "score": 0.444,
             "details": {
-              "description": "min=0.28, mean=0.28, max=0.28, sum=0.561 (2)",
+              "description": "min=0.444, mean=0.444, max=0.444, sum=0.889 (2)",
               "tab": "Accuracy",
-              "Moral Disputes - Observed inference time (s)": "{\"description\": \"min=0.424, mean=0.424, max=0.424, sum=0.848 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4239204674097844\"}",
-              "Moral Scenarios - Observed inference time (s)": "{\"description\": \"min=0.433, mean=0.433, max=0.433, sum=0.866 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.43297034721800737\"}",
-              "Moral Disputes - # eval": "{\"description\": \"min=346, mean=346, max=346, sum=692 (2)\", \"tab\": \"General information\", \"score\": \"346.0\"}",
-              "Moral Disputes - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Moral Disputes - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Moral Disputes - # prompt tokens": "{\"description\": \"min=542.506, mean=542.506, max=542.506, sum=1085.012 (2)\", \"tab\": \"General information\", \"score\": \"542.5057803468208\"}",
-              "Moral Disputes - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "Moral Scenarios - # eval": "{\"description\": \"min=895, mean=895, max=895, sum=1790 (2)\", \"tab\": \"General information\", \"score\": \"895.0\"}",
-              "Moral Scenarios - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Moral Scenarios - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Moral Scenarios - # prompt tokens": "{\"description\": \"min=756.479, mean=756.479, max=756.479, sum=1512.959 (2)\", \"tab\": \"General information\", \"score\": \"756.4793296089385\"}",
-              "Moral Scenarios - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Formal Logic - Observed inference time (s)": "{\"description\": \"min=0.411, mean=0.411, max=0.411, sum=0.821 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4107102117841206\"}",
+              "Formal Logic - # eval": "{\"description\": \"min=126, mean=126, max=126, sum=252 (2)\", \"tab\": \"General information\", \"score\": \"126.0\"}",
+              "Formal Logic - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Formal Logic - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Formal Logic - # prompt tokens": "{\"description\": \"min=691.81, mean=691.81, max=691.81, sum=1383.619 (2)\", \"tab\": \"General information\", \"score\": \"691.8095238095239\"}",
+              "Formal Logic - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"moral_scenarios\"",
+              "subject": "\"formal_logic\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_moral_scenarios\""
+              "groups": "\"mmlu_formal_logic\""
             }
           }
         },
         {
-          "evaluation_name": "Nutrition",
+          "evaluation_name": "High School World History",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1233,36 +1135,114 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Nutrition",
+            "evaluation_description": "EM on High School World History",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.725,
+            "score": 0.827,
             "details": {
-              "description": "min=0.725, mean=0.725, max=0.725, sum=1.451 (2)",
+              "description": "min=0.827, mean=0.827, max=0.827, sum=1.654 (2)",
               "tab": "Accuracy",
-              "Nutrition - Observed inference time (s)": "{\"description\": \"min=0.417, mean=0.417, max=0.417, sum=0.835 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.41727598430284485\"}",
-              "Nutrition - # eval": "{\"description\": \"min=306, mean=306, max=306, sum=612 (2)\", \"tab\": \"General information\", \"score\": \"306.0\"}",
-              "Nutrition - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Nutrition - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Nutrition - # prompt tokens": "{\"description\": \"min=695.922, mean=695.922, max=695.922, sum=1391.843 (2)\", \"tab\": \"General information\", \"score\": \"695.9215686274509\"}",
-              "Nutrition - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "High School Biology - Observed inference time (s)": "{\"description\": \"min=0.424, mean=0.424, max=0.424, sum=0.847 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42357982127897204\"}",
+              "High School Chemistry - Observed inference time (s)": "{\"description\": \"min=0.412, mean=0.412, max=0.412, sum=0.825 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.41242665375394777\"}",
+              "High School Computer Science - Observed inference time (s)": "{\"description\": \"min=0.445, mean=0.445, max=0.445, sum=0.89 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.44495458364486695\"}",
+              "High School European History - Observed inference time (s)": "{\"description\": \"min=0.544, mean=0.544, max=0.544, sum=1.088 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5441486705433238\"}",
+              "High School Geography - Observed inference time (s)": "{\"description\": \"min=0.415, mean=0.415, max=0.415, sum=0.83 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4149725003675981\"}",
+              "High School Government And Politics - Observed inference time (s)": "{\"description\": \"min=0.383, mean=0.383, max=0.383, sum=0.766 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.38312110629106433\"}",
+              "High School Macroeconomics - Observed inference time (s)": "{\"description\": \"min=0.403, mean=0.403, max=0.403, sum=0.807 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4034240123553154\"}",
+              "High School Mathematics - Observed inference time (s)": "{\"description\": \"min=0.39, mean=0.39, max=0.39, sum=0.779 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.38954139285617406\"}",
+              "High School Microeconomics - Observed inference time (s)": "{\"description\": \"min=0.399, mean=0.399, max=0.399, sum=0.798 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3992174813727371\"}",
+              "High School Physics - Observed inference time (s)": "{\"description\": \"min=0.409, mean=0.409, max=0.409, sum=0.819 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.40926165138648835\"}",
+              "High School Psychology - Observed inference time (s)": "{\"description\": \"min=0.408, mean=0.408, max=0.408, sum=0.816 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4081065694126514\"}",
+              "High School Statistics - Observed inference time (s)": "{\"description\": \"min=0.417, mean=0.417, max=0.417, sum=0.833 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4166152830477114\"}",
+              "High School US History - Observed inference time (s)": "{\"description\": \"min=0.45, mean=0.45, max=0.45, sum=0.901 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4504043985815609\"}",
+              "High School World History - Observed inference time (s)": "{\"description\": \"min=0.416, mean=0.416, max=0.416, sum=0.833 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4162542166086189\"}",
+              "High School Biology - # eval": "{\"description\": \"min=310, mean=310, max=310, sum=620 (2)\", \"tab\": \"General information\", \"score\": \"310.0\"}",
+              "High School Biology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Biology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Biology - # prompt tokens": "{\"description\": \"min=596.894, mean=596.894, max=596.894, sum=1193.787 (2)\", \"tab\": \"General information\", \"score\": \"596.8935483870968\"}",
+              "High School Biology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Chemistry - # eval": "{\"description\": \"min=203, mean=203, max=203, sum=406 (2)\", \"tab\": \"General information\", \"score\": \"203.0\"}",
+              "High School Chemistry - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Chemistry - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Chemistry - # prompt tokens": "{\"description\": \"min=568.665, mean=568.665, max=568.665, sum=1137.33 (2)\", \"tab\": \"General information\", \"score\": \"568.6650246305419\"}",
+              "High School Chemistry - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Computer Science - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "High School Computer Science - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Computer Science - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Computer Science - # prompt tokens": "{\"description\": \"min=988.57, mean=988.57, max=988.57, sum=1977.14 (2)\", \"tab\": \"General information\", \"score\": \"988.57\"}",
+              "High School Computer Science - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School European History - # eval": "{\"description\": \"min=165, mean=165, max=165, sum=330 (2)\", \"tab\": \"General information\", \"score\": \"165.0\"}",
+              "High School European History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School European History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School European History - # prompt tokens": "{\"description\": \"min=3159.636, mean=3159.636, max=3159.636, sum=6319.273 (2)\", \"tab\": \"General information\", \"score\": \"3159.6363636363635\"}",
+              "High School European History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Geography - # eval": "{\"description\": \"min=198, mean=198, max=198, sum=396 (2)\", \"tab\": \"General information\", \"score\": \"198.0\"}",
+              "High School Geography - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Geography - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Geography - # prompt tokens": "{\"description\": \"min=436.657, mean=436.657, max=436.657, sum=873.313 (2)\", \"tab\": \"General information\", \"score\": \"436.65656565656565\"}",
+              "High School Geography - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Government And Politics - # eval": "{\"description\": \"min=193, mean=193, max=193, sum=386 (2)\", \"tab\": \"General information\", \"score\": \"193.0\"}",
+              "High School Government And Politics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Government And Politics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Government And Politics - # prompt tokens": "{\"description\": \"min=527.927, mean=527.927, max=527.927, sum=1055.855 (2)\", \"tab\": \"General information\", \"score\": \"527.9274611398964\"}",
+              "High School Government And Politics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Macroeconomics - # eval": "{\"description\": \"min=390, mean=390, max=390, sum=780 (2)\", \"tab\": \"General information\", \"score\": \"390.0\"}",
+              "High School Macroeconomics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Macroeconomics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Macroeconomics - # prompt tokens": "{\"description\": \"min=445.662, mean=445.662, max=445.662, sum=891.323 (2)\", \"tab\": \"General information\", \"score\": \"445.66153846153844\"}",
+              "High School Macroeconomics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Mathematics - # eval": "{\"description\": \"min=270, mean=270, max=270, sum=540 (2)\", \"tab\": \"General information\", \"score\": \"270.0\"}",
+              "High School Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Mathematics - # prompt tokens": "{\"description\": \"min=579.181, mean=579.181, max=579.181, sum=1158.363 (2)\", \"tab\": \"General information\", \"score\": \"579.1814814814815\"}",
+              "High School Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Microeconomics - # eval": "{\"description\": \"min=238, mean=238, max=238, sum=476 (2)\", \"tab\": \"General information\", \"score\": \"238.0\"}",
+              "High School Microeconomics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Microeconomics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Microeconomics - # prompt tokens": "{\"description\": \"min=449.492, mean=449.492, max=449.492, sum=898.983 (2)\", \"tab\": \"General information\", \"score\": \"449.49159663865544\"}",
+              "High School Microeconomics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Physics - # eval": "{\"description\": \"min=151, mean=151, max=151, sum=302 (2)\", \"tab\": \"General information\", \"score\": \"151.0\"}",
+              "High School Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Physics - # prompt tokens": "{\"description\": \"min=621.788, mean=621.788, max=621.788, sum=1243.576 (2)\", \"tab\": \"General information\", \"score\": \"621.7880794701987\"}",
+              "High School Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Psychology - # eval": "{\"description\": \"min=545, mean=545, max=545, sum=1090 (2)\", \"tab\": \"General information\", \"score\": \"545.0\"}",
+              "High School Psychology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Psychology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Psychology - # prompt tokens": "{\"description\": \"min=585.919, mean=585.919, max=585.919, sum=1171.839 (2)\", \"tab\": \"General information\", \"score\": \"585.9192660550459\"}",
+              "High School Psychology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School Statistics - # eval": "{\"description\": \"min=216, mean=216, max=216, sum=432 (2)\", \"tab\": \"General information\", \"score\": \"216.0\"}",
+              "High School Statistics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School Statistics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School Statistics - # prompt tokens": "{\"description\": \"min=908.208, mean=908.208, max=908.208, sum=1816.417 (2)\", \"tab\": \"General information\", \"score\": \"908.2083333333334\"}",
+              "High School Statistics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School US History - # eval": "{\"description\": \"min=204, mean=204, max=204, sum=408 (2)\", \"tab\": \"General information\", \"score\": \"204.0\"}",
+              "High School US History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School US History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School US History - # prompt tokens": "{\"description\": \"min=2535.324, mean=2535.324, max=2535.324, sum=5070.647 (2)\", \"tab\": \"General information\", \"score\": \"2535.323529411765\"}",
+              "High School US History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "High School World History - # eval": "{\"description\": \"min=237, mean=237, max=237, sum=474 (2)\", \"tab\": \"General information\", \"score\": \"237.0\"}",
+              "High School World History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "High School World History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "High School World History - # prompt tokens": "{\"description\": \"min=1638.219, mean=1638.219, max=1638.219, sum=3276.439 (2)\", \"tab\": \"General information\", \"score\": \"1638.2194092827003\"}",
+              "High School World History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"nutrition\"",
+              "subject": "\"high_school_world_history\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_nutrition\""
+              "groups": "\"mmlu_high_school_world_history\""
             }
           }
         },
         {
-          "evaluation_name": "Prehistory",
+          "evaluation_name": "Human Sexuality",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1271,36 +1251,42 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Prehistory",
+            "evaluation_description": "EM on Human Sexuality",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.79,
+            "score": 0.847,
             "details": {
-              "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)",
+              "description": "min=0.847, mean=0.847, max=0.847, sum=1.695 (2)",
               "tab": "Accuracy",
-              "Prehistory - Observed inference time (s)": "{\"description\": \"min=0.43, mean=0.43, max=0.43, sum=0.861 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4303552037403907\"}",
-              "Prehistory - # eval": "{\"description\": \"min=324, mean=324, max=324, sum=648 (2)\", \"tab\": \"General information\", \"score\": \"324.0\"}",
-              "Prehistory - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Prehistory - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Prehistory - # prompt tokens": "{\"description\": \"min=619.185, mean=619.185, max=619.185, sum=1238.37 (2)\", \"tab\": \"General information\", \"score\": \"619.1851851851852\"}",
-              "Prehistory - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Human Aging - Observed inference time (s)": "{\"description\": \"min=0.401, mean=0.401, max=0.401, sum=0.802 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4010318255745242\"}",
+              "Human Sexuality - Observed inference time (s)": "{\"description\": \"min=0.393, mean=0.393, max=0.393, sum=0.787 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.39331119843111695\"}",
+              "Human Aging - # eval": "{\"description\": \"min=223, mean=223, max=223, sum=446 (2)\", \"tab\": \"General information\", \"score\": \"223.0\"}",
+              "Human Aging - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Human Aging - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Human Aging - # prompt tokens": "{\"description\": \"min=361.26, mean=361.26, max=361.26, sum=722.52 (2)\", \"tab\": \"General information\", \"score\": \"361.26008968609864\"}",
+              "Human Aging - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "Human Sexuality - # eval": "{\"description\": \"min=131, mean=131, max=131, sum=262 (2)\", \"tab\": \"General information\", \"score\": \"131.0\"}",
+              "Human Sexuality - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Human Sexuality - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Human Sexuality - # prompt tokens": "{\"description\": \"min=403.382, mean=403.382, max=403.382, sum=806.763 (2)\", \"tab\": \"General information\", \"score\": \"403.381679389313\"}",
+              "Human Sexuality - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"prehistory\"",
+              "subject": "\"human_sexuality\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_prehistory\""
+              "groups": "\"mmlu_human_sexuality\""
             }
           }
         },
         {
-          "evaluation_name": "Public Relations",
+          "evaluation_name": "International Law",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1309,36 +1295,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Public Relations",
+            "evaluation_description": "EM on International Law",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.664,
+            "score": 0.826,
             "details": {
-              "description": "min=0.664, mean=0.664, max=0.664, sum=1.327 (2)",
+              "description": "min=0.826, mean=0.826, max=0.826, sum=1.653 (2)",
               "tab": "Accuracy",
-              "Public Relations - Observed inference time (s)": "{\"description\": \"min=0.428, mean=0.428, max=0.428, sum=0.855 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42750670259649104\"}",
-              "Public Relations - # eval": "{\"description\": \"min=110, mean=110, max=110, sum=220 (2)\", \"tab\": \"General information\", \"score\": \"110.0\"}",
-              "Public Relations - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Public Relations - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Public Relations - # prompt tokens": "{\"description\": \"min=474.827, mean=474.827, max=474.827, sum=949.655 (2)\", \"tab\": \"General information\", \"score\": \"474.8272727272727\"}",
-              "Public Relations - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "International Law - Observed inference time (s)": "{\"description\": \"min=0.42, mean=0.42, max=0.42, sum=0.841 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42040472779392213\"}",
+              "International Law - # eval": "{\"description\": \"min=121, mean=121, max=121, sum=242 (2)\", \"tab\": \"General information\", \"score\": \"121.0\"}",
+              "International Law - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "International Law - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "International Law - # prompt tokens": "{\"description\": \"min=729.463, mean=729.463, max=729.463, sum=1458.926 (2)\", \"tab\": \"General information\", \"score\": \"729.4628099173553\"}",
+              "International Law - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"public_relations\"",
+              "subject": "\"international_law\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_public_relations\""
+              "groups": "\"mmlu_international_law\""
             }
           }
         },
         {
-          "evaluation_name": "Security Studies",
+          "evaluation_name": "Logical Fallacies",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1347,36 +1333,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Security Studies",
+            "evaluation_description": "EM on Logical Fallacies",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.78,
+            "score": 0.779,
             "details": {
-              "description": "min=0.78, mean=0.78, max=0.78, sum=1.559 (2)",
+              "description": "min=0.779, mean=0.779, max=0.779, sum=1.558 (2)",
               "tab": "Accuracy",
-              "Security Studies - Observed inference time (s)": "{\"description\": \"min=0.466, mean=0.466, max=0.466, sum=0.933 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4662662194699657\"}",
-              "Security Studies - # eval": "{\"description\": \"min=245, mean=245, max=245, sum=490 (2)\", \"tab\": \"General information\", \"score\": \"245.0\"}",
-              "Security Studies - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Security Studies - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Security Studies - # prompt tokens": "{\"description\": \"min=1377.531, mean=1377.531, max=1377.531, sum=2755.061 (2)\", \"tab\": \"General information\", \"score\": \"1377.530612244898\"}",
-              "Security Studies - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Logical Fallacies - Observed inference time (s)": "{\"description\": \"min=0.404, mean=0.404, max=0.404, sum=0.809 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4043445353127696\"}",
+              "Logical Fallacies - # eval": "{\"description\": \"min=163, mean=163, max=163, sum=326 (2)\", \"tab\": \"General information\", \"score\": \"163.0\"}",
+              "Logical Fallacies - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Logical Fallacies - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Logical Fallacies - # prompt tokens": "{\"description\": \"min=502.755, mean=502.755, max=502.755, sum=1005.509 (2)\", \"tab\": \"General information\", \"score\": \"502.7546012269939\"}",
+              "Logical Fallacies - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"security_studies\"",
+              "subject": "\"logical_fallacies\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_security_studies\""
+              "groups": "\"mmlu_logical_fallacies\""
             }
           }
         },
         {
-          "evaluation_name": "Sociology",
+          "evaluation_name": "Machine Learning",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1385,36 +1371,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Sociology",
+            "evaluation_description": "EM on Machine Learning",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.891,
+            "score": 0.473,
             "details": {
-              "description": "min=0.891, mean=0.891, max=0.891, sum=1.781 (2)",
+              "description": "min=0.473, mean=0.473, max=0.473, sum=0.946 (2)",
               "tab": "Accuracy",
-              "Sociology - Observed inference time (s)": "{\"description\": \"min=0.416, mean=0.416, max=0.416, sum=0.832 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4159522590352528\"}",
-              "Sociology - # eval": "{\"description\": \"min=201, mean=201, max=201, sum=402 (2)\", \"tab\": \"General information\", \"score\": \"201.0\"}",
-              "Sociology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Sociology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Sociology - # prompt tokens": "{\"description\": \"min=508.478, mean=508.478, max=508.478, sum=1016.955 (2)\", \"tab\": \"General information\", \"score\": \"508.4776119402985\"}",
-              "Sociology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Machine Learning - Observed inference time (s)": "{\"description\": \"min=0.421, mean=0.421, max=0.421, sum=0.842 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42122456644262585\"}",
+              "Machine Learning - # eval": "{\"description\": \"min=112, mean=112, max=112, sum=224 (2)\", \"tab\": \"General information\", \"score\": \"112.0\"}",
+              "Machine Learning - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Machine Learning - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Machine Learning - # prompt tokens": "{\"description\": \"min=730.402, mean=730.402, max=730.402, sum=1460.804 (2)\", \"tab\": \"General information\", \"score\": \"730.4017857142857\"}",
+              "Machine Learning - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"sociology\"",
+              "subject": "\"machine_learning\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_sociology\""
+              "groups": "\"mmlu_machine_learning\""
             }
           }
         },
         {
-          "evaluation_name": "Virology",
+          "evaluation_name": "Management",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1423,36 +1409,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on Virology",
+            "evaluation_description": "EM on Management",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.536,
+            "score": 0.796,
             "details": {
-              "description": "min=0.536, mean=0.536, max=0.536, sum=1.072 (2)",
+              "description": "min=0.796, mean=0.796, max=0.796, sum=1.592 (2)",
               "tab": "Accuracy",
-              "Virology - Observed inference time (s)": "{\"description\": \"min=0.405, mean=0.405, max=0.405, sum=0.809 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.40467354332108096\"}",
-              "Virology - # eval": "{\"description\": \"min=166, mean=166, max=166, sum=332 (2)\", \"tab\": \"General information\", \"score\": \"166.0\"}",
-              "Virology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "Virology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "Virology - # prompt tokens": "{\"description\": \"min=405.108, mean=405.108, max=405.108, sum=810.217 (2)\", \"tab\": \"General information\", \"score\": \"405.10843373493975\"}",
-              "Virology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Management - Observed inference time (s)": "{\"description\": \"min=0.392, mean=0.392, max=0.392, sum=0.785 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.392485206566968\"}",
+              "Management - # eval": "{\"description\": \"min=103, mean=103, max=103, sum=206 (2)\", \"tab\": \"General information\", \"score\": \"103.0\"}",
+              "Management - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Management - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Management - # prompt tokens": "{\"description\": \"min=315.777, mean=315.777, max=315.777, sum=631.553 (2)\", \"tab\": \"General information\", \"score\": \"315.77669902912623\"}",
+              "Management - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"virology\"",
+              "subject": "\"management\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_virology\""
+              "groups": "\"mmlu_management\""
             }
           }
         },
         {
-          "evaluation_name": "World Religions",
+          "evaluation_name": "Marketing",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1461,36 +1447,36 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on World Religions",
+            "evaluation_description": "EM on Marketing",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.854,
+            "score": 0.902,
             "details": {
-              "description": "min=0.854, mean=0.854, max=0.854, sum=1.708 (2)",
+              "description": "min=0.902, mean=0.902, max=0.902, sum=1.803 (2)",
               "tab": "Accuracy",
-              "World Religions - Observed inference time (s)": "{\"description\": \"min=0.393, mean=0.393, max=0.393, sum=0.787 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.39336834455791275\"}",
-              "World Religions - # eval": "{\"description\": \"min=171, mean=171, max=171, sum=342 (2)\", \"tab\": \"General information\", \"score\": \"171.0\"}",
-              "World Religions - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "World Religions - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "World Religions - # prompt tokens": "{\"description\": \"min=304.474, mean=304.474, max=304.474, sum=608.947 (2)\", \"tab\": \"General information\", \"score\": \"304.4736842105263\"}",
-              "World Religions - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Marketing - Observed inference time (s)": "{\"description\": \"min=0.407, mean=0.407, max=0.407, sum=0.813 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.406507401384859\"}",
+              "Marketing - # eval": "{\"description\": \"min=234, mean=234, max=234, sum=468 (2)\", \"tab\": \"General information\", \"score\": \"234.0\"}",
+              "Marketing - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Marketing - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Marketing - # prompt tokens": "{\"description\": \"min=472.628, mean=472.628, max=472.628, sum=945.256 (2)\", \"tab\": \"General information\", \"score\": \"472.62820512820514\"}",
+              "Marketing - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "\"world_religions\"",
+              "subject": "\"marketing\"",
               "method": "\"multiple_choice_joint\"",
               "eval_split": "\"test\"",
-              "groups": "\"mmlu_world_religions\""
+              "groups": "\"mmlu_marketing\""
             }
           }
         },
         {
-          "evaluation_name": "Mean win rate",
+          "evaluation_name": "Medical Genetics",
           "source_data": {
             "dataset_name": "helm_mmlu",
             "source_type": "url",
@@ -1499,404 +1485,418 @@
             ]
           },
           "metric_config": {
-            "evaluation_description": "How many models this model outperforms on average (over columns).",
+            "evaluation_description": "EM on Medical Genetics",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.565,
+            "score": 0.76,
             "details": {
-              "description": "",
-              "tab": "Efficiency"
+              "description": "min=0.76, mean=0.76, max=0.76, sum=1.52 (2)",
+              "tab": "Accuracy",
+              "Medical Genetics - Observed inference time (s)": "{\"description\": \"min=0.417, mean=0.417, max=0.417, sum=0.835 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.41734427213668823\"}",
+              "Medical Genetics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}",
+              "Medical Genetics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Medical Genetics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Medical Genetics - # prompt tokens": "{\"description\": \"min=408.14, mean=408.14, max=408.14, sum=816.28 (2)\", \"tab\": \"General information\", \"score\": \"408.14\"}",
+              "Medical Genetics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
-            "additional_details": {}
+            "additional_details": {
+              "subject": "\"medical_genetics\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_medical_genetics\""
+            }
           }
-        }
-      ],
-      "detailed_evaluation_results": null,
-      "generation_config": {
-        "additional_details": {
-          "subject": "[\"abstract_algebra\", \"anatomy\", \"astronomy\", \"business_ethics\", \"clinical_knowledge\", \"college_biology\", \"college_chemistry\", \"college_computer_science\", \"college_mathematics\", \"college_medicine\", \"college_physics\", \"computer_security\", \"conceptual_physics\", \"econometrics\", \"electrical_engineering\", \"elementary_mathematics\", \"formal_logic\", \"global_facts\", \"high_school_biology\", \"high_school_chemistry\", \"high_school_computer_science\", \"high_school_european_history\", \"high_school_geography\", \"high_school_government_and_politics\", \"high_school_macroeconomics\", \"high_school_mathematics\", \"high_school_microeconomics\", \"high_school_physics\", \"high_school_psychology\", \"high_school_statistics\", \"high_school_us_history\", \"high_school_world_history\", \"human_aging\", \"human_sexuality\", \"international_law\", \"jurisprudence\", \"logical_fallacies\", \"machine_learning\", \"management\", \"marketing\", \"medical_genetics\", \"miscellaneous\", \"moral_disputes\", \"moral_scenarios\", \"nutrition\", \"philosophy\", \"prehistory\", \"professional_accounting\", \"professional_law\", \"professional_medicine\", \"professional_psychology\", \"public_relations\", \"security_studies\", \"sociology\", \"us_foreign_policy\", \"virology\", \"world_religions\"]",
-          "method": "\"multiple_choice_joint\"",
-          "eval_split": "\"test\"",
-          "groups": "[\"mmlu_abstract_algebra\", \"mmlu_anatomy\", \"mmlu_astronomy\", \"mmlu_business_ethics\", \"mmlu_clinical_knowledge\", \"mmlu_college_biology\", \"mmlu_college_chemistry\", \"mmlu_college_computer_science\", \"mmlu_college_mathematics\", \"mmlu_college_medicine\", \"mmlu_college_physics\", \"mmlu_computer_security\", \"mmlu_conceptual_physics\", \"mmlu_econometrics\", \"mmlu_electrical_engineering\", \"mmlu_elementary_mathematics\", \"mmlu_formal_logic\", \"mmlu_global_facts\", \"mmlu_high_school_biology\", \"mmlu_high_school_chemistry\", \"mmlu_high_school_computer_science\", \"mmlu_high_school_european_history\", \"mmlu_high_school_geography\", \"mmlu_high_school_government_and_politics\", \"mmlu_high_school_macroeconomics\", \"mmlu_high_school_mathematics\", \"mmlu_high_school_microeconomics\", \"mmlu_high_school_physics\", \"mmlu_high_school_psychology\", \"mmlu_high_school_statistics\", \"mmlu_high_school_us_history\", \"mmlu_high_school_world_history\", \"mmlu_human_aging\", \"mmlu_human_sexuality\", \"mmlu_international_law\", \"mmlu_jurisprudence\", \"mmlu_logical_fallacies\", \"mmlu_machine_learning\", \"mmlu_management\", \"mmlu_marketing\", \"mmlu_medical_genetics\", \"mmlu_miscellaneous\", \"mmlu_moral_disputes\", \"mmlu_moral_scenarios\", \"mmlu_nutrition\", \"mmlu_philosophy\", \"mmlu_prehistory\", \"mmlu_professional_accounting\", \"mmlu_professional_law\", \"mmlu_professional_medicine\", \"mmlu_professional_psychology\", \"mmlu_public_relations\", \"mmlu_security_studies\", \"mmlu_sociology\", \"mmlu_us_foreign_policy\", \"mmlu_virology\", \"mmlu_world_religions\"]"
-        }
-      }
-    },
-    {
-      "evaluation_id": "helm_lite/snowflake_snowflake-arctic-instruct/1774096306.427425",
-      "retrieved_timestamp": "1774096306.427425",
-      "source_metadata": {
-        "source_name": "helm_lite",
-        "source_type": "documentation",
-        "source_organization_name": "crfm",
-        "evaluator_relationship": "third_party"
-      },
-      "eval_library": {
-        "name": "helm",
-        "version": "unknown"
-      },
-      "benchmark": "helm_lite",
-      "evaluation_results": [
+        },
         {
-          "evaluation_name": "Mean win rate",
+          "evaluation_name": "Miscellaneous",
           "source_data": {
-            "dataset_name": "helm_lite",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "How many models this model outperforms on average (over columns).",
+            "evaluation_description": "EM on Miscellaneous",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.338,
+            "score": 0.875,
             "details": {
-              "description": "",
+              "description": "min=0.875, mean=0.875, max=0.875, sum=1.75 (2)",
               "tab": "Accuracy",
-              "Mean win rate - Efficiency": "{\"description\": \"\", \"tab\": \"Efficiency\", \"score\": \"0.7606242197253433\"}",
-              "Mean win rate - General information": "{\"description\": \"\", \"tab\": \"General information\", \"score\": \"\"}"
+              "Miscellaneous - Observed inference time (s)": "{\"description\": \"min=0.407, mean=0.407, max=0.407, sum=0.814 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.40693108880200146\"}",
+              "Miscellaneous - # eval": "{\"description\": \"min=783, mean=783, max=783, sum=1566 (2)\", \"tab\": \"General information\", \"score\": \"783.0\"}",
+              "Miscellaneous - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Miscellaneous - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Miscellaneous - # prompt tokens": "{\"description\": \"min=345.913, mean=345.913, max=345.913, sum=691.826 (2)\", \"tab\": \"General information\", \"score\": \"345.9131545338442\"}",
+              "Miscellaneous - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
-            "additional_details": {}
+            "additional_details": {
+              "subject": "\"miscellaneous\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_miscellaneous\""
+            }
           }
         },
         {
-          "evaluation_name": "NarrativeQA",
+          "evaluation_name": "Moral Scenarios",
           "source_data": {
-            "dataset_name": "NarrativeQA",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "F1 on NarrativeQA",
+            "evaluation_description": "EM on Moral Scenarios",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.654,
+            "score": 0.28,
             "details": {
-              "description": "min=0.654, mean=0.654, max=0.654, sum=0.654 (1)",
+              "description": "min=0.28, mean=0.28, max=0.28, sum=0.561 (2)",
               "tab": "Accuracy",
-              "NarrativeQA - Observed inference time (s)": "{\"description\": \"min=0.624, mean=0.624, max=0.624, sum=0.624 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.6239793220036466\"}",
-              "NarrativeQA - # eval": "{\"description\": \"min=355, mean=355, max=355, sum=355 (1)\", \"tab\": \"General information\", \"score\": \"355.0\"}",
-              "NarrativeQA - # train": "{\"description\": \"min=4.262, mean=4.262, max=4.262, sum=4.262 (1)\", \"tab\": \"General information\", \"score\": \"4.261971830985916\"}",
-              "NarrativeQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "NarrativeQA - # prompt tokens": "{\"description\": \"min=3603.217, mean=3603.217, max=3603.217, sum=3603.217 (1)\", \"tab\": \"General information\", \"score\": \"3603.2169014084507\"}",
-              "NarrativeQA - # output tokens": "{\"description\": \"min=11.907, mean=11.907, max=11.907, sum=11.907 (1)\", \"tab\": \"General information\", \"score\": \"11.907042253521126\"}"
+              "Moral Disputes - Observed inference time (s)": "{\"description\": \"min=0.424, mean=0.424, max=0.424, sum=0.848 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4239204674097844\"}",
+              "Moral Scenarios - Observed inference time (s)": "{\"description\": \"min=0.433, mean=0.433, max=0.433, sum=0.866 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.43297034721800737\"}",
+              "Moral Disputes - # eval": "{\"description\": \"min=346, mean=346, max=346, sum=692 (2)\", \"tab\": \"General information\", \"score\": \"346.0\"}",
+              "Moral Disputes - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Moral Disputes - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Moral Disputes - # prompt tokens": "{\"description\": \"min=542.506, mean=542.506, max=542.506, sum=1085.012 (2)\", \"tab\": \"General information\", \"score\": \"542.5057803468208\"}",
+              "Moral Disputes - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
+              "Moral Scenarios - # eval": "{\"description\": \"min=895, mean=895, max=895, sum=1790 (2)\", \"tab\": \"General information\", \"score\": \"895.0\"}",
+              "Moral Scenarios - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Moral Scenarios - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Moral Scenarios - # prompt tokens": "{\"description\": \"min=756.479, mean=756.479, max=756.479, sum=1512.959 (2)\", \"tab\": \"General information\", \"score\": \"756.4793296089385\"}",
+              "Moral Scenarios - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
-            "additional_details": {}
+            "additional_details": {
+              "subject": "\"moral_scenarios\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_moral_scenarios\""
+            }
           }
         },
         {
-          "evaluation_name": "NaturalQuestions (closed-book)",
+          "evaluation_name": "Nutrition",
           "source_data": {
-            "dataset_name": "NaturalQuestions (closed-book)",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "F1 on NaturalQuestions (closed-book)",
+            "evaluation_description": "EM on Nutrition",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.39,
+            "score": 0.725,
             "details": {
-              "description": "min=0.39, mean=0.39, max=0.39, sum=0.39 (1)",
+              "description": "min=0.725, mean=0.725, max=0.725, sum=1.451 (2)",
               "tab": "Accuracy",
-              "NaturalQuestions (open-book) - Observed inference time (s)": "{\"description\": \"min=0.636, mean=0.636, max=0.636, sum=0.636 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.6355201268196106\"}",
-              "NaturalQuestions (closed-book) - Observed inference time (s)": "{\"description\": \"min=0.469, mean=0.469, max=0.469, sum=0.469 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.4687326259613037\"}",
-              "NaturalQuestions (open-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}",
-              "NaturalQuestions (open-book) - # train": "{\"description\": \"min=4.825, mean=4.825, max=4.825, sum=4.825 (1)\", \"tab\": \"General information\", \"score\": \"4.825\"}",
-              "NaturalQuestions (open-book) - truncated": "{\"description\": \"min=0.028, mean=0.028, max=0.028, sum=0.028 (1)\", \"tab\": \"General information\", \"score\": \"0.028\"}",
-              "NaturalQuestions (open-book) - # prompt tokens": "{\"description\": \"min=2311.514, mean=2311.514, max=2311.514, sum=2311.514 (1)\", \"tab\": \"General information\", \"score\": \"2311.514\"}",
-              "NaturalQuestions (open-book) - # output tokens": "{\"description\": \"min=18.701, mean=18.701, max=18.701, sum=18.701 (1)\", \"tab\": \"General information\", \"score\": \"18.701\"}",
-              "NaturalQuestions (closed-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}",
-              "NaturalQuestions (closed-book) - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "NaturalQuestions (closed-book) - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "NaturalQuestions (closed-book) - # prompt tokens": "{\"description\": \"min=166.383, mean=166.383, max=166.383, sum=166.383 (1)\", \"tab\": \"General information\", \"score\": \"166.383\"}",
-              "NaturalQuestions (closed-book) - # output tokens": "{\"description\": \"min=14.473, mean=14.473, max=14.473, sum=14.473 (1)\", \"tab\": \"General information\", \"score\": \"14.473\"}"
+              "Nutrition - Observed inference time (s)": "{\"description\": \"min=0.417, mean=0.417, max=0.417, sum=0.835 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.41727598430284485\"}",
+              "Nutrition - # eval": "{\"description\": \"min=306, mean=306, max=306, sum=612 (2)\", \"tab\": \"General information\", \"score\": \"306.0\"}",
+              "Nutrition - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Nutrition - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Nutrition - # prompt tokens": "{\"description\": \"min=695.922, mean=695.922, max=695.922, sum=1391.843 (2)\", \"tab\": \"General information\", \"score\": \"695.9215686274509\"}",
+              "Nutrition - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "mode": "\"closedbook\""
+              "subject": "\"nutrition\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_nutrition\""
             }
           }
         },
         {
-          "evaluation_name": "OpenbookQA",
+          "evaluation_name": "Prehistory",
           "source_data": {
-            "dataset_name": "OpenbookQA",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on OpenbookQA",
+            "evaluation_description": "EM on Prehistory",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.828,
+            "score": 0.79,
             "details": {
-              "description": "min=0.828, mean=0.828, max=0.828, sum=0.828 (1)",
+              "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)",
               "tab": "Accuracy",
-              "OpenbookQA - Observed inference time (s)": "{\"description\": \"min=0.284, mean=0.284, max=0.284, sum=0.284 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.2840936713218689\"}",
-              "OpenbookQA - # eval": "{\"description\": \"min=500, mean=500, max=500, sum=500 (1)\", \"tab\": \"General information\", \"score\": \"500.0\"}",
-              "OpenbookQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "OpenbookQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "OpenbookQA - # prompt tokens": "{\"description\": \"min=291.574, mean=291.574, max=291.574, sum=291.574 (1)\", \"tab\": \"General information\", \"score\": \"291.574\"}",
-              "OpenbookQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Prehistory - Observed inference time (s)": "{\"description\": \"min=0.43, mean=0.43, max=0.43, sum=0.861 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4303552037403907\"}",
+              "Prehistory - # eval": "{\"description\": \"min=324, mean=324, max=324, sum=648 (2)\", \"tab\": \"General information\", \"score\": \"324.0\"}",
+              "Prehistory - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Prehistory - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Prehistory - # prompt tokens": "{\"description\": \"min=619.185, mean=619.185, max=619.185, sum=1238.37 (2)\", \"tab\": \"General information\", \"score\": \"619.1851851851852\"}",
+              "Prehistory - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "dataset": "\"openbookqa\"",
-              "method": "\"multiple_choice_joint\""
+              "subject": "\"prehistory\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_prehistory\""
             }
           }
         },
         {
-          "evaluation_name": "MMLU",
+          "evaluation_name": "Public Relations",
           "source_data": {
-            "dataset_name": "MMLU",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on MMLU",
+            "evaluation_description": "EM on Public Relations",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.575,
+            "score": 0.664,
             "details": {
-              "description": "min=0.31, mean=0.575, max=0.88, sum=2.876 (5)",
+              "description": "min=0.664, mean=0.664, max=0.664, sum=1.327 (2)",
               "tab": "Accuracy",
-              "MMLU - Observed inference time (s)": "{\"description\": \"min=0.293, mean=0.303, max=0.317, sum=1.516 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.30325288054817606\"}",
-              "MMLU - # eval": "{\"description\": \"min=100, mean=102.8, max=114, sum=514 (5)\", \"tab\": \"General information\", \"score\": \"102.8\"}",
-              "MMLU - # train": "{\"description\": \"min=5, mean=5, max=5, sum=25 (5)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "MMLU - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "MMLU - # prompt tokens": "{\"description\": \"min=406.65, mean=531.547, max=693.675, sum=2657.735 (5)\", \"tab\": \"General information\", \"score\": \"531.5470877192982\"}",
-              "MMLU - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "Public Relations - Observed inference time (s)": "{\"description\": \"min=0.428, mean=0.428, max=0.428, sum=0.855 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42750670259649104\"}",
+              "Public Relations - # eval": "{\"description\": \"min=110, mean=110, max=110, sum=220 (2)\", \"tab\": \"General information\", \"score\": \"110.0\"}",
+              "Public Relations - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Public Relations - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Public Relations - # prompt tokens": "{\"description\": \"min=474.827, mean=474.827, max=474.827, sum=949.655 (2)\", \"tab\": \"General information\", \"score\": \"474.8272727272727\"}",
+              "Public Relations - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "[\"abstract_algebra\", \"college_chemistry\", \"computer_security\", \"econometrics\", \"us_foreign_policy\"]",
-              "method": "\"multiple_choice_joint\""
+              "subject": "\"public_relations\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_public_relations\""
             }
           }
         },
         {
-          "evaluation_name": "MATH",
+          "evaluation_name": "Security Studies",
           "source_data": {
-            "dataset_name": "MATH",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "Equivalent (CoT) on MATH",
+            "evaluation_description": "EM on Security Studies",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.519,
+            "score": 0.78,
             "details": {
-              "description": "min=0.316, mean=0.519, max=0.785, sum=3.636 (7)",
+              "description": "min=0.78, mean=0.78, max=0.78, sum=1.559 (2)",
               "tab": "Accuracy",
-              "MATH - Observed inference time (s)": "{\"description\": \"min=1.482, mean=1.724, max=1.995, sum=12.068 (7)\", \"tab\": \"Efficiency\", \"score\": \"1.723981539653867\"}",
-              "MATH - # eval": "{\"description\": \"min=30, mean=62.429, max=135, sum=437 (7)\", \"tab\": \"General information\", \"score\": \"62.42857142857143\"}",
-              "MATH - # train": "{\"description\": \"min=8, mean=8, max=8, sum=56 (7)\", \"tab\": \"General information\", \"score\": \"8.0\"}",
-              "MATH - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (7)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "MATH - # prompt tokens": "{\"description\": \"min=971.652, mean=1438.636, max=2490.962, sum=10070.453 (7)\", \"tab\": \"General information\", \"score\": \"1438.6362030100095\"}",
-              "MATH - # output tokens": "{\"description\": \"min=82.872, mean=98.802, max=122.233, sum=691.615 (7)\", \"tab\": \"General information\", \"score\": \"98.80208187931566\"}"
+              "Security Studies - Observed inference time (s)": "{\"description\": \"min=0.466, mean=0.466, max=0.466, sum=0.933 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4662662194699657\"}",
+              "Security Studies - # eval": "{\"description\": \"min=245, mean=245, max=245, sum=490 (2)\", \"tab\": \"General information\", \"score\": \"245.0\"}",
+              "Security Studies - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Security Studies - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Security Studies - # prompt tokens": "{\"description\": \"min=1377.531, mean=1377.531, max=1377.531, sum=2755.061 (2)\", \"tab\": \"General information\", \"score\": \"1377.530612244898\"}",
+              "Security Studies - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subject": "[\"algebra\", \"counting_and_probability\", \"geometry\", \"intermediate_algebra\", \"number_theory\", \"prealgebra\", \"precalculus\"]",
-              "level": "\"1\"",
-              "use_official_examples": "\"False\"",
-              "use_chain_of_thought": "\"True\""
+              "subject": "\"security_studies\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_security_studies\""
             }
           }
         },
         {
-          "evaluation_name": "GSM8K",
+          "evaluation_name": "Sociology",
           "source_data": {
-            "dataset_name": "GSM8K",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on GSM8K",
+            "evaluation_description": "EM on Sociology",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.768,
+            "score": 0.891,
             "details": {
-              "description": "min=0.768, mean=0.768, max=0.768, sum=0.768 (1)",
+              "description": "min=0.891, mean=0.891, max=0.891, sum=1.781 (2)",
               "tab": "Accuracy",
-              "GSM8K - Observed inference time (s)": "{\"description\": \"min=2.961, mean=2.961, max=2.961, sum=2.961 (1)\", \"tab\": \"Efficiency\", \"score\": \"2.9610197002887726\"}",
-              "GSM8K - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}",
-              "GSM8K - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "GSM8K - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "GSM8K - # prompt tokens": "{\"description\": \"min=1207.746, mean=1207.746, max=1207.746, sum=1207.746 (1)\", \"tab\": \"General information\", \"score\": \"1207.746\"}",
-              "GSM8K - # output tokens": "{\"description\": \"min=189.305, mean=189.305, max=189.305, sum=189.305 (1)\", \"tab\": \"General information\", \"score\": \"189.305\"}"
+              "Sociology - Observed inference time (s)": "{\"description\": \"min=0.416, mean=0.416, max=0.416, sum=0.832 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4159522590352528\"}",
+              "Sociology - # eval": "{\"description\": \"min=201, mean=201, max=201, sum=402 (2)\", \"tab\": \"General information\", \"score\": \"201.0\"}",
+              "Sociology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Sociology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Sociology - # prompt tokens": "{\"description\": \"min=508.478, mean=508.478, max=508.478, sum=1016.955 (2)\", \"tab\": \"General information\", \"score\": \"508.4776119402985\"}",
+              "Sociology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "stop": "\"none\""
+              "subject": "\"sociology\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_sociology\""
             }
           }
         },
         {
-          "evaluation_name": "LegalBench",
+          "evaluation_name": "Virology",
           "source_data": {
-            "dataset_name": "LegalBench",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on LegalBench",
+            "evaluation_description": "EM on Virology",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.588,
+            "score": 0.536,
             "details": {
-              "description": "min=0.351, mean=0.588, max=0.874, sum=2.94 (5)",
+              "description": "min=0.536, mean=0.536, max=0.536, sum=1.072 (2)",
               "tab": "Accuracy",
-              "LegalBench - Observed inference time (s)": "{\"description\": \"min=0.292, mean=0.346, max=0.462, sum=1.729 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.34576316386866485\"}",
-              "LegalBench - # eval": "{\"description\": \"min=95, mean=409.4, max=1000, sum=2047 (5)\", \"tab\": \"General information\", \"score\": \"409.4\"}",
-              "LegalBench - # train": "{\"description\": \"min=1.81, mean=4.162, max=5, sum=20.81 (5)\", \"tab\": \"General information\", \"score\": \"4.162040816326531\"}",
-              "LegalBench - truncated": "{\"description\": \"min=0, mean=0.002, max=0.008, sum=0.008 (5)\", \"tab\": \"General information\", \"score\": \"0.0016326530612244899\"}",
-              "LegalBench - # prompt tokens": "{\"description\": \"min=239.137, mean=1024.722, max=3561.237, sum=5123.61 (5)\", \"tab\": \"General information\", \"score\": \"1024.7220443430492\"}",
-              "LegalBench - # output tokens": "{\"description\": \"min=2, mean=2.438, max=3.421, sum=12.188 (5)\", \"tab\": \"General information\", \"score\": \"2.4375592890361366\"}"
+              "Virology - Observed inference time (s)": "{\"description\": \"min=0.405, mean=0.405, max=0.405, sum=0.809 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.40467354332108096\"}",
+              "Virology - # eval": "{\"description\": \"min=166, mean=166, max=166, sum=332 (2)\", \"tab\": \"General information\", \"score\": \"166.0\"}",
+              "Virology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "Virology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "Virology - # prompt tokens": "{\"description\": \"min=405.108, mean=405.108, max=405.108, sum=810.217 (2)\", \"tab\": \"General information\", \"score\": \"405.10843373493975\"}",
+              "Virology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
             "additional_details": {
-              "subset": "[\"abercrombie\", \"corporate_lobbying\", \"function_of_decision_section\", \"international_citizenship_questions\", \"proa\"]"
+              "subject": "\"virology\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_virology\""
             }
           }
         },
         {
-          "evaluation_name": "MedQA",
+          "evaluation_name": "World Religions",
           "source_data": {
-            "dataset_name": "MedQA",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "EM on MedQA",
+            "evaluation_description": "EM on World Religions",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.581,
+            "score": 0.854,
             "details": {
-              "description": "min=0.581, mean=0.581, max=0.581, sum=0.581 (1)",
+              "description": "min=0.854, mean=0.854, max=0.854, sum=1.708 (2)",
               "tab": "Accuracy",
-              "MedQA - Observed inference time (s)": "{\"description\": \"min=0.313, mean=0.313, max=0.313, sum=0.313 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.31300480038697864\"}",
-              "MedQA - # eval": "{\"description\": \"min=503, mean=503, max=503, sum=503 (1)\", \"tab\": \"General information\", \"score\": \"503.0\"}",
-              "MedQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
-              "MedQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "MedQA - # prompt tokens": "{\"description\": \"min=1243.901, mean=1243.901, max=1243.901, sum=1243.901 (1)\", \"tab\": \"General information\", \"score\": \"1243.9005964214712\"}",
-              "MedQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
+              "World Religions - Observed inference time (s)": "{\"description\": \"min=0.393, mean=0.393, max=0.393, sum=0.787 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.39336834455791275\"}",
+              "World Religions - # eval": "{\"description\": \"min=171, mean=171, max=171, sum=342 (2)\", \"tab\": \"General information\", \"score\": \"171.0\"}",
+              "World Religions - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}",
+              "World Religions - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
+              "World Religions - # prompt tokens": "{\"description\": \"min=304.474, mean=304.474, max=304.474, sum=608.947 (2)\", \"tab\": \"General information\", \"score\": \"304.4736842105263\"}",
+              "World Religions - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}"
             }
           },
           "generation_config": {
-            "additional_details": {}
+            "additional_details": {
+              "subject": "\"world_religions\"",
+              "method": "\"multiple_choice_joint\"",
+              "eval_split": "\"test\"",
+              "groups": "\"mmlu_world_religions\""
+            }
           }
         },
         {
-          "evaluation_name": "WMT 2014",
+          "evaluation_name": "Mean win rate",
           "source_data": {
-            "dataset_name": "WMT 2014",
+            "dataset_name": "helm_mmlu",
             "source_type": "url",
             "url": [
-              "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
+              "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
             ]
           },
           "metric_config": {
-            "evaluation_description": "BLEU-4 on WMT 2014",
+            "evaluation_description": "How many models this model outperforms on average (over columns).",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.172,
+            "score": 0.565,
             "details": {
-              "description": "min=0.09, mean=0.172, max=0.217, sum=0.86 (5)",
-              "tab": "Accuracy",
-              "WMT 2014 - Observed inference time (s)": "{\"description\": \"min=0.65, mean=0.681, max=0.702, sum=3.405 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.681007040066764\"}",
-              "WMT 2014 - # eval": "{\"description\": \"min=503, mean=568.8, max=832, sum=2844 (5)\", \"tab\": \"General information\", \"score\": \"568.8\"}",
-              "WMT 2014 - # train": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}",
-              "WMT 2014 - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}",
-              "WMT 2014 - # prompt tokens": "{\"description\": \"min=145.523, mean=160.288, max=182.972, sum=801.438 (5)\", \"tab\": \"General information\", \"score\": \"160.28751290334915\"}",
-              "WMT 2014 - # output tokens": "{\"description\": \"min=28.596, mean=30.59, max=31.485, sum=152.951 (5)\", \"tab\": \"General information\", \"score\": \"30.59012702630372\"}"
+              "description": "",
+              "tab": "Efficiency"
             }
           },
           "generation_config": {
-            "additional_details": {
-              "language_pair": "[\"cs-en\", \"de-en\", \"fr-en\", \"hi-en\", \"ru-en\"]"
-            }
+            "additional_details": {}
           }
         }
       ],
       "detailed_evaluation_results": null,
       "generation_config": {
-        "additional_details": {}
+        "additional_details": {
+          "subject": "[\"abstract_algebra\", \"anatomy\", \"astronomy\", \"business_ethics\", \"clinical_knowledge\", \"college_biology\", \"college_chemistry\", \"college_computer_science\", \"college_mathematics\", \"college_medicine\", \"college_physics\", \"computer_security\", \"conceptual_physics\", \"econometrics\", \"electrical_engineering\", \"elementary_mathematics\", \"formal_logic\", \"global_facts\", \"high_school_biology\", \"high_school_chemistry\", \"high_school_computer_science\", \"high_school_european_history\", \"high_school_geography\", \"high_school_government_and_politics\", \"high_school_macroeconomics\", \"high_school_mathematics\", \"high_school_microeconomics\", \"high_school_physics\", \"high_school_psychology\", \"high_school_statistics\", \"high_school_us_history\", \"high_school_world_history\", \"human_aging\", \"human_sexuality\", \"international_law\", \"jurisprudence\", \"logical_fallacies\", \"machine_learning\", \"management\", \"marketing\", \"medical_genetics\", \"miscellaneous\", \"moral_disputes\", \"moral_scenarios\", \"nutrition\", \"philosophy\", \"prehistory\", \"professional_accounting\", \"professional_law\", \"professional_medicine\", \"professional_psychology\", \"public_relations\", \"security_studies\", \"sociology\", \"us_foreign_policy\", \"virology\", \"world_religions\"]",
+          "method": "\"multiple_choice_joint\"",
+          "eval_split": "\"test\"",
+          "groups": "[\"mmlu_abstract_algebra\", \"mmlu_anatomy\", \"mmlu_astronomy\", \"mmlu_business_ethics\", \"mmlu_clinical_knowledge\", \"mmlu_college_biology\", \"mmlu_college_chemistry\", \"mmlu_college_computer_science\", \"mmlu_college_mathematics\", \"mmlu_college_medicine\", \"mmlu_college_physics\", \"mmlu_computer_security\", \"mmlu_conceptual_physics\", \"mmlu_econometrics\", \"mmlu_electrical_engineering\", \"mmlu_elementary_mathematics\", \"mmlu_formal_logic\", \"mmlu_global_facts\", \"mmlu_high_school_biology\", \"mmlu_high_school_chemistry\", \"mmlu_high_school_computer_science\", \"mmlu_high_school_european_history\", \"mmlu_high_school_geography\", \"mmlu_high_school_government_and_politics\", \"mmlu_high_school_macroeconomics\", \"mmlu_high_school_mathematics\", \"mmlu_high_school_microeconomics\", \"mmlu_high_school_physics\", \"mmlu_high_school_psychology\", \"mmlu_high_school_statistics\", \"mmlu_high_school_us_history\", \"mmlu_high_school_world_history\", \"mmlu_human_aging\", \"mmlu_human_sexuality\", \"mmlu_international_law\", \"mmlu_jurisprudence\", \"mmlu_logical_fallacies\", \"mmlu_machine_learning\", \"mmlu_management\", \"mmlu_marketing\", \"mmlu_medical_genetics\", \"mmlu_miscellaneous\", \"mmlu_moral_disputes\", \"mmlu_moral_scenarios\", \"mmlu_nutrition\", \"mmlu_philosophy\", \"mmlu_prehistory\", \"mmlu_professional_accounting\", \"mmlu_professional_law\", \"mmlu_professional_medicine\", \"mmlu_professional_psychology\", \"mmlu_public_relations\", \"mmlu_security_studies\", \"mmlu_sociology\", \"mmlu_us_foreign_policy\", \"mmlu_virology\", \"mmlu_world_religions\"]"
+        }
       }
     }
   ]
diff --git a/data/models/ucla-agi_llama-3-instruct-8b-sppo-iter3.json b/data/models/ucla-agi_llama-3-instruct-8b-sppo-iter3.json
index dd0fde633a69c27555ebadb4cd9d2b225a2b1309..6c8b4b0133a49d420e13a89373c13c0888830855 100644
--- a/data/models/ucla-agi_llama-3-instruct-8b-sppo-iter3.json
+++ b/data/models/ucla-agi_llama-3-instruct-8b-sppo-iter3.json
@@ -5,7 +5,7 @@
     "developer": "UCLA-AGI",
     "inference_platform": "unknown",
     "additional_details": {
-      "precision": "float16",
+      "precision": "bfloat16",
       "architecture": "LlamaForCausalLM",
       "params_billions": "8.03"
     }
@@ -44,7 +44,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6703
+            "score": 0.6834
           }
         },
         {
@@ -62,7 +62,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5076
+            "score": 0.508
           }
         },
         {
@@ -80,7 +80,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0718
+            "score": 0.0959
           }
         },
         {
@@ -116,7 +116,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3647
+            "score": 0.3661
           }
         },
         {
@@ -134,7 +134,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3658
+            "score": 0.3644
           }
         }
       ],
@@ -174,7 +174,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6834
+            "score": 0.6703
           }
         },
         {
@@ -192,7 +192,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.508
+            "score": 0.5076
           }
         },
         {
@@ -210,7 +210,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0959
+            "score": 0.0718
           }
         },
         {
@@ -246,7 +246,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3661
+            "score": 0.3647
           }
         },
         {
@@ -264,7 +264,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3644
+            "score": 0.3658
           }
         }
       ],
diff --git a/data/models/valiantlabs_llama3.1-8b-fireplace2.json b/data/models/valiantlabs_llama3.1-8b-fireplace2.json
index 0a050b1b1736e6133f5ceae5b758cb4a60844c7f..1328c2c7f3791f083d4f447f5573aa125ba6707d 100644
--- a/data/models/valiantlabs_llama3.1-8b-fireplace2.json
+++ b/data/models/valiantlabs_llama3.1-8b-fireplace2.json
@@ -5,7 +5,7 @@
     "developer": "ValiantLabs",
     "inference_platform": "unknown",
     "additional_details": {
-      "precision": "float16",
+      "precision": "bfloat16",
       "architecture": "LlamaForCausalLM",
       "params_billions": "8.03"
     }
@@ -44,7 +44,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5483
+            "score": 0.5328
           }
         },
         {
@@ -62,7 +62,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.461
+            "score": 0.4613
           }
         },
         {
@@ -80,7 +80,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0582
+            "score": 0.0876
           }
         },
         {
@@ -98,7 +98,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2886
+            "score": 0.2894
           }
         },
         {
@@ -116,7 +116,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3433
+            "score": 0.3367
           }
         },
         {
@@ -134,7 +134,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2407
+            "score": 0.2424
           }
         }
       ],
@@ -174,7 +174,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5328
+            "score": 0.5483
           }
         },
         {
@@ -192,7 +192,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4613
+            "score": 0.461
           }
         },
         {
@@ -210,7 +210,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0876
+            "score": 0.0582
           }
         },
         {
@@ -228,7 +228,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2894
+            "score": 0.2886
           }
         },
         {
@@ -246,7 +246,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3367
+            "score": 0.3433
           }
         },
         {
@@ -264,7 +264,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2424
+            "score": 0.2407
           }
         }
       ],
diff --git a/data/models/valiantlabs_llama3.1-8b-shiningvaliant2.json b/data/models/valiantlabs_llama3.1-8b-shiningvaliant2.json
index 0736460b872bea97a209be68cef1f113fa7d9f3d..f3e37b204fa779a9e21a0521a813b464c3fe641b 100644
--- a/data/models/valiantlabs_llama3.1-8b-shiningvaliant2.json
+++ b/data/models/valiantlabs_llama3.1-8b-shiningvaliant2.json
@@ -5,7 +5,7 @@
     "developer": "ValiantLabs",
     "inference_platform": "unknown",
     "additional_details": {
-      "precision": "float16",
+      "precision": "bfloat16",
       "architecture": "LlamaForCausalLM",
       "params_billions": "8.03"
     }
@@ -44,7 +44,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2678
+            "score": 0.6496
           }
         },
         {
@@ -62,7 +62,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4429
+            "score": 0.4774
           }
         },
         {
@@ -80,7 +80,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0521
+            "score": 0.0566
           }
         },
         {
@@ -98,7 +98,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.302
+            "score": 0.3104
           }
         },
         {
@@ -116,7 +116,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3959
+            "score": 0.3909
           }
         },
         {
@@ -134,7 +134,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2927
+            "score": 0.3382
           }
         }
       ],
@@ -174,7 +174,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6496
+            "score": 0.2678
           }
         },
         {
@@ -192,7 +192,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4774
+            "score": 0.4429
           }
         },
         {
@@ -210,7 +210,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0566
+            "score": 0.0521
           }
         },
         {
@@ -228,7 +228,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3104
+            "score": 0.302
           }
         },
         {
@@ -246,7 +246,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3909
+            "score": 0.3959
           }
         },
         {
@@ -264,7 +264,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3382
+            "score": 0.2927
           }
         }
       ],
diff --git a/data/models/virnect_llama-3-korean-8b.json b/data/models/virnect_llama-3-korean-8b.json
index 99d18345be715ff626474cdbe5ae8e088352cbe8..10825c433e16c0218b8189ab67e3befd38673ff7 100644
--- a/data/models/virnect_llama-3-korean-8b.json
+++ b/data/models/virnect_llama-3-korean-8b.json
@@ -5,7 +5,7 @@
     "developer": "VIRNECT",
     "inference_platform": "unknown",
     "additional_details": {
-      "precision": "float16",
+      "precision": "bfloat16",
       "architecture": "LlamaForCausalLM",
       "params_billions": "8.03"
     }
@@ -44,7 +44,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5058
+            "score": 0.5021
           }
         },
         {
@@ -62,7 +62,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4908
+            "score": 0.4918
           }
         },
         {
@@ -80,7 +80,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0929
+            "score": 0.108
           }
         },
         {
@@ -116,7 +116,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3662
+            "score": 0.3648
           }
         },
         {
@@ -134,7 +134,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3539
+            "score": 0.3536
           }
         }
       ],
@@ -174,7 +174,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5021
+            "score": 0.5058
           }
         },
         {
@@ -192,7 +192,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4918
+            "score": 0.4908
           }
         },
         {
@@ -210,7 +210,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.108
+            "score": 0.0929
           }
         },
         {
@@ -246,7 +246,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3648
+            "score": 0.3662
           }
         },
         {
@@ -264,7 +264,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3536
+            "score": 0.3539
           }
         }
       ],
diff --git a/data/models/weqweasdas_hh_rlhf_rm_open_llama_3b.json b/data/models/weqweasdas_hh_rlhf_rm_open_llama_3b.json
index f9d2b14f0a0f79f11e39957c0f38b89aa1e78ac9..564c2ceeb0768d947ec7e8507c351558f76e3907 100644
--- a/data/models/weqweasdas_hh_rlhf_rm_open_llama_3b.json
+++ b/data/models/weqweasdas_hh_rlhf_rm_open_llama_3b.json
@@ -9,10 +9,10 @@
   },
   "evaluations": [
     {
-      "evaluation_id": "reward-bench/weqweasdas_hh_rlhf_rm_open_llama_3b/1766412838.146816",
+      "evaluation_id": "reward-bench-2/weqweasdas_hh_rlhf_rm_open_llama_3b/1766412838.146816",
       "retrieved_timestamp": "1766412838.146816",
       "source_metadata": {
-        "source_name": "RewardBench",
+        "source_name": "RewardBench 2",
         "source_type": "documentation",
         "source_organization_name": "Allen Institute for AI",
         "source_organization_url": "https://allenai.org",
@@ -31,109 +31,127 @@
         {
           "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench Score",
+            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5027
+            "score": 0.2498
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat",
+          "evaluation_name": "Factuality",
           "metric_config": {
-            "evaluation_description": "Chat accuracy - includes easy chat subsets",
+            "evaluation_description": "Factuality score - measures factual accuracy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8184
+            "score": 0.3642
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat Hard",
+          "evaluation_name": "Precise IF",
           "metric_config": {
-            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
+            "evaluation_description": "Precise Instruction Following score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3728
+            "score": 0.275
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
+          }
+        },
+        {
+          "evaluation_name": "Math",
+          "metric_config": {
+            "evaluation_description": "Math score - measures mathematical reasoning",
+            "lower_is_better": false,
+            "score_type": "continuous",
+            "min_score": 0.0,
+            "max_score": 1.0
+          },
+          "score_details": {
+            "score": 0.3497
+          },
+          "source_data": {
+            "dataset_name": "RewardBench 2",
+            "source_type": "hf_dataset",
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
           "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Safety accuracy - includes safety subsets",
+            "evaluation_description": "Safety score - measures safety awareness",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4149
+            "score": 0.24
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Reasoning",
+          "evaluation_name": "Focus",
           "metric_config": {
-            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
+            "evaluation_description": "Focus score - measures response focus",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3281
+            "score": 0.2384
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Prior Sets (0.5 weight)",
+          "evaluation_name": "Ties",
           "metric_config": {
-            "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
+            "evaluation_description": "Ties score - ability to identify tie cases",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6564
+            "score": 0.0315
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         }
       ],
@@ -141,10 +159,10 @@
       "generation_config": null
     },
     {
-      "evaluation_id": "reward-bench-2/weqweasdas_hh_rlhf_rm_open_llama_3b/1766412838.146816",
+      "evaluation_id": "reward-bench/weqweasdas_hh_rlhf_rm_open_llama_3b/1766412838.146816",
       "retrieved_timestamp": "1766412838.146816",
       "source_metadata": {
-        "source_name": "RewardBench 2",
+        "source_name": "RewardBench",
         "source_type": "documentation",
         "source_organization_name": "Allen Institute for AI",
         "source_organization_url": "https://allenai.org",
@@ -163,127 +181,109 @@
         {
           "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-            "lower_is_better": false,
-            "score_type": "continuous",
-            "min_score": 0.0,
-            "max_score": 1.0
-          },
-          "score_details": {
-            "score": 0.2498
-          },
-          "source_data": {
-            "dataset_name": "RewardBench 2",
-            "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
-          }
-        },
-        {
-          "evaluation_name": "Factuality",
-          "metric_config": {
-            "evaluation_description": "Factuality score - measures factual accuracy",
+            "evaluation_description": "Overall RewardBench Score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3642
+            "score": 0.5027
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Precise IF",
+          "evaluation_name": "Chat",
           "metric_config": {
-            "evaluation_description": "Precise Instruction Following score",
+            "evaluation_description": "Chat accuracy - includes easy chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.275
+            "score": 0.8184
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Math",
+          "evaluation_name": "Chat Hard",
           "metric_config": {
-            "evaluation_description": "Math score - measures mathematical reasoning",
+            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3497
+            "score": 0.3728
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
           "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Safety score - measures safety awareness",
+            "evaluation_description": "Safety accuracy - includes safety subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.24
+            "score": 0.4149
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Focus",
+          "evaluation_name": "Reasoning",
           "metric_config": {
-            "evaluation_description": "Focus score - measures response focus",
+            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2384
+            "score": 0.3281
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Ties",
+          "evaluation_name": "Prior Sets (0.5 weight)",
           "metric_config": {
-            "evaluation_description": "Ties score - ability to identify tie cases",
+            "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0315
+            "score": 0.6564
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         }
       ],
diff --git a/data/models/weqweasdas_rm-gemma-2b.json b/data/models/weqweasdas_rm-gemma-2b.json
index 23bdb9776ed416ae7ab56cbae8b9e5655ba27e0c..b1151d30882a530616e4e5f2252a198e5a25a8a2 100644
--- a/data/models/weqweasdas_rm-gemma-2b.json
+++ b/data/models/weqweasdas_rm-gemma-2b.json
@@ -9,10 +9,10 @@
   },
   "evaluations": [
     {
-      "evaluation_id": "reward-bench/weqweasdas_RM-Gemma-2B/1766412838.146816",
+      "evaluation_id": "reward-bench-2/weqweasdas_RM-Gemma-2B/1766412838.146816",
       "retrieved_timestamp": "1766412838.146816",
       "source_metadata": {
-        "source_name": "RewardBench",
+        "source_name": "RewardBench 2",
         "source_type": "documentation",
         "source_organization_name": "Allen Institute for AI",
         "source_organization_url": "https://allenai.org",
@@ -31,109 +31,127 @@
         {
           "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench Score",
+            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6549
+            "score": 0.3057
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat",
+          "evaluation_name": "Factuality",
           "metric_config": {
-            "evaluation_description": "Chat accuracy - includes easy chat subsets",
+            "evaluation_description": "Factuality score - measures factual accuracy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9441
+            "score": 0.3705
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat Hard",
+          "evaluation_name": "Precise IF",
           "metric_config": {
-            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
+            "evaluation_description": "Precise Instruction Following score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4079
+            "score": 0.2812
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
+          }
+        },
+        {
+          "evaluation_name": "Math",
+          "metric_config": {
+            "evaluation_description": "Math score - measures mathematical reasoning",
+            "lower_is_better": false,
+            "score_type": "continuous",
+            "min_score": 0.0,
+            "max_score": 1.0
+          },
+          "score_details": {
+            "score": 0.4317
+          },
+          "source_data": {
+            "dataset_name": "RewardBench 2",
+            "source_type": "hf_dataset",
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
           "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Safety accuracy - includes safety subsets",
+            "evaluation_description": "Safety score - measures safety awareness",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4986
+            "score": 0.3311
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Reasoning",
+          "evaluation_name": "Focus",
           "metric_config": {
-            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
+            "evaluation_description": "Focus score - measures response focus",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7637
+            "score": 0.2343
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Prior Sets (0.5 weight)",
+          "evaluation_name": "Ties",
           "metric_config": {
-            "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
+            "evaluation_description": "Ties score - ability to identify tie cases",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6652
+            "score": 0.1851
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         }
       ],
@@ -141,10 +159,10 @@
       "generation_config": null
     },
     {
-      "evaluation_id": "reward-bench-2/weqweasdas_RM-Gemma-2B/1766412838.146816",
+      "evaluation_id": "reward-bench/weqweasdas_RM-Gemma-2B/1766412838.146816",
       "retrieved_timestamp": "1766412838.146816",
       "source_metadata": {
-        "source_name": "RewardBench 2",
+        "source_name": "RewardBench",
         "source_type": "documentation",
         "source_organization_name": "Allen Institute for AI",
         "source_organization_url": "https://allenai.org",
@@ -163,127 +181,109 @@
         {
           "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-            "lower_is_better": false,
-            "score_type": "continuous",
-            "min_score": 0.0,
-            "max_score": 1.0
-          },
-          "score_details": {
-            "score": 0.3057
-          },
-          "source_data": {
-            "dataset_name": "RewardBench 2",
-            "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
-          }
-        },
-        {
-          "evaluation_name": "Factuality",
-          "metric_config": {
-            "evaluation_description": "Factuality score - measures factual accuracy",
+            "evaluation_description": "Overall RewardBench Score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3705
+            "score": 0.6549
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Precise IF",
+          "evaluation_name": "Chat",
           "metric_config": {
-            "evaluation_description": "Precise Instruction Following score",
+            "evaluation_description": "Chat accuracy - includes easy chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2812
+            "score": 0.9441
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Math",
+          "evaluation_name": "Chat Hard",
           "metric_config": {
-            "evaluation_description": "Math score - measures mathematical reasoning",
+            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4317
+            "score": 0.4079
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
           "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Safety score - measures safety awareness",
+            "evaluation_description": "Safety accuracy - includes safety subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3311
+            "score": 0.4986
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Focus",
+          "evaluation_name": "Reasoning",
           "metric_config": {
-            "evaluation_description": "Focus score - measures response focus",
+            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.2343
+            "score": 0.7637
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Ties",
+          "evaluation_name": "Prior Sets (0.5 weight)",
           "metric_config": {
-            "evaluation_description": "Ties score - ability to identify tie cases",
+            "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.1851
+            "score": 0.6652
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         }
       ],
diff --git a/data/models/weqweasdas_rm-mistral-7b.json b/data/models/weqweasdas_rm-mistral-7b.json
index 014b8589e308fa2571a3ca85971364da616341ae..2c0c95b4657b4530753b94c6c05b68b220f49072 100644
--- a/data/models/weqweasdas_rm-mistral-7b.json
+++ b/data/models/weqweasdas_rm-mistral-7b.json
@@ -9,10 +9,10 @@
   },
   "evaluations": [
     {
-      "evaluation_id": "reward-bench/weqweasdas_RM-Mistral-7B/1766412838.146816",
+      "evaluation_id": "reward-bench-2/weqweasdas_RM-Mistral-7B/1766412838.146816",
       "retrieved_timestamp": "1766412838.146816",
       "source_metadata": {
-        "source_name": "RewardBench",
+        "source_name": "RewardBench 2",
         "source_type": "documentation",
         "source_organization_name": "Allen Institute for AI",
         "source_organization_url": "https://allenai.org",
@@ -31,109 +31,127 @@
         {
           "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench Score",
+            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7982
+            "score": 0.596
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat",
+          "evaluation_name": "Factuality",
           "metric_config": {
-            "evaluation_description": "Chat accuracy - includes easy chat subsets",
+            "evaluation_description": "Factuality score - measures factual accuracy",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.9665
+            "score": 0.5937
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Chat Hard",
+          "evaluation_name": "Precise IF",
           "metric_config": {
-            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
+            "evaluation_description": "Precise Instruction Following score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6053
+            "score": 0.3438
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
+          }
+        },
+        {
+          "evaluation_name": "Math",
+          "metric_config": {
+            "evaluation_description": "Math score - measures mathematical reasoning",
+            "lower_is_better": false,
+            "score_type": "continuous",
+            "min_score": 0.0,
+            "max_score": 1.0
+          },
+          "score_details": {
+            "score": 0.5956
+          },
+          "source_data": {
+            "dataset_name": "RewardBench 2",
+            "source_type": "hf_dataset",
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
           "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Safety accuracy - includes safety subsets",
+            "evaluation_description": "Safety score - measures safety awareness",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.8703
+            "score": 0.6911
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Reasoning",
+          "evaluation_name": "Focus",
           "metric_config": {
-            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
+            "evaluation_description": "Focus score - measures response focus",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7736
+            "score": 0.7293
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         },
         {
-          "evaluation_name": "Prior Sets (0.5 weight)",
+          "evaluation_name": "Ties",
           "metric_config": {
-            "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
+            "evaluation_description": "Ties score - ability to identify tie cases",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.753
+            "score": 0.6226
           },
           "source_data": {
-            "dataset_name": "RewardBench",
+            "dataset_name": "RewardBench 2",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench"
+            "hf_repo": "allenai/reward-bench-2-results"
           }
         }
       ],
@@ -141,10 +159,10 @@
       "generation_config": null
     },
     {
-      "evaluation_id": "reward-bench-2/weqweasdas_RM-Mistral-7B/1766412838.146816",
+      "evaluation_id": "reward-bench/weqweasdas_RM-Mistral-7B/1766412838.146816",
       "retrieved_timestamp": "1766412838.146816",
       "source_metadata": {
-        "source_name": "RewardBench 2",
+        "source_name": "RewardBench",
         "source_type": "documentation",
         "source_organization_name": "Allen Institute for AI",
         "source_organization_url": "https://allenai.org",
@@ -163,127 +181,109 @@
         {
           "evaluation_name": "Score",
           "metric_config": {
-            "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-            "lower_is_better": false,
-            "score_type": "continuous",
-            "min_score": 0.0,
-            "max_score": 1.0
-          },
-          "score_details": {
-            "score": 0.596
-          },
-          "source_data": {
-            "dataset_name": "RewardBench 2",
-            "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
-          }
-        },
-        {
-          "evaluation_name": "Factuality",
-          "metric_config": {
-            "evaluation_description": "Factuality score - measures factual accuracy",
+            "evaluation_description": "Overall RewardBench Score",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5937
+            "score": 0.7982
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Precise IF",
+          "evaluation_name": "Chat",
           "metric_config": {
-            "evaluation_description": "Precise Instruction Following score",
+            "evaluation_description": "Chat accuracy - includes easy chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3438
+            "score": 0.9665
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Math",
+          "evaluation_name": "Chat Hard",
           "metric_config": {
-            "evaluation_description": "Math score - measures mathematical reasoning",
+            "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5956
+            "score": 0.6053
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
           "evaluation_name": "Safety",
           "metric_config": {
-            "evaluation_description": "Safety score - measures safety awareness",
+            "evaluation_description": "Safety accuracy - includes safety subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6911
+            "score": 0.8703
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Focus",
+          "evaluation_name": "Reasoning",
           "metric_config": {
-            "evaluation_description": "Focus score - measures response focus",
+            "evaluation_description": "Reasoning accuracy - includes code and math subsets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7293
+            "score": 0.7736
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         },
         {
-          "evaluation_name": "Ties",
+          "evaluation_name": "Prior Sets (0.5 weight)",
           "metric_config": {
-            "evaluation_description": "Ties score - ability to identify tie cases",
+            "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
             "lower_is_better": false,
             "score_type": "continuous",
             "min_score": 0.0,
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6226
+            "score": 0.753
           },
           "source_data": {
-            "dataset_name": "RewardBench 2",
+            "dataset_name": "RewardBench",
             "source_type": "hf_dataset",
-            "hf_repo": "allenai/reward-bench-2-results"
+            "hf_repo": "allenai/reward-bench"
           }
         }
       ],
diff --git a/data/models/xai_grok-3-mini.json b/data/models/xai_grok-3-mini.json
index 6f7e913322e4d12afd1b4e9815b3c829b5eb051d..7fda2d0643a4e1f5eb9d98f1dc44e9ec85970d5d 100644
--- a/data/models/xai_grok-3-mini.json
+++ b/data/models/xai_grok-3-mini.json
@@ -10,8 +10,8 @@
   },
   "evaluations": [
     {
-      "evaluation_id": "global-mmlu-lite/xai_grok-3-mini/1773936583.743359",
-      "retrieved_timestamp": "1773936583.743359",
+      "evaluation_id": "global-mmlu-lite/xai_grok-3-mini/1773936496.366405",
+      "retrieved_timestamp": "1773936496.366405",
       "source_metadata": {
         "source_name": "Global MMLU Lite Leaderboard",
         "source_type": "documentation",
@@ -525,8 +525,8 @@
       "generation_config": null
     },
     {
-      "evaluation_id": "global-mmlu-lite/xai_grok-3-mini/1773936496.366405",
-      "retrieved_timestamp": "1773936496.366405",
+      "evaluation_id": "global-mmlu-lite/xai_grok-3-mini/1773936583.743359",
+      "retrieved_timestamp": "1773936583.743359",
       "source_metadata": {
         "source_name": "Global MMLU Lite Leaderboard",
         "source_type": "documentation",
diff --git a/data/models/xai_grok-4.json b/data/models/xai_grok-4.json
index 2396de921bef3d760583d06f283159a7a4b7cc87..8fb7b1b24e602a14c14ed4217ea5e7cd9fdf0b69 100644
--- a/data/models/xai_grok-4.json
+++ b/data/models/xai_grok-4.json
@@ -84,7 +84,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/terminus-2__grok-4/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/mini-swe-agent__grok-4/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -108,7 +108,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-10-31",
+          "evaluation_timestamp": "2025-11-03",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -117,7 +117,7 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 23.1,
+            "score": 25.4,
             "uncertainty": {
               "standard_error": {
                 "value": 2.9
@@ -127,7 +127,7 @@
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Grok 4\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Grok 4\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -144,7 +144,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Grok 4\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Grok 4\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -158,7 +158,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/mini-swe-agent__grok-4/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/terminus-2__grok-4/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -182,7 +182,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-11-03",
+          "evaluation_timestamp": "2025-10-31",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -191,7 +191,7 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 25.4,
+            "score": 23.1,
             "uncertainty": {
               "standard_error": {
                 "value": 2.9
@@ -201,7 +201,7 @@
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Grok 4\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Grok 4\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -218,7 +218,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Grok 4\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Grok 4\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
diff --git a/data/models/xai_grok-code-fast-1.json b/data/models/xai_grok-code-fast-1.json
index 3dead5de67b9b087729b3bc778973bf9f0bdd596..80a497dcbe0e067252e23ff5d2567dcecf97e827 100644
--- a/data/models/xai_grok-code-fast-1.json
+++ b/data/models/xai_grok-code-fast-1.json
@@ -4,13 +4,13 @@
     "id": "xai/grok-code-fast-1",
     "developer": "xAI",
     "additional_details": {
-      "agent_name": "Terminus 2",
-      "agent_organization": "Terminal Bench"
+      "agent_name": "Mini-SWE-Agent",
+      "agent_organization": "Princeton"
     }
   },
   "evaluations": [
     {
-      "evaluation_id": "terminal-bench-2.0/terminus-2__grok-code-fast-1/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/mini-swe-agent__grok-code-fast-1/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -34,7 +34,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-10-31",
+          "evaluation_timestamp": "2025-11-03",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -43,17 +43,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 14.2,
+            "score": 25.8,
             "uncertainty": {
               "standard_error": {
-                "value": 2.5
+                "value": 2.6
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Grok Code Fast 1\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Grok Code Fast 1\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -70,7 +70,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Grok Code Fast 1\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Grok Code Fast 1\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
@@ -84,7 +84,7 @@
       }
     },
     {
-      "evaluation_id": "terminal-bench-2.0/mini-swe-agent__grok-code-fast-1/1773776901.772108",
+      "evaluation_id": "terminal-bench-2.0/terminus-2__grok-code-fast-1/1773776901.772108",
       "retrieved_timestamp": "1773776901.772108",
       "source_metadata": {
         "source_name": "Terminal-Bench 2.0",
@@ -108,7 +108,7 @@
               "https://www.tbench.ai/leaderboard/terminal-bench/2.0"
             ]
           },
-          "evaluation_timestamp": "2025-11-03",
+          "evaluation_timestamp": "2025-10-31",
           "metric_config": {
             "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each",
             "lower_is_better": false,
@@ -117,17 +117,17 @@
             "max_score": 100.0
           },
           "score_details": {
-            "score": 25.8,
+            "score": 14.2,
             "uncertainty": {
               "standard_error": {
-                "value": 2.6
+                "value": 2.5
               },
               "num_samples": 435
             }
           },
           "generation_config": {
             "generation_args": {
-              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Grok Code Fast 1\" -k 5",
+              "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Grok Code Fast 1\" -k 5",
               "agentic_eval_config": {
                 "available_tools": [
                   {
@@ -144,7 +144,7 @@
       "detailed_evaluation_results": null,
       "generation_config": {
         "generation_args": {
-          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Grok Code Fast 1\" -k 5",
+          "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Grok Code Fast 1\" -k 5",
           "agentic_eval_config": {
             "available_tools": [
               {
diff --git a/data/models/ycros_bagelmisterytour-v2-8x7b.json b/data/models/ycros_bagelmisterytour-v2-8x7b.json
index c7b7f840ab350df665db2f8b289c03a4556651c3..ba69aabd10b1f09ccc48e0969d876027b03e3a4b 100644
--- a/data/models/ycros_bagelmisterytour-v2-8x7b.json
+++ b/data/models/ycros_bagelmisterytour-v2-8x7b.json
@@ -5,7 +5,7 @@
     "developer": "ycros",
     "inference_platform": "unknown",
     "additional_details": {
-      "precision": "float16",
+      "precision": "bfloat16",
       "architecture": "MixtralForCausalLM",
       "params_billions": "46.703"
     }
@@ -44,7 +44,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5994
+            "score": 0.6262
           }
         },
         {
@@ -62,7 +62,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5159
+            "score": 0.5142
           }
         },
         {
@@ -80,7 +80,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0785
+            "score": 0.0937
           }
         },
         {
@@ -98,7 +98,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3045
+            "score": 0.3079
           }
         },
         {
@@ -116,7 +116,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4203
+            "score": 0.4138
           }
         },
         {
@@ -134,7 +134,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3473
+            "score": 0.3481
           }
         }
       ],
@@ -174,7 +174,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6262
+            "score": 0.5994
           }
         },
         {
@@ -192,7 +192,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5142
+            "score": 0.5159
           }
         },
         {
@@ -210,7 +210,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0937
+            "score": 0.0785
           }
         },
         {
@@ -228,7 +228,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3079
+            "score": 0.3045
           }
         },
         {
@@ -246,7 +246,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4138
+            "score": 0.4203
           }
         },
         {
@@ -264,7 +264,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3481
+            "score": 0.3473
           }
         }
       ],
diff --git a/data/models/yoyo-ai_qwen2.5-14b-yoyo-1010.json b/data/models/yoyo-ai_qwen2.5-14b-yoyo-1010.json
index e42112fd11a9b36734c8e8c4176443d8815fdc44..2a296ce898176000fe4f075772b9bc2bcb8d3b71 100644
--- a/data/models/yoyo-ai_qwen2.5-14b-yoyo-1010.json
+++ b/data/models/yoyo-ai_qwen2.5-14b-yoyo-1010.json
@@ -5,7 +5,7 @@
     "developer": "YOYO-AI",
     "inference_platform": "unknown",
     "additional_details": {
-      "precision": "bfloat16",
+      "precision": "float16",
       "architecture": "Qwen2ForCausalLM",
       "params_billions": "14.77"
     }
@@ -44,7 +44,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.7905
+            "score": 0.5899
           }
         },
         {
@@ -62,7 +62,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.6406
+            "score": 0.654
           }
         },
         {
@@ -80,7 +80,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.0
+            "score": 0.4509
           }
         },
         {
@@ -98,7 +98,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3163
+            "score": 0.3834
           }
         },
         {
@@ -116,7 +116,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4181
+            "score": 0.4744
           }
         },
         {
@@ -134,7 +134,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4944
+            "score": 0.5376
           }
         }
       ],
@@ -174,7 +174,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5899
+            "score": 0.7905
           }
         },
         {
@@ -192,7 +192,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.654
+            "score": 0.6406
           }
         },
         {
@@ -210,7 +210,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4509
+            "score": 0.0
           }
         },
         {
@@ -228,7 +228,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.3834
+            "score": 0.3163
           }
         },
         {
@@ -246,7 +246,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.4744
+            "score": 0.4181
           }
         },
         {
@@ -264,7 +264,7 @@
             "max_score": 1.0
           },
           "score_details": {
-            "score": 0.5376
+            "score": 0.4944
           }
         }
       ],