diff --git a/data/benchmarks.json b/data/benchmarks.json index b2c2822ebba1561c909be7c113905d12c56e46d9..b89fad043205746a1fe3c3d97fb375c96123e519 100644 --- a/data/benchmarks.json +++ b/data/benchmarks.json @@ -45,7 +45,7 @@ }, { "benchmark": "hfopenllm_v2", - "model_count": 4494 + "model_count": 4493 }, { "benchmark": "la_leaderboard", @@ -78,5 +78,9 @@ { "benchmark": "terminal-bench-2.0", "model_count": 37 + }, + { + "benchmark": "theory_of_mind", + "model_count": 1 } ] \ No newline at end of file diff --git a/data/benchmarks/appworld_test_normal.json b/data/benchmarks/appworld_test_normal.json index 6b56627ff81e57642084626a8f27419d8d068837..27bfa4d8d1ae3df861f951b1a84e2236e37e45c3 100644 --- a/data/benchmarks/appworld_test_normal.json +++ b/data/benchmarks/appworld_test_normal.json @@ -5,7 +5,7 @@ "name": "claude-opus-4-5", "developer": "Anthropic", "scores": { - "appworld/test_normal": 0.68 + "appworld/test_normal": 0.7 } }, { @@ -13,7 +13,7 @@ "name": "gemini-3-pro-preview", "developer": "Google", "scores": { - "appworld/test_normal": 0.13 + "appworld/test_normal": 0.55 } }, { diff --git a/data/benchmarks/browsecompplus.json b/data/benchmarks/browsecompplus.json index cbe338e54d9ae32cab6bb8e3bb3af6af8c5948a3..fa5cad4bfa9a7ba3be365ad38a5955a6ca39cc9b 100644 --- a/data/benchmarks/browsecompplus.json +++ b/data/benchmarks/browsecompplus.json @@ -13,7 +13,7 @@ "name": "gemini-3-pro-preview", "developer": "Google", "scores": { - "browsecompplus": 0.48 + "browsecompplus": 0.3333 } }, { @@ -21,7 +21,7 @@ "name": "gpt-5.2-2025-12-11", "developer": "OpenAI", "scores": { - "browsecompplus": 0.48 + "browsecompplus": 0.43 } } ] diff --git a/data/benchmarks/hfopenllm_v2.json b/data/benchmarks/hfopenllm_v2.json index b2e8b3aab80c98d2c6986f81a5c4e1c543ca42ae..caf5db92e1eb1042b92e8b5d4090c0a48c4f57e2 100644 --- a/data/benchmarks/hfopenllm_v2.json +++ b/data/benchmarks/hfopenllm_v2.json @@ -2176,12 +2176,12 @@ "name": "LION-Gemma-2b-dpo-v1.0", "developer": "Columbia-NLP", "scores": { - "IFEval": 0.3278, - "BBH": 0.392, - "MATH Level 5": 0.0431, - "GPQA": 0.2492, - "MUSR": 0.412, - "MMLU-PRO": 0.1666 + "IFEval": 0.3102, + "BBH": 0.3881, + "MATH Level 5": 0.0536, + "GPQA": 0.2534, + "MUSR": 0.4081, + "MMLU-PRO": 0.1665 } }, { @@ -3229,12 +3229,12 @@ "name": "PathfinderAI", "developer": "Daemontatox", "scores": { - "IFEval": 0.3745, - "BBH": 0.6668, - "MATH Level 5": 0.4758, - "GPQA": 0.3943, - "MUSR": 0.4858, - "MMLU-PRO": 0.5593 + "IFEval": 0.4855, + "BBH": 0.6627, + "MATH Level 5": 0.4841, + "GPQA": 0.3096, + "MUSR": 0.4256, + "MMLU-PRO": 0.5542 } }, { @@ -4321,12 +4321,12 @@ "name": "Llama-3.1-8b-ITA", "developer": "DeepMount00", "scores": { - "IFEval": 0.7917, - "BBH": 0.5109, - "MATH Level 5": 0.1088, - "GPQA": 0.2878, - "MUSR": 0.4136, - "MMLU-PRO": 0.3876 + "IFEval": 0.5365, + "BBH": 0.517, + "MATH Level 5": 0.1707, + "GPQA": 0.3062, + "MUSR": 0.4487, + "MMLU-PRO": 0.396 } }, { @@ -4646,12 +4646,12 @@ "name": "MN-12B-LilithFrame", "developer": "DoppelReflEx", "scores": { - "IFEval": 0.451, - "BBH": 0.4944, - "MATH Level 5": 0.1156, - "GPQA": 0.3196, - "MUSR": 0.3896, - "MMLU-PRO": 0.3256 + "IFEval": 0.436, + "BBH": 0.4956, + "MATH Level 5": 0.0589, + "GPQA": 0.3205, + "MUSR": 0.3843, + "MMLU-PRO": 0.3237 } }, { @@ -9144,12 +9144,12 @@ "name": "SmolLM2-135M-Instruct", "developer": "HuggingFaceTB", "scores": { - "IFEval": 0.0593, - "BBH": 0.3135, - "MATH Level 5": 0.0144, - "GPQA": 0.2341, - "MUSR": 0.3871, - "MMLU-PRO": 0.1092 + "IFEval": 0.2883, + "BBH": 0.3124, + "MATH Level 5": 0.003, + "GPQA": 0.2357, + "MUSR": 0.3662, + "MMLU-PRO": 0.1115 } }, { @@ -13057,12 +13057,12 @@ "name": "SpydazWeb_AI_HumanAI_012_INSTRUCT_XA", "developer": "LeroyDyer", "scores": { - "IFEval": 0.3579, - "BBH": 0.4477, - "MATH Level 5": 0.0423, - "GPQA": 0.3096, - "MUSR": 0.4134, - "MMLU-PRO": 0.2376 + "IFEval": 0.3798, + "BBH": 0.4483, + "MATH Level 5": 0.04, + "GPQA": 0.3129, + "MUSR": 0.4148, + "MMLU-PRO": 0.2389 } }, { @@ -16874,19 +16874,6 @@ "MMLU-PRO": 0.232 } }, - { - "model_id": "NousResearch/Yarn-Llama-2-7b-128k", - "name": "Yarn-Llama-2-7b-128k", - "developer": "NousResearch", - "scores": { - "IFEval": 0.1485, - "BBH": 0.3248, - "MATH Level 5": 0.0151, - "GPQA": 0.2601, - "MUSR": 0.3967, - "MMLU-PRO": 0.1791 - } - }, { "model_id": "NousResearch/Yarn-Llama-2-7b-64k", "name": "Yarn-Llama-2-7b-64k", @@ -17204,12 +17191,12 @@ "name": "code-yi", "developer": "Omkar1102", "scores": { - "IFEval": 0.2254, - "BBH": 0.275, + "IFEval": 0.2148, + "BBH": 0.276, "MATH Level 5": 0.0, - "GPQA": 0.2576, - "MUSR": 0.3762, - "MMLU-PRO": 0.1123 + "GPQA": 0.2508, + "MUSR": 0.3802, + "MMLU-PRO": 0.1126 } }, { @@ -18141,11 +18128,11 @@ "developer": "PrimeIntellect", "scores": { "IFEval": 0.1757, - "BBH": 0.276, + "BBH": 0.274, "MATH Level 5": 0.0, - "GPQA": 0.2534, - "MUSR": 0.3339, - "MMLU-PRO": 0.1123 + "GPQA": 0.25, + "MUSR": 0.3753, + "MMLU-PRO": 0.112 } }, { @@ -18712,12 +18699,12 @@ "name": "ODB-14B-sce", "developer": "Quazim0t0", "scores": { - "IFEval": 0.7016, - "BBH": 0.6942, - "MATH Level 5": 0.4116, - "GPQA": 0.3624, - "MUSR": 0.4571, - "MMLU-PRO": 0.5411 + "IFEval": 0.2922, + "BBH": 0.6559, + "MATH Level 5": 0.2545, + "GPQA": 0.2659, + "MUSR": 0.3929, + "MMLU-PRO": 0.5207 } }, { @@ -19466,12 +19453,12 @@ "name": "Qwen2.5-0.5B-Instruct", "developer": "Qwen", "scores": { - "IFEval": 0.3153, - "BBH": 0.3322, - "MATH Level 5": 0.1035, - "GPQA": 0.2592, - "MUSR": 0.3342, - "MMLU-PRO": 0.172 + "IFEval": 0.3071, + "BBH": 0.3341, + "MATH Level 5": 0.0, + "GPQA": 0.2576, + "MUSR": 0.3329, + "MMLU-PRO": 0.1697 } }, { @@ -19726,12 +19713,12 @@ "name": "Qwen2.5-Coder-7B-Instruct", "developer": "Qwen", "scores": { - "IFEval": 0.6147, - "BBH": 0.4999, - "MATH Level 5": 0.031, - "GPQA": 0.2936, - "MUSR": 0.4099, - "MMLU-PRO": 0.3354 + "IFEval": 0.6101, + "BBH": 0.5008, + "MATH Level 5": 0.3716, + "GPQA": 0.2919, + "MUSR": 0.4073, + "MMLU-PRO": 0.3352 } }, { @@ -19986,12 +19973,12 @@ "name": "Replete-LLM-Qwen2-7b", "developer": "Replete-AI", "scores": { - "IFEval": 0.0932, - "BBH": 0.2977, + "IFEval": 0.0905, + "BBH": 0.2985, "MATH Level 5": 0.0, - "GPQA": 0.2475, - "MUSR": 0.3941, - "MMLU-PRO": 0.1157 + "GPQA": 0.2534, + "MUSR": 0.3848, + "MMLU-PRO": 0.1158 } }, { @@ -24653,12 +24640,12 @@ "name": "Llama-3-Instruct-8B-SPPO-Iter3", "developer": "UCLA-AGI", "scores": { - "IFEval": 0.6834, - "BBH": 0.508, - "MATH Level 5": 0.0959, + "IFEval": 0.6703, + "BBH": 0.5076, + "MATH Level 5": 0.0718, "GPQA": 0.2651, - "MUSR": 0.3661, - "MMLU-PRO": 0.3644 + "MUSR": 0.3647, + "MMLU-PRO": 0.3658 } }, { @@ -25004,12 +24991,12 @@ "name": "llama-3-Korean-8B", "developer": "VIRNECT", "scores": { - "IFEval": 0.5021, - "BBH": 0.4918, - "MATH Level 5": 0.108, + "IFEval": 0.5058, + "BBH": 0.4908, + "MATH Level 5": 0.0929, "GPQA": 0.271, - "MUSR": 0.3648, - "MMLU-PRO": 0.3536 + "MUSR": 0.3662, + "MMLU-PRO": 0.3539 } }, { @@ -25108,12 +25095,12 @@ "name": "Llama3.1-8B-Fireplace2", "developer": "ValiantLabs", "scores": { - "IFEval": 0.5328, - "BBH": 0.4613, - "MATH Level 5": 0.0876, - "GPQA": 0.2894, - "MUSR": 0.3367, - "MMLU-PRO": 0.2424 + "IFEval": 0.5483, + "BBH": 0.461, + "MATH Level 5": 0.0582, + "GPQA": 0.2886, + "MUSR": 0.3433, + "MMLU-PRO": 0.2407 } }, { @@ -25121,12 +25108,12 @@ "name": "Llama3.1-8B-ShiningValiant2", "developer": "ValiantLabs", "scores": { - "IFEval": 0.6496, - "BBH": 0.4774, - "MATH Level 5": 0.0566, - "GPQA": 0.3104, - "MUSR": 0.3909, - "MMLU-PRO": 0.3382 + "IFEval": 0.2678, + "BBH": 0.4429, + "MATH Level 5": 0.0521, + "GPQA": 0.302, + "MUSR": 0.3959, + "MMLU-PRO": 0.2927 } }, { @@ -25654,12 +25641,12 @@ "name": "Qwen2.5-14B-YOYO-1010", "developer": "YOYO-AI", "scores": { - "IFEval": 0.5899, - "BBH": 0.654, - "MATH Level 5": 0.4509, - "GPQA": 0.3834, - "MUSR": 0.4744, - "MMLU-PRO": 0.5376 + "IFEval": 0.7905, + "BBH": 0.6406, + "MATH Level 5": 0.0, + "GPQA": 0.3163, + "MUSR": 0.4181, + "MMLU-PRO": 0.4944 } }, { @@ -26603,12 +26590,12 @@ "name": "QAIMath-Qwen2.5-7B-TIES", "developer": "adriszmar", "scores": { - "IFEval": 0.1685, - "BBH": 0.3124, - "MATH Level 5": 0.0015, - "GPQA": 0.2492, - "MUSR": 0.3963, - "MMLU-PRO": 0.1066 + "IFEval": 0.1746, + "BBH": 0.3126, + "MATH Level 5": 0.0, + "GPQA": 0.245, + "MUSR": 0.4096, + "MMLU-PRO": 0.1087 } }, { @@ -26889,12 +26876,12 @@ "name": "Llama-3.1-Storm-8B", "developer": "akjindal53244", "scores": { - "IFEval": 0.8051, - "BBH": 0.5189, - "MATH Level 5": 0.1722, - "GPQA": 0.3263, + "IFEval": 0.8033, + "BBH": 0.5196, + "MATH Level 5": 0.1624, + "GPQA": 0.3096, "MUSR": 0.4028, - "MMLU-PRO": 0.3803 + "MMLU-PRO": 0.3812 } }, { @@ -26915,12 +26902,12 @@ "name": "Llama-3.1-Tulu-3-70B", "developer": "allenai", "scores": { - "IFEval": 0.8379, - "BBH": 0.6157, - "MATH Level 5": 0.3829, + "IFEval": 0.8291, + "BBH": 0.6164, + "MATH Level 5": 0.4502, "GPQA": 0.3733, - "MUSR": 0.4988, - "MMLU-PRO": 0.4656 + "MUSR": 0.4948, + "MMLU-PRO": 0.4645 } }, { @@ -31647,12 +31634,12 @@ "name": "dolphin-2.9.2-Phi-3-Medium-abliterated", "developer": "cognitivecomputations", "scores": { - "IFEval": 0.4124, - "BBH": 0.6383, - "MATH Level 5": 0.182, - "GPQA": 0.3289, - "MUSR": 0.4349, - "MMLU-PRO": 0.4525 + "IFEval": 0.3613, + "BBH": 0.6123, + "MATH Level 5": 0.1239, + "GPQA": 0.328, + "MUSR": 0.4112, + "MMLU-PRO": 0.4494 } }, { @@ -31790,12 +31777,12 @@ "name": "llama-43m-beta", "developer": "cpayne1303", "scores": { - "IFEval": 0.1916, - "BBH": 0.2977, - "MATH Level 5": 0.0, + "IFEval": 0.1949, + "BBH": 0.2965, + "MATH Level 5": 0.0045, "GPQA": 0.2685, - "MUSR": 0.3872, - "MMLU-PRO": 0.1132 + "MUSR": 0.3885, + "MMLU-PRO": 0.1111 } }, { @@ -32167,12 +32154,12 @@ "name": "Llama-3-8B-Orpo-v0.1", "developer": "dfurman", "scores": { - "IFEval": 0.2835, - "BBH": 0.3842, - "MATH Level 5": 0.0521, - "GPQA": 0.2609, - "MUSR": 0.3566, - "MMLU-PRO": 0.2298 + "IFEval": 0.3, + "BBH": 0.3853, + "MATH Level 5": 0.0415, + "GPQA": 0.2617, + "MUSR": 0.3579, + "MMLU-PRO": 0.2281 } }, { @@ -34663,12 +34650,12 @@ "name": "gemma-2-2b", "developer": "Google", "scores": { - "IFEval": 0.2018, - "BBH": 0.3709, - "MATH Level 5": 0.0302, + "IFEval": 0.1993, + "BBH": 0.3656, + "MATH Level 5": 0.0287, "GPQA": 0.2626, - "MUSR": 0.4219, - "MMLU-PRO": 0.2217 + "MUSR": 0.4232, + "MMLU-PRO": 0.218 } }, { @@ -34689,12 +34676,12 @@ "name": "gemma-2-2b-jpn-it", "developer": "Google", "scores": { - "IFEval": 0.5078, - "BBH": 0.4226, - "MATH Level 5": 0.0347, - "GPQA": 0.2852, - "MUSR": 0.3964, - "MMLU-PRO": 0.2578 + "IFEval": 0.5288, + "BBH": 0.4178, + "MATH Level 5": 0.0476, + "GPQA": 0.2752, + "MUSR": 0.3728, + "MMLU-PRO": 0.2467 } }, { @@ -37705,12 +37692,12 @@ "name": "Kosmos-EVAA-Fusion-8B", "developer": "jaspionjader", "scores": { - "IFEval": 0.4345, - "BBH": 0.5419, - "MATH Level 5": 0.1292, - "GPQA": 0.3087, + "IFEval": 0.4418, + "BBH": 0.5406, + "MATH Level 5": 0.1352, + "GPQA": 0.3062, "MUSR": 0.4277, - "MMLU-PRO": 0.3854 + "MMLU-PRO": 0.386 } }, { @@ -42359,12 +42346,12 @@ "name": "Mistral-v0.3-7B-ORPO", "developer": "llmat", "scores": { - "IFEval": 0.364, - "BBH": 0.4005, - "MATH Level 5": 0.0015, - "GPQA": 0.2693, - "MUSR": 0.3529, - "MMLU-PRO": 0.2301 + "IFEval": 0.377, + "BBH": 0.3978, + "MATH Level 5": 0.0242, + "GPQA": 0.2668, + "MUSR": 0.3555, + "MMLU-PRO": 0.2278 } }, { @@ -44478,12 +44465,12 @@ "name": "Mixtral-8x7B-v0.1", "developer": "mistralai", "scores": { - "IFEval": 0.2326, - "BBH": 0.5098, - "MATH Level 5": 0.0937, - "GPQA": 0.3205, - "MUSR": 0.4413, - "MMLU-PRO": 0.3871 + "IFEval": 0.2415, + "BBH": 0.5087, + "MATH Level 5": 0.102, + "GPQA": 0.3138, + "MUSR": 0.4321, + "MMLU-PRO": 0.385 } }, { @@ -44738,12 +44725,12 @@ "name": "NeuralDaredevil-8B-abliterated", "developer": "mlabonne", "scores": { - "IFEval": 0.4162, - "BBH": 0.5124, - "MATH Level 5": 0.0853, - "GPQA": 0.3029, - "MUSR": 0.415, - "MMLU-PRO": 0.3802 + "IFEval": 0.7561, + "BBH": 0.5111, + "MATH Level 5": 0.0906, + "GPQA": 0.3062, + "MUSR": 0.4019, + "MMLU-PRO": 0.3841 } }, { @@ -45076,12 +45063,12 @@ "name": "Mistral-Nemo-Kurdish-Instruct", "developer": "nazimali", "scores": { - "IFEval": 0.4964, - "BBH": 0.4699, - "MATH Level 5": 0.0045, - "GPQA": 0.2827, - "MUSR": 0.3979, - "MMLU-PRO": 0.3063 + "IFEval": 0.486, + "BBH": 0.4721, + "MATH Level 5": 0.0846, + "GPQA": 0.2844, + "MUSR": 0.4006, + "MMLU-PRO": 0.3087 } }, { @@ -46779,12 +46766,12 @@ "name": "franqwenstein-35b", "developer": "nisten", "scores": { - "IFEval": 0.3914, - "BBH": 0.6591, - "MATH Level 5": 0.3044, - "GPQA": 0.3591, - "MUSR": 0.4681, - "MMLU-PRO": 0.5611 + "IFEval": 0.3799, + "BBH": 0.6647, + "MATH Level 5": 0.3406, + "GPQA": 0.4035, + "MUSR": 0.494, + "MMLU-PRO": 0.5731 } }, { @@ -48729,12 +48716,12 @@ "name": "Llama-3-8B-ProLong-512k-Instruct", "developer": "princeton-nlp", "scores": { - "IFEval": 0.5508, - "BBH": 0.5028, - "MATH Level 5": 0.0529, - "GPQA": 0.2861, - "MUSR": 0.4266, - "MMLU-PRO": 0.3231 + "IFEval": 0.3978, + "BBH": 0.4983, + "MATH Level 5": 0.0582, + "GPQA": 0.281, + "MUSR": 0.425, + "MMLU-PRO": 0.3246 } }, { @@ -51303,12 +51290,12 @@ "name": "Gemma-2-Ataraxy-Gemmasutra-9B-slerp", "developer": "recoilme", "scores": { - "IFEval": 0.7649, - "BBH": 0.5974, - "MATH Level 5": 0.0174, - "GPQA": 0.3305, - "MUSR": 0.4245, - "MMLU-PRO": 0.4207 + "IFEval": 0.2854, + "BBH": 0.5984, + "MATH Level 5": 0.1005, + "GPQA": 0.3297, + "MUSR": 0.4607, + "MMLU-PRO": 0.4162 } }, { @@ -51329,12 +51316,12 @@ "name": "recoilme-gemma-2-9B-v0.2", "developer": "recoilme", "scores": { - "IFEval": 0.2747, - "BBH": 0.6031, - "MATH Level 5": 0.0831, - "GPQA": 0.3305, - "MUSR": 0.4686, - "MMLU-PRO": 0.4122 + "IFEval": 0.7592, + "BBH": 0.6026, + "MATH Level 5": 0.0529, + "GPQA": 0.3289, + "MUSR": 0.4099, + "MMLU-PRO": 0.4163 } }, { @@ -51342,12 +51329,12 @@ "name": "recoilme-gemma-2-9B-v0.3", "developer": "recoilme", "scores": { - "IFEval": 0.7439, - "BBH": 0.5993, - "MATH Level 5": 0.0876, - "GPQA": 0.3238, - "MUSR": 0.4204, - "MMLU-PRO": 0.4072 + "IFEval": 0.5761, + "BBH": 0.602, + "MATH Level 5": 0.1888, + "GPQA": 0.3372, + "MUSR": 0.4632, + "MMLU-PRO": 0.4039 } }, { @@ -56997,12 +56984,12 @@ "name": "BagelMIsteryTour-v2-8x7B", "developer": "ycros", "scores": { - "IFEval": 0.6262, - "BBH": 0.5142, - "MATH Level 5": 0.0937, - "GPQA": 0.3079, - "MUSR": 0.4138, - "MMLU-PRO": 0.3481 + "IFEval": 0.5994, + "BBH": 0.5159, + "MATH Level 5": 0.0785, + "GPQA": 0.3045, + "MUSR": 0.4203, + "MMLU-PRO": 0.3473 } }, { diff --git a/data/benchmarks/livecodebenchpro.json b/data/benchmarks/livecodebenchpro.json index 449e36cb0d36e98776192be2ede4fd274d74ec7e..896b3c77b4f3fcb0324b8662401bdd6aa70bb19b 100644 --- a/data/benchmarks/livecodebenchpro.json +++ b/data/benchmarks/livecodebenchpro.json @@ -205,9 +205,9 @@ "name": "gpt-5-2025-08-07", "developer": "OpenAI", "scores": { - "Hard Problems": 0.0423, - "Medium Problems": 0.4085, - "Easy Problems": 0.9014 + "Hard Problems": 0.04225352112676056, + "Medium Problems": 0.4084507042253521, + "Easy Problems": 0.8873239436619719 } }, { diff --git a/data/benchmarks/reward-bench.json b/data/benchmarks/reward-bench.json index 02bfaa8b54fd08d9d25532a11affc44ecd74ea63..d41f3c4a1d71f52cfb866cd53b58580a0eae02dd 100644 --- a/data/benchmarks/reward-bench.json +++ b/data/benchmarks/reward-bench.json @@ -453,16 +453,16 @@ "name": "LxzGordon/URM-LLaMa-3.1-8B", "developer": "LxzGordon", "scores": { - "Score": 0.9294, + "Score": 0.7394, + "Chat": 0.9553, + "Chat Hard": 0.8816, + "Safety": 0.9178, + "Reasoning": 0.9698, "Factuality": 0.6884, "Precise IF": 0.45, "Math": 0.6393, - "Safety": 0.9108, "Focus": 0.9758, - "Ties": 0.7653, - "Chat": 0.9553, - "Chat Hard": 0.8816, - "Reasoning": 0.9698 + "Ties": 0.7653 } }, { @@ -555,17 +555,17 @@ "name": "OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1", "developer": "OpenAssistant", "scores": { - "Score": 0.615, + "Score": 0.2653, + "Chat": 0.9246, + "Chat Hard": 0.3728, + "Safety": 0.3289, + "Reasoning": 0.5855, + "Prior Sets (0.5 weight)": 0.6801, "Factuality": 0.3979, "Precise IF": 0.2875, "Math": 0.377, - "Safety": 0.5446, "Focus": 0.1535, - "Ties": 0.047, - "Chat": 0.9246, - "Chat Hard": 0.3728, - "Reasoning": 0.5855, - "Prior Sets (0.5 weight)": 0.6801 + "Ties": 0.047 } }, { @@ -573,17 +573,17 @@ "name": "OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5", "developer": "OpenAssistant", "scores": { - "Score": 0.2648, - "Chat": 0.8855, - "Chat Hard": 0.4868, - "Safety": 0.3244, - "Reasoning": 0.7752, - "Prior Sets (0.5 weight)": 0.6533, + "Score": 0.6901, "Factuality": 0.3179, "Precise IF": 0.2625, "Math": 0.3934, + "Safety": 0.6311, "Focus": 0.2707, - "Ties": 0.0198 + "Ties": 0.0198, + "Chat": 0.8855, + "Chat Hard": 0.4868, + "Reasoning": 0.7752, + "Prior Sets (0.5 weight)": 0.6533 } }, { @@ -609,17 +609,17 @@ "name": "PKU-Alignment/beaver-7b-v1.0-cost", "developer": "PKU-Alignment", "scores": { - "Score": 0.3332, - "Chat": 0.6173, - "Chat Hard": 0.4232, - "Safety": 0.7589, - "Reasoning": 0.5482, - "Prior Sets (0.5 weight)": 0.57, + "Score": 0.5798, "Factuality": 0.3263, "Precise IF": 0.2313, "Math": 0.3989, + "Safety": 0.7351, "Focus": 0.2939, - "Ties": -0.01 + "Ties": -0.01, + "Chat": 0.6173, + "Chat Hard": 0.4232, + "Reasoning": 0.5482, + "Prior Sets (0.5 weight)": 0.57 } }, { @@ -627,17 +627,17 @@ "name": "PKU-Alignment/beaver-7b-v1.0-reward", "developer": "PKU-Alignment", "scores": { - "Score": 0.4727, + "Score": 0.1606, + "Chat": 0.8184, + "Chat Hard": 0.2873, + "Safety": 0.1422, + "Reasoning": 0.346, + "Prior Sets (0.5 weight)": 0.5993, "Factuality": 0.2105, "Precise IF": 0.2938, "Math": 0.2623, - "Safety": 0.3757, "Focus": 0.0646, - "Ties": -0.01, - "Chat": 0.8184, - "Chat Hard": 0.2873, - "Reasoning": 0.346, - "Prior Sets (0.5 weight)": 0.5993 + "Ties": -0.01 } }, { @@ -663,17 +663,17 @@ "name": "PKU-Alignment/beaver-7b-v2.0-reward", "developer": "PKU-Alignment", "scores": { - "Score": 0.6366, + "Score": 0.2544, + "Chat": 0.8994, + "Chat Hard": 0.364, + "Safety": 0.3156, + "Reasoning": 0.6887, + "Prior Sets (0.5 weight)": 0.6171, "Factuality": 0.2168, "Precise IF": 0.2562, "Math": 0.3825, - "Safety": 0.6041, "Focus": 0.2606, - "Ties": 0.0944, - "Chat": 0.8994, - "Chat Hard": 0.364, - "Reasoning": 0.6887, - "Prior Sets (0.5 weight)": 0.6171 + "Ties": 0.0944 } }, { @@ -921,16 +921,16 @@ "name": "Ray2333/GRM-gemma2-2B-rewardmodel-ft", "developer": "Ray2333", "scores": { - "Score": 0.8839, + "Score": 0.5966, + "Chat": 0.9302, + "Chat Hard": 0.7719, + "Safety": 0.9222, + "Reasoning": 0.912, "Factuality": 0.5305, "Precise IF": 0.3125, "Math": 0.5902, - "Safety": 0.9216, "Focus": 0.7455, - "Ties": 0.4788, - "Chat": 0.9302, - "Chat Hard": 0.7719, - "Reasoning": 0.912 + "Ties": 0.4788 } }, { @@ -956,17 +956,17 @@ "name": "Ray2333/GRM-llama3-8B-sftreg", "developer": "Ray2333", "scores": { - "Score": 0.6089, - "Chat": 0.986, - "Chat Hard": 0.6776, - "Safety": 0.7867, - "Reasoning": 0.9229, - "Prior Sets (0.5 weight)": 0.7309, + "Score": 0.8542, "Factuality": 0.6189, "Precise IF": 0.3875, "Math": 0.5792, + "Safety": 0.8919, "Focus": 0.6828, - "Ties": 0.5981 + "Ties": 0.5981, + "Chat": 0.986, + "Chat Hard": 0.6776, + "Reasoning": 0.9229, + "Prior Sets (0.5 weight)": 0.7309 } }, { @@ -1139,16 +1139,16 @@ "name": "Skywork/Skywork-Reward-Gemma-2-27B", "developer": "Skywork", "scores": { - "Score": 0.938, + "Score": 0.7576, + "Chat": 0.9581, + "Chat Hard": 0.9145, + "Safety": 0.9422, + "Reasoning": 0.9606, "Factuality": 0.7368, "Precise IF": 0.4031, "Math": 0.7049, - "Safety": 0.9189, "Focus": 0.9323, - "Ties": 0.8261, - "Chat": 0.9581, - "Chat Hard": 0.9145, - "Reasoning": 0.9606 + "Ties": 0.8261 } }, { @@ -1156,16 +1156,16 @@ "name": "Skywork/Skywork-Reward-Gemma-2-27B-v0.2", "developer": "Skywork", "scores": { - "Score": 0.7531, - "Chat": 0.9609, - "Chat Hard": 0.8991, - "Safety": 0.9689, - "Reasoning": 0.9807, + "Score": 0.9426, "Factuality": 0.7674, "Precise IF": 0.375, "Math": 0.6721, + "Safety": 0.9297, "Focus": 0.9172, - "Ties": 0.8182 + "Ties": 0.8182, + "Chat": 0.9609, + "Chat Hard": 0.8991, + "Reasoning": 0.9807 } }, { @@ -1173,16 +1173,16 @@ "name": "Skywork/Skywork-Reward-Llama-3.1-8B", "developer": "Skywork", "scores": { - "Score": 0.7314, - "Chat": 0.9581, - "Chat Hard": 0.8728, - "Safety": 0.9333, - "Reasoning": 0.962, + "Score": 0.9252, "Factuality": 0.6989, "Precise IF": 0.425, "Math": 0.6284, + "Safety": 0.9081, "Focus": 0.9616, - "Ties": 0.741 + "Ties": 0.741, + "Chat": 0.9581, + "Chat Hard": 0.8728, + "Reasoning": 0.962 } }, { @@ -1305,16 +1305,16 @@ "name": "Skywork/Skywork-VL-Reward-7B", "developer": "Skywork", "scores": { - "Score": 0.9007, + "Score": 0.6885, + "Chat": 0.8994, + "Chat Hard": 0.875, + "Safety": 0.8911, + "Reasoning": 0.9176, "Factuality": 0.6063, "Precise IF": 0.35, "Math": 0.6339, - "Safety": 0.9108, "Focus": 0.8909, - "Ties": 0.7586, - "Chat": 0.8994, - "Chat Hard": 0.875, - "Reasoning": 0.9176 + "Ties": 0.7586 } }, { @@ -1379,9 +1379,9 @@ "name": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...", "developer": "AI2", "scores": { - "Score": 0.7008, - "Chat": 0.9385, - "Chat Hard": 0.3882, + "Score": 0.6924, + "Chat": 0.9441, + "Chat Hard": 0.3575, "Safety": 0.7757 } }, @@ -1423,17 +1423,17 @@ "name": "allenai/Llama-3.1-70B-Instruct-RM-RB2", "developer": "allenai", "scores": { - "Score": 0.9021, + "Score": 0.7606, + "Chat": 0.9665, + "Chat Hard": 0.8355, + "Safety": 0.8844, + "Reasoning": 0.8969, + "Prior Sets (0.5 weight)": 0.0, "Factuality": 0.8126, "Precise IF": 0.4188, "Math": 0.6995, - "Safety": 0.9095, "Focus": 0.8646, - "Ties": 0.8835, - "Chat": 0.9665, - "Chat Hard": 0.8355, - "Reasoning": 0.8969, - "Prior Sets (0.5 weight)": 0.0 + "Ties": 0.8835 } }, { @@ -1459,17 +1459,17 @@ "name": "allenai/Llama-3.1-8B-Instruct-RM-RB2", "developer": "allenai", "scores": { - "Score": 0.8885, + "Score": 0.7285, + "Chat": 0.9581, + "Chat Hard": 0.8158, + "Safety": 0.8956, + "Reasoning": 0.887, + "Prior Sets (0.5 weight)": 0.0, "Factuality": 0.7432, "Precise IF": 0.4437, "Math": 0.6175, - "Safety": 0.8932, "Focus": 0.9071, - "Ties": 0.7638, - "Chat": 0.9581, - "Chat Hard": 0.8158, - "Reasoning": 0.887, - "Prior Sets (0.5 weight)": 0.0 + "Ties": 0.7638 } }, { @@ -1477,17 +1477,17 @@ "name": "allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2", "developer": "allenai", "scores": { - "Score": 0.722, - "Chat": 0.9693, - "Chat Hard": 0.8268, - "Safety": 0.8689, - "Reasoning": 0.8583, - "Prior Sets (0.5 weight)": 0.0, + "Score": 0.8892, "Factuality": 0.8084, "Precise IF": 0.3688, "Math": 0.6776, + "Safety": 0.9027, "Focus": 0.7778, - "Ties": 0.8308 + "Ties": 0.8308, + "Chat": 0.9693, + "Chat Hard": 0.8268, + "Reasoning": 0.8583, + "Prior Sets (0.5 weight)": 0.0 } }, { @@ -1495,17 +1495,17 @@ "name": "allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2", "developer": "allenai", "scores": { - "Score": 0.687, - "Chat": 0.9553, - "Chat Hard": 0.761, - "Safety": 0.86, - "Reasoning": 0.7898, - "Prior Sets (0.5 weight)": 0.0, + "Score": 0.8431, "Factuality": 0.7516, "Precise IF": 0.3875, "Math": 0.6284, + "Safety": 0.8662, "Focus": 0.8545, - "Ties": 0.6397 + "Ties": 0.6397, + "Chat": 0.9553, + "Chat Hard": 0.761, + "Reasoning": 0.7898, + "Prior Sets (0.5 weight)": 0.0 } }, { @@ -3784,16 +3784,16 @@ "name": "infly/INF-ORM-Llama3.1-70B", "developer": "infly", "scores": { - "Score": 0.7648, - "Chat": 0.9665, - "Chat Hard": 0.9101, - "Safety": 0.9644, - "Reasoning": 0.9912, + "Score": 0.9511, "Factuality": 0.7411, "Precise IF": 0.4188, "Math": 0.6995, + "Safety": 0.9365, "Focus": 0.903, - "Ties": 0.8622 + "Ties": 0.8622, + "Chat": 0.9665, + "Chat Hard": 0.9101, + "Reasoning": 0.9912 } }, { @@ -3835,16 +3835,16 @@ "name": "internlm/internlm2-7b-reward", "developer": "internlm", "scores": { - "Score": 0.8759, + "Score": 0.5335, + "Chat": 0.9916, + "Chat Hard": 0.6952, + "Safety": 0.5956, + "Reasoning": 0.9453, "Factuality": 0.4211, "Precise IF": 0.4, "Math": 0.5628, - "Safety": 0.8716, "Focus": 0.7051, - "Ties": 0.5164, - "Chat": 0.9916, - "Chat Hard": 0.6952, - "Reasoning": 0.9453 + "Ties": 0.5164 } }, { @@ -4014,16 +4014,16 @@ "name": "nicolinho/QRM-Gemma-2-27B", "developer": "nicolinho", "scores": { - "Score": 0.9444, + "Score": 0.7667, + "Chat": 0.9665, + "Chat Hard": 0.9013, + "Safety": 0.9578, + "Reasoning": 0.9826, "Factuality": 0.7853, "Precise IF": 0.3719, "Math": 0.6995, - "Safety": 0.927, "Focus": 0.9535, - "Ties": 0.8321, - "Chat": 0.9665, - "Chat Hard": 0.9013, - "Reasoning": 0.9826 + "Ties": 0.8321 } }, { @@ -4055,16 +4055,16 @@ "name": "nicolinho/QRM-Llama3.1-8B-v2", "developer": "nicolinho", "scores": { - "Score": 0.9314, + "Score": 0.7074, + "Chat": 0.9637, + "Chat Hard": 0.8684, + "Safety": 0.9467, + "Reasoning": 0.9677, "Factuality": 0.6653, "Precise IF": 0.4062, "Math": 0.612, - "Safety": 0.9257, "Focus": 0.8909, - "Ties": 0.7234, - "Chat": 0.9637, - "Chat Hard": 0.8684, - "Reasoning": 0.9677 + "Ties": 0.7234 } }, { @@ -4202,16 +4202,16 @@ "name": "GPT-4o 2024-08-06", "developer": "OpenAI", "scores": { - "Score": 0.6493, - "Chat": 0.9609, - "Chat Hard": 0.761, - "Safety": 0.8619, - "Reasoning": 0.8661, + "Score": 0.8673, "Factuality": 0.5684, "Precise IF": 0.3312, "Math": 0.623, + "Safety": 0.8811, "Focus": 0.7293, - "Ties": 0.7819 + "Ties": 0.7819, + "Chat": 0.9609, + "Chat Hard": 0.761, + "Reasoning": 0.8661 } }, { @@ -4249,17 +4249,17 @@ "name": "openbmb/Eurus-RM-7b", "developer": "openbmb", "scores": { - "Score": 0.5806, - "Chat": 0.9804, - "Chat Hard": 0.6557, - "Safety": 0.6267, - "Reasoning": 0.8633, - "Prior Sets (0.5 weight)": 0.7172, + "Score": 0.8159, "Factuality": 0.6, "Precise IF": 0.3438, "Math": 0.5683, + "Safety": 0.8135, "Focus": 0.7475, - "Ties": 0.5972 + "Ties": 0.5972, + "Chat": 0.9804, + "Chat Hard": 0.6557, + "Reasoning": 0.8633, + "Prior Sets (0.5 weight)": 0.7172 } }, { @@ -4370,17 +4370,17 @@ "name": "sfairXC/FsfairX-LLaMA3-RM-v0.1", "developer": "sfairXC", "scores": { - "Score": 0.6292, - "Chat": 0.9944, - "Chat Hard": 0.6513, - "Safety": 0.7667, - "Reasoning": 0.8644, - "Prior Sets (0.5 weight)": 0.7492, + "Score": 0.8338, "Factuality": 0.5916, "Precise IF": 0.4188, "Math": 0.6284, + "Safety": 0.8676, "Focus": 0.7051, - "Ties": 0.6647 + "Ties": 0.6647, + "Chat": 0.9944, + "Chat Hard": 0.6513, + "Reasoning": 0.8644, + "Prior Sets (0.5 weight)": 0.7492 } }, { @@ -4492,17 +4492,17 @@ "name": "weqweasdas/RM-Gemma-2B", "developer": "weqweasdas", "scores": { - "Score": 0.3057, - "Chat": 0.9441, - "Chat Hard": 0.4079, - "Safety": 0.3311, - "Reasoning": 0.7637, - "Prior Sets (0.5 weight)": 0.6652, + "Score": 0.6549, "Factuality": 0.3705, "Precise IF": 0.2812, "Math": 0.4317, + "Safety": 0.4986, "Focus": 0.2343, - "Ties": 0.1851 + "Ties": 0.1851, + "Chat": 0.9441, + "Chat Hard": 0.4079, + "Reasoning": 0.7637, + "Prior Sets (0.5 weight)": 0.6652 } }, { @@ -4541,17 +4541,17 @@ "name": "weqweasdas/RM-Mistral-7B", "developer": "weqweasdas", "scores": { - "Score": 0.596, - "Chat": 0.9665, - "Chat Hard": 0.6053, - "Safety": 0.6911, - "Reasoning": 0.7736, - "Prior Sets (0.5 weight)": 0.753, + "Score": 0.7982, "Factuality": 0.5937, "Precise IF": 0.3438, "Math": 0.5956, + "Safety": 0.8703, "Focus": 0.7293, - "Ties": 0.6226 + "Ties": 0.6226, + "Chat": 0.9665, + "Chat Hard": 0.6053, + "Reasoning": 0.7736, + "Prior Sets (0.5 weight)": 0.753 } }, { @@ -4559,17 +4559,17 @@ "name": "weqweasdas/hh_rlhf_rm_open_llama_3b", "developer": "weqweasdas", "scores": { - "Score": 0.2498, - "Chat": 0.8184, - "Chat Hard": 0.3728, - "Safety": 0.24, - "Reasoning": 0.3281, - "Prior Sets (0.5 weight)": 0.6564, + "Score": 0.5027, "Factuality": 0.3642, "Precise IF": 0.275, "Math": 0.3497, + "Safety": 0.4149, "Focus": 0.2384, - "Ties": 0.0315 + "Ties": 0.0315, + "Chat": 0.8184, + "Chat Hard": 0.3728, + "Reasoning": 0.3281, + "Prior Sets (0.5 weight)": 0.6564 } } ] diff --git a/data/benchmarks/swe-bench.json b/data/benchmarks/swe-bench.json index 1411ad930fa9e027d02d79c5b478c955d8c4e629..c5ac3821d8cf33006a01a757ed86c4602272b367 100644 --- a/data/benchmarks/swe-bench.json +++ b/data/benchmarks/swe-bench.json @@ -5,7 +5,7 @@ "name": "claude-opus-4-5", "developer": "Anthropic", "scores": { - "swe-bench": 0.65 + "swe-bench": 0.6061 } }, { @@ -13,7 +13,7 @@ "name": "gemini-3-pro-preview", "developer": "Google", "scores": { - "swe-bench": 0.7234 + "swe-bench": 0.71 } }, { diff --git a/data/benchmarks/tau-bench-2_airline.json b/data/benchmarks/tau-bench-2_airline.json index 8d2ca0689ae7644754daa6ce07597d00fd2d892b..f12d28637a77b58341be6f902aadfc1d22527d1a 100644 --- a/data/benchmarks/tau-bench-2_airline.json +++ b/data/benchmarks/tau-bench-2_airline.json @@ -5,7 +5,7 @@ "name": "claude-opus-4-5", "developer": "Anthropic", "scores": { - "tau-bench-2/airline": 0.72 + "tau-bench-2/airline": 0.66 } }, { @@ -13,7 +13,7 @@ "name": "gemini-3-pro-preview", "developer": "Google", "scores": { - "tau-bench-2/airline": 0.7 + "tau-bench-2/airline": 0.68 } }, { diff --git a/data/benchmarks/tau-bench-2_retail.json b/data/benchmarks/tau-bench-2_retail.json index 1f141a12075d4e893a2a3bae0a3a7637670fb4c8..8567872e9ca3424b440aa66923f7ea7a010a7290 100644 --- a/data/benchmarks/tau-bench-2_retail.json +++ b/data/benchmarks/tau-bench-2_retail.json @@ -21,7 +21,7 @@ "name": "gpt-5.2-2025-12-11", "developer": "OpenAI", "scores": { - "tau-bench-2/retail": 0.68 + "tau-bench-2/retail": 0.73 } } ] diff --git a/data/benchmarks/tau-bench-2_telecom.json b/data/benchmarks/tau-bench-2_telecom.json index 5e2e97c5a63c814404bfd0e936bb7f41ce63593e..717a8c139daad73d2bba1920f3f4fdded08dd42b 100644 --- a/data/benchmarks/tau-bench-2_telecom.json +++ b/data/benchmarks/tau-bench-2_telecom.json @@ -5,7 +5,7 @@ "name": "claude-opus-4-5", "developer": "Anthropic", "scores": { - "tau-bench-2/telecom": 0.76 + "tau-bench-2/telecom": 0.84 } }, { @@ -21,7 +21,7 @@ "name": "gpt-5.2-2025-12-11", "developer": "OpenAI", "scores": { - "tau-bench-2/telecom": 0.5354 + "tau-bench-2/telecom": 0.71 } } ] diff --git a/data/benchmarks/terminal-bench-2.0.json b/data/benchmarks/terminal-bench-2.0.json index 2a3783a3e17d1f920cd8ee2720ab236eb67f76b1..10b4a1e53538ae0245444d9327835077907520b5 100644 --- a/data/benchmarks/terminal-bench-2.0.json +++ b/data/benchmarks/terminal-bench-2.0.json @@ -21,7 +21,7 @@ "name": "Claude Opus 4.1", "developer": "Anthropic", "scores": { - "terminal-bench-2.0": 38.0 + "terminal-bench-2.0": 35.1 } }, { @@ -29,7 +29,7 @@ "name": "Claude Opus 4.5", "developer": "Anthropic", "scores": { - "terminal-bench-2.0": 59.1 + "terminal-bench-2.0": 52.1 } }, { @@ -37,7 +37,7 @@ "name": "Claude Opus 4.6", "developer": "Anthropic", "scores": { - "terminal-bench-2.0": 58.0 + "terminal-bench-2.0": 62.9 } }, { @@ -45,7 +45,7 @@ "name": "Claude Sonnet 4.5", "developer": "Anthropic", "scores": { - "terminal-bench-2.0": 43.1 + "terminal-bench-2.0": 42.6 } }, { @@ -61,7 +61,7 @@ "name": "Gemini 2.5 Flash", "developer": "Google", "scores": { - "terminal-bench-2.0": 17.1 + "terminal-bench-2.0": 16.9 } }, { @@ -77,7 +77,7 @@ "name": "Gemini 3 Flash", "developer": "Google", "scores": { - "terminal-bench-2.0": 51.0 + "terminal-bench-2.0": 47.4 } }, { @@ -109,7 +109,7 @@ "name": "MiniMax M2.1", "developer": "MiniMax", "scores": { - "terminal-bench-2.0": 29.2 + "terminal-bench-2.0": 36.6 } }, { @@ -125,7 +125,7 @@ "name": "Kimi K2 Instruct", "developer": "Moonshot AI", "scores": { - "terminal-bench-2.0": 26.7 + "terminal-bench-2.0": 27.8 } }, { @@ -149,7 +149,7 @@ "name": "Multiple", "developer": "Multiple", "scores": { - "terminal-bench-2.0": 71.0 + "terminal-bench-2.0": 72.4 } }, { @@ -157,7 +157,7 @@ "name": "GPT-5", "developer": "OpenAI", "scores": { - "terminal-bench-2.0": 35.2 + "terminal-bench-2.0": 49.6 } }, { @@ -165,7 +165,7 @@ "name": "GPT-5-Codex", "developer": "OpenAI", "scores": { - "terminal-bench-2.0": 44.3 + "terminal-bench-2.0": 43.4 } }, { @@ -173,7 +173,7 @@ "name": "GPT-5-Mini", "developer": "OpenAI", "scores": { - "terminal-bench-2.0": 34.8 + "terminal-bench-2.0": 24.0 } }, { @@ -181,7 +181,7 @@ "name": "GPT-5-Nano", "developer": "OpenAI", "scores": { - "terminal-bench-2.0": 9.9 + "terminal-bench-2.0": 11.5 } }, { @@ -197,7 +197,7 @@ "name": "GPT-5.1-Codex", "developer": "OpenAI", "scores": { - "terminal-bench-2.0": 53.5 + "terminal-bench-2.0": 57.8 } }, { @@ -221,7 +221,7 @@ "name": "GPT-5.2", "developer": "OpenAI", "scores": { - "terminal-bench-2.0": 60.7 + "terminal-bench-2.0": 62.9 } }, { @@ -237,7 +237,7 @@ "name": "GPT-5.3-Codex", "developer": "OpenAI", "scores": { - "terminal-bench-2.0": 64.7 + "terminal-bench-2.0": 77.3 } }, { @@ -245,7 +245,7 @@ "name": "GPT-OSS-120B", "developer": "OpenAI", "scores": { - "terminal-bench-2.0": 14.2 + "terminal-bench-2.0": 18.7 } }, { @@ -253,7 +253,7 @@ "name": "GPT-OSS-20B", "developer": "OpenAI", "scores": { - "terminal-bench-2.0": 3.1 + "terminal-bench-2.0": 3.4 } }, { @@ -261,7 +261,7 @@ "name": "Grok 4", "developer": "xAI", "scores": { - "terminal-bench-2.0": 25.4 + "terminal-bench-2.0": 23.1 } }, { @@ -269,7 +269,7 @@ "name": "Grok Code Fast 1", "developer": "xAI", "scores": { - "terminal-bench-2.0": 25.8 + "terminal-bench-2.0": 14.2 } }, { diff --git a/data/benchmarks/theory_of_mind.json b/data/benchmarks/theory_of_mind.json new file mode 100644 index 0000000000000000000000000000000000000000..8af892835f3a3bd725f97d45e09284631904bf11 --- /dev/null +++ b/data/benchmarks/theory_of_mind.json @@ -0,0 +1,12 @@ +{ + "models": [ + { + "model_id": "Qwen/Qwen2.5-3B-Instruct", + "name": "Qwen2.5-3B-Instruct", + "developer": "Qwen", + "scores": { + "accuracy on theory_of_mind for scorer model_graded_fact": 0.78 + } + } + ] +} \ No newline at end of file diff --git a/data/developers.json b/data/developers.json index 003e462b3fb10e7ebce2d70b3ed9418882ec3a93..f579192a07283e4405b9950165cfa67af8d0dcda 100644 --- a/data/developers.json +++ b/data/developers.json @@ -1917,7 +1917,7 @@ }, { "developer": "NousResearch", - "model_count": 19 + "model_count": 18 }, { "developer": "Novaciano", diff --git a/data/developers/adriszmar.json b/data/developers/adriszmar.json index acb90d745752909d8f96f323acb9caa9b19061ae..1f1d39916960942963a9c3c265196aea3657be38 100644 --- a/data/developers/adriszmar.json +++ b/data/developers/adriszmar.json @@ -7,12 +7,12 @@ "developer": "adriszmar", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.1685, - "hfopenllm_v2/BBH": 0.3124, - "hfopenllm_v2/MATH Level 5": 0.0015, - "hfopenllm_v2/GPQA": 0.2492, - "hfopenllm_v2/MUSR": 0.3963, - "hfopenllm_v2/MMLU-PRO": 0.1066 + "hfopenllm_v2/IFEval": 0.1746, + "hfopenllm_v2/BBH": 0.3126, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.245, + "hfopenllm_v2/MUSR": 0.4096, + "hfopenllm_v2/MMLU-PRO": 0.1087 } } ] diff --git a/data/developers/ai2.json b/data/developers/ai2.json index 4934c11b7806e647da8c3821dcfccba2566bb947..6ae2e91a5d501e1d313c59819f3bee806d5615b0 100644 --- a/data/developers/ai2.json +++ b/data/developers/ai2.json @@ -43,9 +43,9 @@ "developer": "AI2", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.7008, - "reward-bench/Chat": 0.9385, - "reward-bench/Chat Hard": 0.3882, + "reward-bench/Score": 0.6924, + "reward-bench/Chat": 0.9441, + "reward-bench/Chat Hard": 0.3575, "reward-bench/Safety": 0.7757 } }, diff --git a/data/developers/akjindal53244.json b/data/developers/akjindal53244.json index 237ea0357d953fdc2d416f7c27241c406836e723..86acdd918ca996450aa81c0111cbad62da1b8b17 100644 --- a/data/developers/akjindal53244.json +++ b/data/developers/akjindal53244.json @@ -7,12 +7,12 @@ "developer": "akjindal53244", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.8051, - "hfopenllm_v2/BBH": 0.5189, - "hfopenllm_v2/MATH Level 5": 0.1722, - "hfopenllm_v2/GPQA": 0.3263, + "hfopenllm_v2/IFEval": 0.8033, + "hfopenllm_v2/BBH": 0.5196, + "hfopenllm_v2/MATH Level 5": 0.1624, + "hfopenllm_v2/GPQA": 0.3096, "hfopenllm_v2/MUSR": 0.4028, - "hfopenllm_v2/MMLU-PRO": 0.3803 + "hfopenllm_v2/MMLU-PRO": 0.3812 } } ] diff --git a/data/developers/allenai.json b/data/developers/allenai.json index 00cefcb6fd468cf299bbaf9888936aea413ac714..4d8014825fe4883af161e8da92543c33ea544e4b 100644 --- a/data/developers/allenai.json +++ b/data/developers/allenai.json @@ -63,17 +63,17 @@ "developer": "allenai", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.9021, + "reward-bench/Score": 0.7606, + "reward-bench/Chat": 0.9665, + "reward-bench/Chat Hard": 0.8355, + "reward-bench/Safety": 0.8844, + "reward-bench/Reasoning": 0.8969, + "reward-bench/Prior Sets (0.5 weight)": 0.0, "reward-bench/Factuality": 0.8126, "reward-bench/Precise IF": 0.4188, "reward-bench/Math": 0.6995, - "reward-bench/Safety": 0.9095, "reward-bench/Focus": 0.8646, - "reward-bench/Ties": 0.8835, - "reward-bench/Chat": 0.9665, - "reward-bench/Chat Hard": 0.8355, - "reward-bench/Reasoning": 0.8969, - "reward-bench/Prior Sets (0.5 weight)": 0.0 + "reward-bench/Ties": 0.8835 } }, { @@ -101,17 +101,17 @@ "developer": "allenai", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.8885, + "reward-bench/Score": 0.7285, + "reward-bench/Chat": 0.9581, + "reward-bench/Chat Hard": 0.8158, + "reward-bench/Safety": 0.8956, + "reward-bench/Reasoning": 0.887, + "reward-bench/Prior Sets (0.5 weight)": 0.0, "reward-bench/Factuality": 0.7432, "reward-bench/Precise IF": 0.4437, "reward-bench/Math": 0.6175, - "reward-bench/Safety": 0.8932, "reward-bench/Focus": 0.9071, - "reward-bench/Ties": 0.7638, - "reward-bench/Chat": 0.9581, - "reward-bench/Chat Hard": 0.8158, - "reward-bench/Reasoning": 0.887, - "reward-bench/Prior Sets (0.5 weight)": 0.0 + "reward-bench/Ties": 0.7638 } }, { @@ -120,12 +120,12 @@ "developer": "allenai", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.8379, - "hfopenllm_v2/BBH": 0.6157, - "hfopenllm_v2/MATH Level 5": 0.3829, + "hfopenllm_v2/IFEval": 0.8291, + "hfopenllm_v2/BBH": 0.6164, + "hfopenllm_v2/MATH Level 5": 0.4502, "hfopenllm_v2/GPQA": 0.3733, - "hfopenllm_v2/MUSR": 0.4988, - "hfopenllm_v2/MMLU-PRO": 0.4656 + "hfopenllm_v2/MUSR": 0.4948, + "hfopenllm_v2/MMLU-PRO": 0.4645 } }, { @@ -162,17 +162,17 @@ "developer": "allenai", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.722, - "reward-bench/Chat": 0.9693, - "reward-bench/Chat Hard": 0.8268, - "reward-bench/Safety": 0.8689, - "reward-bench/Reasoning": 0.8583, - "reward-bench/Prior Sets (0.5 weight)": 0.0, + "reward-bench/Score": 0.8892, "reward-bench/Factuality": 0.8084, "reward-bench/Precise IF": 0.3688, "reward-bench/Math": 0.6776, + "reward-bench/Safety": 0.9027, "reward-bench/Focus": 0.7778, - "reward-bench/Ties": 0.8308 + "reward-bench/Ties": 0.8308, + "reward-bench/Chat": 0.9693, + "reward-bench/Chat Hard": 0.8268, + "reward-bench/Reasoning": 0.8583, + "reward-bench/Prior Sets (0.5 weight)": 0.0 } }, { @@ -209,17 +209,17 @@ "developer": "allenai", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.687, - "reward-bench/Chat": 0.9553, - "reward-bench/Chat Hard": 0.761, - "reward-bench/Safety": 0.86, - "reward-bench/Reasoning": 0.7898, - "reward-bench/Prior Sets (0.5 weight)": 0.0, + "reward-bench/Score": 0.8431, "reward-bench/Factuality": 0.7516, "reward-bench/Precise IF": 0.3875, "reward-bench/Math": 0.6284, + "reward-bench/Safety": 0.8662, "reward-bench/Focus": 0.8545, - "reward-bench/Ties": 0.6397 + "reward-bench/Ties": 0.6397, + "reward-bench/Chat": 0.9553, + "reward-bench/Chat Hard": 0.761, + "reward-bench/Reasoning": 0.7898, + "reward-bench/Prior Sets (0.5 weight)": 0.0 } }, { diff --git a/data/developers/anthropic.json b/data/developers/anthropic.json index 674f01d5340b017e6a95b1abb6e16b7a263cdb47..e348f58bb2289a52a6840206a2b95ae3acade6c2 100644 --- a/data/developers/anthropic.json +++ b/data/developers/anthropic.json @@ -650,12 +650,12 @@ "developer": "Anthropic", "evaluator_relationship": null, "benchmark_scores": { - "appworld_test_normal/appworld/test_normal": 0.68, + "appworld_test_normal/appworld/test_normal": 0.7, "browsecompplus/browsecompplus": 0.61, - "swe-bench/swe-bench": 0.65, - "tau-bench-2_airline/tau-bench-2/airline": 0.72, + "swe-bench/swe-bench": 0.6061, + "tau-bench-2_airline/tau-bench-2/airline": 0.66, "tau-bench-2_retail/tau-bench-2/retail": 0.78, - "tau-bench-2_telecom/tau-bench-2/telecom": 0.76 + "tau-bench-2_telecom/tau-bench-2/telecom": 0.84 } }, { @@ -664,7 +664,7 @@ "developer": "Anthropic", "evaluator_relationship": null, "benchmark_scores": { - "terminal-bench-2.0/terminal-bench-2.0": 38.0 + "terminal-bench-2.0/terminal-bench-2.0": 35.1 } }, { @@ -673,7 +673,7 @@ "developer": "Anthropic", "evaluator_relationship": null, "benchmark_scores": { - "terminal-bench-2.0/terminal-bench-2.0": 59.1 + "terminal-bench-2.0/terminal-bench-2.0": 52.1 } }, { @@ -682,7 +682,7 @@ "developer": "Anthropic", "evaluator_relationship": null, "benchmark_scores": { - "terminal-bench-2.0/terminal-bench-2.0": 58.0 + "terminal-bench-2.0/terminal-bench-2.0": 62.9 } }, { @@ -756,7 +756,7 @@ "developer": "Anthropic", "evaluator_relationship": null, "benchmark_scores": { - "terminal-bench-2.0/terminal-bench-2.0": 43.1 + "terminal-bench-2.0/terminal-bench-2.0": 42.6 } }, { @@ -800,8 +800,6 @@ "developer": "Anthropic", "evaluator_relationship": null, "benchmark_scores": { - "ace/Overall Score": 0.478, - "ace/Gaming Score": 0.391, "apex-agents/Overall Pass@1": 0.184, "apex-agents/Overall Pass@8": 0.34, "apex-agents/Overall Mean Score": 0.348, @@ -809,6 +807,8 @@ "apex-agents/Management Consulting Pass@1": 0.132, "apex-agents/Corporate Law Pass@1": 0.202, "apex-agents/Corporate Lawyer Mean Score": 0.471, + "ace/Overall Score": 0.478, + "ace/Gaming Score": 0.391, "apex-v1/Medicine (MD) Score": 0.65 } }, diff --git a/data/developers/cognitivecomputations.json b/data/developers/cognitivecomputations.json index 27ef3ede420acfeab83ed5bb754062e32374c41a..292d5e513d244c8ebb441a809017ff16919c9354 100644 --- a/data/developers/cognitivecomputations.json +++ b/data/developers/cognitivecomputations.json @@ -77,12 +77,12 @@ "developer": "cognitivecomputations", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.4124, - "hfopenllm_v2/BBH": 0.6383, - "hfopenllm_v2/MATH Level 5": 0.182, - "hfopenllm_v2/GPQA": 0.3289, - "hfopenllm_v2/MUSR": 0.4349, - "hfopenllm_v2/MMLU-PRO": 0.4525 + "hfopenllm_v2/IFEval": 0.3613, + "hfopenllm_v2/BBH": 0.6123, + "hfopenllm_v2/MATH Level 5": 0.1239, + "hfopenllm_v2/GPQA": 0.328, + "hfopenllm_v2/MUSR": 0.4112, + "hfopenllm_v2/MMLU-PRO": 0.4494 } }, { diff --git a/data/developers/columbia-nlp.json b/data/developers/columbia-nlp.json index b04d1f97bca6939c03a53d86cd396214edf82f72..11f5fed45eb39522787aef4e964f3d3e28d320c0 100644 --- a/data/developers/columbia-nlp.json +++ b/data/developers/columbia-nlp.json @@ -7,12 +7,12 @@ "developer": "Columbia-NLP", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.3278, - "hfopenllm_v2/BBH": 0.392, - "hfopenllm_v2/MATH Level 5": 0.0431, - "hfopenllm_v2/GPQA": 0.2492, - "hfopenllm_v2/MUSR": 0.412, - "hfopenllm_v2/MMLU-PRO": 0.1666 + "hfopenllm_v2/IFEval": 0.3102, + "hfopenllm_v2/BBH": 0.3881, + "hfopenllm_v2/MATH Level 5": 0.0536, + "hfopenllm_v2/GPQA": 0.2534, + "hfopenllm_v2/MUSR": 0.4081, + "hfopenllm_v2/MMLU-PRO": 0.1665 } }, { diff --git a/data/developers/cpayne1303.json b/data/developers/cpayne1303.json index 6d735bd94a67b9fc86d407e5a74d4ec119a21a01..878ab50b2c306395a2d661382c84d0660b0f0d51 100644 --- a/data/developers/cpayne1303.json +++ b/data/developers/cpayne1303.json @@ -35,12 +35,12 @@ "developer": "cpayne1303", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.1916, - "hfopenllm_v2/BBH": 0.2977, - "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/IFEval": 0.1949, + "hfopenllm_v2/BBH": 0.2965, + "hfopenllm_v2/MATH Level 5": 0.0045, "hfopenllm_v2/GPQA": 0.2685, - "hfopenllm_v2/MUSR": 0.3872, - "hfopenllm_v2/MMLU-PRO": 0.1132 + "hfopenllm_v2/MUSR": 0.3885, + "hfopenllm_v2/MMLU-PRO": 0.1111 } }, { diff --git a/data/developers/daemontatox.json b/data/developers/daemontatox.json index 09a88a4f4f23ea30a1c40f4893201987d0954b30..3de1c87bc255ec29e11a7fcd9434ebc4d17ff27a 100644 --- a/data/developers/daemontatox.json +++ b/data/developers/daemontatox.json @@ -231,12 +231,12 @@ "developer": "Daemontatox", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.3745, - "hfopenllm_v2/BBH": 0.6668, - "hfopenllm_v2/MATH Level 5": 0.4758, - "hfopenllm_v2/GPQA": 0.3943, - "hfopenllm_v2/MUSR": 0.4858, - "hfopenllm_v2/MMLU-PRO": 0.5593 + "hfopenllm_v2/IFEval": 0.4855, + "hfopenllm_v2/BBH": 0.6627, + "hfopenllm_v2/MATH Level 5": 0.4841, + "hfopenllm_v2/GPQA": 0.3096, + "hfopenllm_v2/MUSR": 0.4256, + "hfopenllm_v2/MMLU-PRO": 0.5542 } }, { diff --git a/data/developers/deepmount00.json b/data/developers/deepmount00.json index 5505c28134c7ac10ef3a27978dd2745253c793a1..e898074e4a61782e05002f7b47eb2ee0411313aa 100644 --- a/data/developers/deepmount00.json +++ b/data/developers/deepmount00.json @@ -63,12 +63,12 @@ "developer": "DeepMount00", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.7917, - "hfopenllm_v2/BBH": 0.5109, - "hfopenllm_v2/MATH Level 5": 0.1088, - "hfopenllm_v2/GPQA": 0.2878, - "hfopenllm_v2/MUSR": 0.4136, - "hfopenllm_v2/MMLU-PRO": 0.3876 + "hfopenllm_v2/IFEval": 0.5365, + "hfopenllm_v2/BBH": 0.517, + "hfopenllm_v2/MATH Level 5": 0.1707, + "hfopenllm_v2/GPQA": 0.3062, + "hfopenllm_v2/MUSR": 0.4487, + "hfopenllm_v2/MMLU-PRO": 0.396 } }, { diff --git a/data/developers/dfurman.json b/data/developers/dfurman.json index 2947dc3ef503295f24886c787e729305dcebb026..7e28f4da929deb69b4028e08d70bcf7cf516d943 100644 --- a/data/developers/dfurman.json +++ b/data/developers/dfurman.json @@ -35,12 +35,12 @@ "developer": "dfurman", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.2835, - "hfopenllm_v2/BBH": 0.3842, - "hfopenllm_v2/MATH Level 5": 0.0521, - "hfopenllm_v2/GPQA": 0.2609, - "hfopenllm_v2/MUSR": 0.3566, - "hfopenllm_v2/MMLU-PRO": 0.2298 + "hfopenllm_v2/IFEval": 0.3, + "hfopenllm_v2/BBH": 0.3853, + "hfopenllm_v2/MATH Level 5": 0.0415, + "hfopenllm_v2/GPQA": 0.2617, + "hfopenllm_v2/MUSR": 0.3579, + "hfopenllm_v2/MMLU-PRO": 0.2281 } }, { diff --git a/data/developers/doppelreflex.json b/data/developers/doppelreflex.json index 4e478be7761570eec41b8a497db82c0ab081c8b0..0b78fc1cbec001d1df6b60b1fab265b2eab6799e 100644 --- a/data/developers/doppelreflex.json +++ b/data/developers/doppelreflex.json @@ -175,12 +175,12 @@ "developer": "DoppelReflEx", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.451, - "hfopenllm_v2/BBH": 0.4944, - "hfopenllm_v2/MATH Level 5": 0.1156, - "hfopenllm_v2/GPQA": 0.3196, - "hfopenllm_v2/MUSR": 0.3896, - "hfopenllm_v2/MMLU-PRO": 0.3256 + "hfopenllm_v2/IFEval": 0.436, + "hfopenllm_v2/BBH": 0.4956, + "hfopenllm_v2/MATH Level 5": 0.0589, + "hfopenllm_v2/GPQA": 0.3205, + "hfopenllm_v2/MUSR": 0.3843, + "hfopenllm_v2/MMLU-PRO": 0.3237 } }, { diff --git a/data/developers/google.json b/data/developers/google.json index d8eb903319736c192b1ab41a5a5c1c01cb963ec6..a447ddbc58f3e10c0347885b529d40f4033f2cb3 100644 --- a/data/developers/google.json +++ b/data/developers/google.json @@ -139,6 +139,7 @@ "developer": "Google", "evaluator_relationship": null, "benchmark_scores": { + "ace/Gaming Score": 0.415, "apex-agents/Overall Pass@1": 0.24, "apex-agents/Overall Pass@8": 0.367, "apex-agents/Overall Mean Score": 0.395, @@ -146,7 +147,6 @@ "apex-agents/Management Consulting Pass@1": 0.193, "apex-agents/Corporate Law Pass@1": 0.259, "apex-agents/Corporate Lawyer Mean Score": 0.524, - "ace/Gaming Score": 0.415, "apex-v1/Overall Score": 0.64, "apex-v1/Consulting Score": 0.64 } @@ -157,6 +157,8 @@ "developer": "Google", "evaluator_relationship": null, "benchmark_scores": { + "ace/Overall Score": 0.47, + "ace/Gaming Score": 0.509, "apex-agents/Overall Pass@1": 0.184, "apex-agents/Overall Pass@8": 0.373, "apex-agents/Overall Mean Score": 0.341, @@ -164,8 +166,6 @@ "apex-agents/Management Consulting Pass@1": 0.124, "apex-agents/Corporate Law Pass@1": 0.239, "apex-agents/Corporate Lawyer Mean Score": 0.487, - "ace/Overall Score": 0.47, - "ace/Gaming Score": 0.509, "apex-v1/Overall Score": 0.643, "apex-v1/Consulting Score": 0.64, "apex-v1/Investment Banking Score": 0.63 @@ -723,7 +723,7 @@ "reward-bench/Safety": 0.909, "reward-bench/Focus": 0.841, "reward-bench/Ties": 0.809, - "terminal-bench-2.0/terminal-bench-2.0": 17.1 + "terminal-bench-2.0/terminal-bench-2.0": 16.9 } }, { @@ -861,7 +861,7 @@ "developer": "Google", "evaluator_relationship": null, "benchmark_scores": { - "terminal-bench-2.0/terminal-bench-2.0": 51.0 + "terminal-bench-2.0/terminal-bench-2.0": 47.4 } }, { @@ -879,8 +879,8 @@ "developer": "Google", "evaluator_relationship": null, "benchmark_scores": { - "appworld_test_normal/appworld/test_normal": 0.13, - "browsecompplus/browsecompplus": 0.48, + "appworld_test_normal/appworld/test_normal": 0.55, + "browsecompplus/browsecompplus": 0.3333, "global-mmlu-lite/Global MMLU Lite": 0.9453, "global-mmlu-lite/Culturally Sensitive": 0.9397, "global-mmlu-lite/Culturally Agnostic": 0.9509, @@ -900,8 +900,8 @@ "global-mmlu-lite/Yoruba": 0.9425, "global-mmlu-lite/Chinese": 0.9475, "global-mmlu-lite/Burmese": 0.9425, - "swe-bench/swe-bench": 0.7234, - "tau-bench-2_airline/tau-bench-2/airline": 0.7, + "swe-bench/swe-bench": 0.71, + "tau-bench-2_airline/tau-bench-2/airline": 0.68, "tau-bench-2_retail/tau-bench-2/retail": 0.73, "tau-bench-2_telecom/tau-bench-2/telecom": 0.73 } @@ -1028,12 +1028,12 @@ "developer": "Google", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.2018, - "hfopenllm_v2/BBH": 0.3709, - "hfopenllm_v2/MATH Level 5": 0.0302, + "hfopenllm_v2/IFEval": 0.1993, + "hfopenllm_v2/BBH": 0.3656, + "hfopenllm_v2/MATH Level 5": 0.0287, "hfopenllm_v2/GPQA": 0.2626, - "hfopenllm_v2/MUSR": 0.4219, - "hfopenllm_v2/MMLU-PRO": 0.2217 + "hfopenllm_v2/MUSR": 0.4232, + "hfopenllm_v2/MMLU-PRO": 0.218 } }, { @@ -1056,12 +1056,12 @@ "developer": "Google", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.5078, - "hfopenllm_v2/BBH": 0.4226, - "hfopenllm_v2/MATH Level 5": 0.0347, - "hfopenllm_v2/GPQA": 0.2852, - "hfopenllm_v2/MUSR": 0.3964, - "hfopenllm_v2/MMLU-PRO": 0.2578 + "hfopenllm_v2/IFEval": 0.5288, + "hfopenllm_v2/BBH": 0.4178, + "hfopenllm_v2/MATH Level 5": 0.0476, + "hfopenllm_v2/GPQA": 0.2752, + "hfopenllm_v2/MUSR": 0.3728, + "hfopenllm_v2/MMLU-PRO": 0.2467 } }, { diff --git a/data/developers/huggingfacetb.json b/data/developers/huggingfacetb.json index bed31781473fb30427be579aab45ef01bf5054ce..1df25de945ccaf98a06fb9f3b47618bba05613d9 100644 --- a/data/developers/huggingfacetb.json +++ b/data/developers/huggingfacetb.json @@ -133,12 +133,12 @@ "developer": "HuggingFaceTB", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.0593, - "hfopenllm_v2/BBH": 0.3135, - "hfopenllm_v2/MATH Level 5": 0.0144, - "hfopenllm_v2/GPQA": 0.2341, - "hfopenllm_v2/MUSR": 0.3871, - "hfopenllm_v2/MMLU-PRO": 0.1092 + "hfopenllm_v2/IFEval": 0.2883, + "hfopenllm_v2/BBH": 0.3124, + "hfopenllm_v2/MATH Level 5": 0.003, + "hfopenllm_v2/GPQA": 0.2357, + "hfopenllm_v2/MUSR": 0.3662, + "hfopenllm_v2/MMLU-PRO": 0.1115 } }, { diff --git a/data/developers/infly.json b/data/developers/infly.json index fe3f0dc6f7a4b2c08dd2895544fd05de4f16df3c..d497bf1e2632542284e99f21127cba81c8ed1b97 100644 --- a/data/developers/infly.json +++ b/data/developers/infly.json @@ -7,16 +7,16 @@ "developer": "infly", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.7648, - "reward-bench/Chat": 0.9665, - "reward-bench/Chat Hard": 0.9101, - "reward-bench/Safety": 0.9644, - "reward-bench/Reasoning": 0.9912, + "reward-bench/Score": 0.9511, "reward-bench/Factuality": 0.7411, "reward-bench/Precise IF": 0.4188, "reward-bench/Math": 0.6995, + "reward-bench/Safety": 0.9365, "reward-bench/Focus": 0.903, - "reward-bench/Ties": 0.8622 + "reward-bench/Ties": 0.8622, + "reward-bench/Chat": 0.9665, + "reward-bench/Chat Hard": 0.9101, + "reward-bench/Reasoning": 0.9912 } } ] diff --git a/data/developers/internlm.json b/data/developers/internlm.json index 69708dbd584389b0c656d19a5340943c92e82210..fbcc7249ad65cfb92e1e09809d6765ef54a982e2 100644 --- a/data/developers/internlm.json +++ b/data/developers/internlm.json @@ -71,16 +71,16 @@ "developer": "internlm", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.8759, + "reward-bench/Score": 0.5335, + "reward-bench/Chat": 0.9916, + "reward-bench/Chat Hard": 0.6952, + "reward-bench/Safety": 0.5956, + "reward-bench/Reasoning": 0.9453, "reward-bench/Factuality": 0.4211, "reward-bench/Precise IF": 0.4, "reward-bench/Math": 0.5628, - "reward-bench/Safety": 0.8716, "reward-bench/Focus": 0.7051, - "reward-bench/Ties": 0.5164, - "reward-bench/Chat": 0.9916, - "reward-bench/Chat Hard": 0.6952, - "reward-bench/Reasoning": 0.9453 + "reward-bench/Ties": 0.5164 } }, { diff --git a/data/developers/jaspionjader.json b/data/developers/jaspionjader.json index 053d128582b4aa02040ae51cd0577288669f17e5..9d9d1e268e56a9945ae657deca0493de6a22ce3d 100644 --- a/data/developers/jaspionjader.json +++ b/data/developers/jaspionjader.json @@ -1477,12 +1477,12 @@ "developer": "jaspionjader", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.4345, - "hfopenllm_v2/BBH": 0.5419, - "hfopenllm_v2/MATH Level 5": 0.1292, - "hfopenllm_v2/GPQA": 0.3087, + "hfopenllm_v2/IFEval": 0.4418, + "hfopenllm_v2/BBH": 0.5406, + "hfopenllm_v2/MATH Level 5": 0.1352, + "hfopenllm_v2/GPQA": 0.3062, "hfopenllm_v2/MUSR": 0.4277, - "hfopenllm_v2/MMLU-PRO": 0.3854 + "hfopenllm_v2/MMLU-PRO": 0.386 } }, { diff --git a/data/developers/leroydyer.json b/data/developers/leroydyer.json index e1aa95462e1fed8d4e011735c7846112031a21a1..119d29bab457804b8d64637d339b1f72d7389de3 100644 --- a/data/developers/leroydyer.json +++ b/data/developers/leroydyer.json @@ -707,12 +707,12 @@ "developer": "LeroyDyer", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.3579, - "hfopenllm_v2/BBH": 0.4477, - "hfopenllm_v2/MATH Level 5": 0.0423, - "hfopenllm_v2/GPQA": 0.3096, - "hfopenllm_v2/MUSR": 0.4134, - "hfopenllm_v2/MMLU-PRO": 0.2376 + "hfopenllm_v2/IFEval": 0.3798, + "hfopenllm_v2/BBH": 0.4483, + "hfopenllm_v2/MATH Level 5": 0.04, + "hfopenllm_v2/GPQA": 0.3129, + "hfopenllm_v2/MUSR": 0.4148, + "hfopenllm_v2/MMLU-PRO": 0.2389 } }, { diff --git a/data/developers/llmat.json b/data/developers/llmat.json index 95633d3199310803501c256677a0fb788d0a08f7..d073eb81547c22e04e5363702192b3a9654d7362 100644 --- a/data/developers/llmat.json +++ b/data/developers/llmat.json @@ -7,12 +7,12 @@ "developer": "llmat", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.364, - "hfopenllm_v2/BBH": 0.4005, - "hfopenllm_v2/MATH Level 5": 0.0015, - "hfopenllm_v2/GPQA": 0.2693, - "hfopenllm_v2/MUSR": 0.3529, - "hfopenllm_v2/MMLU-PRO": 0.2301 + "hfopenllm_v2/IFEval": 0.377, + "hfopenllm_v2/BBH": 0.3978, + "hfopenllm_v2/MATH Level 5": 0.0242, + "hfopenllm_v2/GPQA": 0.2668, + "hfopenllm_v2/MUSR": 0.3555, + "hfopenllm_v2/MMLU-PRO": 0.2278 } } ] diff --git a/data/developers/lxzgordon.json b/data/developers/lxzgordon.json index e4ace3cc8f7193c8c711403c980535ee73fdd6d3..7f802cf733857054e01537f3ecf745a3fdb38a05 100644 --- a/data/developers/lxzgordon.json +++ b/data/developers/lxzgordon.json @@ -20,16 +20,16 @@ "developer": "LxzGordon", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.9294, + "reward-bench/Score": 0.7394, + "reward-bench/Chat": 0.9553, + "reward-bench/Chat Hard": 0.8816, + "reward-bench/Safety": 0.9178, + "reward-bench/Reasoning": 0.9698, "reward-bench/Factuality": 0.6884, "reward-bench/Precise IF": 0.45, "reward-bench/Math": 0.6393, - "reward-bench/Safety": 0.9108, "reward-bench/Focus": 0.9758, - "reward-bench/Ties": 0.7653, - "reward-bench/Chat": 0.9553, - "reward-bench/Chat Hard": 0.8816, - "reward-bench/Reasoning": 0.9698 + "reward-bench/Ties": 0.7653 } } ] diff --git a/data/developers/meta.json b/data/developers/meta.json index ed6538df893d5e651c8ce4719f630ed6d36178ec..801662ea5cb32e2dbb8c066d026c908449d6cd1d 100644 --- a/data/developers/meta.json +++ b/data/developers/meta.json @@ -471,6 +471,16 @@ "helm_capabilities/IFEval": 0.743, "helm_capabilities/WildBench": 0.686, "helm_capabilities/Omni-MATH": 0.137, + "helm_lite/Mean win rate": 0.303, + "helm_lite/NarrativeQA": 0.756, + "helm_lite/NaturalQuestions (closed-book)": 0.209, + "helm_lite/OpenbookQA": 0.74, + "helm_lite/MMLU": 0.5, + "helm_lite/MATH": 0.703, + "helm_lite/GSM8K": 0.798, + "helm_lite/LegalBench": 0.342, + "helm_lite/MedQA": 0.245, + "helm_lite/WMT 2014": 0.181, "helm_mmlu/MMLU All Subjects": 0.561, "helm_mmlu/Abstract Algebra": 0.26, "helm_mmlu/Anatomy": 0.459, @@ -506,17 +516,7 @@ "helm_mmlu/Sociology": 0.701, "helm_mmlu/Virology": 0.446, "helm_mmlu/World Religions": 0.789, - "helm_mmlu/Mean win rate": 0.475, - "helm_lite/Mean win rate": 0.303, - "helm_lite/NarrativeQA": 0.756, - "helm_lite/NaturalQuestions (closed-book)": 0.209, - "helm_lite/OpenbookQA": 0.74, - "helm_lite/MMLU": 0.5, - "helm_lite/MATH": 0.703, - "helm_lite/GSM8K": 0.798, - "helm_lite/LegalBench": 0.342, - "helm_lite/MedQA": 0.245, - "helm_lite/WMT 2014": 0.181 + "helm_mmlu/Mean win rate": 0.475 } }, { @@ -579,6 +579,16 @@ "developer": "Meta", "evaluator_relationship": null, "benchmark_scores": { + "helm_lite/Mean win rate": 0.819, + "helm_lite/NarrativeQA": 0.777, + "helm_lite/NaturalQuestions (closed-book)": 0.457, + "helm_lite/OpenbookQA": 0.942, + "helm_lite/MMLU": 0.703, + "helm_lite/MATH": 0.791, + "helm_lite/GSM8K": 0.936, + "helm_lite/LegalBench": 0.68, + "helm_lite/MedQA": 0.769, + "helm_lite/WMT 2014": 0.224, "helm_mmlu/MMLU All Subjects": 0.803, "helm_mmlu/Abstract Algebra": 0.52, "helm_mmlu/Anatomy": 0.8, @@ -614,17 +624,7 @@ "helm_mmlu/Sociology": 0.92, "helm_mmlu/Virology": 0.584, "helm_mmlu/World Religions": 0.901, - "helm_mmlu/Mean win rate": 0.773, - "helm_lite/Mean win rate": 0.819, - "helm_lite/NarrativeQA": 0.777, - "helm_lite/NaturalQuestions (closed-book)": 0.457, - "helm_lite/OpenbookQA": 0.942, - "helm_lite/MMLU": 0.703, - "helm_lite/MATH": 0.791, - "helm_lite/GSM8K": 0.936, - "helm_lite/LegalBench": 0.68, - "helm_lite/MedQA": 0.769, - "helm_lite/WMT 2014": 0.224 + "helm_mmlu/Mean win rate": 0.773 } }, { diff --git a/data/developers/minimax.json b/data/developers/minimax.json index b575a16ccd7fb272ceb4b3067b0a9e48f65cff08..3eb98fb6a6609e5f1cc4d74a47ecc2b74aaa9bb0 100644 --- a/data/developers/minimax.json +++ b/data/developers/minimax.json @@ -25,7 +25,7 @@ "developer": "MiniMax", "evaluator_relationship": null, "benchmark_scores": { - "terminal-bench-2.0/terminal-bench-2.0": 29.2 + "terminal-bench-2.0/terminal-bench-2.0": 36.6 } }, { diff --git a/data/developers/mistralai.json b/data/developers/mistralai.json index 57b0b83dacb50720e9d21af9c48cd23ccbf0f7b9..168bb98ff1b313fc7d40f024df899fec3a02671f 100644 --- a/data/developers/mistralai.json +++ b/data/developers/mistralai.json @@ -69,6 +69,16 @@ "helm_capabilities/IFEval": 0.567, "helm_capabilities/WildBench": 0.66, "helm_capabilities/Omni-MATH": 0.072, + "helm_lite/Mean win rate": 0.196, + "helm_lite/NarrativeQA": 0.716, + "helm_lite/NaturalQuestions (closed-book)": 0.253, + "helm_lite/OpenbookQA": 0.79, + "helm_lite/MMLU": 0.51, + "helm_lite/MATH": 0.289, + "helm_lite/GSM8K": 0.538, + "helm_lite/LegalBench": 0.331, + "helm_lite/MedQA": 0.517, + "helm_lite/WMT 2014": 0.142, "helm_mmlu/MMLU All Subjects": 0.599, "helm_mmlu/Abstract Algebra": 0.27, "helm_mmlu/Anatomy": 0.585, @@ -105,16 +115,6 @@ "helm_mmlu/Virology": 0.47, "helm_mmlu/World Religions": 0.825, "helm_mmlu/Mean win rate": 0.509, - "helm_lite/Mean win rate": 0.196, - "helm_lite/NarrativeQA": 0.716, - "helm_lite/NaturalQuestions (closed-book)": 0.253, - "helm_lite/OpenbookQA": 0.79, - "helm_lite/MMLU": 0.51, - "helm_lite/MATH": 0.289, - "helm_lite/GSM8K": 0.538, - "helm_lite/LegalBench": 0.331, - "helm_lite/MedQA": 0.517, - "helm_lite/WMT 2014": 0.142, "hfopenllm_v2/IFEval": 0.5465, "hfopenllm_v2/BBH": 0.4722, "hfopenllm_v2/MATH Level 5": 0.0385, @@ -718,12 +718,12 @@ "developer": "mistralai", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.2326, - "hfopenllm_v2/BBH": 0.5098, - "hfopenllm_v2/MATH Level 5": 0.0937, - "hfopenllm_v2/GPQA": 0.3205, - "hfopenllm_v2/MUSR": 0.4413, - "hfopenllm_v2/MMLU-PRO": 0.3871 + "hfopenllm_v2/IFEval": 0.2415, + "hfopenllm_v2/BBH": 0.5087, + "hfopenllm_v2/MATH Level 5": 0.102, + "hfopenllm_v2/GPQA": 0.3138, + "hfopenllm_v2/MUSR": 0.4321, + "hfopenllm_v2/MMLU-PRO": 0.385 } }, { diff --git a/data/developers/mlabonne.json b/data/developers/mlabonne.json index be86bd7fe732025f133b06dc7c412aa5e56b7119..2620a8c4e8931697abdcd44e4a4aae7c1e430da5 100644 --- a/data/developers/mlabonne.json +++ b/data/developers/mlabonne.json @@ -161,12 +161,12 @@ "developer": "mlabonne", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.4162, - "hfopenllm_v2/BBH": 0.5124, - "hfopenllm_v2/MATH Level 5": 0.0853, - "hfopenllm_v2/GPQA": 0.3029, - "hfopenllm_v2/MUSR": 0.415, - "hfopenllm_v2/MMLU-PRO": 0.3802 + "hfopenllm_v2/IFEval": 0.7561, + "hfopenllm_v2/BBH": 0.5111, + "hfopenllm_v2/MATH Level 5": 0.0906, + "hfopenllm_v2/GPQA": 0.3062, + "hfopenllm_v2/MUSR": 0.4019, + "hfopenllm_v2/MMLU-PRO": 0.3841 } }, { diff --git a/data/developers/moonshot_ai.json b/data/developers/moonshot_ai.json index d83f11402fcca0c39c33d09eed495f2aefd69384..746185ce773a539bd025922ab856fdcf2f8a1d9f 100644 --- a/data/developers/moonshot_ai.json +++ b/data/developers/moonshot_ai.json @@ -7,7 +7,7 @@ "developer": "Moonshot AI", "evaluator_relationship": null, "benchmark_scores": { - "terminal-bench-2.0/terminal-bench-2.0": 26.7 + "terminal-bench-2.0/terminal-bench-2.0": 27.8 } }, { diff --git a/data/developers/multiple.json b/data/developers/multiple.json index e235ffc0287578be5fd9fdf3ba4e4e1b232b5df8..34cdb844d495e12fd3a3820204fbda313306e211 100644 --- a/data/developers/multiple.json +++ b/data/developers/multiple.json @@ -7,7 +7,7 @@ "developer": "Multiple", "evaluator_relationship": null, "benchmark_scores": { - "terminal-bench-2.0/terminal-bench-2.0": 71.0 + "terminal-bench-2.0/terminal-bench-2.0": 72.4 } } ] diff --git a/data/developers/nazimali.json b/data/developers/nazimali.json index 07c42beb48b2fc9e0ad1a1059178a9ce4071a591..34d47c9647d462b17a0fe6015c5c4f9fc00264e7 100644 --- a/data/developers/nazimali.json +++ b/data/developers/nazimali.json @@ -21,12 +21,12 @@ "developer": "nazimali", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.4964, - "hfopenllm_v2/BBH": 0.4699, - "hfopenllm_v2/MATH Level 5": 0.0045, - "hfopenllm_v2/GPQA": 0.2827, - "hfopenllm_v2/MUSR": 0.3979, - "hfopenllm_v2/MMLU-PRO": 0.3063 + "hfopenllm_v2/IFEval": 0.486, + "hfopenllm_v2/BBH": 0.4721, + "hfopenllm_v2/MATH Level 5": 0.0846, + "hfopenllm_v2/GPQA": 0.2844, + "hfopenllm_v2/MUSR": 0.4006, + "hfopenllm_v2/MMLU-PRO": 0.3087 } } ] diff --git a/data/developers/nicolinho.json b/data/developers/nicolinho.json index 551d4a5de698babd0e830b509f51bb11f4dd2ac7..79bf445ae201aa8b9add0559d92e4abd4fd3bebb 100644 --- a/data/developers/nicolinho.json +++ b/data/developers/nicolinho.json @@ -7,16 +7,16 @@ "developer": "nicolinho", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.9444, + "reward-bench/Score": 0.7667, + "reward-bench/Chat": 0.9665, + "reward-bench/Chat Hard": 0.9013, + "reward-bench/Safety": 0.9578, + "reward-bench/Reasoning": 0.9826, "reward-bench/Factuality": 0.7853, "reward-bench/Precise IF": 0.3719, "reward-bench/Math": 0.6995, - "reward-bench/Safety": 0.927, "reward-bench/Focus": 0.9535, - "reward-bench/Ties": 0.8321, - "reward-bench/Chat": 0.9665, - "reward-bench/Chat Hard": 0.9013, - "reward-bench/Reasoning": 0.9826 + "reward-bench/Ties": 0.8321 } }, { @@ -51,16 +51,16 @@ "developer": "nicolinho", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.9314, + "reward-bench/Score": 0.7074, + "reward-bench/Chat": 0.9637, + "reward-bench/Chat Hard": 0.8684, + "reward-bench/Safety": 0.9467, + "reward-bench/Reasoning": 0.9677, "reward-bench/Factuality": 0.6653, "reward-bench/Precise IF": 0.4062, "reward-bench/Math": 0.612, - "reward-bench/Safety": 0.9257, "reward-bench/Focus": 0.8909, - "reward-bench/Ties": 0.7234, - "reward-bench/Chat": 0.9637, - "reward-bench/Chat Hard": 0.8684, - "reward-bench/Reasoning": 0.9677 + "reward-bench/Ties": 0.7234 } } ] diff --git a/data/developers/nisten.json b/data/developers/nisten.json index 7b275a3c64fa267662b1c7ec09c2c6db9c0fbfc6..785709badefb68361515ca958f0d589a8268f14c 100644 --- a/data/developers/nisten.json +++ b/data/developers/nisten.json @@ -7,12 +7,12 @@ "developer": "nisten", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.3914, - "hfopenllm_v2/BBH": 0.6591, - "hfopenllm_v2/MATH Level 5": 0.3044, - "hfopenllm_v2/GPQA": 0.3591, - "hfopenllm_v2/MUSR": 0.4681, - "hfopenllm_v2/MMLU-PRO": 0.5611 + "hfopenllm_v2/IFEval": 0.3799, + "hfopenllm_v2/BBH": 0.6647, + "hfopenllm_v2/MATH Level 5": 0.3406, + "hfopenllm_v2/GPQA": 0.4035, + "hfopenllm_v2/MUSR": 0.494, + "hfopenllm_v2/MMLU-PRO": 0.5731 } }, { diff --git a/data/developers/nousresearch.json b/data/developers/nousresearch.json index 5eca3534d830aded2e15a419e7392ebd605b4769..68e17c3374e0831b38026cc5c7fe37546bb1fc55 100644 --- a/data/developers/nousresearch.json +++ b/data/developers/nousresearch.json @@ -200,20 +200,6 @@ "hfopenllm_v2/MMLU-PRO": 0.232 } }, - { - "id": "NousResearch/Yarn-Llama-2-7b-128k", - "name": "Yarn-Llama-2-7b-128k", - "developer": "NousResearch", - "evaluator_relationship": null, - "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.1485, - "hfopenllm_v2/BBH": 0.3248, - "hfopenllm_v2/MATH Level 5": 0.0151, - "hfopenllm_v2/GPQA": 0.2601, - "hfopenllm_v2/MUSR": 0.3967, - "hfopenllm_v2/MMLU-PRO": 0.1791 - } - }, { "id": "NousResearch/Yarn-Llama-2-7b-64k", "name": "Yarn-Llama-2-7b-64k", diff --git a/data/developers/omkar1102.json b/data/developers/omkar1102.json index 1d044781189744d770af993cfdb651c1e02eee6f..ca0270b469e58068ee5242b6801e2fbe782ddc0a 100644 --- a/data/developers/omkar1102.json +++ b/data/developers/omkar1102.json @@ -7,12 +7,12 @@ "developer": "Omkar1102", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.2254, - "hfopenllm_v2/BBH": 0.275, + "hfopenllm_v2/IFEval": 0.2148, + "hfopenllm_v2/BBH": 0.276, "hfopenllm_v2/MATH Level 5": 0.0, - "hfopenllm_v2/GPQA": 0.2576, - "hfopenllm_v2/MUSR": 0.3762, - "hfopenllm_v2/MMLU-PRO": 0.1123 + "hfopenllm_v2/GPQA": 0.2508, + "hfopenllm_v2/MUSR": 0.3802, + "hfopenllm_v2/MMLU-PRO": 0.1126 } } ] diff --git a/data/developers/openai.json b/data/developers/openai.json index 3a51dd8f7575d45f6d9b06bface9fb31355ba46d..463a516253b52dd581eb26a0054c21d8f415cdc5 100644 --- a/data/developers/openai.json +++ b/data/developers/openai.json @@ -163,16 +163,16 @@ "developer": "OpenAI", "evaluator_relationship": null, "benchmark_scores": { + "ace/Overall Score": 0.515, + "ace/Food Score": 0.65, + "ace/Gaming Score": 0.578, "apex-agents/Overall Pass@1": 0.23, "apex-agents/Overall Pass@8": 0.4, "apex-agents/Overall Mean Score": 0.387, "apex-agents/Investment Banking Pass@1": 0.273, "apex-agents/Management Consulting Pass@1": 0.227, "apex-agents/Corporate Law Pass@1": 0.189, - "apex-agents/Corporate Lawyer Mean Score": 0.443, - "ace/Overall Score": 0.515, - "ace/Food Score": 0.65, - "ace/Gaming Score": 0.578 + "apex-agents/Corporate Lawyer Mean Score": 0.443 } }, { @@ -300,13 +300,6 @@ "developer": "OpenAI", "evaluator_relationship": null, "benchmark_scores": { - "helm_instruct/Mean win rate": 0.689, - "helm_instruct/Anthropic RLHF dataset": 4.964, - "helm_instruct/Best ChatGPT Prompts": 4.986, - "helm_instruct/Koala test dataset": 4.987, - "helm_instruct/Open Assistant": 4.987, - "helm_instruct/Self Instruct": 4.99, - "helm_instruct/Vicuna": 4.992, "helm_classic/Mean win rate": 0.783, "helm_classic/MMLU": 0.391, "helm_classic/BoolQ": 0.87, @@ -322,6 +315,13 @@ "helm_classic/IMDB": 0.943, "helm_classic/CivilComments": 0.696, "helm_classic/RAFT": 0.748, + "helm_instruct/Mean win rate": 0.689, + "helm_instruct/Anthropic RLHF dataset": 4.964, + "helm_instruct/Best ChatGPT Prompts": 4.986, + "helm_instruct/Koala test dataset": 4.987, + "helm_instruct/Open Assistant": 4.987, + "helm_instruct/Self Instruct": 4.99, + "helm_instruct/Vicuna": 4.992, "helm_lite/Mean win rate": 0.358, "helm_lite/NarrativeQA": 0.655, "helm_lite/NaturalQuestions (closed-book)": 0.335, @@ -405,6 +405,16 @@ "developer": "OpenAI", "evaluator_relationship": null, "benchmark_scores": { + "helm_lite/Mean win rate": 0.867, + "helm_lite/NarrativeQA": 0.768, + "helm_lite/NaturalQuestions (closed-book)": 0.457, + "helm_lite/OpenbookQA": 0.96, + "helm_lite/MMLU": 0.735, + "helm_lite/MATH": 0.802, + "helm_lite/GSM8K": 0.932, + "helm_lite/LegalBench": 0.713, + "helm_lite/MedQA": 0.815, + "helm_lite/WMT 2014": 0.211, "helm_mmlu/MMLU All Subjects": 0.824, "helm_mmlu/Abstract Algebra": 0.63, "helm_mmlu/Anatomy": 0.8, @@ -440,17 +450,7 @@ "helm_mmlu/Sociology": 0.93, "helm_mmlu/Virology": 0.596, "helm_mmlu/World Religions": 0.877, - "helm_mmlu/Mean win rate": 0.517, - "helm_lite/Mean win rate": 0.867, - "helm_lite/NarrativeQA": 0.768, - "helm_lite/NaturalQuestions (closed-book)": 0.457, - "helm_lite/OpenbookQA": 0.96, - "helm_lite/MMLU": 0.735, - "helm_lite/MATH": 0.802, - "helm_lite/GSM8K": 0.932, - "helm_lite/LegalBench": 0.713, - "helm_lite/MedQA": 0.815, - "helm_lite/WMT 2014": 0.211 + "helm_mmlu/Mean win rate": 0.517 } }, { @@ -513,6 +513,16 @@ "developer": "OpenAI", "evaluator_relationship": null, "benchmark_scores": { + "helm_lite/Mean win rate": 0.864, + "helm_lite/NarrativeQA": 0.761, + "helm_lite/NaturalQuestions (closed-book)": 0.482, + "helm_lite/OpenbookQA": 0.97, + "helm_lite/MMLU": 0.711, + "helm_lite/MATH": 0.833, + "helm_lite/GSM8K": 0.824, + "helm_lite/LegalBench": 0.727, + "helm_lite/MedQA": 0.783, + "helm_lite/WMT 2014": 0.218, "helm_mmlu/MMLU All Subjects": 0.813, "helm_mmlu/Abstract Algebra": 0.56, "helm_mmlu/Anatomy": 0.822, @@ -549,16 +559,6 @@ "helm_mmlu/Virology": 0.602, "helm_mmlu/World Religions": 0.848, "helm_mmlu/Mean win rate": 0.351, - "helm_lite/Mean win rate": 0.864, - "helm_lite/NarrativeQA": 0.761, - "helm_lite/NaturalQuestions (closed-book)": 0.482, - "helm_lite/OpenbookQA": 0.97, - "helm_lite/MMLU": 0.711, - "helm_lite/MATH": 0.833, - "helm_lite/GSM8K": 0.824, - "helm_lite/LegalBench": 0.727, - "helm_lite/MedQA": 0.783, - "helm_lite/WMT 2014": 0.218, "reward-bench/Score": 0.8395, "reward-bench/Chat": 0.9525, "reward-bench/Chat Hard": 0.7544, @@ -772,16 +772,16 @@ "helm_mmlu/Virology": 0.578, "helm_mmlu/World Religions": 0.883, "helm_mmlu/Mean win rate": 0.52, - "reward-bench/Score": 0.6493, - "reward-bench/Chat": 0.9609, - "reward-bench/Chat Hard": 0.761, - "reward-bench/Safety": 0.8619, - "reward-bench/Reasoning": 0.8661, + "reward-bench/Score": 0.8673, "reward-bench/Factuality": 0.5684, "reward-bench/Precise IF": 0.3312, "reward-bench/Math": 0.623, + "reward-bench/Safety": 0.8811, "reward-bench/Focus": 0.7293, - "reward-bench/Ties": 0.7819 + "reward-bench/Ties": 0.7819, + "reward-bench/Chat": 0.9609, + "reward-bench/Chat Hard": 0.761, + "reward-bench/Reasoning": 0.8661 } }, { @@ -877,7 +877,7 @@ "developer": "OpenAI", "evaluator_relationship": null, "benchmark_scores": { - "terminal-bench-2.0/terminal-bench-2.0": 35.2 + "terminal-bench-2.0/terminal-bench-2.0": 49.6 } }, { @@ -911,9 +911,9 @@ "helm_capabilities/IFEval": 0.875, "helm_capabilities/WildBench": 0.857, "helm_capabilities/Omni-MATH": 0.647, - "livecodebenchpro/Hard Problems": 0.0423, - "livecodebenchpro/Medium Problems": 0.4085, - "livecodebenchpro/Easy Problems": 0.9014 + "livecodebenchpro/Hard Problems": 0.04225352112676056, + "livecodebenchpro/Medium Problems": 0.4084507042253521, + "livecodebenchpro/Easy Problems": 0.8873239436619719 } }, { @@ -922,7 +922,7 @@ "developer": "OpenAI", "evaluator_relationship": null, "benchmark_scores": { - "terminal-bench-2.0/terminal-bench-2.0": 44.3 + "terminal-bench-2.0/terminal-bench-2.0": 43.4 } }, { @@ -931,7 +931,7 @@ "developer": "OpenAI", "evaluator_relationship": null, "benchmark_scores": { - "terminal-bench-2.0/terminal-bench-2.0": 34.8 + "terminal-bench-2.0/terminal-bench-2.0": 24.0 } }, { @@ -954,7 +954,7 @@ "developer": "OpenAI", "evaluator_relationship": null, "benchmark_scores": { - "terminal-bench-2.0/terminal-bench-2.0": 9.9 + "terminal-bench-2.0/terminal-bench-2.0": 11.5 } }, { @@ -986,7 +986,7 @@ "developer": "OpenAI", "evaluator_relationship": null, "benchmark_scores": { - "terminal-bench-2.0/terminal-bench-2.0": 53.5 + "terminal-bench-2.0/terminal-bench-2.0": 57.8 } }, { @@ -1013,7 +1013,7 @@ "developer": "OpenAI", "evaluator_relationship": null, "benchmark_scores": { - "terminal-bench-2.0/terminal-bench-2.0": 60.7 + "terminal-bench-2.0/terminal-bench-2.0": 62.9 } }, { @@ -1023,14 +1023,14 @@ "evaluator_relationship": null, "benchmark_scores": { "appworld_test_normal/appworld/test_normal": 0.0, - "browsecompplus/browsecompplus": 0.48, + "browsecompplus/browsecompplus": 0.43, "livecodebenchpro/Hard Problems": 0.1594, "livecodebenchpro/Medium Problems": 0.5211, "livecodebenchpro/Easy Problems": 0.9014, "swe-bench/swe-bench": 0.5455, "tau-bench-2_airline/tau-bench-2/airline": 0.6, - "tau-bench-2_retail/tau-bench-2/retail": 0.68, - "tau-bench-2_telecom/tau-bench-2/telecom": 0.5354 + "tau-bench-2_retail/tau-bench-2/retail": 0.73, + "tau-bench-2_telecom/tau-bench-2/telecom": 0.71 } }, { @@ -1048,7 +1048,7 @@ "developer": "OpenAI", "evaluator_relationship": null, "benchmark_scores": { - "terminal-bench-2.0/terminal-bench-2.0": 64.7 + "terminal-bench-2.0/terminal-bench-2.0": 77.3 } }, { @@ -1112,7 +1112,7 @@ "livecodebenchpro/Hard Problems": 0.0, "livecodebenchpro/Medium Problems": 0.11267605633802817, "livecodebenchpro/Easy Problems": 0.6619718309859155, - "terminal-bench-2.0/terminal-bench-2.0": 14.2 + "terminal-bench-2.0/terminal-bench-2.0": 18.7 } }, { @@ -1130,7 +1130,7 @@ "livecodebenchpro/Hard Problems": 0.0, "livecodebenchpro/Medium Problems": 0.056338028169014086, "livecodebenchpro/Easy Problems": 0.5070422535211268, - "terminal-bench-2.0/terminal-bench-2.0": 3.1 + "terminal-bench-2.0/terminal-bench-2.0": 3.4 } }, { diff --git a/data/developers/openassistant.json b/data/developers/openassistant.json index ad2c9bbf5e049296e5d368971f1365bc748e1226..7e4cf042377b82fe85282ade845062094d3d9218 100644 --- a/data/developers/openassistant.json +++ b/data/developers/openassistant.json @@ -7,17 +7,17 @@ "developer": "OpenAssistant", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.615, + "reward-bench/Score": 0.2653, + "reward-bench/Chat": 0.9246, + "reward-bench/Chat Hard": 0.3728, + "reward-bench/Safety": 0.3289, + "reward-bench/Reasoning": 0.5855, + "reward-bench/Prior Sets (0.5 weight)": 0.6801, "reward-bench/Factuality": 0.3979, "reward-bench/Precise IF": 0.2875, "reward-bench/Math": 0.377, - "reward-bench/Safety": 0.5446, "reward-bench/Focus": 0.1535, - "reward-bench/Ties": 0.047, - "reward-bench/Chat": 0.9246, - "reward-bench/Chat Hard": 0.3728, - "reward-bench/Reasoning": 0.5855, - "reward-bench/Prior Sets (0.5 weight)": 0.6801 + "reward-bench/Ties": 0.047 } }, { @@ -26,17 +26,17 @@ "developer": "OpenAssistant", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.2648, - "reward-bench/Chat": 0.8855, - "reward-bench/Chat Hard": 0.4868, - "reward-bench/Safety": 0.3244, - "reward-bench/Reasoning": 0.7752, - "reward-bench/Prior Sets (0.5 weight)": 0.6533, + "reward-bench/Score": 0.6901, "reward-bench/Factuality": 0.3179, "reward-bench/Precise IF": 0.2625, "reward-bench/Math": 0.3934, + "reward-bench/Safety": 0.6311, "reward-bench/Focus": 0.2707, - "reward-bench/Ties": 0.0198 + "reward-bench/Ties": 0.0198, + "reward-bench/Chat": 0.8855, + "reward-bench/Chat Hard": 0.4868, + "reward-bench/Reasoning": 0.7752, + "reward-bench/Prior Sets (0.5 weight)": 0.6533 } }, { diff --git a/data/developers/openbmb.json b/data/developers/openbmb.json index b9795ed819d46987af7a443e489394fa08308cdd..d8dae84054074ce01b5c47fc58b69a148fdc99c0 100644 --- a/data/developers/openbmb.json +++ b/data/developers/openbmb.json @@ -21,17 +21,17 @@ "developer": "openbmb", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.5806, - "reward-bench/Chat": 0.9804, - "reward-bench/Chat Hard": 0.6557, - "reward-bench/Safety": 0.6267, - "reward-bench/Reasoning": 0.8633, - "reward-bench/Prior Sets (0.5 weight)": 0.7172, + "reward-bench/Score": 0.8159, "reward-bench/Factuality": 0.6, "reward-bench/Precise IF": 0.3438, "reward-bench/Math": 0.5683, + "reward-bench/Safety": 0.8135, "reward-bench/Focus": 0.7475, - "reward-bench/Ties": 0.5972 + "reward-bench/Ties": 0.5972, + "reward-bench/Chat": 0.9804, + "reward-bench/Chat Hard": 0.6557, + "reward-bench/Reasoning": 0.8633, + "reward-bench/Prior Sets (0.5 weight)": 0.7172 } }, { diff --git a/data/developers/pku-alignment.json b/data/developers/pku-alignment.json index 0ae80803f93980ecd8558a983e84a1646a1bd7d6..76e1f41b6171c4fd2a3d35d25df175a75a77a416 100644 --- a/data/developers/pku-alignment.json +++ b/data/developers/pku-alignment.json @@ -7,17 +7,17 @@ "developer": "PKU-Alignment", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.3332, - "reward-bench/Chat": 0.6173, - "reward-bench/Chat Hard": 0.4232, - "reward-bench/Safety": 0.7589, - "reward-bench/Reasoning": 0.5482, - "reward-bench/Prior Sets (0.5 weight)": 0.57, + "reward-bench/Score": 0.5798, "reward-bench/Factuality": 0.3263, "reward-bench/Precise IF": 0.2313, "reward-bench/Math": 0.3989, + "reward-bench/Safety": 0.7351, "reward-bench/Focus": 0.2939, - "reward-bench/Ties": -0.01 + "reward-bench/Ties": -0.01, + "reward-bench/Chat": 0.6173, + "reward-bench/Chat Hard": 0.4232, + "reward-bench/Reasoning": 0.5482, + "reward-bench/Prior Sets (0.5 weight)": 0.57 } }, { @@ -26,17 +26,17 @@ "developer": "PKU-Alignment", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.4727, + "reward-bench/Score": 0.1606, + "reward-bench/Chat": 0.8184, + "reward-bench/Chat Hard": 0.2873, + "reward-bench/Safety": 0.1422, + "reward-bench/Reasoning": 0.346, + "reward-bench/Prior Sets (0.5 weight)": 0.5993, "reward-bench/Factuality": 0.2105, "reward-bench/Precise IF": 0.2938, "reward-bench/Math": 0.2623, - "reward-bench/Safety": 0.3757, "reward-bench/Focus": 0.0646, - "reward-bench/Ties": -0.01, - "reward-bench/Chat": 0.8184, - "reward-bench/Chat Hard": 0.2873, - "reward-bench/Reasoning": 0.346, - "reward-bench/Prior Sets (0.5 weight)": 0.5993 + "reward-bench/Ties": -0.01 } }, { @@ -64,17 +64,17 @@ "developer": "PKU-Alignment", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.6366, + "reward-bench/Score": 0.2544, + "reward-bench/Chat": 0.8994, + "reward-bench/Chat Hard": 0.364, + "reward-bench/Safety": 0.3156, + "reward-bench/Reasoning": 0.6887, + "reward-bench/Prior Sets (0.5 weight)": 0.6171, "reward-bench/Factuality": 0.2168, "reward-bench/Precise IF": 0.2562, "reward-bench/Math": 0.3825, - "reward-bench/Safety": 0.6041, "reward-bench/Focus": 0.2606, - "reward-bench/Ties": 0.0944, - "reward-bench/Chat": 0.8994, - "reward-bench/Chat Hard": 0.364, - "reward-bench/Reasoning": 0.6887, - "reward-bench/Prior Sets (0.5 weight)": 0.6171 + "reward-bench/Ties": 0.0944 } } ] diff --git a/data/developers/primeintellect.json b/data/developers/primeintellect.json index 674a0e3b141480d7e0d33d0a2fe9b205b710216f..160722785b06f80d9b220dee435fad1245d45495 100644 --- a/data/developers/primeintellect.json +++ b/data/developers/primeintellect.json @@ -8,11 +8,11 @@ "evaluator_relationship": null, "benchmark_scores": { "hfopenllm_v2/IFEval": 0.1757, - "hfopenllm_v2/BBH": 0.276, + "hfopenllm_v2/BBH": 0.274, "hfopenllm_v2/MATH Level 5": 0.0, - "hfopenllm_v2/GPQA": 0.2534, - "hfopenllm_v2/MUSR": 0.3339, - "hfopenllm_v2/MMLU-PRO": 0.1123 + "hfopenllm_v2/GPQA": 0.25, + "hfopenllm_v2/MUSR": 0.3753, + "hfopenllm_v2/MMLU-PRO": 0.112 } }, { diff --git a/data/developers/princeton-nlp.json b/data/developers/princeton-nlp.json index c2ef64ee6de3b0368749e3cc226eadfed8e75e4e..10775d33e9c59e099c9bb1a2b84aa429c8a82f44 100644 --- a/data/developers/princeton-nlp.json +++ b/data/developers/princeton-nlp.json @@ -49,12 +49,12 @@ "developer": "princeton-nlp", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.5508, - "hfopenllm_v2/BBH": 0.5028, - "hfopenllm_v2/MATH Level 5": 0.0529, - "hfopenllm_v2/GPQA": 0.2861, - "hfopenllm_v2/MUSR": 0.4266, - "hfopenllm_v2/MMLU-PRO": 0.3231 + "hfopenllm_v2/IFEval": 0.3978, + "hfopenllm_v2/BBH": 0.4983, + "hfopenllm_v2/MATH Level 5": 0.0582, + "hfopenllm_v2/GPQA": 0.281, + "hfopenllm_v2/MUSR": 0.425, + "hfopenllm_v2/MMLU-PRO": 0.3246 } }, { diff --git a/data/developers/quazim0t0.json b/data/developers/quazim0t0.json index c70b926d08d065ebcbe4ee7b0de956265e80bd56..496082d204a36ed57a388cd540f10080c702a420 100644 --- a/data/developers/quazim0t0.json +++ b/data/developers/quazim0t0.json @@ -637,12 +637,12 @@ "developer": "Quazim0t0", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.7016, - "hfopenllm_v2/BBH": 0.6942, - "hfopenllm_v2/MATH Level 5": 0.4116, - "hfopenllm_v2/GPQA": 0.3624, - "hfopenllm_v2/MUSR": 0.4571, - "hfopenllm_v2/MMLU-PRO": 0.5411 + "hfopenllm_v2/IFEval": 0.2922, + "hfopenllm_v2/BBH": 0.6559, + "hfopenllm_v2/MATH Level 5": 0.2545, + "hfopenllm_v2/GPQA": 0.2659, + "hfopenllm_v2/MUSR": 0.3929, + "hfopenllm_v2/MMLU-PRO": 0.5207 } }, { diff --git a/data/developers/qwen.json b/data/developers/qwen.json index b583c293c02ad4ba7bcdda51cb7b094b097d1824..8996ee46a22c0193f6bf8577a319d9b66b681c8b 100644 --- a/data/developers/qwen.json +++ b/data/developers/qwen.json @@ -775,12 +775,12 @@ "developer": "Qwen", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.3153, - "hfopenllm_v2/BBH": 0.3322, - "hfopenllm_v2/MATH Level 5": 0.1035, - "hfopenllm_v2/GPQA": 0.2592, - "hfopenllm_v2/MUSR": 0.3342, - "hfopenllm_v2/MMLU-PRO": 0.172 + "hfopenllm_v2/IFEval": 0.3071, + "hfopenllm_v2/BBH": 0.3341, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2576, + "hfopenllm_v2/MUSR": 0.3329, + "hfopenllm_v2/MMLU-PRO": 0.1697 } }, { @@ -906,7 +906,8 @@ "hfopenllm_v2/MATH Level 5": 0.3678, "hfopenllm_v2/GPQA": 0.2727, "hfopenllm_v2/MUSR": 0.3968, - "hfopenllm_v2/MMLU-PRO": 0.3255 + "hfopenllm_v2/MMLU-PRO": 0.3255, + "theory_of_mind/accuracy on theory_of_mind for scorer model_graded_fact": 0.78 } }, { @@ -1176,12 +1177,12 @@ "developer": "Qwen", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.6147, - "hfopenllm_v2/BBH": 0.4999, - "hfopenllm_v2/MATH Level 5": 0.031, - "hfopenllm_v2/GPQA": 0.2936, - "hfopenllm_v2/MUSR": 0.4099, - "hfopenllm_v2/MMLU-PRO": 0.3354 + "hfopenllm_v2/IFEval": 0.6101, + "hfopenllm_v2/BBH": 0.5008, + "hfopenllm_v2/MATH Level 5": 0.3716, + "hfopenllm_v2/GPQA": 0.2919, + "hfopenllm_v2/MUSR": 0.4073, + "hfopenllm_v2/MMLU-PRO": 0.3352 } }, { diff --git a/data/developers/ray2333.json b/data/developers/ray2333.json index 709d24161c8237750555868d00eabe667376cddb..0d33fd83045f7f1f2a0352b101b8f653545ad511 100644 --- a/data/developers/ray2333.json +++ b/data/developers/ray2333.json @@ -61,16 +61,16 @@ "developer": "Ray2333", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.8839, + "reward-bench/Score": 0.5966, + "reward-bench/Chat": 0.9302, + "reward-bench/Chat Hard": 0.7719, + "reward-bench/Safety": 0.9222, + "reward-bench/Reasoning": 0.912, "reward-bench/Factuality": 0.5305, "reward-bench/Precise IF": 0.3125, "reward-bench/Math": 0.5902, - "reward-bench/Safety": 0.9216, "reward-bench/Focus": 0.7455, - "reward-bench/Ties": 0.4788, - "reward-bench/Chat": 0.9302, - "reward-bench/Chat Hard": 0.7719, - "reward-bench/Reasoning": 0.912 + "reward-bench/Ties": 0.4788 } }, { @@ -116,17 +116,17 @@ "developer": "Ray2333", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.6089, - "reward-bench/Chat": 0.986, - "reward-bench/Chat Hard": 0.6776, - "reward-bench/Safety": 0.7867, - "reward-bench/Reasoning": 0.9229, - "reward-bench/Prior Sets (0.5 weight)": 0.7309, + "reward-bench/Score": 0.8542, "reward-bench/Factuality": 0.6189, "reward-bench/Precise IF": 0.3875, "reward-bench/Math": 0.5792, + "reward-bench/Safety": 0.8919, "reward-bench/Focus": 0.6828, - "reward-bench/Ties": 0.5981 + "reward-bench/Ties": 0.5981, + "reward-bench/Chat": 0.986, + "reward-bench/Chat Hard": 0.6776, + "reward-bench/Reasoning": 0.9229, + "reward-bench/Prior Sets (0.5 weight)": 0.7309 } }, { diff --git a/data/developers/recoilme.json b/data/developers/recoilme.json index 1e7efb22560a32fb85354296593a7cbce1d04e91..862469c1653f0d1414e3f6dabd97ef04b6d4f9e2 100644 --- a/data/developers/recoilme.json +++ b/data/developers/recoilme.json @@ -7,12 +7,12 @@ "developer": "recoilme", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.7649, - "hfopenllm_v2/BBH": 0.5974, - "hfopenllm_v2/MATH Level 5": 0.0174, - "hfopenllm_v2/GPQA": 0.3305, - "hfopenllm_v2/MUSR": 0.4245, - "hfopenllm_v2/MMLU-PRO": 0.4207 + "hfopenllm_v2/IFEval": 0.2854, + "hfopenllm_v2/BBH": 0.5984, + "hfopenllm_v2/MATH Level 5": 0.1005, + "hfopenllm_v2/GPQA": 0.3297, + "hfopenllm_v2/MUSR": 0.4607, + "hfopenllm_v2/MMLU-PRO": 0.4162 } }, { @@ -35,12 +35,12 @@ "developer": "recoilme", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.2747, - "hfopenllm_v2/BBH": 0.6031, - "hfopenllm_v2/MATH Level 5": 0.0831, - "hfopenllm_v2/GPQA": 0.3305, - "hfopenllm_v2/MUSR": 0.4686, - "hfopenllm_v2/MMLU-PRO": 0.4122 + "hfopenllm_v2/IFEval": 0.7592, + "hfopenllm_v2/BBH": 0.6026, + "hfopenllm_v2/MATH Level 5": 0.0529, + "hfopenllm_v2/GPQA": 0.3289, + "hfopenllm_v2/MUSR": 0.4099, + "hfopenllm_v2/MMLU-PRO": 0.4163 } }, { @@ -49,12 +49,12 @@ "developer": "recoilme", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.7439, - "hfopenllm_v2/BBH": 0.5993, - "hfopenllm_v2/MATH Level 5": 0.0876, - "hfopenllm_v2/GPQA": 0.3238, - "hfopenllm_v2/MUSR": 0.4204, - "hfopenllm_v2/MMLU-PRO": 0.4072 + "hfopenllm_v2/IFEval": 0.5761, + "hfopenllm_v2/BBH": 0.602, + "hfopenllm_v2/MATH Level 5": 0.1888, + "hfopenllm_v2/GPQA": 0.3372, + "hfopenllm_v2/MUSR": 0.4632, + "hfopenllm_v2/MMLU-PRO": 0.4039 } }, { diff --git a/data/developers/replete-ai.json b/data/developers/replete-ai.json index dbb06f00736a7fcddf76b24a5c7673e098ecab57..0f038b31b0c0cf28a02b6c25fb4fd9bd374c118c 100644 --- a/data/developers/replete-ai.json +++ b/data/developers/replete-ai.json @@ -91,12 +91,12 @@ "developer": "Replete-AI", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.0932, - "hfopenllm_v2/BBH": 0.2977, + "hfopenllm_v2/IFEval": 0.0905, + "hfopenllm_v2/BBH": 0.2985, "hfopenllm_v2/MATH Level 5": 0.0, - "hfopenllm_v2/GPQA": 0.2475, - "hfopenllm_v2/MUSR": 0.3941, - "hfopenllm_v2/MMLU-PRO": 0.1157 + "hfopenllm_v2/GPQA": 0.2534, + "hfopenllm_v2/MUSR": 0.3848, + "hfopenllm_v2/MMLU-PRO": 0.1158 } }, { diff --git a/data/developers/sfairxc.json b/data/developers/sfairxc.json index f511504fa472f754b653d8fff6432038f2fa5642..0f83a5aa819fcf69906563e63fd5c5a7cc1f8ce0 100644 --- a/data/developers/sfairxc.json +++ b/data/developers/sfairxc.json @@ -7,17 +7,17 @@ "developer": "sfairXC", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.6292, - "reward-bench/Chat": 0.9944, - "reward-bench/Chat Hard": 0.6513, - "reward-bench/Safety": 0.7667, - "reward-bench/Reasoning": 0.8644, - "reward-bench/Prior Sets (0.5 weight)": 0.7492, + "reward-bench/Score": 0.8338, "reward-bench/Factuality": 0.5916, "reward-bench/Precise IF": 0.4188, "reward-bench/Math": 0.6284, + "reward-bench/Safety": 0.8676, "reward-bench/Focus": 0.7051, - "reward-bench/Ties": 0.6647 + "reward-bench/Ties": 0.6647, + "reward-bench/Chat": 0.9944, + "reward-bench/Chat Hard": 0.6513, + "reward-bench/Reasoning": 0.8644, + "reward-bench/Prior Sets (0.5 weight)": 0.7492 } } ] diff --git a/data/developers/skywork.json b/data/developers/skywork.json index a1886dbde5ab2f08ceeb175a3c92e5b11655f24d..310fc474921f3ff87a2e73d5f74077c21e25d2fe 100644 --- a/data/developers/skywork.json +++ b/data/developers/skywork.json @@ -47,16 +47,16 @@ "developer": "Skywork", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.938, + "reward-bench/Score": 0.7576, + "reward-bench/Chat": 0.9581, + "reward-bench/Chat Hard": 0.9145, + "reward-bench/Safety": 0.9422, + "reward-bench/Reasoning": 0.9606, "reward-bench/Factuality": 0.7368, "reward-bench/Precise IF": 0.4031, "reward-bench/Math": 0.7049, - "reward-bench/Safety": 0.9189, "reward-bench/Focus": 0.9323, - "reward-bench/Ties": 0.8261, - "reward-bench/Chat": 0.9581, - "reward-bench/Chat Hard": 0.9145, - "reward-bench/Reasoning": 0.9606 + "reward-bench/Ties": 0.8261 } }, { @@ -71,16 +71,16 @@ "hfopenllm_v2/GPQA": 0.344, "hfopenllm_v2/MUSR": 0.4231, "hfopenllm_v2/MMLU-PRO": 0.4103, - "reward-bench/Score": 0.7531, - "reward-bench/Chat": 0.9609, - "reward-bench/Chat Hard": 0.8991, - "reward-bench/Safety": 0.9689, - "reward-bench/Reasoning": 0.9807, + "reward-bench/Score": 0.9426, "reward-bench/Factuality": 0.7674, "reward-bench/Precise IF": 0.375, "reward-bench/Math": 0.6721, + "reward-bench/Safety": 0.9297, "reward-bench/Focus": 0.9172, - "reward-bench/Ties": 0.8182 + "reward-bench/Ties": 0.8182, + "reward-bench/Chat": 0.9609, + "reward-bench/Chat Hard": 0.8991, + "reward-bench/Reasoning": 0.9807 } }, { @@ -89,16 +89,16 @@ "developer": "Skywork", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.7314, - "reward-bench/Chat": 0.9581, - "reward-bench/Chat Hard": 0.8728, - "reward-bench/Safety": 0.9333, - "reward-bench/Reasoning": 0.962, + "reward-bench/Score": 0.9252, "reward-bench/Factuality": 0.6989, "reward-bench/Precise IF": 0.425, "reward-bench/Math": 0.6284, + "reward-bench/Safety": 0.9081, "reward-bench/Focus": 0.9616, - "reward-bench/Ties": 0.741 + "reward-bench/Ties": 0.741, + "reward-bench/Chat": 0.9581, + "reward-bench/Chat Hard": 0.8728, + "reward-bench/Reasoning": 0.962 } }, { @@ -230,16 +230,16 @@ "developer": "Skywork", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.9007, + "reward-bench/Score": 0.6885, + "reward-bench/Chat": 0.8994, + "reward-bench/Chat Hard": 0.875, + "reward-bench/Safety": 0.8911, + "reward-bench/Reasoning": 0.9176, "reward-bench/Factuality": 0.6063, "reward-bench/Precise IF": 0.35, "reward-bench/Math": 0.6339, - "reward-bench/Safety": 0.9108, "reward-bench/Focus": 0.8909, - "reward-bench/Ties": 0.7586, - "reward-bench/Chat": 0.8994, - "reward-bench/Chat Hard": 0.875, - "reward-bench/Reasoning": 0.9176 + "reward-bench/Ties": 0.7586 } } ] diff --git a/data/developers/snowflake.json b/data/developers/snowflake.json index ea10d8d8be19d81cae9a42ff10694a41ac772666..d36a86d899a25b9271cd05a105d08de1ef5c396f 100644 --- a/data/developers/snowflake.json +++ b/data/developers/snowflake.json @@ -7,6 +7,16 @@ "developer": "snowflake", "evaluator_relationship": null, "benchmark_scores": { + "helm_lite/Mean win rate": 0.338, + "helm_lite/NarrativeQA": 0.654, + "helm_lite/NaturalQuestions (closed-book)": 0.39, + "helm_lite/OpenbookQA": 0.828, + "helm_lite/MMLU": 0.575, + "helm_lite/MATH": 0.519, + "helm_lite/GSM8K": 0.768, + "helm_lite/LegalBench": 0.588, + "helm_lite/MedQA": 0.581, + "helm_lite/WMT 2014": 0.172, "helm_mmlu/MMLU All Subjects": 0.677, "helm_mmlu/Abstract Algebra": 0.35, "helm_mmlu/Anatomy": 0.652, @@ -42,17 +52,7 @@ "helm_mmlu/Sociology": 0.891, "helm_mmlu/Virology": 0.536, "helm_mmlu/World Religions": 0.854, - "helm_mmlu/Mean win rate": 0.565, - "helm_lite/Mean win rate": 0.338, - "helm_lite/NarrativeQA": 0.654, - "helm_lite/NaturalQuestions (closed-book)": 0.39, - "helm_lite/OpenbookQA": 0.828, - "helm_lite/MMLU": 0.575, - "helm_lite/MATH": 0.519, - "helm_lite/GSM8K": 0.768, - "helm_lite/LegalBench": 0.588, - "helm_lite/MedQA": 0.581, - "helm_lite/WMT 2014": 0.172 + "helm_mmlu/Mean win rate": 0.565 } } ] diff --git a/data/developers/ucla-agi.json b/data/developers/ucla-agi.json index 75dd11b6fe190b43556ce10dc96eff4392988245..93fb82e5b4d9ef216b964e7c543f4f7a063aa502 100644 --- a/data/developers/ucla-agi.json +++ b/data/developers/ucla-agi.json @@ -77,12 +77,12 @@ "developer": "UCLA-AGI", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.6834, - "hfopenllm_v2/BBH": 0.508, - "hfopenllm_v2/MATH Level 5": 0.0959, + "hfopenllm_v2/IFEval": 0.6703, + "hfopenllm_v2/BBH": 0.5076, + "hfopenllm_v2/MATH Level 5": 0.0718, "hfopenllm_v2/GPQA": 0.2651, - "hfopenllm_v2/MUSR": 0.3661, - "hfopenllm_v2/MMLU-PRO": 0.3644 + "hfopenllm_v2/MUSR": 0.3647, + "hfopenllm_v2/MMLU-PRO": 0.3658 } }, { diff --git a/data/developers/valiantlabs.json b/data/developers/valiantlabs.json index 28817feab6d48bbca18c53e003fe218c174dc60a..a0fc2b3d81b7ddfe31ae4c63f4f370e45c708501 100644 --- a/data/developers/valiantlabs.json +++ b/data/developers/valiantlabs.json @@ -91,12 +91,12 @@ "developer": "ValiantLabs", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.5328, - "hfopenllm_v2/BBH": 0.4613, - "hfopenllm_v2/MATH Level 5": 0.0876, - "hfopenllm_v2/GPQA": 0.2894, - "hfopenllm_v2/MUSR": 0.3367, - "hfopenllm_v2/MMLU-PRO": 0.2424 + "hfopenllm_v2/IFEval": 0.5483, + "hfopenllm_v2/BBH": 0.461, + "hfopenllm_v2/MATH Level 5": 0.0582, + "hfopenllm_v2/GPQA": 0.2886, + "hfopenllm_v2/MUSR": 0.3433, + "hfopenllm_v2/MMLU-PRO": 0.2407 } }, { @@ -105,12 +105,12 @@ "developer": "ValiantLabs", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.6496, - "hfopenllm_v2/BBH": 0.4774, - "hfopenllm_v2/MATH Level 5": 0.0566, - "hfopenllm_v2/GPQA": 0.3104, - "hfopenllm_v2/MUSR": 0.3909, - "hfopenllm_v2/MMLU-PRO": 0.3382 + "hfopenllm_v2/IFEval": 0.2678, + "hfopenllm_v2/BBH": 0.4429, + "hfopenllm_v2/MATH Level 5": 0.0521, + "hfopenllm_v2/GPQA": 0.302, + "hfopenllm_v2/MUSR": 0.3959, + "hfopenllm_v2/MMLU-PRO": 0.2927 } }, { diff --git a/data/developers/virnect.json b/data/developers/virnect.json index 42ebd864a2c9cd593e1e2721cb69ddb9031aa22d..c4d108ef590950a6696d9775575feeb937623918 100644 --- a/data/developers/virnect.json +++ b/data/developers/virnect.json @@ -7,12 +7,12 @@ "developer": "VIRNECT", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.5021, - "hfopenllm_v2/BBH": 0.4918, - "hfopenllm_v2/MATH Level 5": 0.108, + "hfopenllm_v2/IFEval": 0.5058, + "hfopenllm_v2/BBH": 0.4908, + "hfopenllm_v2/MATH Level 5": 0.0929, "hfopenllm_v2/GPQA": 0.271, - "hfopenllm_v2/MUSR": 0.3648, - "hfopenllm_v2/MMLU-PRO": 0.3536 + "hfopenllm_v2/MUSR": 0.3662, + "hfopenllm_v2/MMLU-PRO": 0.3539 } }, { diff --git a/data/developers/weqweasdas.json b/data/developers/weqweasdas.json index 36aa4e34297ffab30b15be8c96f1b385a32071bf..fd061c03aa20db7f3ab1d7ab2e9b6a6249c65553 100644 --- a/data/developers/weqweasdas.json +++ b/data/developers/weqweasdas.json @@ -7,17 +7,17 @@ "developer": "weqweasdas", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.2498, - "reward-bench/Chat": 0.8184, - "reward-bench/Chat Hard": 0.3728, - "reward-bench/Safety": 0.24, - "reward-bench/Reasoning": 0.3281, - "reward-bench/Prior Sets (0.5 weight)": 0.6564, + "reward-bench/Score": 0.5027, "reward-bench/Factuality": 0.3642, "reward-bench/Precise IF": 0.275, "reward-bench/Math": 0.3497, + "reward-bench/Safety": 0.4149, "reward-bench/Focus": 0.2384, - "reward-bench/Ties": 0.0315 + "reward-bench/Ties": 0.0315, + "reward-bench/Chat": 0.8184, + "reward-bench/Chat Hard": 0.3728, + "reward-bench/Reasoning": 0.3281, + "reward-bench/Prior Sets (0.5 weight)": 0.6564 } }, { @@ -26,17 +26,17 @@ "developer": "weqweasdas", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.3057, - "reward-bench/Chat": 0.9441, - "reward-bench/Chat Hard": 0.4079, - "reward-bench/Safety": 0.3311, - "reward-bench/Reasoning": 0.7637, - "reward-bench/Prior Sets (0.5 weight)": 0.6652, + "reward-bench/Score": 0.6549, "reward-bench/Factuality": 0.3705, "reward-bench/Precise IF": 0.2812, "reward-bench/Math": 0.4317, + "reward-bench/Safety": 0.4986, "reward-bench/Focus": 0.2343, - "reward-bench/Ties": 0.1851 + "reward-bench/Ties": 0.1851, + "reward-bench/Chat": 0.9441, + "reward-bench/Chat Hard": 0.4079, + "reward-bench/Reasoning": 0.7637, + "reward-bench/Prior Sets (0.5 weight)": 0.6652 } }, { @@ -78,17 +78,17 @@ "developer": "weqweasdas", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.596, - "reward-bench/Chat": 0.9665, - "reward-bench/Chat Hard": 0.6053, - "reward-bench/Safety": 0.6911, - "reward-bench/Reasoning": 0.7736, - "reward-bench/Prior Sets (0.5 weight)": 0.753, + "reward-bench/Score": 0.7982, "reward-bench/Factuality": 0.5937, "reward-bench/Precise IF": 0.3438, "reward-bench/Math": 0.5956, + "reward-bench/Safety": 0.8703, "reward-bench/Focus": 0.7293, - "reward-bench/Ties": 0.6226 + "reward-bench/Ties": 0.6226, + "reward-bench/Chat": 0.9665, + "reward-bench/Chat Hard": 0.6053, + "reward-bench/Reasoning": 0.7736, + "reward-bench/Prior Sets (0.5 weight)": 0.753 } } ] diff --git a/data/developers/xai.json b/data/developers/xai.json index 4539ab8505af399ecb6f06dee4d4f2618de3e3bc..5bf50e8754b14e71e2153e4564b26c11231d3d27 100644 --- a/data/developers/xai.json +++ b/data/developers/xai.json @@ -78,7 +78,7 @@ "developer": "xAI", "evaluator_relationship": null, "benchmark_scores": { - "terminal-bench-2.0/terminal-bench-2.0": 25.4 + "terminal-bench-2.0/terminal-bench-2.0": 23.1 } }, { @@ -120,7 +120,7 @@ "developer": "xAI", "evaluator_relationship": null, "benchmark_scores": { - "terminal-bench-2.0/terminal-bench-2.0": 25.8 + "terminal-bench-2.0/terminal-bench-2.0": 14.2 } } ] diff --git a/data/developers/ycros.json b/data/developers/ycros.json index e95400442aba80bbdb2ffa85224e7399ce946b0d..9b83f7f872f197f98fd71538cda202af618f5659 100644 --- a/data/developers/ycros.json +++ b/data/developers/ycros.json @@ -7,12 +7,12 @@ "developer": "ycros", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.6262, - "hfopenllm_v2/BBH": 0.5142, - "hfopenllm_v2/MATH Level 5": 0.0937, - "hfopenllm_v2/GPQA": 0.3079, - "hfopenllm_v2/MUSR": 0.4138, - "hfopenllm_v2/MMLU-PRO": 0.3481 + "hfopenllm_v2/IFEval": 0.5994, + "hfopenllm_v2/BBH": 0.5159, + "hfopenllm_v2/MATH Level 5": 0.0785, + "hfopenllm_v2/GPQA": 0.3045, + "hfopenllm_v2/MUSR": 0.4203, + "hfopenllm_v2/MMLU-PRO": 0.3473 } } ] diff --git a/data/developers/yoyo-ai.json b/data/developers/yoyo-ai.json index 9dcbc81e86094f67bbf3f24e1917205fb4cc44f8..d0e307257898b20d6b2c21faa52d04a28d7cca51 100644 --- a/data/developers/yoyo-ai.json +++ b/data/developers/yoyo-ai.json @@ -105,12 +105,12 @@ "developer": "YOYO-AI", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.5899, - "hfopenllm_v2/BBH": 0.654, - "hfopenllm_v2/MATH Level 5": 0.4509, - "hfopenllm_v2/GPQA": 0.3834, - "hfopenllm_v2/MUSR": 0.4744, - "hfopenllm_v2/MMLU-PRO": 0.5376 + "hfopenllm_v2/IFEval": 0.7905, + "hfopenllm_v2/BBH": 0.6406, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.3163, + "hfopenllm_v2/MUSR": 0.4181, + "hfopenllm_v2/MMLU-PRO": 0.4944 } }, { diff --git a/data/models.json b/data/models.json index 1c62b750346e574615920f5e5cddd93c6e67872c..2b93623cbdf71160760101030d0eb4a96cc29224 100644 --- a/data/models.json +++ b/data/models.json @@ -1005,12 +1005,12 @@ "developer": "adriszmar", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.1685, - "hfopenllm_v2/BBH": 0.3124, - "hfopenllm_v2/MATH Level 5": 0.0015, - "hfopenllm_v2/GPQA": 0.2492, - "hfopenllm_v2/MUSR": 0.3963, - "hfopenllm_v2/MMLU-PRO": 0.1066 + "hfopenllm_v2/IFEval": 0.1746, + "hfopenllm_v2/BBH": 0.3126, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.245, + "hfopenllm_v2/MUSR": 0.4096, + "hfopenllm_v2/MMLU-PRO": 0.1087 } }, { @@ -1391,9 +1391,9 @@ "developer": "AI2", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.7008, - "reward-bench/Chat": 0.9385, - "reward-bench/Chat Hard": 0.3882, + "reward-bench/Score": 0.6924, + "reward-bench/Chat": 0.9441, + "reward-bench/Chat Hard": 0.3575, "reward-bench/Safety": 0.7757 } }, @@ -2036,12 +2036,12 @@ "developer": "akjindal53244", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.8051, - "hfopenllm_v2/BBH": 0.5189, - "hfopenllm_v2/MATH Level 5": 0.1722, - "hfopenllm_v2/GPQA": 0.3263, + "hfopenllm_v2/IFEval": 0.8033, + "hfopenllm_v2/BBH": 0.5196, + "hfopenllm_v2/MATH Level 5": 0.1624, + "hfopenllm_v2/GPQA": 0.3096, "hfopenllm_v2/MUSR": 0.4028, - "hfopenllm_v2/MMLU-PRO": 0.3803 + "hfopenllm_v2/MMLU-PRO": 0.3812 } }, { @@ -2390,17 +2390,17 @@ "developer": "allenai", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.9021, + "reward-bench/Score": 0.7606, + "reward-bench/Chat": 0.9665, + "reward-bench/Chat Hard": 0.8355, + "reward-bench/Safety": 0.8844, + "reward-bench/Reasoning": 0.8969, + "reward-bench/Prior Sets (0.5 weight)": 0.0, "reward-bench/Factuality": 0.8126, "reward-bench/Precise IF": 0.4188, "reward-bench/Math": 0.6995, - "reward-bench/Safety": 0.9095, "reward-bench/Focus": 0.8646, - "reward-bench/Ties": 0.8835, - "reward-bench/Chat": 0.9665, - "reward-bench/Chat Hard": 0.8355, - "reward-bench/Reasoning": 0.8969, - "reward-bench/Prior Sets (0.5 weight)": 0.0 + "reward-bench/Ties": 0.8835 } }, { @@ -2428,17 +2428,17 @@ "developer": "allenai", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.8885, + "reward-bench/Score": 0.7285, + "reward-bench/Chat": 0.9581, + "reward-bench/Chat Hard": 0.8158, + "reward-bench/Safety": 0.8956, + "reward-bench/Reasoning": 0.887, + "reward-bench/Prior Sets (0.5 weight)": 0.0, "reward-bench/Factuality": 0.7432, "reward-bench/Precise IF": 0.4437, "reward-bench/Math": 0.6175, - "reward-bench/Safety": 0.8932, "reward-bench/Focus": 0.9071, - "reward-bench/Ties": 0.7638, - "reward-bench/Chat": 0.9581, - "reward-bench/Chat Hard": 0.8158, - "reward-bench/Reasoning": 0.887, - "reward-bench/Prior Sets (0.5 weight)": 0.0 + "reward-bench/Ties": 0.7638 } }, { @@ -2447,12 +2447,12 @@ "developer": "allenai", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.8379, - "hfopenllm_v2/BBH": 0.6157, - "hfopenllm_v2/MATH Level 5": 0.3829, + "hfopenllm_v2/IFEval": 0.8291, + "hfopenllm_v2/BBH": 0.6164, + "hfopenllm_v2/MATH Level 5": 0.4502, "hfopenllm_v2/GPQA": 0.3733, - "hfopenllm_v2/MUSR": 0.4988, - "hfopenllm_v2/MMLU-PRO": 0.4656 + "hfopenllm_v2/MUSR": 0.4948, + "hfopenllm_v2/MMLU-PRO": 0.4645 } }, { @@ -2489,17 +2489,17 @@ "developer": "allenai", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.722, - "reward-bench/Chat": 0.9693, - "reward-bench/Chat Hard": 0.8268, - "reward-bench/Safety": 0.8689, - "reward-bench/Reasoning": 0.8583, - "reward-bench/Prior Sets (0.5 weight)": 0.0, + "reward-bench/Score": 0.8892, "reward-bench/Factuality": 0.8084, "reward-bench/Precise IF": 0.3688, "reward-bench/Math": 0.6776, + "reward-bench/Safety": 0.9027, "reward-bench/Focus": 0.7778, - "reward-bench/Ties": 0.8308 + "reward-bench/Ties": 0.8308, + "reward-bench/Chat": 0.9693, + "reward-bench/Chat Hard": 0.8268, + "reward-bench/Reasoning": 0.8583, + "reward-bench/Prior Sets (0.5 weight)": 0.0 } }, { @@ -2536,17 +2536,17 @@ "developer": "allenai", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.687, - "reward-bench/Chat": 0.9553, - "reward-bench/Chat Hard": 0.761, - "reward-bench/Safety": 0.86, - "reward-bench/Reasoning": 0.7898, - "reward-bench/Prior Sets (0.5 weight)": 0.0, + "reward-bench/Score": 0.8431, "reward-bench/Factuality": 0.7516, "reward-bench/Precise IF": 0.3875, "reward-bench/Math": 0.6284, + "reward-bench/Safety": 0.8662, "reward-bench/Focus": 0.8545, - "reward-bench/Ties": 0.6397 + "reward-bench/Ties": 0.6397, + "reward-bench/Chat": 0.9553, + "reward-bench/Chat Hard": 0.761, + "reward-bench/Reasoning": 0.7898, + "reward-bench/Prior Sets (0.5 weight)": 0.0 } }, { @@ -7446,12 +7446,12 @@ "developer": "Anthropic", "evaluator_relationship": null, "benchmark_scores": { - "appworld_test_normal/appworld/test_normal": 0.68, + "appworld_test_normal/appworld/test_normal": 0.7, "browsecompplus/browsecompplus": 0.61, - "swe-bench/swe-bench": 0.65, - "tau-bench-2_airline/tau-bench-2/airline": 0.72, + "swe-bench/swe-bench": 0.6061, + "tau-bench-2_airline/tau-bench-2/airline": 0.66, "tau-bench-2_retail/tau-bench-2/retail": 0.78, - "tau-bench-2_telecom/tau-bench-2/telecom": 0.76 + "tau-bench-2_telecom/tau-bench-2/telecom": 0.84 } }, { @@ -7460,7 +7460,7 @@ "developer": "Anthropic", "evaluator_relationship": null, "benchmark_scores": { - "terminal-bench-2.0/terminal-bench-2.0": 38.0 + "terminal-bench-2.0/terminal-bench-2.0": 35.1 } }, { @@ -7469,7 +7469,7 @@ "developer": "Anthropic", "evaluator_relationship": null, "benchmark_scores": { - "terminal-bench-2.0/terminal-bench-2.0": 59.1 + "terminal-bench-2.0/terminal-bench-2.0": 52.1 } }, { @@ -7478,7 +7478,7 @@ "developer": "Anthropic", "evaluator_relationship": null, "benchmark_scores": { - "terminal-bench-2.0/terminal-bench-2.0": 58.0 + "terminal-bench-2.0/terminal-bench-2.0": 62.9 } }, { @@ -7552,7 +7552,7 @@ "developer": "Anthropic", "evaluator_relationship": null, "benchmark_scores": { - "terminal-bench-2.0/terminal-bench-2.0": 43.1 + "terminal-bench-2.0/terminal-bench-2.0": 42.6 } }, { @@ -7596,8 +7596,6 @@ "developer": "Anthropic", "evaluator_relationship": null, "benchmark_scores": { - "ace/Overall Score": 0.478, - "ace/Gaming Score": 0.391, "apex-agents/Overall Pass@1": 0.184, "apex-agents/Overall Pass@8": 0.34, "apex-agents/Overall Mean Score": 0.348, @@ -7605,6 +7603,8 @@ "apex-agents/Management Consulting Pass@1": 0.132, "apex-agents/Corporate Law Pass@1": 0.202, "apex-agents/Corporate Lawyer Mean Score": 0.471, + "ace/Overall Score": 0.478, + "ace/Gaming Score": 0.391, "apex-v1/Medicine (MD) Score": 0.65 } }, @@ -12127,12 +12127,12 @@ "developer": "cognitivecomputations", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.4124, - "hfopenllm_v2/BBH": 0.6383, - "hfopenllm_v2/MATH Level 5": 0.182, - "hfopenllm_v2/GPQA": 0.3289, - "hfopenllm_v2/MUSR": 0.4349, - "hfopenllm_v2/MMLU-PRO": 0.4525 + "hfopenllm_v2/IFEval": 0.3613, + "hfopenllm_v2/BBH": 0.6123, + "hfopenllm_v2/MATH Level 5": 0.1239, + "hfopenllm_v2/GPQA": 0.328, + "hfopenllm_v2/MUSR": 0.4112, + "hfopenllm_v2/MMLU-PRO": 0.4494 } }, { @@ -12852,12 +12852,12 @@ "developer": "Columbia-NLP", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.3278, - "hfopenllm_v2/BBH": 0.392, - "hfopenllm_v2/MATH Level 5": 0.0431, - "hfopenllm_v2/GPQA": 0.2492, - "hfopenllm_v2/MUSR": 0.412, - "hfopenllm_v2/MMLU-PRO": 0.1666 + "hfopenllm_v2/IFEval": 0.3102, + "hfopenllm_v2/BBH": 0.3881, + "hfopenllm_v2/MATH Level 5": 0.0536, + "hfopenllm_v2/GPQA": 0.2534, + "hfopenllm_v2/MUSR": 0.4081, + "hfopenllm_v2/MMLU-PRO": 0.1665 } }, { @@ -13400,12 +13400,12 @@ "developer": "cpayne1303", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.1916, - "hfopenllm_v2/BBH": 0.2977, - "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/IFEval": 0.1949, + "hfopenllm_v2/BBH": 0.2965, + "hfopenllm_v2/MATH Level 5": 0.0045, "hfopenllm_v2/GPQA": 0.2685, - "hfopenllm_v2/MUSR": 0.3872, - "hfopenllm_v2/MMLU-PRO": 0.1132 + "hfopenllm_v2/MUSR": 0.3885, + "hfopenllm_v2/MMLU-PRO": 0.1111 } }, { @@ -14338,12 +14338,12 @@ "developer": "Daemontatox", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.3745, - "hfopenllm_v2/BBH": 0.6668, - "hfopenllm_v2/MATH Level 5": 0.4758, - "hfopenllm_v2/GPQA": 0.3943, - "hfopenllm_v2/MUSR": 0.4858, - "hfopenllm_v2/MMLU-PRO": 0.5593 + "hfopenllm_v2/IFEval": 0.4855, + "hfopenllm_v2/BBH": 0.6627, + "hfopenllm_v2/MATH Level 5": 0.4841, + "hfopenllm_v2/GPQA": 0.3096, + "hfopenllm_v2/MUSR": 0.4256, + "hfopenllm_v2/MMLU-PRO": 0.5542 } }, { @@ -15729,12 +15729,12 @@ "developer": "DeepMount00", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.7917, - "hfopenllm_v2/BBH": 0.5109, - "hfopenllm_v2/MATH Level 5": 0.1088, - "hfopenllm_v2/GPQA": 0.2878, - "hfopenllm_v2/MUSR": 0.4136, - "hfopenllm_v2/MMLU-PRO": 0.3876 + "hfopenllm_v2/IFEval": 0.5365, + "hfopenllm_v2/BBH": 0.517, + "hfopenllm_v2/MATH Level 5": 0.1707, + "hfopenllm_v2/GPQA": 0.3062, + "hfopenllm_v2/MUSR": 0.4487, + "hfopenllm_v2/MMLU-PRO": 0.396 } }, { @@ -16376,12 +16376,12 @@ "developer": "dfurman", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.2835, - "hfopenllm_v2/BBH": 0.3842, - "hfopenllm_v2/MATH Level 5": 0.0521, - "hfopenllm_v2/GPQA": 0.2609, - "hfopenllm_v2/MUSR": 0.3566, - "hfopenllm_v2/MMLU-PRO": 0.2298 + "hfopenllm_v2/IFEval": 0.3, + "hfopenllm_v2/BBH": 0.3853, + "hfopenllm_v2/MATH Level 5": 0.0415, + "hfopenllm_v2/GPQA": 0.2617, + "hfopenllm_v2/MUSR": 0.3579, + "hfopenllm_v2/MMLU-PRO": 0.2281 } }, { @@ -17020,12 +17020,12 @@ "developer": "DoppelReflEx", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.451, - "hfopenllm_v2/BBH": 0.4944, - "hfopenllm_v2/MATH Level 5": 0.1156, - "hfopenllm_v2/GPQA": 0.3196, - "hfopenllm_v2/MUSR": 0.3896, - "hfopenllm_v2/MMLU-PRO": 0.3256 + "hfopenllm_v2/IFEval": 0.436, + "hfopenllm_v2/BBH": 0.4956, + "hfopenllm_v2/MATH Level 5": 0.0589, + "hfopenllm_v2/GPQA": 0.3205, + "hfopenllm_v2/MUSR": 0.3843, + "hfopenllm_v2/MMLU-PRO": 0.3237 } }, { @@ -23519,6 +23519,7 @@ "developer": "Google", "evaluator_relationship": null, "benchmark_scores": { + "ace/Gaming Score": 0.415, "apex-agents/Overall Pass@1": 0.24, "apex-agents/Overall Pass@8": 0.367, "apex-agents/Overall Mean Score": 0.395, @@ -23526,7 +23527,6 @@ "apex-agents/Management Consulting Pass@1": 0.193, "apex-agents/Corporate Law Pass@1": 0.259, "apex-agents/Corporate Lawyer Mean Score": 0.524, - "ace/Gaming Score": 0.415, "apex-v1/Overall Score": 0.64, "apex-v1/Consulting Score": 0.64 } @@ -23537,6 +23537,8 @@ "developer": "Google", "evaluator_relationship": null, "benchmark_scores": { + "ace/Overall Score": 0.47, + "ace/Gaming Score": 0.509, "apex-agents/Overall Pass@1": 0.184, "apex-agents/Overall Pass@8": 0.373, "apex-agents/Overall Mean Score": 0.341, @@ -23544,8 +23546,6 @@ "apex-agents/Management Consulting Pass@1": 0.124, "apex-agents/Corporate Law Pass@1": 0.239, "apex-agents/Corporate Lawyer Mean Score": 0.487, - "ace/Overall Score": 0.47, - "ace/Gaming Score": 0.509, "apex-v1/Overall Score": 0.643, "apex-v1/Consulting Score": 0.64, "apex-v1/Investment Banking Score": 0.63 @@ -24103,7 +24103,7 @@ "reward-bench/Safety": 0.909, "reward-bench/Focus": 0.841, "reward-bench/Ties": 0.809, - "terminal-bench-2.0/terminal-bench-2.0": 17.1 + "terminal-bench-2.0/terminal-bench-2.0": 16.9 } }, { @@ -24241,7 +24241,7 @@ "developer": "Google", "evaluator_relationship": null, "benchmark_scores": { - "terminal-bench-2.0/terminal-bench-2.0": 51.0 + "terminal-bench-2.0/terminal-bench-2.0": 47.4 } }, { @@ -24259,8 +24259,8 @@ "developer": "Google", "evaluator_relationship": null, "benchmark_scores": { - "appworld_test_normal/appworld/test_normal": 0.13, - "browsecompplus/browsecompplus": 0.48, + "appworld_test_normal/appworld/test_normal": 0.55, + "browsecompplus/browsecompplus": 0.3333, "global-mmlu-lite/Global MMLU Lite": 0.9453, "global-mmlu-lite/Culturally Sensitive": 0.9397, "global-mmlu-lite/Culturally Agnostic": 0.9509, @@ -24280,8 +24280,8 @@ "global-mmlu-lite/Yoruba": 0.9425, "global-mmlu-lite/Chinese": 0.9475, "global-mmlu-lite/Burmese": 0.9425, - "swe-bench/swe-bench": 0.7234, - "tau-bench-2_airline/tau-bench-2/airline": 0.7, + "swe-bench/swe-bench": 0.71, + "tau-bench-2_airline/tau-bench-2/airline": 0.68, "tau-bench-2_retail/tau-bench-2/retail": 0.73, "tau-bench-2_telecom/tau-bench-2/telecom": 0.73 } @@ -24408,12 +24408,12 @@ "developer": "Google", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.2018, - "hfopenllm_v2/BBH": 0.3709, - "hfopenllm_v2/MATH Level 5": 0.0302, + "hfopenllm_v2/IFEval": 0.1993, + "hfopenllm_v2/BBH": 0.3656, + "hfopenllm_v2/MATH Level 5": 0.0287, "hfopenllm_v2/GPQA": 0.2626, - "hfopenllm_v2/MUSR": 0.4219, - "hfopenllm_v2/MMLU-PRO": 0.2217 + "hfopenllm_v2/MUSR": 0.4232, + "hfopenllm_v2/MMLU-PRO": 0.218 } }, { @@ -24436,12 +24436,12 @@ "developer": "Google", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.5078, - "hfopenllm_v2/BBH": 0.4226, - "hfopenllm_v2/MATH Level 5": 0.0347, - "hfopenllm_v2/GPQA": 0.2852, - "hfopenllm_v2/MUSR": 0.3964, - "hfopenllm_v2/MMLU-PRO": 0.2578 + "hfopenllm_v2/IFEval": 0.5288, + "hfopenllm_v2/BBH": 0.4178, + "hfopenllm_v2/MATH Level 5": 0.0476, + "hfopenllm_v2/GPQA": 0.2752, + "hfopenllm_v2/MUSR": 0.3728, + "hfopenllm_v2/MMLU-PRO": 0.2467 } }, { @@ -26786,12 +26786,12 @@ "developer": "HuggingFaceTB", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.0593, - "hfopenllm_v2/BBH": 0.3135, - "hfopenllm_v2/MATH Level 5": 0.0144, - "hfopenllm_v2/GPQA": 0.2341, - "hfopenllm_v2/MUSR": 0.3871, - "hfopenllm_v2/MMLU-PRO": 0.1092 + "hfopenllm_v2/IFEval": 0.2883, + "hfopenllm_v2/BBH": 0.3124, + "hfopenllm_v2/MATH Level 5": 0.003, + "hfopenllm_v2/GPQA": 0.2357, + "hfopenllm_v2/MUSR": 0.3662, + "hfopenllm_v2/MMLU-PRO": 0.1115 } }, { @@ -28507,16 +28507,16 @@ "developer": "infly", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.7648, - "reward-bench/Chat": 0.9665, - "reward-bench/Chat Hard": 0.9101, - "reward-bench/Safety": 0.9644, - "reward-bench/Reasoning": 0.9912, + "reward-bench/Score": 0.9511, "reward-bench/Factuality": 0.7411, "reward-bench/Precise IF": 0.4188, "reward-bench/Math": 0.6995, + "reward-bench/Safety": 0.9365, "reward-bench/Focus": 0.903, - "reward-bench/Ties": 0.8622 + "reward-bench/Ties": 0.8622, + "reward-bench/Chat": 0.9665, + "reward-bench/Chat Hard": 0.9101, + "reward-bench/Reasoning": 0.9912 } }, { @@ -28701,16 +28701,16 @@ "developer": "internlm", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.8759, + "reward-bench/Score": 0.5335, + "reward-bench/Chat": 0.9916, + "reward-bench/Chat Hard": 0.6952, + "reward-bench/Safety": 0.5956, + "reward-bench/Reasoning": 0.9453, "reward-bench/Factuality": 0.4211, "reward-bench/Precise IF": 0.4, "reward-bench/Math": 0.5628, - "reward-bench/Safety": 0.8716, "reward-bench/Focus": 0.7051, - "reward-bench/Ties": 0.5164, - "reward-bench/Chat": 0.9916, - "reward-bench/Chat Hard": 0.6952, - "reward-bench/Reasoning": 0.9453 + "reward-bench/Ties": 0.5164 } }, { @@ -30623,12 +30623,12 @@ "developer": "jaspionjader", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.4345, - "hfopenllm_v2/BBH": 0.5419, - "hfopenllm_v2/MATH Level 5": 0.1292, - "hfopenllm_v2/GPQA": 0.3087, + "hfopenllm_v2/IFEval": 0.4418, + "hfopenllm_v2/BBH": 0.5406, + "hfopenllm_v2/MATH Level 5": 0.1352, + "hfopenllm_v2/GPQA": 0.3062, "hfopenllm_v2/MUSR": 0.4277, - "hfopenllm_v2/MMLU-PRO": 0.3854 + "hfopenllm_v2/MMLU-PRO": 0.386 } }, { @@ -38012,12 +38012,12 @@ "developer": "LeroyDyer", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.3579, - "hfopenllm_v2/BBH": 0.4477, - "hfopenllm_v2/MATH Level 5": 0.0423, - "hfopenllm_v2/GPQA": 0.3096, - "hfopenllm_v2/MUSR": 0.4134, - "hfopenllm_v2/MMLU-PRO": 0.2376 + "hfopenllm_v2/IFEval": 0.3798, + "hfopenllm_v2/BBH": 0.4483, + "hfopenllm_v2/MATH Level 5": 0.04, + "hfopenllm_v2/GPQA": 0.3129, + "hfopenllm_v2/MUSR": 0.4148, + "hfopenllm_v2/MMLU-PRO": 0.2389 } }, { @@ -38936,12 +38936,12 @@ "developer": "llmat", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.364, - "hfopenllm_v2/BBH": 0.4005, - "hfopenllm_v2/MATH Level 5": 0.0015, - "hfopenllm_v2/GPQA": 0.2693, - "hfopenllm_v2/MUSR": 0.3529, - "hfopenllm_v2/MMLU-PRO": 0.2301 + "hfopenllm_v2/IFEval": 0.377, + "hfopenllm_v2/BBH": 0.3978, + "hfopenllm_v2/MATH Level 5": 0.0242, + "hfopenllm_v2/GPQA": 0.2668, + "hfopenllm_v2/MUSR": 0.3555, + "hfopenllm_v2/MMLU-PRO": 0.2278 } }, { @@ -39569,16 +39569,16 @@ "developer": "LxzGordon", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.9294, + "reward-bench/Score": 0.7394, + "reward-bench/Chat": 0.9553, + "reward-bench/Chat Hard": 0.8816, + "reward-bench/Safety": 0.9178, + "reward-bench/Reasoning": 0.9698, "reward-bench/Factuality": 0.6884, "reward-bench/Precise IF": 0.45, "reward-bench/Math": 0.6393, - "reward-bench/Safety": 0.9108, "reward-bench/Focus": 0.9758, - "reward-bench/Ties": 0.7653, - "reward-bench/Chat": 0.9553, - "reward-bench/Chat Hard": 0.8816, - "reward-bench/Reasoning": 0.9698 + "reward-bench/Ties": 0.7653 } }, { @@ -42623,6 +42623,16 @@ "helm_capabilities/IFEval": 0.743, "helm_capabilities/WildBench": 0.686, "helm_capabilities/Omni-MATH": 0.137, + "helm_lite/Mean win rate": 0.303, + "helm_lite/NarrativeQA": 0.756, + "helm_lite/NaturalQuestions (closed-book)": 0.209, + "helm_lite/OpenbookQA": 0.74, + "helm_lite/MMLU": 0.5, + "helm_lite/MATH": 0.703, + "helm_lite/GSM8K": 0.798, + "helm_lite/LegalBench": 0.342, + "helm_lite/MedQA": 0.245, + "helm_lite/WMT 2014": 0.181, "helm_mmlu/MMLU All Subjects": 0.561, "helm_mmlu/Abstract Algebra": 0.26, "helm_mmlu/Anatomy": 0.459, @@ -42658,17 +42668,7 @@ "helm_mmlu/Sociology": 0.701, "helm_mmlu/Virology": 0.446, "helm_mmlu/World Religions": 0.789, - "helm_mmlu/Mean win rate": 0.475, - "helm_lite/Mean win rate": 0.303, - "helm_lite/NarrativeQA": 0.756, - "helm_lite/NaturalQuestions (closed-book)": 0.209, - "helm_lite/OpenbookQA": 0.74, - "helm_lite/MMLU": 0.5, - "helm_lite/MATH": 0.703, - "helm_lite/GSM8K": 0.798, - "helm_lite/LegalBench": 0.342, - "helm_lite/MedQA": 0.245, - "helm_lite/WMT 2014": 0.181 + "helm_mmlu/Mean win rate": 0.475 } }, { @@ -42731,6 +42731,16 @@ "developer": "Meta", "evaluator_relationship": null, "benchmark_scores": { + "helm_lite/Mean win rate": 0.819, + "helm_lite/NarrativeQA": 0.777, + "helm_lite/NaturalQuestions (closed-book)": 0.457, + "helm_lite/OpenbookQA": 0.942, + "helm_lite/MMLU": 0.703, + "helm_lite/MATH": 0.791, + "helm_lite/GSM8K": 0.936, + "helm_lite/LegalBench": 0.68, + "helm_lite/MedQA": 0.769, + "helm_lite/WMT 2014": 0.224, "helm_mmlu/MMLU All Subjects": 0.803, "helm_mmlu/Abstract Algebra": 0.52, "helm_mmlu/Anatomy": 0.8, @@ -42766,17 +42776,7 @@ "helm_mmlu/Sociology": 0.92, "helm_mmlu/Virology": 0.584, "helm_mmlu/World Religions": 0.901, - "helm_mmlu/Mean win rate": 0.773, - "helm_lite/Mean win rate": 0.819, - "helm_lite/NarrativeQA": 0.777, - "helm_lite/NaturalQuestions (closed-book)": 0.457, - "helm_lite/OpenbookQA": 0.942, - "helm_lite/MMLU": 0.703, - "helm_lite/MATH": 0.791, - "helm_lite/GSM8K": 0.936, - "helm_lite/LegalBench": 0.68, - "helm_lite/MedQA": 0.769, - "helm_lite/WMT 2014": 0.224 + "helm_mmlu/Mean win rate": 0.773 } }, { @@ -43667,7 +43667,7 @@ "developer": "MiniMax", "evaluator_relationship": null, "benchmark_scores": { - "terminal-bench-2.0/terminal-bench-2.0": 29.2 + "terminal-bench-2.0/terminal-bench-2.0": 36.6 } }, { @@ -43803,6 +43803,16 @@ "helm_capabilities/IFEval": 0.567, "helm_capabilities/WildBench": 0.66, "helm_capabilities/Omni-MATH": 0.072, + "helm_lite/Mean win rate": 0.196, + "helm_lite/NarrativeQA": 0.716, + "helm_lite/NaturalQuestions (closed-book)": 0.253, + "helm_lite/OpenbookQA": 0.79, + "helm_lite/MMLU": 0.51, + "helm_lite/MATH": 0.289, + "helm_lite/GSM8K": 0.538, + "helm_lite/LegalBench": 0.331, + "helm_lite/MedQA": 0.517, + "helm_lite/WMT 2014": 0.142, "helm_mmlu/MMLU All Subjects": 0.599, "helm_mmlu/Abstract Algebra": 0.27, "helm_mmlu/Anatomy": 0.585, @@ -43839,16 +43849,6 @@ "helm_mmlu/Virology": 0.47, "helm_mmlu/World Religions": 0.825, "helm_mmlu/Mean win rate": 0.509, - "helm_lite/Mean win rate": 0.196, - "helm_lite/NarrativeQA": 0.716, - "helm_lite/NaturalQuestions (closed-book)": 0.253, - "helm_lite/OpenbookQA": 0.79, - "helm_lite/MMLU": 0.51, - "helm_lite/MATH": 0.289, - "helm_lite/GSM8K": 0.538, - "helm_lite/LegalBench": 0.331, - "helm_lite/MedQA": 0.517, - "helm_lite/WMT 2014": 0.142, "hfopenllm_v2/IFEval": 0.5465, "hfopenllm_v2/BBH": 0.4722, "hfopenllm_v2/MATH Level 5": 0.0385, @@ -44452,12 +44452,12 @@ "developer": "mistralai", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.2326, - "hfopenllm_v2/BBH": 0.5098, - "hfopenllm_v2/MATH Level 5": 0.0937, - "hfopenllm_v2/GPQA": 0.3205, - "hfopenllm_v2/MUSR": 0.4413, - "hfopenllm_v2/MMLU-PRO": 0.3871 + "hfopenllm_v2/IFEval": 0.2415, + "hfopenllm_v2/BBH": 0.5087, + "hfopenllm_v2/MATH Level 5": 0.102, + "hfopenllm_v2/GPQA": 0.3138, + "hfopenllm_v2/MUSR": 0.4321, + "hfopenllm_v2/MMLU-PRO": 0.385 } }, { @@ -44758,12 +44758,12 @@ "developer": "mlabonne", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.4162, - "hfopenllm_v2/BBH": 0.5124, - "hfopenllm_v2/MATH Level 5": 0.0853, - "hfopenllm_v2/GPQA": 0.3029, - "hfopenllm_v2/MUSR": 0.415, - "hfopenllm_v2/MMLU-PRO": 0.3802 + "hfopenllm_v2/IFEval": 0.7561, + "hfopenllm_v2/BBH": 0.5111, + "hfopenllm_v2/MATH Level 5": 0.0906, + "hfopenllm_v2/GPQA": 0.3062, + "hfopenllm_v2/MUSR": 0.4019, + "hfopenllm_v2/MMLU-PRO": 0.3841 } }, { @@ -44996,7 +44996,7 @@ "developer": "Moonshot AI", "evaluator_relationship": null, "benchmark_scores": { - "terminal-bench-2.0/terminal-bench-2.0": 26.7 + "terminal-bench-2.0/terminal-bench-2.0": 27.8 } }, { @@ -45317,7 +45317,7 @@ "developer": "Multiple", "evaluator_relationship": null, "benchmark_scores": { - "terminal-bench-2.0/terminal-bench-2.0": 71.0 + "terminal-bench-2.0/terminal-bench-2.0": 72.4 } }, { @@ -45633,12 +45633,12 @@ "developer": "nazimali", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.4964, - "hfopenllm_v2/BBH": 0.4699, - "hfopenllm_v2/MATH Level 5": 0.0045, - "hfopenllm_v2/GPQA": 0.2827, - "hfopenllm_v2/MUSR": 0.3979, - "hfopenllm_v2/MMLU-PRO": 0.3063 + "hfopenllm_v2/IFEval": 0.486, + "hfopenllm_v2/BBH": 0.4721, + "hfopenllm_v2/MATH Level 5": 0.0846, + "hfopenllm_v2/GPQA": 0.2844, + "hfopenllm_v2/MUSR": 0.4006, + "hfopenllm_v2/MMLU-PRO": 0.3087 } }, { @@ -48273,16 +48273,16 @@ "developer": "nicolinho", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.9444, + "reward-bench/Score": 0.7667, + "reward-bench/Chat": 0.9665, + "reward-bench/Chat Hard": 0.9013, + "reward-bench/Safety": 0.9578, + "reward-bench/Reasoning": 0.9826, "reward-bench/Factuality": 0.7853, "reward-bench/Precise IF": 0.3719, "reward-bench/Math": 0.6995, - "reward-bench/Safety": 0.927, "reward-bench/Focus": 0.9535, - "reward-bench/Ties": 0.8321, - "reward-bench/Chat": 0.9665, - "reward-bench/Chat Hard": 0.9013, - "reward-bench/Reasoning": 0.9826 + "reward-bench/Ties": 0.8321 } }, { @@ -48317,16 +48317,16 @@ "developer": "nicolinho", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.9314, + "reward-bench/Score": 0.7074, + "reward-bench/Chat": 0.9637, + "reward-bench/Chat Hard": 0.8684, + "reward-bench/Safety": 0.9467, + "reward-bench/Reasoning": 0.9677, "reward-bench/Factuality": 0.6653, "reward-bench/Precise IF": 0.4062, "reward-bench/Math": 0.612, - "reward-bench/Safety": 0.9257, "reward-bench/Focus": 0.8909, - "reward-bench/Ties": 0.7234, - "reward-bench/Chat": 0.9637, - "reward-bench/Chat Hard": 0.8684, - "reward-bench/Reasoning": 0.9677 + "reward-bench/Ties": 0.7234 } }, { @@ -48447,12 +48447,12 @@ "developer": "nisten", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.3914, - "hfopenllm_v2/BBH": 0.6591, - "hfopenllm_v2/MATH Level 5": 0.3044, - "hfopenllm_v2/GPQA": 0.3591, - "hfopenllm_v2/MUSR": 0.4681, - "hfopenllm_v2/MMLU-PRO": 0.5611 + "hfopenllm_v2/IFEval": 0.3799, + "hfopenllm_v2/BBH": 0.6647, + "hfopenllm_v2/MATH Level 5": 0.3406, + "hfopenllm_v2/GPQA": 0.4035, + "hfopenllm_v2/MUSR": 0.494, + "hfopenllm_v2/MMLU-PRO": 0.5731 } }, { @@ -49326,20 +49326,6 @@ "hfopenllm_v2/MMLU-PRO": 0.232 } }, - { - "id": "NousResearch/Yarn-Llama-2-7b-128k", - "name": "Yarn-Llama-2-7b-128k", - "developer": "NousResearch", - "evaluator_relationship": null, - "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.1485, - "hfopenllm_v2/BBH": 0.3248, - "hfopenllm_v2/MATH Level 5": 0.0151, - "hfopenllm_v2/GPQA": 0.2601, - "hfopenllm_v2/MUSR": 0.3967, - "hfopenllm_v2/MMLU-PRO": 0.1791 - } - }, { "id": "NousResearch/Yarn-Llama-2-7b-64k", "name": "Yarn-Llama-2-7b-64k", @@ -50085,12 +50071,12 @@ "developer": "Omkar1102", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.2254, - "hfopenllm_v2/BBH": 0.275, + "hfopenllm_v2/IFEval": 0.2148, + "hfopenllm_v2/BBH": 0.276, "hfopenllm_v2/MATH Level 5": 0.0, - "hfopenllm_v2/GPQA": 0.2576, - "hfopenllm_v2/MUSR": 0.3762, - "hfopenllm_v2/MMLU-PRO": 0.1123 + "hfopenllm_v2/GPQA": 0.2508, + "hfopenllm_v2/MUSR": 0.3802, + "hfopenllm_v2/MMLU-PRO": 0.1126 } }, { @@ -51011,16 +50997,16 @@ "developer": "OpenAI", "evaluator_relationship": null, "benchmark_scores": { + "ace/Overall Score": 0.515, + "ace/Food Score": 0.65, + "ace/Gaming Score": 0.578, "apex-agents/Overall Pass@1": 0.23, "apex-agents/Overall Pass@8": 0.4, "apex-agents/Overall Mean Score": 0.387, "apex-agents/Investment Banking Pass@1": 0.273, "apex-agents/Management Consulting Pass@1": 0.227, "apex-agents/Corporate Law Pass@1": 0.189, - "apex-agents/Corporate Lawyer Mean Score": 0.443, - "ace/Overall Score": 0.515, - "ace/Food Score": 0.65, - "ace/Gaming Score": 0.578 + "apex-agents/Corporate Lawyer Mean Score": 0.443 } }, { @@ -51148,13 +51134,6 @@ "developer": "OpenAI", "evaluator_relationship": null, "benchmark_scores": { - "helm_instruct/Mean win rate": 0.689, - "helm_instruct/Anthropic RLHF dataset": 4.964, - "helm_instruct/Best ChatGPT Prompts": 4.986, - "helm_instruct/Koala test dataset": 4.987, - "helm_instruct/Open Assistant": 4.987, - "helm_instruct/Self Instruct": 4.99, - "helm_instruct/Vicuna": 4.992, "helm_classic/Mean win rate": 0.783, "helm_classic/MMLU": 0.391, "helm_classic/BoolQ": 0.87, @@ -51170,6 +51149,13 @@ "helm_classic/IMDB": 0.943, "helm_classic/CivilComments": 0.696, "helm_classic/RAFT": 0.748, + "helm_instruct/Mean win rate": 0.689, + "helm_instruct/Anthropic RLHF dataset": 4.964, + "helm_instruct/Best ChatGPT Prompts": 4.986, + "helm_instruct/Koala test dataset": 4.987, + "helm_instruct/Open Assistant": 4.987, + "helm_instruct/Self Instruct": 4.99, + "helm_instruct/Vicuna": 4.992, "helm_lite/Mean win rate": 0.358, "helm_lite/NarrativeQA": 0.655, "helm_lite/NaturalQuestions (closed-book)": 0.335, @@ -51253,6 +51239,16 @@ "developer": "OpenAI", "evaluator_relationship": null, "benchmark_scores": { + "helm_lite/Mean win rate": 0.867, + "helm_lite/NarrativeQA": 0.768, + "helm_lite/NaturalQuestions (closed-book)": 0.457, + "helm_lite/OpenbookQA": 0.96, + "helm_lite/MMLU": 0.735, + "helm_lite/MATH": 0.802, + "helm_lite/GSM8K": 0.932, + "helm_lite/LegalBench": 0.713, + "helm_lite/MedQA": 0.815, + "helm_lite/WMT 2014": 0.211, "helm_mmlu/MMLU All Subjects": 0.824, "helm_mmlu/Abstract Algebra": 0.63, "helm_mmlu/Anatomy": 0.8, @@ -51288,17 +51284,7 @@ "helm_mmlu/Sociology": 0.93, "helm_mmlu/Virology": 0.596, "helm_mmlu/World Religions": 0.877, - "helm_mmlu/Mean win rate": 0.517, - "helm_lite/Mean win rate": 0.867, - "helm_lite/NarrativeQA": 0.768, - "helm_lite/NaturalQuestions (closed-book)": 0.457, - "helm_lite/OpenbookQA": 0.96, - "helm_lite/MMLU": 0.735, - "helm_lite/MATH": 0.802, - "helm_lite/GSM8K": 0.932, - "helm_lite/LegalBench": 0.713, - "helm_lite/MedQA": 0.815, - "helm_lite/WMT 2014": 0.211 + "helm_mmlu/Mean win rate": 0.517 } }, { @@ -51361,6 +51347,16 @@ "developer": "OpenAI", "evaluator_relationship": null, "benchmark_scores": { + "helm_lite/Mean win rate": 0.864, + "helm_lite/NarrativeQA": 0.761, + "helm_lite/NaturalQuestions (closed-book)": 0.482, + "helm_lite/OpenbookQA": 0.97, + "helm_lite/MMLU": 0.711, + "helm_lite/MATH": 0.833, + "helm_lite/GSM8K": 0.824, + "helm_lite/LegalBench": 0.727, + "helm_lite/MedQA": 0.783, + "helm_lite/WMT 2014": 0.218, "helm_mmlu/MMLU All Subjects": 0.813, "helm_mmlu/Abstract Algebra": 0.56, "helm_mmlu/Anatomy": 0.822, @@ -51397,16 +51393,6 @@ "helm_mmlu/Virology": 0.602, "helm_mmlu/World Religions": 0.848, "helm_mmlu/Mean win rate": 0.351, - "helm_lite/Mean win rate": 0.864, - "helm_lite/NarrativeQA": 0.761, - "helm_lite/NaturalQuestions (closed-book)": 0.482, - "helm_lite/OpenbookQA": 0.97, - "helm_lite/MMLU": 0.711, - "helm_lite/MATH": 0.833, - "helm_lite/GSM8K": 0.824, - "helm_lite/LegalBench": 0.727, - "helm_lite/MedQA": 0.783, - "helm_lite/WMT 2014": 0.218, "reward-bench/Score": 0.8395, "reward-bench/Chat": 0.9525, "reward-bench/Chat Hard": 0.7544, @@ -51620,16 +51606,16 @@ "helm_mmlu/Virology": 0.578, "helm_mmlu/World Religions": 0.883, "helm_mmlu/Mean win rate": 0.52, - "reward-bench/Score": 0.6493, - "reward-bench/Chat": 0.9609, - "reward-bench/Chat Hard": 0.761, - "reward-bench/Safety": 0.8619, - "reward-bench/Reasoning": 0.8661, + "reward-bench/Score": 0.8673, "reward-bench/Factuality": 0.5684, "reward-bench/Precise IF": 0.3312, "reward-bench/Math": 0.623, + "reward-bench/Safety": 0.8811, "reward-bench/Focus": 0.7293, - "reward-bench/Ties": 0.7819 + "reward-bench/Ties": 0.7819, + "reward-bench/Chat": 0.9609, + "reward-bench/Chat Hard": 0.761, + "reward-bench/Reasoning": 0.8661 } }, { @@ -51725,7 +51711,7 @@ "developer": "OpenAI", "evaluator_relationship": null, "benchmark_scores": { - "terminal-bench-2.0/terminal-bench-2.0": 35.2 + "terminal-bench-2.0/terminal-bench-2.0": 49.6 } }, { @@ -51759,9 +51745,9 @@ "helm_capabilities/IFEval": 0.875, "helm_capabilities/WildBench": 0.857, "helm_capabilities/Omni-MATH": 0.647, - "livecodebenchpro/Hard Problems": 0.0423, - "livecodebenchpro/Medium Problems": 0.4085, - "livecodebenchpro/Easy Problems": 0.9014 + "livecodebenchpro/Hard Problems": 0.04225352112676056, + "livecodebenchpro/Medium Problems": 0.4084507042253521, + "livecodebenchpro/Easy Problems": 0.8873239436619719 } }, { @@ -51770,7 +51756,7 @@ "developer": "OpenAI", "evaluator_relationship": null, "benchmark_scores": { - "terminal-bench-2.0/terminal-bench-2.0": 44.3 + "terminal-bench-2.0/terminal-bench-2.0": 43.4 } }, { @@ -51779,7 +51765,7 @@ "developer": "OpenAI", "evaluator_relationship": null, "benchmark_scores": { - "terminal-bench-2.0/terminal-bench-2.0": 34.8 + "terminal-bench-2.0/terminal-bench-2.0": 24.0 } }, { @@ -51802,7 +51788,7 @@ "developer": "OpenAI", "evaluator_relationship": null, "benchmark_scores": { - "terminal-bench-2.0/terminal-bench-2.0": 9.9 + "terminal-bench-2.0/terminal-bench-2.0": 11.5 } }, { @@ -51834,7 +51820,7 @@ "developer": "OpenAI", "evaluator_relationship": null, "benchmark_scores": { - "terminal-bench-2.0/terminal-bench-2.0": 53.5 + "terminal-bench-2.0/terminal-bench-2.0": 57.8 } }, { @@ -51861,7 +51847,7 @@ "developer": "OpenAI", "evaluator_relationship": null, "benchmark_scores": { - "terminal-bench-2.0/terminal-bench-2.0": 60.7 + "terminal-bench-2.0/terminal-bench-2.0": 62.9 } }, { @@ -51871,14 +51857,14 @@ "evaluator_relationship": null, "benchmark_scores": { "appworld_test_normal/appworld/test_normal": 0.0, - "browsecompplus/browsecompplus": 0.48, + "browsecompplus/browsecompplus": 0.43, "livecodebenchpro/Hard Problems": 0.1594, "livecodebenchpro/Medium Problems": 0.5211, "livecodebenchpro/Easy Problems": 0.9014, "swe-bench/swe-bench": 0.5455, "tau-bench-2_airline/tau-bench-2/airline": 0.6, - "tau-bench-2_retail/tau-bench-2/retail": 0.68, - "tau-bench-2_telecom/tau-bench-2/telecom": 0.5354 + "tau-bench-2_retail/tau-bench-2/retail": 0.73, + "tau-bench-2_telecom/tau-bench-2/telecom": 0.71 } }, { @@ -51896,7 +51882,7 @@ "developer": "OpenAI", "evaluator_relationship": null, "benchmark_scores": { - "terminal-bench-2.0/terminal-bench-2.0": 64.7 + "terminal-bench-2.0/terminal-bench-2.0": 77.3 } }, { @@ -51960,7 +51946,7 @@ "livecodebenchpro/Hard Problems": 0.0, "livecodebenchpro/Medium Problems": 0.11267605633802817, "livecodebenchpro/Easy Problems": 0.6619718309859155, - "terminal-bench-2.0/terminal-bench-2.0": 14.2 + "terminal-bench-2.0/terminal-bench-2.0": 18.7 } }, { @@ -51978,7 +51964,7 @@ "livecodebenchpro/Hard Problems": 0.0, "livecodebenchpro/Medium Problems": 0.056338028169014086, "livecodebenchpro/Easy Problems": 0.5070422535211268, - "terminal-bench-2.0/terminal-bench-2.0": 3.1 + "terminal-bench-2.0/terminal-bench-2.0": 3.4 } }, { @@ -52227,17 +52213,17 @@ "developer": "OpenAssistant", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.615, + "reward-bench/Score": 0.2653, + "reward-bench/Chat": 0.9246, + "reward-bench/Chat Hard": 0.3728, + "reward-bench/Safety": 0.3289, + "reward-bench/Reasoning": 0.5855, + "reward-bench/Prior Sets (0.5 weight)": 0.6801, "reward-bench/Factuality": 0.3979, "reward-bench/Precise IF": 0.2875, "reward-bench/Math": 0.377, - "reward-bench/Safety": 0.5446, "reward-bench/Focus": 0.1535, - "reward-bench/Ties": 0.047, - "reward-bench/Chat": 0.9246, - "reward-bench/Chat Hard": 0.3728, - "reward-bench/Reasoning": 0.5855, - "reward-bench/Prior Sets (0.5 weight)": 0.6801 + "reward-bench/Ties": 0.047 } }, { @@ -52246,17 +52232,17 @@ "developer": "OpenAssistant", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.2648, - "reward-bench/Chat": 0.8855, - "reward-bench/Chat Hard": 0.4868, - "reward-bench/Safety": 0.3244, - "reward-bench/Reasoning": 0.7752, - "reward-bench/Prior Sets (0.5 weight)": 0.6533, + "reward-bench/Score": 0.6901, "reward-bench/Factuality": 0.3179, "reward-bench/Precise IF": 0.2625, "reward-bench/Math": 0.3934, + "reward-bench/Safety": 0.6311, "reward-bench/Focus": 0.2707, - "reward-bench/Ties": 0.0198 + "reward-bench/Ties": 0.0198, + "reward-bench/Chat": 0.8855, + "reward-bench/Chat Hard": 0.4868, + "reward-bench/Reasoning": 0.7752, + "reward-bench/Prior Sets (0.5 weight)": 0.6533 } }, { @@ -52312,17 +52298,17 @@ "developer": "openbmb", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.5806, - "reward-bench/Chat": 0.9804, - "reward-bench/Chat Hard": 0.6557, - "reward-bench/Safety": 0.6267, - "reward-bench/Reasoning": 0.8633, - "reward-bench/Prior Sets (0.5 weight)": 0.7172, + "reward-bench/Score": 0.8159, "reward-bench/Factuality": 0.6, "reward-bench/Precise IF": 0.3438, "reward-bench/Math": 0.5683, + "reward-bench/Safety": 0.8135, "reward-bench/Focus": 0.7475, - "reward-bench/Ties": 0.5972 + "reward-bench/Ties": 0.5972, + "reward-bench/Chat": 0.9804, + "reward-bench/Chat Hard": 0.6557, + "reward-bench/Reasoning": 0.8633, + "reward-bench/Prior Sets (0.5 weight)": 0.7172 } }, { @@ -53830,17 +53816,17 @@ "developer": "PKU-Alignment", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.3332, - "reward-bench/Chat": 0.6173, - "reward-bench/Chat Hard": 0.4232, - "reward-bench/Safety": 0.7589, - "reward-bench/Reasoning": 0.5482, - "reward-bench/Prior Sets (0.5 weight)": 0.57, + "reward-bench/Score": 0.5798, "reward-bench/Factuality": 0.3263, "reward-bench/Precise IF": 0.2313, "reward-bench/Math": 0.3989, + "reward-bench/Safety": 0.7351, "reward-bench/Focus": 0.2939, - "reward-bench/Ties": -0.01 + "reward-bench/Ties": -0.01, + "reward-bench/Chat": 0.6173, + "reward-bench/Chat Hard": 0.4232, + "reward-bench/Reasoning": 0.5482, + "reward-bench/Prior Sets (0.5 weight)": 0.57 } }, { @@ -53849,17 +53835,17 @@ "developer": "PKU-Alignment", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.4727, + "reward-bench/Score": 0.1606, + "reward-bench/Chat": 0.8184, + "reward-bench/Chat Hard": 0.2873, + "reward-bench/Safety": 0.1422, + "reward-bench/Reasoning": 0.346, + "reward-bench/Prior Sets (0.5 weight)": 0.5993, "reward-bench/Factuality": 0.2105, "reward-bench/Precise IF": 0.2938, "reward-bench/Math": 0.2623, - "reward-bench/Safety": 0.3757, "reward-bench/Focus": 0.0646, - "reward-bench/Ties": -0.01, - "reward-bench/Chat": 0.8184, - "reward-bench/Chat Hard": 0.2873, - "reward-bench/Reasoning": 0.346, - "reward-bench/Prior Sets (0.5 weight)": 0.5993 + "reward-bench/Ties": -0.01 } }, { @@ -53887,17 +53873,17 @@ "developer": "PKU-Alignment", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.6366, + "reward-bench/Score": 0.2544, + "reward-bench/Chat": 0.8994, + "reward-bench/Chat Hard": 0.364, + "reward-bench/Safety": 0.3156, + "reward-bench/Reasoning": 0.6887, + "reward-bench/Prior Sets (0.5 weight)": 0.6171, "reward-bench/Factuality": 0.2168, "reward-bench/Precise IF": 0.2562, "reward-bench/Math": 0.3825, - "reward-bench/Safety": 0.6041, "reward-bench/Focus": 0.2606, - "reward-bench/Ties": 0.0944, - "reward-bench/Chat": 0.8994, - "reward-bench/Chat Hard": 0.364, - "reward-bench/Reasoning": 0.6887, - "reward-bench/Prior Sets (0.5 weight)": 0.6171 + "reward-bench/Ties": 0.0944 } }, { @@ -54172,11 +54158,11 @@ "evaluator_relationship": null, "benchmark_scores": { "hfopenllm_v2/IFEval": 0.1757, - "hfopenllm_v2/BBH": 0.276, + "hfopenllm_v2/BBH": 0.274, "hfopenllm_v2/MATH Level 5": 0.0, - "hfopenllm_v2/GPQA": 0.2534, - "hfopenllm_v2/MUSR": 0.3339, - "hfopenllm_v2/MMLU-PRO": 0.1123 + "hfopenllm_v2/GPQA": 0.25, + "hfopenllm_v2/MUSR": 0.3753, + "hfopenllm_v2/MMLU-PRO": 0.112 } }, { @@ -54255,12 +54241,12 @@ "developer": "princeton-nlp", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.5508, - "hfopenllm_v2/BBH": 0.5028, - "hfopenllm_v2/MATH Level 5": 0.0529, - "hfopenllm_v2/GPQA": 0.2861, - "hfopenllm_v2/MUSR": 0.4266, - "hfopenllm_v2/MMLU-PRO": 0.3231 + "hfopenllm_v2/IFEval": 0.3978, + "hfopenllm_v2/BBH": 0.4983, + "hfopenllm_v2/MATH Level 5": 0.0582, + "hfopenllm_v2/GPQA": 0.281, + "hfopenllm_v2/MUSR": 0.425, + "hfopenllm_v2/MMLU-PRO": 0.3246 } }, { @@ -57543,12 +57529,12 @@ "developer": "Quazim0t0", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.7016, - "hfopenllm_v2/BBH": 0.6942, - "hfopenllm_v2/MATH Level 5": 0.4116, - "hfopenllm_v2/GPQA": 0.3624, - "hfopenllm_v2/MUSR": 0.4571, - "hfopenllm_v2/MMLU-PRO": 0.5411 + "hfopenllm_v2/IFEval": 0.2922, + "hfopenllm_v2/BBH": 0.6559, + "hfopenllm_v2/MATH Level 5": 0.2545, + "hfopenllm_v2/GPQA": 0.2659, + "hfopenllm_v2/MUSR": 0.3929, + "hfopenllm_v2/MMLU-PRO": 0.5207 } }, { @@ -58647,12 +58633,12 @@ "developer": "Qwen", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.3153, - "hfopenllm_v2/BBH": 0.3322, - "hfopenllm_v2/MATH Level 5": 0.1035, - "hfopenllm_v2/GPQA": 0.2592, - "hfopenllm_v2/MUSR": 0.3342, - "hfopenllm_v2/MMLU-PRO": 0.172 + "hfopenllm_v2/IFEval": 0.3071, + "hfopenllm_v2/BBH": 0.3341, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.2576, + "hfopenllm_v2/MUSR": 0.3329, + "hfopenllm_v2/MMLU-PRO": 0.1697 } }, { @@ -58778,7 +58764,8 @@ "hfopenllm_v2/MATH Level 5": 0.3678, "hfopenllm_v2/GPQA": 0.2727, "hfopenllm_v2/MUSR": 0.3968, - "hfopenllm_v2/MMLU-PRO": 0.3255 + "hfopenllm_v2/MMLU-PRO": 0.3255, + "theory_of_mind/accuracy on theory_of_mind for scorer model_graded_fact": 0.78 } }, { @@ -59048,12 +59035,12 @@ "developer": "Qwen", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.6147, - "hfopenllm_v2/BBH": 0.4999, - "hfopenllm_v2/MATH Level 5": 0.031, - "hfopenllm_v2/GPQA": 0.2936, - "hfopenllm_v2/MUSR": 0.4099, - "hfopenllm_v2/MMLU-PRO": 0.3354 + "hfopenllm_v2/IFEval": 0.6101, + "hfopenllm_v2/BBH": 0.5008, + "hfopenllm_v2/MATH Level 5": 0.3716, + "hfopenllm_v2/GPQA": 0.2919, + "hfopenllm_v2/MUSR": 0.4073, + "hfopenllm_v2/MMLU-PRO": 0.3352 } }, { @@ -59381,16 +59368,16 @@ "developer": "Ray2333", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.8839, + "reward-bench/Score": 0.5966, + "reward-bench/Chat": 0.9302, + "reward-bench/Chat Hard": 0.7719, + "reward-bench/Safety": 0.9222, + "reward-bench/Reasoning": 0.912, "reward-bench/Factuality": 0.5305, "reward-bench/Precise IF": 0.3125, "reward-bench/Math": 0.5902, - "reward-bench/Safety": 0.9216, "reward-bench/Focus": 0.7455, - "reward-bench/Ties": 0.4788, - "reward-bench/Chat": 0.9302, - "reward-bench/Chat Hard": 0.7719, - "reward-bench/Reasoning": 0.912 + "reward-bench/Ties": 0.4788 } }, { @@ -59436,17 +59423,17 @@ "developer": "Ray2333", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.6089, - "reward-bench/Chat": 0.986, - "reward-bench/Chat Hard": 0.6776, - "reward-bench/Safety": 0.7867, - "reward-bench/Reasoning": 0.9229, - "reward-bench/Prior Sets (0.5 weight)": 0.7309, + "reward-bench/Score": 0.8542, "reward-bench/Factuality": 0.6189, "reward-bench/Precise IF": 0.3875, "reward-bench/Math": 0.5792, + "reward-bench/Safety": 0.8919, "reward-bench/Focus": 0.6828, - "reward-bench/Ties": 0.5981 + "reward-bench/Ties": 0.5981, + "reward-bench/Chat": 0.986, + "reward-bench/Chat Hard": 0.6776, + "reward-bench/Reasoning": 0.9229, + "reward-bench/Prior Sets (0.5 weight)": 0.7309 } }, { @@ -59510,12 +59497,12 @@ "developer": "recoilme", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.7649, - "hfopenllm_v2/BBH": 0.5974, - "hfopenllm_v2/MATH Level 5": 0.0174, - "hfopenllm_v2/GPQA": 0.3305, - "hfopenllm_v2/MUSR": 0.4245, - "hfopenllm_v2/MMLU-PRO": 0.4207 + "hfopenllm_v2/IFEval": 0.2854, + "hfopenllm_v2/BBH": 0.5984, + "hfopenllm_v2/MATH Level 5": 0.1005, + "hfopenllm_v2/GPQA": 0.3297, + "hfopenllm_v2/MUSR": 0.4607, + "hfopenllm_v2/MMLU-PRO": 0.4162 } }, { @@ -59538,12 +59525,12 @@ "developer": "recoilme", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.2747, - "hfopenllm_v2/BBH": 0.6031, - "hfopenllm_v2/MATH Level 5": 0.0831, - "hfopenllm_v2/GPQA": 0.3305, - "hfopenllm_v2/MUSR": 0.4686, - "hfopenllm_v2/MMLU-PRO": 0.4122 + "hfopenllm_v2/IFEval": 0.7592, + "hfopenllm_v2/BBH": 0.6026, + "hfopenllm_v2/MATH Level 5": 0.0529, + "hfopenllm_v2/GPQA": 0.3289, + "hfopenllm_v2/MUSR": 0.4099, + "hfopenllm_v2/MMLU-PRO": 0.4163 } }, { @@ -59552,12 +59539,12 @@ "developer": "recoilme", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.7439, - "hfopenllm_v2/BBH": 0.5993, - "hfopenllm_v2/MATH Level 5": 0.0876, - "hfopenllm_v2/GPQA": 0.3238, - "hfopenllm_v2/MUSR": 0.4204, - "hfopenllm_v2/MMLU-PRO": 0.4072 + "hfopenllm_v2/IFEval": 0.5761, + "hfopenllm_v2/BBH": 0.602, + "hfopenllm_v2/MATH Level 5": 0.1888, + "hfopenllm_v2/GPQA": 0.3372, + "hfopenllm_v2/MUSR": 0.4632, + "hfopenllm_v2/MMLU-PRO": 0.4039 } }, { @@ -59720,12 +59707,12 @@ "developer": "Replete-AI", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.0932, - "hfopenllm_v2/BBH": 0.2977, + "hfopenllm_v2/IFEval": 0.0905, + "hfopenllm_v2/BBH": 0.2985, "hfopenllm_v2/MATH Level 5": 0.0, - "hfopenllm_v2/GPQA": 0.2475, - "hfopenllm_v2/MUSR": 0.3941, - "hfopenllm_v2/MMLU-PRO": 0.1157 + "hfopenllm_v2/GPQA": 0.2534, + "hfopenllm_v2/MUSR": 0.3848, + "hfopenllm_v2/MMLU-PRO": 0.1158 } }, { @@ -62491,17 +62478,17 @@ "developer": "sfairXC", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.6292, - "reward-bench/Chat": 0.9944, - "reward-bench/Chat Hard": 0.6513, - "reward-bench/Safety": 0.7667, - "reward-bench/Reasoning": 0.8644, - "reward-bench/Prior Sets (0.5 weight)": 0.7492, + "reward-bench/Score": 0.8338, "reward-bench/Factuality": 0.5916, "reward-bench/Precise IF": 0.4188, "reward-bench/Math": 0.6284, + "reward-bench/Safety": 0.8676, "reward-bench/Focus": 0.7051, - "reward-bench/Ties": 0.6647 + "reward-bench/Ties": 0.6647, + "reward-bench/Chat": 0.9944, + "reward-bench/Chat Hard": 0.6513, + "reward-bench/Reasoning": 0.8644, + "reward-bench/Prior Sets (0.5 weight)": 0.7492 } }, { @@ -63282,16 +63269,16 @@ "developer": "Skywork", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.938, + "reward-bench/Score": 0.7576, + "reward-bench/Chat": 0.9581, + "reward-bench/Chat Hard": 0.9145, + "reward-bench/Safety": 0.9422, + "reward-bench/Reasoning": 0.9606, "reward-bench/Factuality": 0.7368, "reward-bench/Precise IF": 0.4031, "reward-bench/Math": 0.7049, - "reward-bench/Safety": 0.9189, "reward-bench/Focus": 0.9323, - "reward-bench/Ties": 0.8261, - "reward-bench/Chat": 0.9581, - "reward-bench/Chat Hard": 0.9145, - "reward-bench/Reasoning": 0.9606 + "reward-bench/Ties": 0.8261 } }, { @@ -63306,16 +63293,16 @@ "hfopenllm_v2/GPQA": 0.344, "hfopenllm_v2/MUSR": 0.4231, "hfopenllm_v2/MMLU-PRO": 0.4103, - "reward-bench/Score": 0.7531, - "reward-bench/Chat": 0.9609, - "reward-bench/Chat Hard": 0.8991, - "reward-bench/Safety": 0.9689, - "reward-bench/Reasoning": 0.9807, + "reward-bench/Score": 0.9426, "reward-bench/Factuality": 0.7674, "reward-bench/Precise IF": 0.375, "reward-bench/Math": 0.6721, + "reward-bench/Safety": 0.9297, "reward-bench/Focus": 0.9172, - "reward-bench/Ties": 0.8182 + "reward-bench/Ties": 0.8182, + "reward-bench/Chat": 0.9609, + "reward-bench/Chat Hard": 0.8991, + "reward-bench/Reasoning": 0.9807 } }, { @@ -63324,16 +63311,16 @@ "developer": "Skywork", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.7314, - "reward-bench/Chat": 0.9581, - "reward-bench/Chat Hard": 0.8728, - "reward-bench/Safety": 0.9333, - "reward-bench/Reasoning": 0.962, + "reward-bench/Score": 0.9252, "reward-bench/Factuality": 0.6989, "reward-bench/Precise IF": 0.425, "reward-bench/Math": 0.6284, + "reward-bench/Safety": 0.9081, "reward-bench/Focus": 0.9616, - "reward-bench/Ties": 0.741 + "reward-bench/Ties": 0.741, + "reward-bench/Chat": 0.9581, + "reward-bench/Chat Hard": 0.8728, + "reward-bench/Reasoning": 0.962 } }, { @@ -63465,16 +63452,16 @@ "developer": "Skywork", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.9007, + "reward-bench/Score": 0.6885, + "reward-bench/Chat": 0.8994, + "reward-bench/Chat Hard": 0.875, + "reward-bench/Safety": 0.8911, + "reward-bench/Reasoning": 0.9176, "reward-bench/Factuality": 0.6063, "reward-bench/Precise IF": 0.35, "reward-bench/Math": 0.6339, - "reward-bench/Safety": 0.9108, "reward-bench/Focus": 0.8909, - "reward-bench/Ties": 0.7586, - "reward-bench/Chat": 0.8994, - "reward-bench/Chat Hard": 0.875, - "reward-bench/Reasoning": 0.9176 + "reward-bench/Ties": 0.7586 } }, { @@ -63483,6 +63470,16 @@ "developer": "snowflake", "evaluator_relationship": null, "benchmark_scores": { + "helm_lite/Mean win rate": 0.338, + "helm_lite/NarrativeQA": 0.654, + "helm_lite/NaturalQuestions (closed-book)": 0.39, + "helm_lite/OpenbookQA": 0.828, + "helm_lite/MMLU": 0.575, + "helm_lite/MATH": 0.519, + "helm_lite/GSM8K": 0.768, + "helm_lite/LegalBench": 0.588, + "helm_lite/MedQA": 0.581, + "helm_lite/WMT 2014": 0.172, "helm_mmlu/MMLU All Subjects": 0.677, "helm_mmlu/Abstract Algebra": 0.35, "helm_mmlu/Anatomy": 0.652, @@ -63518,17 +63515,7 @@ "helm_mmlu/Sociology": 0.891, "helm_mmlu/Virology": 0.536, "helm_mmlu/World Religions": 0.854, - "helm_mmlu/Mean win rate": 0.565, - "helm_lite/Mean win rate": 0.338, - "helm_lite/NarrativeQA": 0.654, - "helm_lite/NaturalQuestions (closed-book)": 0.39, - "helm_lite/OpenbookQA": 0.828, - "helm_lite/MMLU": 0.575, - "helm_lite/MATH": 0.519, - "helm_lite/GSM8K": 0.768, - "helm_lite/LegalBench": 0.588, - "helm_lite/MedQA": 0.581, - "helm_lite/WMT 2014": 0.172 + "helm_mmlu/Mean win rate": 0.565 } }, { @@ -70093,12 +70080,12 @@ "developer": "UCLA-AGI", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.6834, - "hfopenllm_v2/BBH": 0.508, - "hfopenllm_v2/MATH Level 5": 0.0959, + "hfopenllm_v2/IFEval": 0.6703, + "hfopenllm_v2/BBH": 0.5076, + "hfopenllm_v2/MATH Level 5": 0.0718, "hfopenllm_v2/GPQA": 0.2651, - "hfopenllm_v2/MUSR": 0.3661, - "hfopenllm_v2/MMLU-PRO": 0.3644 + "hfopenllm_v2/MUSR": 0.3647, + "hfopenllm_v2/MMLU-PRO": 0.3658 } }, { @@ -71041,12 +71028,12 @@ "developer": "ValiantLabs", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.5328, - "hfopenllm_v2/BBH": 0.4613, - "hfopenllm_v2/MATH Level 5": 0.0876, - "hfopenllm_v2/GPQA": 0.2894, - "hfopenllm_v2/MUSR": 0.3367, - "hfopenllm_v2/MMLU-PRO": 0.2424 + "hfopenllm_v2/IFEval": 0.5483, + "hfopenllm_v2/BBH": 0.461, + "hfopenllm_v2/MATH Level 5": 0.0582, + "hfopenllm_v2/GPQA": 0.2886, + "hfopenllm_v2/MUSR": 0.3433, + "hfopenllm_v2/MMLU-PRO": 0.2407 } }, { @@ -71055,12 +71042,12 @@ "developer": "ValiantLabs", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.6496, - "hfopenllm_v2/BBH": 0.4774, - "hfopenllm_v2/MATH Level 5": 0.0566, - "hfopenllm_v2/GPQA": 0.3104, - "hfopenllm_v2/MUSR": 0.3909, - "hfopenllm_v2/MMLU-PRO": 0.3382 + "hfopenllm_v2/IFEval": 0.2678, + "hfopenllm_v2/BBH": 0.4429, + "hfopenllm_v2/MATH Level 5": 0.0521, + "hfopenllm_v2/GPQA": 0.302, + "hfopenllm_v2/MUSR": 0.3959, + "hfopenllm_v2/MMLU-PRO": 0.2927 } }, { @@ -71377,12 +71364,12 @@ "developer": "VIRNECT", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.5021, - "hfopenllm_v2/BBH": 0.4918, - "hfopenllm_v2/MATH Level 5": 0.108, + "hfopenllm_v2/IFEval": 0.5058, + "hfopenllm_v2/BBH": 0.4908, + "hfopenllm_v2/MATH Level 5": 0.0929, "hfopenllm_v2/GPQA": 0.271, - "hfopenllm_v2/MUSR": 0.3648, - "hfopenllm_v2/MMLU-PRO": 0.3536 + "hfopenllm_v2/MUSR": 0.3662, + "hfopenllm_v2/MMLU-PRO": 0.3539 } }, { @@ -71685,17 +71672,17 @@ "developer": "weqweasdas", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.2498, - "reward-bench/Chat": 0.8184, - "reward-bench/Chat Hard": 0.3728, - "reward-bench/Safety": 0.24, - "reward-bench/Reasoning": 0.3281, - "reward-bench/Prior Sets (0.5 weight)": 0.6564, + "reward-bench/Score": 0.5027, "reward-bench/Factuality": 0.3642, "reward-bench/Precise IF": 0.275, "reward-bench/Math": 0.3497, + "reward-bench/Safety": 0.4149, "reward-bench/Focus": 0.2384, - "reward-bench/Ties": 0.0315 + "reward-bench/Ties": 0.0315, + "reward-bench/Chat": 0.8184, + "reward-bench/Chat Hard": 0.3728, + "reward-bench/Reasoning": 0.3281, + "reward-bench/Prior Sets (0.5 weight)": 0.6564 } }, { @@ -71704,17 +71691,17 @@ "developer": "weqweasdas", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.3057, - "reward-bench/Chat": 0.9441, - "reward-bench/Chat Hard": 0.4079, - "reward-bench/Safety": 0.3311, - "reward-bench/Reasoning": 0.7637, - "reward-bench/Prior Sets (0.5 weight)": 0.6652, + "reward-bench/Score": 0.6549, "reward-bench/Factuality": 0.3705, "reward-bench/Precise IF": 0.2812, "reward-bench/Math": 0.4317, + "reward-bench/Safety": 0.4986, "reward-bench/Focus": 0.2343, - "reward-bench/Ties": 0.1851 + "reward-bench/Ties": 0.1851, + "reward-bench/Chat": 0.9441, + "reward-bench/Chat Hard": 0.4079, + "reward-bench/Reasoning": 0.7637, + "reward-bench/Prior Sets (0.5 weight)": 0.6652 } }, { @@ -71756,17 +71743,17 @@ "developer": "weqweasdas", "evaluator_relationship": null, "benchmark_scores": { - "reward-bench/Score": 0.596, - "reward-bench/Chat": 0.9665, - "reward-bench/Chat Hard": 0.6053, - "reward-bench/Safety": 0.6911, - "reward-bench/Reasoning": 0.7736, - "reward-bench/Prior Sets (0.5 weight)": 0.753, + "reward-bench/Score": 0.7982, "reward-bench/Factuality": 0.5937, "reward-bench/Precise IF": 0.3438, "reward-bench/Math": 0.5956, + "reward-bench/Safety": 0.8703, "reward-bench/Focus": 0.7293, - "reward-bench/Ties": 0.6226 + "reward-bench/Ties": 0.6226, + "reward-bench/Chat": 0.9665, + "reward-bench/Chat Hard": 0.6053, + "reward-bench/Reasoning": 0.7736, + "reward-bench/Prior Sets (0.5 weight)": 0.753 } }, { @@ -72393,7 +72380,7 @@ "developer": "xAI", "evaluator_relationship": null, "benchmark_scores": { - "terminal-bench-2.0/terminal-bench-2.0": 25.4 + "terminal-bench-2.0/terminal-bench-2.0": 23.1 } }, { @@ -72435,7 +72422,7 @@ "developer": "xAI", "evaluator_relationship": null, "benchmark_scores": { - "terminal-bench-2.0/terminal-bench-2.0": 25.8 + "terminal-bench-2.0/terminal-bench-2.0": 14.2 } }, { @@ -73139,12 +73126,12 @@ "developer": "ycros", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.6262, - "hfopenllm_v2/BBH": 0.5142, - "hfopenllm_v2/MATH Level 5": 0.0937, - "hfopenllm_v2/GPQA": 0.3079, - "hfopenllm_v2/MUSR": 0.4138, - "hfopenllm_v2/MMLU-PRO": 0.3481 + "hfopenllm_v2/IFEval": 0.5994, + "hfopenllm_v2/BBH": 0.5159, + "hfopenllm_v2/MATH Level 5": 0.0785, + "hfopenllm_v2/GPQA": 0.3045, + "hfopenllm_v2/MUSR": 0.4203, + "hfopenllm_v2/MMLU-PRO": 0.3473 } }, { @@ -73825,12 +73812,12 @@ "developer": "YOYO-AI", "evaluator_relationship": null, "benchmark_scores": { - "hfopenllm_v2/IFEval": 0.5899, - "hfopenllm_v2/BBH": 0.654, - "hfopenllm_v2/MATH Level 5": 0.4509, - "hfopenllm_v2/GPQA": 0.3834, - "hfopenllm_v2/MUSR": 0.4744, - "hfopenllm_v2/MMLU-PRO": 0.5376 + "hfopenllm_v2/IFEval": 0.7905, + "hfopenllm_v2/BBH": 0.6406, + "hfopenllm_v2/MATH Level 5": 0.0, + "hfopenllm_v2/GPQA": 0.3163, + "hfopenllm_v2/MUSR": 0.4181, + "hfopenllm_v2/MMLU-PRO": 0.4944 } }, { diff --git a/data/models/adriszmar_qaimath-qwen2.5-7b-ties.json b/data/models/adriszmar_qaimath-qwen2.5-7b-ties.json index a8a69697e53329e56da6b7741229b84b4f2eeb85..776a7c662b5afb276bf059bb90896cee5cd968ec 100644 --- a/data/models/adriszmar_qaimath-qwen2.5-7b-ties.json +++ b/data/models/adriszmar_qaimath-qwen2.5-7b-ties.json @@ -5,7 +5,7 @@ "developer": "adriszmar", "inference_platform": "unknown", "additional_details": { - "precision": "float16", + "precision": "bfloat16", "architecture": "Qwen2ForCausalLM", "params_billions": "7.616" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1746 + "score": 0.1685 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3126 + "score": 0.3124 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0 + "score": 0.0015 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.245 + "score": 0.2492 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4096 + "score": 0.3963 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1087 + "score": 0.1066 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1685 + "score": 0.1746 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3124 + "score": 0.3126 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0015 + "score": 0.0 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2492 + "score": 0.245 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3963 + "score": 0.4096 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1066 + "score": 0.1087 } } ], diff --git a/data/models/ai2_tulu-2-7b-rm-v0-nectar-binarized-3.8m-check....json b/data/models/ai2_tulu-2-7b-rm-v0-nectar-binarized-3.8m-check....json index fe877c718ccc5b06d1fe6c011897398e18bfe6e7..f109864a284814ea86e7b02eb2f140eae70857b8 100644 --- a/data/models/ai2_tulu-2-7b-rm-v0-nectar-binarized-3.8m-check....json +++ b/data/models/ai2_tulu-2-7b-rm-v0-nectar-binarized-3.8m-check....json @@ -326,7 +326,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.7058 + "score": 0.7008 }, "source_data": { "dataset_name": "RewardBench", @@ -344,7 +344,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.9525 + "score": 0.9385 }, "source_data": { "dataset_name": "RewardBench", @@ -362,7 +362,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3947 + "score": 0.3882 }, "source_data": { "dataset_name": "RewardBench", @@ -380,7 +380,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.7703 + "score": 0.7757 }, "source_data": { "dataset_name": "RewardBench", @@ -422,7 +422,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.6808 + "score": 0.6895 }, "source_data": { "dataset_name": "RewardBench", @@ -440,7 +440,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.9302 + "score": 0.9385 }, "source_data": { "dataset_name": "RewardBench", @@ -458,7 +458,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3596 + "score": 0.3706 }, "source_data": { "dataset_name": "RewardBench", @@ -476,7 +476,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.7527 + "score": 0.7595 }, "source_data": { "dataset_name": "RewardBench", @@ -518,7 +518,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.6924 + "score": 0.6808 }, "source_data": { "dataset_name": "RewardBench", @@ -536,7 +536,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.9441 + "score": 0.9302 }, "source_data": { "dataset_name": "RewardBench", @@ -554,7 +554,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3575 + "score": 0.3596 }, "source_data": { "dataset_name": "RewardBench", @@ -572,7 +572,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.7757 + "score": 0.7527 }, "source_data": { "dataset_name": "RewardBench", @@ -710,7 +710,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.6895 + "score": 0.7058 }, "source_data": { "dataset_name": "RewardBench", @@ -728,7 +728,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.9385 + "score": 0.9525 }, "source_data": { "dataset_name": "RewardBench", @@ -746,7 +746,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3706 + "score": 0.3947 }, "source_data": { "dataset_name": "RewardBench", @@ -764,7 +764,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.7595 + "score": 0.7703 }, "source_data": { "dataset_name": "RewardBench", @@ -806,7 +806,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.7008 + "score": 0.6924 }, "source_data": { "dataset_name": "RewardBench", @@ -824,7 +824,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.9385 + "score": 0.9441 }, "source_data": { "dataset_name": "RewardBench", @@ -842,7 +842,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3882 + "score": 0.3575 }, "source_data": { "dataset_name": "RewardBench", diff --git a/data/models/akjindal53244_llama-3.1-storm-8b.json b/data/models/akjindal53244_llama-3.1-storm-8b.json index 10080988b150f205c06f956e608326efa2dd3fb0..cd06cbbbbe2ccc306c529154ea621a16fb9ebcf6 100644 --- a/data/models/akjindal53244_llama-3.1-storm-8b.json +++ b/data/models/akjindal53244_llama-3.1-storm-8b.json @@ -5,7 +5,7 @@ "developer": "akjindal53244", "inference_platform": "unknown", "additional_details": { - "precision": "bfloat16", + "precision": "float16", "architecture": "LlamaForCausalLM", "params_billions": "8.03" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.8033 + "score": 0.8051 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5196 + "score": 0.5189 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1624 + "score": 0.1722 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3096 + "score": 0.3263 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3812 + "score": 0.3803 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.8051 + "score": 0.8033 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5189 + "score": 0.5196 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1722 + "score": 0.1624 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3263 + "score": 0.3096 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3803 + "score": 0.3812 } } ], diff --git a/data/models/alibaba_qwen-3-coder-480b.json b/data/models/alibaba_qwen-3-coder-480b.json index 76736ecf57c8a847549ed4d201c4f8542bcb4b8e..4f06f3ef146c1e8e47df3955820bbce15d9ad09d 100644 --- a/data/models/alibaba_qwen-3-coder-480b.json +++ b/data/models/alibaba_qwen-3-coder-480b.json @@ -4,13 +4,13 @@ "id": "alibaba/qwen-3-coder-480b", "developer": "Alibaba", "additional_details": { - "agent_name": "Terminus 2", - "agent_organization": "Terminal Bench" + "agent_name": "OpenHands", + "agent_organization": "OpenHands" } }, "evaluations": [ { - "evaluation_id": "terminal-bench-2.0/terminus-2__qwen-3-coder-480b/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/openhands__qwen-3-coder-480b/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -34,7 +34,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-01", + "evaluation_timestamp": "2025-11-02", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -43,17 +43,17 @@ "max_score": 100.0 }, "score_details": { - "score": 23.9, + "score": 25.4, "uncertainty": { "standard_error": { - "value": 2.8 + "value": 2.6 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Qwen 3 Coder 480B\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Qwen 3 Coder 480B\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -70,7 +70,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Qwen 3 Coder 480B\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Qwen 3 Coder 480B\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -84,7 +84,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/openhands__qwen-3-coder-480b/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/terminus-2__qwen-3-coder-480b/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -108,7 +108,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-02", + "evaluation_timestamp": "2025-11-01", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -117,17 +117,17 @@ "max_score": 100.0 }, "score_details": { - "score": 25.4, + "score": 23.9, "uncertainty": { "standard_error": { - "value": 2.6 + "value": 2.8 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Qwen 3 Coder 480B\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Qwen 3 Coder 480B\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -144,7 +144,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Qwen 3 Coder 480B\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Qwen 3 Coder 480B\" -k 5", "agentic_eval_config": { "available_tools": [ { diff --git a/data/models/alibaba_qwen3-235b-a22b-instruct-2507.json b/data/models/alibaba_qwen3-235b-a22b-instruct-2507.json index 18e9257ee4a295645c5d70b31ea729844e529c44..73411ab4fcdf3bfd9a55ece3006bf091682be221 100644 --- a/data/models/alibaba_qwen3-235b-a22b-instruct-2507.json +++ b/data/models/alibaba_qwen3-235b-a22b-instruct-2507.json @@ -10,8 +10,8 @@ }, "evaluations": [ { - "evaluation_id": "global-mmlu-lite/alibaba_qwen3-235b-a22b-instruct-2507/1773936496.366405", - "retrieved_timestamp": "1773936496.366405", + "evaluation_id": "global-mmlu-lite/alibaba_qwen3-235b-a22b-instruct-2507/1773936583.743359", + "retrieved_timestamp": "1773936583.743359", "source_metadata": { "source_name": "Global MMLU Lite Leaderboard", "source_type": "documentation", @@ -525,8 +525,8 @@ "generation_config": null }, { - "evaluation_id": "global-mmlu-lite/alibaba_qwen3-235b-a22b-instruct-2507/1773936583.743359", - "retrieved_timestamp": "1773936583.743359", + "evaluation_id": "global-mmlu-lite/alibaba_qwen3-235b-a22b-instruct-2507/1773936496.366405", + "retrieved_timestamp": "1773936496.366405", "source_metadata": { "source_name": "Global MMLU Lite Leaderboard", "source_type": "documentation", diff --git a/data/models/allenai_llama-3.1-70b-instruct-rm-rb2.json b/data/models/allenai_llama-3.1-70b-instruct-rm-rb2.json index 6b82bed55df1bdb62067f41e1f2f588c7fd5e772..3edb4e861c9c885f1d8b857f90568f09968d9fbc 100644 --- a/data/models/allenai_llama-3.1-70b-instruct-rm-rb2.json +++ b/data/models/allenai_llama-3.1-70b-instruct-rm-rb2.json @@ -9,10 +9,10 @@ }, "evaluations": [ { - "evaluation_id": "reward-bench-2/allenai_Llama-3.1-70B-Instruct-RM-RB2/1766412838.146816", + "evaluation_id": "reward-bench/allenai_Llama-3.1-70B-Instruct-RM-RB2/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench 2", + "source_name": "RewardBench", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -31,127 +31,109 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7606 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8126 + "score": 0.9021 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Precise IF", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.4188 + "score": 0.9665 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6995 + "score": 0.8355 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8844 + "score": 0.9095 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Focus", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8646 + "score": 0.8969 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Ties", + "evaluation_name": "Prior Sets (0.5 weight)", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8835 + "score": 0.0 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } } ], @@ -159,10 +141,10 @@ "generation_config": null }, { - "evaluation_id": "reward-bench/allenai_Llama-3.1-70B-Instruct-RM-RB2/1766412838.146816", + "evaluation_id": "reward-bench-2/allenai_Llama-3.1-70B-Instruct-RM-RB2/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench", + "source_name": "RewardBench 2", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -181,109 +163,127 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9021 + "score": 0.7606 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9665 + "score": 0.8126 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8355 + "score": 0.4188 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" + } + }, + { + "evaluation_name": "Math", + "metric_config": { + "evaluation_description": "Math score - measures mathematical reasoning", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.6995 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9095 + "score": 0.8844 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8969 + "score": 0.8646 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Prior Sets (0.5 weight)", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.0 + "score": 0.8835 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } } ], diff --git a/data/models/allenai_llama-3.1-8b-instruct-rm-rb2.json b/data/models/allenai_llama-3.1-8b-instruct-rm-rb2.json index 60d204122c692e04be6211b26f2d79d76e83f431..2b424dce10a93f404c9945de99ddd768647b6184 100644 --- a/data/models/allenai_llama-3.1-8b-instruct-rm-rb2.json +++ b/data/models/allenai_llama-3.1-8b-instruct-rm-rb2.json @@ -9,10 +9,10 @@ }, "evaluations": [ { - "evaluation_id": "reward-bench-2/allenai_Llama-3.1-8B-Instruct-RM-RB2/1766412838.146816", + "evaluation_id": "reward-bench/allenai_Llama-3.1-8B-Instruct-RM-RB2/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench 2", + "source_name": "RewardBench", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -31,127 +31,109 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7285 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7432 + "score": 0.8885 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Precise IF", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.4437 + "score": 0.9581 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6175 + "score": 0.8158 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8956 + "score": 0.8932 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Focus", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9071 + "score": 0.887 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Ties", + "evaluation_name": "Prior Sets (0.5 weight)", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7638 + "score": 0.0 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } } ], @@ -159,10 +141,10 @@ "generation_config": null }, { - "evaluation_id": "reward-bench/allenai_Llama-3.1-8B-Instruct-RM-RB2/1766412838.146816", + "evaluation_id": "reward-bench-2/allenai_Llama-3.1-8B-Instruct-RM-RB2/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench", + "source_name": "RewardBench 2", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -181,109 +163,127 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8885 + "score": 0.7285 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9581 + "score": 0.7432 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8158 + "score": 0.4437 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" + } + }, + { + "evaluation_name": "Math", + "metric_config": { + "evaluation_description": "Math score - measures mathematical reasoning", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.6175 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8932 + "score": 0.8956 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.887 + "score": 0.9071 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Prior Sets (0.5 weight)", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.0 + "score": 0.7638 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } } ], diff --git a/data/models/allenai_llama-3.1-tulu-3-70b-sft-rm-rb2.json b/data/models/allenai_llama-3.1-tulu-3-70b-sft-rm-rb2.json index 3c470e6abd70f5f2bc9a8be57cc36f939ca7753c..56c7eedcfadd2447274047e1824ef99e14cf4e28 100644 --- a/data/models/allenai_llama-3.1-tulu-3-70b-sft-rm-rb2.json +++ b/data/models/allenai_llama-3.1-tulu-3-70b-sft-rm-rb2.json @@ -9,10 +9,10 @@ }, "evaluations": [ { - "evaluation_id": "reward-bench/allenai_Llama-3.1-Tulu-3-70B-SFT-RM-RB2/1766412838.146816", + "evaluation_id": "reward-bench-2/allenai_Llama-3.1-Tulu-3-70B-SFT-RM-RB2/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench", + "source_name": "RewardBench 2", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -31,109 +31,127 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8892 + "score": 0.722 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9693 + "score": 0.8084 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8268 + "score": 0.3688 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" + } + }, + { + "evaluation_name": "Math", + "metric_config": { + "evaluation_description": "Math score - measures mathematical reasoning", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.6776 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9027 + "score": 0.8689 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8583 + "score": 0.7778 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Prior Sets (0.5 weight)", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.0 + "score": 0.8308 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } } ], @@ -141,10 +159,10 @@ "generation_config": null }, { - "evaluation_id": "reward-bench-2/allenai_Llama-3.1-Tulu-3-70B-SFT-RM-RB2/1766412838.146816", + "evaluation_id": "reward-bench/allenai_Llama-3.1-Tulu-3-70B-SFT-RM-RB2/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench 2", + "source_name": "RewardBench", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -163,127 +181,109 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.722 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8084 + "score": 0.8892 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Precise IF", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3688 + "score": 0.9693 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6776 + "score": 0.8268 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8689 + "score": 0.9027 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Focus", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7778 + "score": 0.8583 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Ties", + "evaluation_name": "Prior Sets (0.5 weight)", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8308 + "score": 0.0 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } } ], diff --git a/data/models/allenai_llama-3.1-tulu-3-70b.json b/data/models/allenai_llama-3.1-tulu-3-70b.json index 1db7ede127ee803b7ffd5960ed83ddaa8fe5c58c..dedd220cd13dda3e92225bd87465795dab977b4c 100644 --- a/data/models/allenai_llama-3.1-tulu-3-70b.json +++ b/data/models/allenai_llama-3.1-tulu-3-70b.json @@ -5,7 +5,7 @@ "developer": "allenai", "inference_platform": "unknown", "additional_details": { - "precision": "bfloat16", + "precision": "float16", "architecture": "LlamaForCausalLM", "params_billions": "70.554" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.8291 + "score": 0.8379 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.6164 + "score": 0.6157 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4502 + "score": 0.3829 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4948 + "score": 0.4988 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4645 + "score": 0.4656 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.8379 + "score": 0.8291 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.6157 + "score": 0.6164 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3829 + "score": 0.4502 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4988 + "score": 0.4948 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4656 + "score": 0.4645 } } ], diff --git a/data/models/allenai_llama-3.1-tulu-3-8b-dpo-rm-rb2.json b/data/models/allenai_llama-3.1-tulu-3-8b-dpo-rm-rb2.json index 408cdcea425aaae910c976c1f57587d29fcf9c63..52eed39924ec31b815aba5be343333e3d61411ae 100644 --- a/data/models/allenai_llama-3.1-tulu-3-8b-dpo-rm-rb2.json +++ b/data/models/allenai_llama-3.1-tulu-3-8b-dpo-rm-rb2.json @@ -9,10 +9,10 @@ }, "evaluations": [ { - "evaluation_id": "reward-bench/allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/1766412838.146816", + "evaluation_id": "reward-bench-2/allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench", + "source_name": "RewardBench 2", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -31,109 +31,127 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8431 + "score": 0.687 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9553 + "score": 0.7516 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.761 + "score": 0.3875 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" + } + }, + { + "evaluation_name": "Math", + "metric_config": { + "evaluation_description": "Math score - measures mathematical reasoning", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.6284 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8662 + "score": 0.86 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7898 + "score": 0.8545 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Prior Sets (0.5 weight)", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.0 + "score": 0.6397 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } } ], @@ -141,10 +159,10 @@ "generation_config": null }, { - "evaluation_id": "reward-bench-2/allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/1766412838.146816", + "evaluation_id": "reward-bench/allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench 2", + "source_name": "RewardBench", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -163,127 +181,109 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.687 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7516 + "score": 0.8431 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Precise IF", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3875 + "score": 0.9553 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6284 + "score": 0.761 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.86 + "score": 0.8662 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Focus", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8545 + "score": 0.7898 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Ties", + "evaluation_name": "Prior Sets (0.5 weight)", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6397 + "score": 0.0 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } } ], diff --git a/data/models/anthropic_claude-haiku-4.5.json b/data/models/anthropic_claude-haiku-4.5.json index f3552167d1969c3e644bdda5d3cf5d022313285c..a51d1ecfb03be58e1a56ba7e5711a9bf62822a49 100644 --- a/data/models/anthropic_claude-haiku-4.5.json +++ b/data/models/anthropic_claude-haiku-4.5.json @@ -4,13 +4,13 @@ "id": "anthropic/claude-haiku-4.5", "developer": "Anthropic", "additional_details": { - "agent_name": "Claude Code", - "agent_organization": "Anthropic" + "agent_name": "Mini-SWE-Agent", + "agent_organization": "Princeton" } }, "evaluations": [ { - "evaluation_id": "terminal-bench-2.0/claude-code__claude-haiku-4.5/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/mini-swe-agent__claude-haiku-4.5/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -34,7 +34,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-04", + "evaluation_timestamp": "2025-11-03", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -43,17 +43,17 @@ "max_score": 100.0 }, "score_details": { - "score": 27.5, + "score": 29.8, "uncertainty": { "standard_error": { - "value": 2.8 + "value": 2.5 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Haiku 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Claude Haiku 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -70,7 +70,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Haiku 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Claude Haiku 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -84,7 +84,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/goose__claude-haiku-4.5/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/terminus-2__claude-haiku-4.5/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -108,7 +108,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-12-11", + "evaluation_timestamp": "2025-10-31", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -117,7 +117,7 @@ "max_score": 100.0 }, "score_details": { - "score": 35.5, + "score": 28.3, "uncertainty": { "standard_error": { "value": 2.9 @@ -127,7 +127,7 @@ }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Goose\" -m \"Claude Haiku 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Haiku 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -144,7 +144,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Goose\" -m \"Claude Haiku 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Haiku 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -158,7 +158,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/mini-swe-agent__claude-haiku-4.5/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/claude-code__claude-haiku-4.5/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -182,7 +182,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-03", + "evaluation_timestamp": "2025-11-04", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -191,17 +191,17 @@ "max_score": 100.0 }, "score_details": { - "score": 29.8, + "score": 27.5, "uncertainty": { "standard_error": { - "value": 2.5 + "value": 2.8 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Claude Haiku 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Haiku 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -218,7 +218,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Claude Haiku 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Haiku 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -232,7 +232,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/terminus-2__claude-haiku-4.5/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/goose__claude-haiku-4.5/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -256,7 +256,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-10-31", + "evaluation_timestamp": "2025-12-11", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -265,7 +265,7 @@ "max_score": 100.0 }, "score_details": { - "score": 28.3, + "score": 35.5, "uncertainty": { "standard_error": { "value": 2.9 @@ -275,7 +275,7 @@ }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Haiku 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Goose\" -m \"Claude Haiku 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -292,7 +292,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Haiku 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Goose\" -m \"Claude Haiku 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { diff --git a/data/models/anthropic_claude-opus-4-1-20250805.json b/data/models/anthropic_claude-opus-4-1-20250805.json index 97af67f7a9590f836aeb06cb6e8ac5e127131091..d8d76da387da094a25675bc9e4c49b25e5dc839b 100644 --- a/data/models/anthropic_claude-opus-4-1-20250805.json +++ b/data/models/anthropic_claude-opus-4-1-20250805.json @@ -10,8 +10,8 @@ }, "evaluations": [ { - "evaluation_id": "global-mmlu-lite/anthropic_claude-opus-4-1-20250805/1773936496.366405", - "retrieved_timestamp": "1773936496.366405", + "evaluation_id": "global-mmlu-lite/anthropic_claude-opus-4-1-20250805/1773936583.743359", + "retrieved_timestamp": "1773936583.743359", "source_metadata": { "source_name": "Global MMLU Lite Leaderboard", "source_type": "documentation", @@ -525,8 +525,8 @@ "generation_config": null }, { - "evaluation_id": "global-mmlu-lite/anthropic_claude-opus-4-1-20250805/1773936583.743359", - "retrieved_timestamp": "1773936583.743359", + "evaluation_id": "global-mmlu-lite/anthropic_claude-opus-4-1-20250805/1773936496.366405", + "retrieved_timestamp": "1773936496.366405", "source_metadata": { "source_name": "Global MMLU Lite Leaderboard", "source_type": "documentation", diff --git a/data/models/anthropic_claude-opus-4-5.json b/data/models/anthropic_claude-opus-4-5.json index b56ddf35bf238ee9affa206d56038693a0cc3a91..1de3c90261aa34fd7e316493f0ec72792c7a6a7a 100644 --- a/data/models/anthropic_claude-opus-4-5.json +++ b/data/models/anthropic_claude-opus-4-5.json @@ -4,13 +4,13 @@ "id": "anthropic/claude-opus-4-5", "developer": "Anthropic", "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } }, "evaluations": [ { - "evaluation_id": "appworld/test_normal/smolagents-code__anthropic_claude-opus-4-5/1774263615.0201504", + "evaluation_id": "appworld/test_normal/openai-solo__anthropic_claude-opus-4-5/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -42,23 +42,23 @@ "max_score": 1.0 }, "score_details": { - "score": 0.7, + "score": 0.68, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "5.59", - "total_run_cost": "558.51", - "average_steps": "41.07", - "percent_finished": "0.82" + "average_agent_cost": "22.76", + "total_run_cost": "2276.48", + "average_steps": "47.65", + "percent_finished": "0.77" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } @@ -70,15 +70,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } } }, { - "evaluation_id": "appworld/test_normal/claude-code-cli__anthropic_claude-opus-4-5/1774263615.0201504", + "evaluation_id": "browsecompplus/openai-solo__anthropic_claude-opus-4-5/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -91,42 +91,42 @@ "name": "exgentic", "version": "0.1.0" }, - "benchmark": "appworld_test_normal", + "benchmark": "browsecompplus", "evaluation_results": [ { - "evaluation_name": "appworld/test_normal", + "evaluation_name": "browsecompplus", "source_data": { - "dataset_name": "appworld/test_normal", + "dataset_name": "browsecompplus", "source_type": "url", "url": [ "https://github.com/Exgentic/exgentic" ] }, "metric_config": { - "evaluation_description": "AppWorld benchmark evaluation (test_normal subset)", + "evaluation_description": "BrowseCompPlus benchmark evaluation", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.66, + "score": 0.61, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "13.08", - "total_run_cost": "1308.38", - "average_steps": "49.69", - "percent_finished": "0.74" + "average_agent_cost": "7.59", + "total_run_cost": "759.44", + "average_steps": "27.18", + "percent_finished": "1.0" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } @@ -138,15 +138,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } } }, { - "evaluation_id": "appworld/test_normal/litellm-tool-calling-with-shortlisting__anthropic_claude-opus-4-5/1774263615.0201504", + "evaluation_id": "browsecompplus/litellm-tool-calling__anthropic_claude-opus-4-5/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -159,42 +159,42 @@ "name": "exgentic", "version": "0.1.0" }, - "benchmark": "appworld_test_normal", + "benchmark": "browsecompplus", "evaluation_results": [ { - "evaluation_name": "appworld/test_normal", + "evaluation_name": "browsecompplus", "source_data": { - "dataset_name": "appworld/test_normal", + "dataset_name": "browsecompplus", "source_type": "url", "url": [ "https://github.com/Exgentic/exgentic" ] }, "metric_config": { - "evaluation_description": "AppWorld benchmark evaluation (test_normal subset)", + "evaluation_description": "BrowseCompPlus benchmark evaluation", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.64, + "score": 0.49, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "3.43", - "total_run_cost": "343.32", - "average_steps": "20.06", - "percent_finished": "0.82" + "average_agent_cost": "7.09", + "total_run_cost": "709.54", + "average_steps": "21.66", + "percent_finished": "0.93" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } @@ -206,8 +206,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } @@ -282,7 +282,7 @@ } }, { - "evaluation_id": "appworld/test_normal/openai-solo__anthropic_claude-opus-4-5/1774263615.0201504", + "evaluation_id": "appworld/test_normal/claude-code-cli__anthropic_claude-opus-4-5/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -314,23 +314,23 @@ "max_score": 1.0 }, "score_details": { - "score": 0.68, + "score": 0.66, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "22.76", - "total_run_cost": "2276.48", - "average_steps": "47.65", - "percent_finished": "0.77" + "average_agent_cost": "13.08", + "total_run_cost": "1308.38", + "average_steps": "49.69", + "percent_finished": "0.74" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } @@ -342,15 +342,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } } }, { - "evaluation_id": "browsecompplus/litellm-tool-calling__anthropic_claude-opus-4-5/1774263615.0201504", + "evaluation_id": "appworld/test_normal/litellm-tool-calling-with-shortlisting__anthropic_claude-opus-4-5/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -363,42 +363,42 @@ "name": "exgentic", "version": "0.1.0" }, - "benchmark": "browsecompplus", + "benchmark": "appworld_test_normal", "evaluation_results": [ { - "evaluation_name": "browsecompplus", + "evaluation_name": "appworld/test_normal", "source_data": { - "dataset_name": "browsecompplus", + "dataset_name": "appworld/test_normal", "source_type": "url", "url": [ "https://github.com/Exgentic/exgentic" ] }, "metric_config": { - "evaluation_description": "BrowseCompPlus benchmark evaluation", + "evaluation_description": "AppWorld benchmark evaluation (test_normal subset)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.49, + "score": 0.64, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "7.09", - "total_run_cost": "709.54", - "average_steps": "21.66", - "percent_finished": "0.93" + "average_agent_cost": "3.43", + "total_run_cost": "343.32", + "average_steps": "20.06", + "percent_finished": "0.82" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } @@ -410,8 +410,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } @@ -486,7 +486,7 @@ } }, { - "evaluation_id": "browsecompplus/openai-solo__anthropic_claude-opus-4-5/1774263615.0201504", + "evaluation_id": "appworld/test_normal/smolagents-code__anthropic_claude-opus-4-5/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -499,42 +499,42 @@ "name": "exgentic", "version": "0.1.0" }, - "benchmark": "browsecompplus", + "benchmark": "appworld_test_normal", "evaluation_results": [ { - "evaluation_name": "browsecompplus", + "evaluation_name": "appworld/test_normal", "source_data": { - "dataset_name": "browsecompplus", + "dataset_name": "appworld/test_normal", "source_type": "url", "url": [ "https://github.com/Exgentic/exgentic" ] }, "metric_config": { - "evaluation_description": "BrowseCompPlus benchmark evaluation", + "evaluation_description": "AppWorld benchmark evaluation (test_normal subset)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.61, + "score": 0.7, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "7.59", - "total_run_cost": "759.44", - "average_steps": "27.18", - "percent_finished": "1.0" + "average_agent_cost": "5.59", + "total_run_cost": "558.51", + "average_steps": "41.07", + "percent_finished": "0.82" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } @@ -546,8 +546,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } @@ -690,7 +690,7 @@ } }, { - "evaluation_id": "swe-bench/openai-solo__anthropic_claude-opus-4-5/1774263615.0201504", + "evaluation_id": "swe-bench/claude-code-cli__anthropic_claude-opus-4-5/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -722,14 +722,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.8072, + "score": 0.7423, "uncertainty": { - "num_samples": 83 + "num_samples": 97 }, "details": { - "average_agent_cost": "2.96", - "total_run_cost": "245.78", - "average_steps": "34.1", + "average_agent_cost": "5.6", + "total_run_cost": "543.62", + "average_steps": "31.76", "percent_finished": "1.0" } }, @@ -737,8 +737,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } @@ -750,15 +750,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } } }, { - "evaluation_id": "swe-bench/litellm-tool-calling-with-shortlisting__anthropic_claude-opus-4-5/1774263615.0201504", + "evaluation_id": "swe-bench/litellm-tool-calling__anthropic_claude-opus-4-5/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -805,8 +805,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } @@ -818,15 +818,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } } }, { - "evaluation_id": "swe-bench/litellm-tool-calling__anthropic_claude-opus-4-5/1774263615.0201504", + "evaluation_id": "swe-bench/openai-solo__anthropic_claude-opus-4-5/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -858,14 +858,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.6061, + "score": 0.8072, "uncertainty": { - "num_samples": 99 + "num_samples": 83 }, "details": { - "average_agent_cost": "3.97", - "total_run_cost": "393.16", - "average_steps": "43.44", + "average_agent_cost": "2.96", + "total_run_cost": "245.78", + "average_steps": "34.1", "percent_finished": "1.0" } }, @@ -873,8 +873,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } @@ -886,15 +886,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } } }, { - "evaluation_id": "swe-bench/claude-code-cli__anthropic_claude-opus-4-5/1774263615.0201504", + "evaluation_id": "swe-bench/smolagents-code__anthropic_claude-opus-4-5/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -926,14 +926,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.7423, + "score": 0.65, "uncertainty": { - "num_samples": 97 + "num_samples": 100 }, "details": { - "average_agent_cost": "5.6", - "total_run_cost": "543.62", - "average_steps": "31.76", + "average_agent_cost": "4.85", + "total_run_cost": "485.22", + "average_steps": "39.13", "percent_finished": "1.0" } }, @@ -941,8 +941,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } @@ -954,15 +954,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } } }, { - "evaluation_id": "swe-bench/smolagents-code__anthropic_claude-opus-4-5/1774263615.0201504", + "evaluation_id": "swe-bench/litellm-tool-calling-with-shortlisting__anthropic_claude-opus-4-5/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -994,14 +994,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.65, + "score": 0.6061, "uncertainty": { - "num_samples": 100 + "num_samples": 99 }, "details": { - "average_agent_cost": "4.85", - "total_run_cost": "485.22", - "average_steps": "39.13", + "average_agent_cost": "3.97", + "total_run_cost": "393.16", + "average_steps": "43.44", "percent_finished": "1.0" } }, @@ -1009,8 +1009,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } @@ -1022,15 +1022,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } } }, { - "evaluation_id": "tau-bench-2/airline/litellm-tool-calling-with-shortlisting__anthropic_claude-opus-4-5/1774263615.0201504", + "evaluation_id": "tau-bench-2/airline/smolagents-code__anthropic_claude-opus-4-5/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1062,14 +1062,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.66, + "score": 0.72, "uncertainty": { "num_samples": 50 }, "details": { - "average_agent_cost": "0.47", - "total_run_cost": "24.23", - "average_steps": "10.0", + "average_agent_cost": "0.78", + "total_run_cost": "39.67", + "average_steps": "11.88", "percent_finished": "1.0" } }, @@ -1077,8 +1077,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } @@ -1090,8 +1090,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } @@ -1302,7 +1302,7 @@ } }, { - "evaluation_id": "tau-bench-2/airline/smolagents-code__anthropic_claude-opus-4-5/1774263615.0201504", + "evaluation_id": "tau-bench-2/retail/litellm-tool-calling__anthropic_claude-opus-4-5/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1315,33 +1315,33 @@ "name": "exgentic", "version": "0.1.0" }, - "benchmark": "tau-bench-2_airline", + "benchmark": "tau-bench-2_retail", "evaluation_results": [ { - "evaluation_name": "tau-bench-2/airline", + "evaluation_name": "tau-bench-2/retail", "source_data": { - "dataset_name": "tau-bench-2/airline", + "dataset_name": "tau-bench-2/retail", "source_type": "url", "url": [ "https://github.com/Exgentic/exgentic" ] }, "metric_config": { - "evaluation_description": "Tau Bench 2 benchmark evaluation (airline subset)", + "evaluation_description": "Tau Bench 2 benchmark evaluation (retail subset)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.72, + "score": 0.78, "uncertainty": { - "num_samples": 50 + "num_samples": 100 }, "details": { - "average_agent_cost": "0.78", - "total_run_cost": "39.67", - "average_steps": "11.88", + "average_agent_cost": "0.47", + "total_run_cost": "48.01", + "average_steps": "11.33", "percent_finished": "1.0" } }, @@ -1349,8 +1349,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } @@ -1362,15 +1362,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } } }, { - "evaluation_id": "tau-bench-2/retail/litellm-tool-calling__anthropic_claude-opus-4-5/1774263615.0201504", + "evaluation_id": "tau-bench-2/retail/claude-code-cli__anthropic_claude-opus-4-5/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1402,14 +1402,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.78, + "score": 0.83, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.47", - "total_run_cost": "48.01", - "average_steps": "11.33", + "average_agent_cost": "1.6", + "total_run_cost": "161.14", + "average_steps": "12.54", "percent_finished": "1.0" } }, @@ -1417,8 +1417,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } @@ -1430,15 +1430,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } } }, { - "evaluation_id": "tau-bench-2/retail/claude-code-cli__anthropic_claude-opus-4-5/1774263615.0201504", + "evaluation_id": "tau-bench-2/retail/smolagents-code__anthropic_claude-opus-4-5/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1470,14 +1470,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.83, + "score": 0.78, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "1.6", - "total_run_cost": "161.14", - "average_steps": "12.54", + "average_agent_cost": "0.67", + "total_run_cost": "68.24", + "average_steps": "11.71", "percent_finished": "1.0" } }, @@ -1485,8 +1485,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } @@ -1498,15 +1498,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } } }, { - "evaluation_id": "tau-bench-2/retail/openai-solo__anthropic_claude-opus-4-5/1774263615.0201504", + "evaluation_id": "tau-bench-2/airline/litellm-tool-calling-with-shortlisting__anthropic_claude-opus-4-5/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1519,33 +1519,33 @@ "name": "exgentic", "version": "0.1.0" }, - "benchmark": "tau-bench-2_retail", + "benchmark": "tau-bench-2_airline", "evaluation_results": [ { - "evaluation_name": "tau-bench-2/retail", + "evaluation_name": "tau-bench-2/airline", "source_data": { - "dataset_name": "tau-bench-2/retail", + "dataset_name": "tau-bench-2/airline", "source_type": "url", "url": [ "https://github.com/Exgentic/exgentic" ] }, "metric_config": { - "evaluation_description": "Tau Bench 2 benchmark evaluation (retail subset)", + "evaluation_description": "Tau Bench 2 benchmark evaluation (airline subset)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.85, + "score": 0.66, "uncertainty": { - "num_samples": 100 + "num_samples": 50 }, "details": { - "average_agent_cost": "0.55", - "total_run_cost": "56.18", - "average_steps": "12.54", + "average_agent_cost": "0.47", + "total_run_cost": "24.23", + "average_steps": "10.0", "percent_finished": "1.0" } }, @@ -1553,8 +1553,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } @@ -1566,15 +1566,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } } }, { - "evaluation_id": "tau-bench-2/retail/litellm-tool-calling-with-shortlisting__anthropic_claude-opus-4-5/1774263615.0201504", + "evaluation_id": "tau-bench-2/retail/openai-solo__anthropic_claude-opus-4-5/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1606,14 +1606,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.78, + "score": 0.85, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.47", - "total_run_cost": "48.01", - "average_steps": "11.33", + "average_agent_cost": "0.55", + "total_run_cost": "56.18", + "average_steps": "12.54", "percent_finished": "1.0" } }, @@ -1621,8 +1621,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } @@ -1634,15 +1634,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } } }, { - "evaluation_id": "tau-bench-2/retail/smolagents-code__anthropic_claude-opus-4-5/1774263615.0201504", + "evaluation_id": "tau-bench-2/retail/litellm-tool-calling-with-shortlisting__anthropic_claude-opus-4-5/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1679,9 +1679,9 @@ "num_samples": 100 }, "details": { - "average_agent_cost": "0.67", - "total_run_cost": "68.24", - "average_steps": "11.71", + "average_agent_cost": "0.47", + "total_run_cost": "48.01", + "average_steps": "11.33", "percent_finished": "1.0" } }, @@ -1689,8 +1689,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } @@ -1702,15 +1702,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } } }, { - "evaluation_id": "tau-bench-2/telecom/claude-code-cli__anthropic_claude-opus-4-5/1774263615.0201504", + "evaluation_id": "tau-bench-2/telecom/litellm-tool-calling-with-shortlisting__anthropic_claude-opus-4-5/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1747,9 +1747,9 @@ "num_samples": 100 }, "details": { - "average_agent_cost": "2.45", - "total_run_cost": "255.97", - "average_steps": "18.71", + "average_agent_cost": "0.92", + "total_run_cost": "102.01", + "average_steps": "17.22", "percent_finished": "1.0" } }, @@ -1757,8 +1757,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } @@ -1770,8 +1770,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } @@ -1846,7 +1846,7 @@ } }, { - "evaluation_id": "tau-bench-2/telecom/openai-solo__anthropic_claude-opus-4-5/1774263615.0201504", + "evaluation_id": "tau-bench-2/telecom/claude-code-cli__anthropic_claude-opus-4-5/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1878,14 +1878,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.84, + "score": 0.76, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "1.25", - "total_run_cost": "136.84", - "average_steps": "17.15", + "average_agent_cost": "2.45", + "total_run_cost": "255.97", + "average_steps": "18.71", "percent_finished": "1.0" } }, @@ -1893,8 +1893,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } @@ -1906,15 +1906,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } } }, { - "evaluation_id": "tau-bench-2/telecom/litellm-tool-calling-with-shortlisting__anthropic_claude-opus-4-5/1774263615.0201504", + "evaluation_id": "tau-bench-2/telecom/litellm-tool-calling__anthropic_claude-opus-4-5/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1961,8 +1961,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } @@ -1974,15 +1974,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } } }, { - "evaluation_id": "tau-bench-2/telecom/litellm-tool-calling__anthropic_claude-opus-4-5/1774263615.0201504", + "evaluation_id": "tau-bench-2/telecom/openai-solo__anthropic_claude-opus-4-5/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -2014,14 +2014,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.76, + "score": 0.84, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.92", - "total_run_cost": "102.01", - "average_steps": "17.22", + "average_agent_cost": "1.25", + "total_run_cost": "136.84", + "average_steps": "17.15", "percent_finished": "1.0" } }, @@ -2029,8 +2029,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } @@ -2042,8 +2042,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } diff --git a/data/models/anthropic_claude-opus-4.1.json b/data/models/anthropic_claude-opus-4.1.json index e9a568bd310f72259de9371fc544155b5b7255f5..b51256b4d5a95f9160a82aa03d45fc55ec0aba09 100644 --- a/data/models/anthropic_claude-opus-4.1.json +++ b/data/models/anthropic_claude-opus-4.1.json @@ -4,13 +4,13 @@ "id": "anthropic/claude-opus-4.1", "developer": "Anthropic", "additional_details": { - "agent_name": "Claude Code", - "agent_organization": "Anthropic" + "agent_name": "OpenHands", + "agent_organization": "OpenHands" } }, "evaluations": [ { - "evaluation_id": "terminal-bench-2.0/claude-code__claude-opus-4.1/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/openhands__claude-opus-4.1/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -34,7 +34,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-04", + "evaluation_timestamp": "2025-11-02", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -43,17 +43,17 @@ "max_score": 100.0 }, "score_details": { - "score": 34.8, + "score": 36.9, "uncertainty": { "standard_error": { - "value": 2.9 + "value": 2.7 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Opus 4.1\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Claude Opus 4.1\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -70,7 +70,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Opus 4.1\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Claude Opus 4.1\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -84,7 +84,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/mini-swe-agent__claude-opus-4.1/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/terminus-2__claude-opus-4.1/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -108,7 +108,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-03", + "evaluation_timestamp": "2025-10-31", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -117,17 +117,17 @@ "max_score": 100.0 }, "score_details": { - "score": 35.1, + "score": 38.0, "uncertainty": { "standard_error": { - "value": 2.5 + "value": 2.6 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Claude Opus 4.1\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Opus 4.1\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -144,7 +144,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Claude Opus 4.1\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Opus 4.1\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -158,7 +158,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/openhands__claude-opus-4.1/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/claude-code__claude-opus-4.1/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -182,7 +182,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-02", + "evaluation_timestamp": "2025-11-04", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -191,17 +191,17 @@ "max_score": 100.0 }, "score_details": { - "score": 36.9, + "score": 34.8, "uncertainty": { "standard_error": { - "value": 2.7 + "value": 2.9 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Claude Opus 4.1\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Opus 4.1\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -218,7 +218,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Claude Opus 4.1\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Opus 4.1\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -232,7 +232,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/terminus-2__claude-opus-4.1/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/mini-swe-agent__claude-opus-4.1/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -256,7 +256,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-10-31", + "evaluation_timestamp": "2025-11-03", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -265,17 +265,17 @@ "max_score": 100.0 }, "score_details": { - "score": 38.0, + "score": 35.1, "uncertainty": { "standard_error": { - "value": 2.6 + "value": 2.5 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Opus 4.1\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Claude Opus 4.1\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -292,7 +292,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Opus 4.1\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Claude Opus 4.1\" -k 5", "agentic_eval_config": { "available_tools": [ { diff --git a/data/models/anthropic_claude-opus-4.5.json b/data/models/anthropic_claude-opus-4.5.json index 67b02f7c77893feb7214e9d8713a329f346b20ae..14b1dc3ce90d80ab30d23bbb2e060321876d554a 100644 --- a/data/models/anthropic_claude-opus-4.5.json +++ b/data/models/anthropic_claude-opus-4.5.json @@ -4,13 +4,13 @@ "id": "anthropic/claude-opus-4.5", "developer": "Anthropic", "additional_details": { - "agent_name": "Terminus 2", - "agent_organization": "Terminal Bench" + "agent_name": "OpenCode", + "agent_organization": "Anomaly Innovations" } }, "evaluations": [ { - "evaluation_id": "terminal-bench-2.0/terminus-2__claude-opus-4.5/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/opencode__claude-opus-4.5/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -34,7 +34,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-22", + "evaluation_timestamp": "2026-01-12", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -43,17 +43,11 @@ "max_score": 100.0 }, "score_details": { - "score": 57.8, - "uncertainty": { - "standard_error": { - "value": 2.5 - }, - "num_samples": 435 - } + "score": 51.7 }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Opus 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenCode\" -m \"Claude Opus 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -70,7 +64,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Opus 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenCode\" -m \"Claude Opus 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -84,7 +78,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/mux__claude-opus-4.5/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/terminus-2__claude-opus-4.5/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -108,7 +102,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2026-01-17", + "evaluation_timestamp": "2025-11-22", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -117,11 +111,17 @@ "max_score": 100.0 }, "score_details": { - "score": 58.4 + "score": 57.8, + "uncertainty": { + "standard_error": { + "value": 2.5 + }, + "num_samples": 435 + } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mux\" -m \"Claude Opus 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Opus 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -138,7 +138,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mux\" -m \"Claude Opus 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Opus 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -152,7 +152,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/opencode__claude-opus-4.5/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/goose__claude-opus-4.5/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -176,7 +176,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2026-01-12", + "evaluation_timestamp": "2025-12-11", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -185,11 +185,17 @@ "max_score": 100.0 }, "score_details": { - "score": 51.7 + "score": 54.3, + "uncertainty": { + "standard_error": { + "value": 2.6 + }, + "num_samples": 435 + } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenCode\" -m \"Claude Opus 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Goose\" -m \"Claude Opus 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -206,7 +212,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenCode\" -m \"Claude Opus 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Goose\" -m \"Claude Opus 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -294,7 +300,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/goose__claude-opus-4.5/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/mux__claude-opus-4.5/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -318,7 +324,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-12-11", + "evaluation_timestamp": "2026-01-17", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -327,17 +333,11 @@ "max_score": 100.0 }, "score_details": { - "score": 54.3, - "uncertainty": { - "standard_error": { - "value": 2.6 - }, - "num_samples": 435 - } + "score": 58.4 }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Goose\" -m \"Claude Opus 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mux\" -m \"Claude Opus 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -354,7 +354,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Goose\" -m \"Claude Opus 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mux\" -m \"Claude Opus 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -442,7 +442,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/claude-code__claude-opus-4.5/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/letta-code__claude-opus-4.5/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -466,7 +466,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-12-18", + "evaluation_timestamp": "2025-12-17", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -475,17 +475,17 @@ "max_score": 100.0 }, "score_details": { - "score": 52.1, + "score": 59.1, "uncertainty": { "standard_error": { - "value": 2.5 + "value": 2.4 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Opus 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Letta Code\" -m \"Claude Opus 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -502,7 +502,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Opus 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Letta Code\" -m \"Claude Opus 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -516,7 +516,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/letta-code__claude-opus-4.5/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/claude-code__claude-opus-4.5/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -540,7 +540,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-12-17", + "evaluation_timestamp": "2025-12-18", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -549,17 +549,17 @@ "max_score": 100.0 }, "score_details": { - "score": 59.1, + "score": 52.1, "uncertainty": { "standard_error": { - "value": 2.4 + "value": 2.5 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Letta Code\" -m \"Claude Opus 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Opus 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -576,7 +576,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Letta Code\" -m \"Claude Opus 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Opus 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { diff --git a/data/models/anthropic_claude-opus-4.6.json b/data/models/anthropic_claude-opus-4.6.json index bc72df0490b3d006ed2917d9f7d68277e00092b2..048dae2de3da40230747c994f11a1351dfec5a14 100644 --- a/data/models/anthropic_claude-opus-4.6.json +++ b/data/models/anthropic_claude-opus-4.6.json @@ -4,13 +4,13 @@ "id": "anthropic/claude-opus-4.6", "developer": "Anthropic", "additional_details": { - "agent_name": "Mux", - "agent_organization": "Coder" + "agent_name": "Terminus-KIRA", + "agent_organization": "KRAFTON AI" } }, "evaluations": [ { - "evaluation_id": "terminal-bench-2.0/mux__claude-opus-4.6/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/terminus-kira__claude-opus-4.6/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -34,7 +34,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2026-02-13", + "evaluation_timestamp": "2026-02-22", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -43,17 +43,17 @@ "max_score": 100.0 }, "score_details": { - "score": 66.5, + "score": 74.7, "uncertainty": { "standard_error": { - "value": 2.5 + "value": 2.6 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mux\" -m \"Claude Opus 4.6\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus-KIRA\" -m \"Claude Opus 4.6\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -70,7 +70,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mux\" -m \"Claude Opus 4.6\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus-KIRA\" -m \"Claude Opus 4.6\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -84,7 +84,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/terminus-kira__claude-opus-4.6/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/droid__claude-opus-4.6/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -108,7 +108,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2026-02-22", + "evaluation_timestamp": "2026-02-05", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -117,17 +117,17 @@ "max_score": 100.0 }, "score_details": { - "score": 74.7, + "score": 69.9, "uncertainty": { "standard_error": { - "value": 2.6 + "value": 2.5 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus-KIRA\" -m \"Claude Opus 4.6\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Droid\" -m \"Claude Opus 4.6\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -144,7 +144,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus-KIRA\" -m \"Claude Opus 4.6\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Droid\" -m \"Claude Opus 4.6\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -158,7 +158,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/crux__claude-opus-4.6/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/mux__claude-opus-4.6/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -182,7 +182,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2026-02-23", + "evaluation_timestamp": "2026-02-13", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -191,11 +191,17 @@ "max_score": 100.0 }, "score_details": { - "score": 66.9 + "score": 66.5, + "uncertainty": { + "standard_error": { + "value": 2.5 + }, + "num_samples": 435 + } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Crux\" -m \"Claude Opus 4.6\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mux\" -m \"Claude Opus 4.6\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -212,7 +218,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Crux\" -m \"Claude Opus 4.6\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mux\" -m \"Claude Opus 4.6\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -226,7 +232,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/droid__claude-opus-4.6/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/crux__claude-opus-4.6/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -250,7 +256,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2026-02-05", + "evaluation_timestamp": "2026-02-23", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -259,17 +265,11 @@ "max_score": 100.0 }, "score_details": { - "score": 69.9, - "uncertainty": { - "standard_error": { - "value": 2.5 - }, - "num_samples": 435 - } + "score": 66.9 }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Droid\" -m \"Claude Opus 4.6\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Crux\" -m \"Claude Opus 4.6\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -286,7 +286,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Droid\" -m \"Claude Opus 4.6\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Crux\" -m \"Claude Opus 4.6\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -300,7 +300,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/tongagents__claude-opus-4.6/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/claude-code__claude-opus-4.6/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -324,7 +324,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2026-02-22", + "evaluation_timestamp": "2026-02-07", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -333,17 +333,17 @@ "max_score": 100.0 }, "score_details": { - "score": 71.9, + "score": 58.0, "uncertainty": { "standard_error": { - "value": 2.7 + "value": 2.9 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"TongAgents\" -m \"Claude Opus 4.6\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Opus 4.6\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -360,7 +360,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"TongAgents\" -m \"Claude Opus 4.6\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Opus 4.6\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -374,7 +374,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/terminus-2__claude-opus-4.6/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/tongagents__claude-opus-4.6/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -398,7 +398,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2026-02-06", + "evaluation_timestamp": "2026-02-22", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -407,7 +407,7 @@ "max_score": 100.0 }, "score_details": { - "score": 62.9, + "score": 71.9, "uncertainty": { "standard_error": { "value": 2.7 @@ -417,7 +417,7 @@ }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Opus 4.6\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"TongAgents\" -m \"Claude Opus 4.6\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -434,7 +434,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Opus 4.6\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"TongAgents\" -m \"Claude Opus 4.6\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -448,7 +448,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/claude-code__claude-opus-4.6/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/terminus-2__claude-opus-4.6/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -472,7 +472,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2026-02-07", + "evaluation_timestamp": "2026-02-06", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -481,17 +481,17 @@ "max_score": 100.0 }, "score_details": { - "score": 58.0, + "score": 62.9, "uncertainty": { "standard_error": { - "value": 2.9 + "value": 2.7 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Opus 4.6\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Opus 4.6\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -508,7 +508,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Opus 4.6\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Opus 4.6\" -k 5", "agentic_eval_config": { "available_tools": [ { diff --git a/data/models/anthropic_claude-sonnet-4-20250514.json b/data/models/anthropic_claude-sonnet-4-20250514.json index 17485295f0940239338e7d9ae5edbc1db77fa9eb..a43572d10d77d034b8cc0b4e9e80cb595c19f907 100644 --- a/data/models/anthropic_claude-sonnet-4-20250514.json +++ b/data/models/anthropic_claude-sonnet-4-20250514.json @@ -10,8 +10,8 @@ }, "evaluations": [ { - "evaluation_id": "global-mmlu-lite/anthropic_claude-sonnet-4-20250514/1773936583.743359", - "retrieved_timestamp": "1773936583.743359", + "evaluation_id": "global-mmlu-lite/anthropic_claude-sonnet-4-20250514/1773936496.366405", + "retrieved_timestamp": "1773936496.366405", "source_metadata": { "source_name": "Global MMLU Lite Leaderboard", "source_type": "documentation", @@ -525,8 +525,8 @@ "generation_config": null }, { - "evaluation_id": "global-mmlu-lite/anthropic_claude-sonnet-4-20250514/1773936496.366405", - "retrieved_timestamp": "1773936496.366405", + "evaluation_id": "global-mmlu-lite/anthropic_claude-sonnet-4-20250514/1773936583.743359", + "retrieved_timestamp": "1773936583.743359", "source_metadata": { "source_name": "Global MMLU Lite Leaderboard", "source_type": "documentation", diff --git a/data/models/anthropic_claude-sonnet-4.5.json b/data/models/anthropic_claude-sonnet-4.5.json index 9a3fab7ce187c0a2db0a367672e1fd3359d83a6f..2721b6a2b6511249d3161ee660ebb342753596b5 100644 --- a/data/models/anthropic_claude-sonnet-4.5.json +++ b/data/models/anthropic_claude-sonnet-4.5.json @@ -4,13 +4,13 @@ "id": "anthropic/claude-sonnet-4.5", "developer": "Anthropic", "additional_details": { - "agent_name": "CAMEL-AI", - "agent_organization": "CAMEL-AI" + "agent_name": "Goose", + "agent_organization": "Block" } }, "evaluations": [ { - "evaluation_id": "terminal-bench-2.0/camel-ai__claude-sonnet-4.5/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/goose__claude-sonnet-4.5/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -34,7 +34,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-12-24", + "evaluation_timestamp": "2025-12-11", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -43,17 +43,17 @@ "max_score": 100.0 }, "score_details": { - "score": 46.5, + "score": 43.1, "uncertainty": { "standard_error": { - "value": 2.4 + "value": 2.6 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"CAMEL-AI\" -m \"Claude Sonnet 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Goose\" -m \"Claude Sonnet 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -70,7 +70,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"CAMEL-AI\" -m \"Claude Sonnet 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Goose\" -m \"Claude Sonnet 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -84,7 +84,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/openhands__claude-sonnet-4.5/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/terminus-2__claude-sonnet-4.5/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -108,7 +108,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-02", + "evaluation_timestamp": "2025-10-31", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -117,7 +117,7 @@ "max_score": 100.0 }, "score_details": { - "score": 42.6, + "score": 42.8, "uncertainty": { "standard_error": { "value": 2.8 @@ -127,7 +127,7 @@ }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Claude Sonnet 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Sonnet 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -144,7 +144,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Claude Sonnet 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Sonnet 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -158,7 +158,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/terminus-2__claude-sonnet-4.5/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/claude-code__claude-sonnet-4.5/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -182,7 +182,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-10-31", + "evaluation_timestamp": "2025-11-04", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -191,17 +191,17 @@ "max_score": 100.0 }, "score_details": { - "score": 42.8, + "score": 40.1, "uncertainty": { "standard_error": { - "value": 2.8 + "value": 2.9 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Sonnet 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Sonnet 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -218,7 +218,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Claude Sonnet 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Sonnet 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -232,7 +232,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/mini-swe-agent__claude-sonnet-4.5/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/maya__claude-sonnet-4.5/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -256,7 +256,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-03", + "evaluation_timestamp": "2026-01-04", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -265,17 +265,11 @@ "max_score": 100.0 }, "score_details": { - "score": 42.5, - "uncertainty": { - "standard_error": { - "value": 2.8 - }, - "num_samples": 435 - } + "score": 42.7 }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Claude Sonnet 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"MAYA\" -m \"Claude Sonnet 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -292,7 +286,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Claude Sonnet 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"MAYA\" -m \"Claude Sonnet 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -306,7 +300,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/claude-code__claude-sonnet-4.5/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/mini-swe-agent__claude-sonnet-4.5/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -330,7 +324,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-04", + "evaluation_timestamp": "2025-11-03", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -339,17 +333,17 @@ "max_score": 100.0 }, "score_details": { - "score": 40.1, + "score": 42.5, "uncertainty": { "standard_error": { - "value": 2.9 + "value": 2.8 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Sonnet 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Claude Sonnet 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -366,7 +360,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Claude Code\" -m \"Claude Sonnet 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Claude Sonnet 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -380,7 +374,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/maya__claude-sonnet-4.5/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/camel-ai__claude-sonnet-4.5/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -404,7 +398,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2026-01-04", + "evaluation_timestamp": "2025-12-24", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -413,11 +407,17 @@ "max_score": 100.0 }, "score_details": { - "score": 42.7 + "score": 46.5, + "uncertainty": { + "standard_error": { + "value": 2.4 + }, + "num_samples": 435 + } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"MAYA\" -m \"Claude Sonnet 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"CAMEL-AI\" -m \"Claude Sonnet 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -434,7 +434,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"MAYA\" -m \"Claude Sonnet 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"CAMEL-AI\" -m \"Claude Sonnet 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -448,7 +448,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/goose__claude-sonnet-4.5/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/openhands__claude-sonnet-4.5/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -472,7 +472,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-12-11", + "evaluation_timestamp": "2025-11-02", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -481,17 +481,17 @@ "max_score": 100.0 }, "score_details": { - "score": 43.1, + "score": 42.6, "uncertainty": { "standard_error": { - "value": 2.6 + "value": 2.8 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Goose\" -m \"Claude Sonnet 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Claude Sonnet 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -508,7 +508,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Goose\" -m \"Claude Sonnet 4.5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Claude Sonnet 4.5\" -k 5", "agentic_eval_config": { "available_tools": [ { diff --git a/data/models/anthropic_opus_4.5.json b/data/models/anthropic_opus_4.5.json index 0952f13ca0e6f15c8598404fd4db3674326651d7..04269bfd22dff1677abd1f4b2e4e3d614c500f72 100644 --- a/data/models/anthropic_opus_4.5.json +++ b/data/models/anthropic_opus_4.5.json @@ -6,76 +6,6 @@ "inference_platform": "unknown" }, "evaluations": [ - { - "evaluation_id": "ace/anthropic_opus-4.5/1773260200", - "retrieved_timestamp": "1773260200", - "source_metadata": { - "source_name": "Mercor ACE Leaderboard", - "source_type": "evaluation_run", - "source_organization_name": "Mercor", - "source_organization_url": "https://www.mercor.com", - "evaluator_relationship": "first_party" - }, - "eval_library": { - "name": "archipelago", - "version": "1.0.0" - }, - "benchmark": "ace", - "evaluation_results": [ - { - "evaluation_name": "Overall Score", - "source_data": { - "dataset_name": "ace", - "source_type": "hf_dataset", - "hf_repo": "Mercor/ACE" - }, - "metric_config": { - "evaluation_description": "Overall ACE score (paper snapshot).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.478 - }, - "generation_config": { - "additional_details": { - "run_setting": "On" - } - } - }, - { - "evaluation_name": "Gaming Score", - "source_data": { - "dataset_name": "ace", - "source_type": "hf_dataset", - "hf_repo": "Mercor/ACE" - }, - "metric_config": { - "evaluation_description": "Gaming domain score.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.391 - }, - "generation_config": { - "additional_details": { - "run_setting": "On" - } - } - } - ], - "detailed_evaluation_results": null, - "generation_config": { - "additional_details": { - "run_setting": "On" - } - } - }, { "evaluation_id": "apex-agents/anthropic_opus-4.5/1773260200", "retrieved_timestamp": "1773260200", @@ -275,6 +205,76 @@ } } }, + { + "evaluation_id": "ace/anthropic_opus-4.5/1773260200", + "retrieved_timestamp": "1773260200", + "source_metadata": { + "source_name": "Mercor ACE Leaderboard", + "source_type": "evaluation_run", + "source_organization_name": "Mercor", + "source_organization_url": "https://www.mercor.com", + "evaluator_relationship": "first_party" + }, + "eval_library": { + "name": "archipelago", + "version": "1.0.0" + }, + "benchmark": "ace", + "evaluation_results": [ + { + "evaluation_name": "Overall Score", + "source_data": { + "dataset_name": "ace", + "source_type": "hf_dataset", + "hf_repo": "Mercor/ACE" + }, + "metric_config": { + "evaluation_description": "Overall ACE score (paper snapshot).", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0, + "max_score": 1 + }, + "score_details": { + "score": 0.478 + }, + "generation_config": { + "additional_details": { + "run_setting": "On" + } + } + }, + { + "evaluation_name": "Gaming Score", + "source_data": { + "dataset_name": "ace", + "source_type": "hf_dataset", + "hf_repo": "Mercor/ACE" + }, + "metric_config": { + "evaluation_description": "Gaming domain score.", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0, + "max_score": 1 + }, + "score_details": { + "score": 0.391 + }, + "generation_config": { + "additional_details": { + "run_setting": "On" + } + } + } + ], + "detailed_evaluation_results": null, + "generation_config": { + "additional_details": { + "run_setting": "On" + } + } + }, { "evaluation_id": "apex-v1/anthropic_opus-4.5/1773260200", "retrieved_timestamp": "1773260200", diff --git a/data/models/cognitivecomputations_dolphin-2.9.2-phi-3-medium-abliterated.json b/data/models/cognitivecomputations_dolphin-2.9.2-phi-3-medium-abliterated.json index 9b7cc35f188c98dba3f0427de09e77d4b26cfc40..62d09c3db567da9932caf7141d7a1389c1aafea9 100644 --- a/data/models/cognitivecomputations_dolphin-2.9.2-phi-3-medium-abliterated.json +++ b/data/models/cognitivecomputations_dolphin-2.9.2-phi-3-medium-abliterated.json @@ -5,7 +5,7 @@ "developer": "cognitivecomputations", "inference_platform": "unknown", "additional_details": { - "precision": "float16", + "precision": "bfloat16", "architecture": "MistralForCausalLM", "params_billions": "13.96" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3613 + "score": 0.4124 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.6123 + "score": 0.6383 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1239 + "score": 0.182 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.328 + "score": 0.3289 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4112 + "score": 0.4349 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4494 + "score": 0.4525 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4124 + "score": 0.3613 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.6383 + "score": 0.6123 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.182 + "score": 0.1239 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3289 + "score": 0.328 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4349 + "score": 0.4112 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4525 + "score": 0.4494 } } ], diff --git a/data/models/columbia-nlp_lion-gemma-2b-dpo-v1.0.json b/data/models/columbia-nlp_lion-gemma-2b-dpo-v1.0.json index fcba7390760a9757eb868fa82619eef91d551f86..e1500a121f56ad89301900472e223d7937764cb0 100644 --- a/data/models/columbia-nlp_lion-gemma-2b-dpo-v1.0.json +++ b/data/models/columbia-nlp_lion-gemma-2b-dpo-v1.0.json @@ -5,7 +5,7 @@ "developer": "Columbia-NLP", "inference_platform": "unknown", "additional_details": { - "precision": "bfloat16", + "precision": "float16", "architecture": "GemmaForCausalLM", "params_billions": "2.506" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3102 + "score": 0.3278 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3881 + "score": 0.392 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0536 + "score": 0.0431 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2534 + "score": 0.2492 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4081 + "score": 0.412 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1665 + "score": 0.1666 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3278 + "score": 0.3102 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.392 + "score": 0.3881 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0431 + "score": 0.0536 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2492 + "score": 0.2534 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.412 + "score": 0.4081 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1666 + "score": 0.1665 } } ], diff --git a/data/models/cpayne1303_llama-43m-beta.json b/data/models/cpayne1303_llama-43m-beta.json index 0f0d3430f35b22a1aef8ac071591532d594d6d4d..8e62d1be9edc1ccaeda7702d1a793fd5db4d4150 100644 --- a/data/models/cpayne1303_llama-43m-beta.json +++ b/data/models/cpayne1303_llama-43m-beta.json @@ -5,7 +5,7 @@ "developer": "cpayne1303", "inference_platform": "unknown", "additional_details": { - "precision": "float16", + "precision": "bfloat16", "architecture": "LlamaForCausalLM", "params_billions": "0.043" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1949 + "score": 0.1916 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2965 + "score": 0.2977 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0045 + "score": 0.0 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3885 + "score": 0.3872 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1111 + "score": 0.1132 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1916 + "score": 0.1949 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2977 + "score": 0.2965 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0 + "score": 0.0045 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3872 + "score": 0.3885 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1132 + "score": 0.1111 } } ], diff --git a/data/models/daemontatox_pathfinderai.json b/data/models/daemontatox_pathfinderai.json index 7a5f7d25c7278e2df08548a48abfe0b0ee9b4f2a..8b13e2aabf79ff36b82f8d8bd4c0bcdf2b41e385 100644 --- a/data/models/daemontatox_pathfinderai.json +++ b/data/models/daemontatox_pathfinderai.json @@ -5,7 +5,7 @@ "developer": "Daemontatox", "inference_platform": "unknown", "additional_details": { - "precision": "bfloat16", + "precision": "float16", "architecture": "Qwen2ForCausalLM", "params_billions": "32.764" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4855 + "score": 0.3745 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.6627 + "score": 0.6668 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4841 + "score": 0.4758 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3096 + "score": 0.3943 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4256 + "score": 0.4858 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5542 + "score": 0.5593 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3745 + "score": 0.4855 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.6668 + "score": 0.6627 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4758 + "score": 0.4841 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3943 + "score": 0.3096 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4858 + "score": 0.4256 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5593 + "score": 0.5542 } } ], diff --git a/data/models/deepmount00_llama-3.1-8b-ita.json b/data/models/deepmount00_llama-3.1-8b-ita.json index be94466036e753700b8de72e15b468fc6edebdee..1fef7ca0b3d471692e7379016093b45312ecc7df 100644 --- a/data/models/deepmount00_llama-3.1-8b-ita.json +++ b/data/models/deepmount00_llama-3.1-8b-ita.json @@ -6,8 +6,8 @@ "inference_platform": "unknown", "additional_details": { "precision": "bfloat16", - "architecture": "Unknown", - "params_billions": "0.0", + "architecture": "LlamaForCausalLM", + "params_billions": "8.03", "model_id_aliases": [ "DeepMount00/Llama-3.1-8b-Ita" ] @@ -15,7 +15,7 @@ }, "evaluations": [ { - "evaluation_id": "hfopenllm_v2/DeepMount00_Llama-3.1-8b-Ita/1773936498.240187", + "evaluation_id": "hfopenllm_v2/DeepMount00_Llama-3.1-8b-ITA/1773936498.240187", "retrieved_timestamp": "1773936498.240187", "source_metadata": { "source_name": "HF Open LLM v2", @@ -47,7 +47,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5365 + "score": 0.7917 } }, { @@ -65,7 +65,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.517 + "score": 0.5109 } }, { @@ -83,7 +83,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1707 + "score": 0.1088 } }, { @@ -101,7 +101,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3062 + "score": 0.2878 } }, { @@ -119,7 +119,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4487 + "score": 0.4136 } }, { @@ -137,7 +137,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.396 + "score": 0.3876 } } ], @@ -145,7 +145,7 @@ "generation_config": null }, { - "evaluation_id": "hfopenllm_v2/DeepMount00_Llama-3.1-8b-ITA/1773936498.240187", + "evaluation_id": "hfopenllm_v2/DeepMount00_Llama-3.1-8b-Ita/1773936498.240187", "retrieved_timestamp": "1773936498.240187", "source_metadata": { "source_name": "HF Open LLM v2", @@ -177,7 +177,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.7917 + "score": 0.5365 } }, { @@ -195,7 +195,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5109 + "score": 0.517 } }, { @@ -213,7 +213,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1088 + "score": 0.1707 } }, { @@ -231,7 +231,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2878 + "score": 0.3062 } }, { @@ -249,7 +249,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4136 + "score": 0.4487 } }, { @@ -267,7 +267,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3876 + "score": 0.396 } } ], diff --git a/data/models/dfurman_llama-3-8b-orpo-v0.1.json b/data/models/dfurman_llama-3-8b-orpo-v0.1.json index 987b3112f03b34626e0421787b1c9cb2e2b55a46..a2dcf54fc8dfd61d5182710e882025f5ed46cdab 100644 --- a/data/models/dfurman_llama-3-8b-orpo-v0.1.json +++ b/data/models/dfurman_llama-3-8b-orpo-v0.1.json @@ -5,8 +5,8 @@ "developer": "dfurman", "inference_platform": "unknown", "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", + "precision": "float16", + "architecture": "?", "params_billions": "8.03" } }, @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3 + "score": 0.2835 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3853 + "score": 0.3842 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0415 + "score": 0.0521 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2617 + "score": 0.2609 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3579 + "score": 0.3566 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2281 + "score": 0.2298 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2835 + "score": 0.3 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3842 + "score": 0.3853 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0521 + "score": 0.0415 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2609 + "score": 0.2617 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3566 + "score": 0.3579 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2298 + "score": 0.2281 } } ], diff --git a/data/models/doppelreflex_mn-12b-lilithframe.json b/data/models/doppelreflex_mn-12b-lilithframe.json index 2532c65fe82e1d06f8d68b438dbffc94e3e4e4bb..720fbe0846130d5605f8d9f0e7d4c1731ec8e47c 100644 --- a/data/models/doppelreflex_mn-12b-lilithframe.json +++ b/data/models/doppelreflex_mn-12b-lilithframe.json @@ -5,7 +5,7 @@ "developer": "DoppelReflEx", "inference_platform": "unknown", "additional_details": { - "precision": "bfloat16", + "precision": "float16", "architecture": "MistralForCausalLM", "params_billions": "12.248" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.436 + "score": 0.451 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4956 + "score": 0.4944 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0589 + "score": 0.1156 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3205 + "score": 0.3196 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3843 + "score": 0.3896 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3237 + "score": 0.3256 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.451 + "score": 0.436 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4944 + "score": 0.4956 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1156 + "score": 0.0589 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3196 + "score": 0.3205 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3896 + "score": 0.3843 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3256 + "score": 0.3237 } } ], diff --git a/data/models/google_gemini-2.5-flash.json b/data/models/google_gemini-2.5-flash.json index fed61d14fcef2477959a8fb49c1907eead45e035..a2e65a21393c95ddfc5f4fe71d47a11fac033db8 100644 --- a/data/models/google_gemini-2.5-flash.json +++ b/data/models/google_gemini-2.5-flash.json @@ -1269,7 +1269,7 @@ "generation_config": null }, { - "evaluation_id": "terminal-bench-2.0/terminus-2__gemini-2.5-flash/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/mini-swe-agent__gemini-2.5-flash/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -1293,7 +1293,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-10-31", + "evaluation_timestamp": "2025-11-03", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -1302,17 +1302,17 @@ "max_score": 100.0 }, "score_details": { - "score": 16.9, + "score": 17.1, "uncertainty": { "standard_error": { - "value": 2.4 + "value": 2.5 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Gemini 2.5 Flash\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Gemini 2.5 Flash\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -1329,7 +1329,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Gemini 2.5 Flash\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Gemini 2.5 Flash\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -1343,7 +1343,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/gemini-cli__gemini-2.5-flash/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/openhands__gemini-2.5-flash/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -1367,7 +1367,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-04", + "evaluation_timestamp": "2025-11-02", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -1376,17 +1376,17 @@ "max_score": 100.0 }, "score_details": { - "score": 15.4, + "score": 16.4, "uncertainty": { "standard_error": { - "value": 2.3 + "value": 2.4 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Gemini CLI\" -m \"Gemini 2.5 Flash\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Gemini 2.5 Flash\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -1403,7 +1403,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Gemini CLI\" -m \"Gemini 2.5 Flash\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Gemini 2.5 Flash\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -1417,7 +1417,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/openhands__gemini-2.5-flash/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/gemini-cli__gemini-2.5-flash/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -1441,7 +1441,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-02", + "evaluation_timestamp": "2025-11-04", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -1450,17 +1450,17 @@ "max_score": 100.0 }, "score_details": { - "score": 16.4, + "score": 15.4, "uncertainty": { "standard_error": { - "value": 2.4 + "value": 2.3 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Gemini 2.5 Flash\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Gemini CLI\" -m \"Gemini 2.5 Flash\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -1477,7 +1477,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Gemini 2.5 Flash\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Gemini CLI\" -m \"Gemini 2.5 Flash\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -1491,7 +1491,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/mini-swe-agent__gemini-2.5-flash/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/terminus-2__gemini-2.5-flash/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -1515,7 +1515,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-03", + "evaluation_timestamp": "2025-10-31", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -1524,17 +1524,17 @@ "max_score": 100.0 }, "score_details": { - "score": 17.1, + "score": 16.9, "uncertainty": { "standard_error": { - "value": 2.5 + "value": 2.4 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Gemini 2.5 Flash\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Gemini 2.5 Flash\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -1551,7 +1551,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Gemini 2.5 Flash\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Gemini 2.5 Flash\" -k 5", "agentic_eval_config": { "available_tools": [ { diff --git a/data/models/google_gemini-2.5-pro.json b/data/models/google_gemini-2.5-pro.json index 9276c148869675ec0a23b9d18706b87bdb718f59..64bf8e5b0bbd8066e28305bcfb70c0755706abd7 100644 --- a/data/models/google_gemini-2.5-pro.json +++ b/data/models/google_gemini-2.5-pro.json @@ -1343,7 +1343,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/terminus-2__gemini-2.5-pro/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/gemini-cli__gemini-2.5-pro/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -1367,7 +1367,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-10-31", + "evaluation_timestamp": "2025-11-04", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -1376,17 +1376,17 @@ "max_score": 100.0 }, "score_details": { - "score": 32.6, + "score": 19.6, "uncertainty": { "standard_error": { - "value": 3.0 + "value": 2.9 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Gemini 2.5 Pro\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Gemini CLI\" -m \"Gemini 2.5 Pro\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -1403,7 +1403,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Gemini 2.5 Pro\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Gemini CLI\" -m \"Gemini 2.5 Pro\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -1417,7 +1417,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/gemini-cli__gemini-2.5-pro/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/terminus-2__gemini-2.5-pro/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -1441,7 +1441,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-04", + "evaluation_timestamp": "2025-10-31", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -1450,17 +1450,17 @@ "max_score": 100.0 }, "score_details": { - "score": 19.6, + "score": 32.6, "uncertainty": { "standard_error": { - "value": 2.9 + "value": 3.0 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Gemini CLI\" -m \"Gemini 2.5 Pro\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Gemini 2.5 Pro\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -1477,7 +1477,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Gemini CLI\" -m \"Gemini 2.5 Pro\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Gemini 2.5 Pro\" -k 5", "agentic_eval_config": { "available_tools": [ { diff --git a/data/models/google_gemini-3-flash.json b/data/models/google_gemini-3-flash.json index 4ea354a43eede049751e0c22c3ef39159edfde83..96d81804f9403e5af725c031c3f055444b4c7604 100644 --- a/data/models/google_gemini-3-flash.json +++ b/data/models/google_gemini-3-flash.json @@ -4,13 +4,13 @@ "id": "google/gemini-3-flash", "developer": "Google", "additional_details": { - "agent_name": "Junie CLI", - "agent_organization": "JetBrains" + "agent_name": "Gemini CLI", + "agent_organization": "Google" } }, "evaluations": [ { - "evaluation_id": "terminal-bench-2.0/junie-cli__gemini-3-flash/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/gemini-cli__gemini-3-flash/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -43,17 +43,17 @@ "max_score": 100.0 }, "score_details": { - "score": 64.3, + "score": 51.0, "uncertainty": { "standard_error": { - "value": 2.8 + "value": 3.0 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Junie CLI\" -m \"Gemini 3 Flash\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Gemini CLI\" -m \"Gemini 3 Flash\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -70,7 +70,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Junie CLI\" -m \"Gemini 3 Flash\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Gemini CLI\" -m \"Gemini 3 Flash\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -84,7 +84,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/gemini-cli__gemini-3-flash/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/terminus-2__gemini-3-flash/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -108,7 +108,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2026-03-06", + "evaluation_timestamp": "2026-01-07", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -117,17 +117,17 @@ "max_score": 100.0 }, "score_details": { - "score": 47.4, + "score": 51.7, "uncertainty": { "standard_error": { - "value": 3.0 + "value": 3.1 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Gemini CLI\" -m \"Gemini 3 Flash\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Gemini 3 Flash\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -144,7 +144,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Gemini CLI\" -m \"Gemini 3 Flash\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Gemini 3 Flash\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -158,7 +158,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/terminus-2__gemini-3-flash/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/junie-cli__gemini-3-flash/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -182,7 +182,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2026-01-07", + "evaluation_timestamp": "2025-12-23", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -191,17 +191,17 @@ "max_score": 100.0 }, "score_details": { - "score": 51.7, + "score": 64.3, "uncertainty": { "standard_error": { - "value": 3.1 + "value": 2.8 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Gemini 3 Flash\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Junie CLI\" -m \"Gemini 3 Flash\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -218,7 +218,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Gemini 3 Flash\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Junie CLI\" -m \"Gemini 3 Flash\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -256,7 +256,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-12-23", + "evaluation_timestamp": "2026-03-06", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -265,7 +265,7 @@ "max_score": 100.0 }, "score_details": { - "score": 51.0, + "score": 47.4, "uncertainty": { "standard_error": { "value": 3.0 diff --git a/data/models/google_gemini-3-pro-preview.json b/data/models/google_gemini-3-pro-preview.json index f42cfe1c80eb9fdb58bb474a8d6a4066dfd5628d..7edffa00c381bc6c2a68a0a203fbe30120c52254 100644 --- a/data/models/google_gemini-3-pro-preview.json +++ b/data/models/google_gemini-3-pro-preview.json @@ -4,13 +4,13 @@ "id": "google/gemini-3-pro-preview", "developer": "Google", "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } }, "evaluations": [ { - "evaluation_id": "appworld/test_normal/litellm-tool-calling-with-shortlisting__google_gemini-3-pro-preview/1774263615.0201504", + "evaluation_id": "appworld/test_normal/openai-solo__google_gemini-3-pro-preview/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -42,23 +42,23 @@ "max_score": 1.0 }, "score_details": { - "score": 0.55, + "score": 0.582, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "1.3", - "total_run_cost": "130.49", - "average_steps": "22.59", - "percent_finished": "1.0" + "average_agent_cost": "8.7", + "total_run_cost": "869.55", + "average_steps": "33.49", + "percent_finished": "0.98" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } @@ -70,15 +70,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } } }, { - "evaluation_id": "appworld/test_normal/litellm-tool-calling__google_gemini-3-pro-preview/1774263615.0201504", + "evaluation_id": "browsecompplus/claude-code-cli__google_gemini-3-pro-preview/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -91,42 +91,42 @@ "name": "exgentic", "version": "0.1.0" }, - "benchmark": "appworld_test_normal", + "benchmark": "browsecompplus", "evaluation_results": [ { - "evaluation_name": "appworld/test_normal", + "evaluation_name": "browsecompplus", "source_data": { - "dataset_name": "appworld/test_normal", + "dataset_name": "browsecompplus", "source_type": "url", "url": [ "https://github.com/Exgentic/exgentic" ] }, "metric_config": { - "evaluation_description": "AppWorld benchmark evaluation (test_normal subset)", + "evaluation_description": "BrowseCompPlus benchmark evaluation", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.505, + "score": 0.51, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "1.88", - "total_run_cost": "188.19", - "average_steps": "21.76", - "percent_finished": "0.99" + "average_agent_cost": "2.85", + "total_run_cost": "284.68", + "average_steps": "22.88", + "percent_finished": "0.7" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } @@ -138,15 +138,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } } }, { - "evaluation_id": "browsecompplus/smolagents-code__google_gemini-3-pro-preview/1774263615.0201504", + "evaluation_id": "appworld/test_normal/litellm-tool-calling__google_gemini-3-pro-preview/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -159,42 +159,42 @@ "name": "exgentic", "version": "0.1.0" }, - "benchmark": "browsecompplus", + "benchmark": "appworld_test_normal", "evaluation_results": [ { - "evaluation_name": "browsecompplus", + "evaluation_name": "appworld/test_normal", "source_data": { - "dataset_name": "browsecompplus", + "dataset_name": "appworld/test_normal", "source_type": "url", "url": [ "https://github.com/Exgentic/exgentic" ] }, "metric_config": { - "evaluation_description": "BrowseCompPlus benchmark evaluation", + "evaluation_description": "AppWorld benchmark evaluation (test_normal subset)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.57, + "score": 0.505, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "2.39", - "total_run_cost": "239.0", - "average_steps": "29.63", - "percent_finished": "0.69" + "average_agent_cost": "1.88", + "total_run_cost": "188.19", + "average_steps": "21.76", + "percent_finished": "0.99" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } @@ -206,15 +206,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } } }, { - "evaluation_id": "appworld/test_normal/openai-solo__google_gemini-3-pro-preview/1774263615.0201504", + "evaluation_id": "appworld/test_normal/claude-code-cli__google_gemini-3-pro-preview/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -246,23 +246,23 @@ "max_score": 1.0 }, "score_details": { - "score": 0.582, + "score": 0.36, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "8.7", - "total_run_cost": "869.55", - "average_steps": "33.49", - "percent_finished": "0.98" + "average_agent_cost": "3.11", + "total_run_cost": "310.55", + "average_steps": "38.01", + "percent_finished": "0.86" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } @@ -274,15 +274,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } } }, { - "evaluation_id": "appworld/test_normal/claude-code-cli__google_gemini-3-pro-preview/1774263615.0201504", + "evaluation_id": "appworld/test_normal/smolagents-code__google_gemini-3-pro-preview/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -314,23 +314,23 @@ "max_score": 1.0 }, "score_details": { - "score": 0.36, + "score": 0.13, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "3.11", - "total_run_cost": "310.55", - "average_steps": "38.01", - "percent_finished": "0.86" + "average_agent_cost": "2.54", + "total_run_cost": "254.25", + "average_steps": "49.13", + "percent_finished": "0.71" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } @@ -342,15 +342,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } } }, { - "evaluation_id": "appworld/test_normal/smolagents-code__google_gemini-3-pro-preview/1774263615.0201504", + "evaluation_id": "appworld/test_normal/litellm-tool-calling-with-shortlisting__google_gemini-3-pro-preview/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -382,23 +382,23 @@ "max_score": 1.0 }, "score_details": { - "score": 0.13, + "score": 0.55, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "2.54", - "total_run_cost": "254.25", - "average_steps": "49.13", - "percent_finished": "0.71" + "average_agent_cost": "1.3", + "total_run_cost": "130.49", + "average_steps": "22.59", + "percent_finished": "1.0" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } @@ -410,15 +410,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } } }, { - "evaluation_id": "browsecompplus/claude-code-cli__google_gemini-3-pro-preview/1774263615.0201504", + "evaluation_id": "browsecompplus/smolagents-code__google_gemini-3-pro-preview/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -450,23 +450,23 @@ "max_score": 1.0 }, "score_details": { - "score": 0.51, + "score": 0.57, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "2.85", - "total_run_cost": "284.68", - "average_steps": "22.88", - "percent_finished": "0.7" + "average_agent_cost": "2.39", + "total_run_cost": "239.0", + "average_steps": "29.63", + "percent_finished": "0.69" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } @@ -478,15 +478,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } } }, { - "evaluation_id": "browsecompplus/openai-solo__google_gemini-3-pro-preview/1774263615.0201504", + "evaluation_id": "browsecompplus/litellm-tool-calling__google_gemini-3-pro-preview/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -518,23 +518,23 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3333, + "score": 0.48, "uncertainty": { - "num_samples": 99 + "num_samples": 100 }, "details": { - "average_agent_cost": "0.64", - "total_run_cost": "63.79", - "average_steps": "8.45", - "percent_finished": "0.6061" + "average_agent_cost": "0.44", + "total_run_cost": "44.18", + "average_steps": "7.85", + "percent_finished": "0.99" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } @@ -546,8 +546,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } @@ -622,7 +622,7 @@ } }, { - "evaluation_id": "browsecompplus/litellm-tool-calling__google_gemini-3-pro-preview/1774263615.0201504", + "evaluation_id": "browsecompplus/openai-solo__google_gemini-3-pro-preview/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -654,23 +654,23 @@ "max_score": 1.0 }, "score_details": { - "score": 0.48, + "score": 0.3333, "uncertainty": { - "num_samples": 100 + "num_samples": 99 }, "details": { - "average_agent_cost": "0.44", - "total_run_cost": "44.18", - "average_steps": "7.85", - "percent_finished": "0.99" + "average_agent_cost": "0.64", + "total_run_cost": "63.79", + "average_steps": "8.45", + "percent_finished": "0.6061" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } @@ -682,8 +682,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } @@ -1720,7 +1720,7 @@ "generation_config": null }, { - "evaluation_id": "swe-bench/litellm-tool-calling-with-shortlisting__google_gemini-3-pro-preview/1774263615.0201504", + "evaluation_id": "swe-bench/litellm-tool-calling__google_gemini-3-pro-preview/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1767,8 +1767,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } @@ -1780,15 +1780,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } } }, { - "evaluation_id": "swe-bench/claude-code-cli__google_gemini-3-pro-preview/1774263615.0201504", + "evaluation_id": "swe-bench/openai-solo__google_gemini-3-pro-preview/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1820,14 +1820,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.67, + "score": 0.7234, "uncertainty": { - "num_samples": 100 + "num_samples": 94 }, "details": { - "average_agent_cost": "3.68", - "total_run_cost": "367.97", - "average_steps": "43.72", + "average_agent_cost": "1.58", + "total_run_cost": "148.44", + "average_steps": "32.36", "percent_finished": "1.0" } }, @@ -1835,8 +1835,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } @@ -1848,8 +1848,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } @@ -1924,7 +1924,7 @@ } }, { - "evaluation_id": "swe-bench/litellm-tool-calling__google_gemini-3-pro-preview/1774263615.0201504", + "evaluation_id": "swe-bench/claude-code-cli__google_gemini-3-pro-preview/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1956,14 +1956,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.71, + "score": 0.67, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.7", - "total_run_cost": "69.56", - "average_steps": "32.55", + "average_agent_cost": "3.68", + "total_run_cost": "367.97", + "average_steps": "43.72", "percent_finished": "1.0" } }, @@ -1971,8 +1971,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } @@ -1984,15 +1984,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } } }, { - "evaluation_id": "swe-bench/openai-solo__google_gemini-3-pro-preview/1774263615.0201504", + "evaluation_id": "swe-bench/litellm-tool-calling-with-shortlisting__google_gemini-3-pro-preview/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -2024,14 +2024,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.7234, + "score": 0.71, "uncertainty": { - "num_samples": 94 + "num_samples": 100 }, "details": { - "average_agent_cost": "1.58", - "total_run_cost": "148.44", - "average_steps": "32.36", + "average_agent_cost": "0.7", + "total_run_cost": "69.56", + "average_steps": "32.55", "percent_finished": "1.0" } }, @@ -2039,8 +2039,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } @@ -2052,8 +2052,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } @@ -2127,74 +2127,6 @@ } } }, - { - "evaluation_id": "tau-bench-2/airline/litellm-tool-calling__google_gemini-3-pro-preview/1774263615.0201504", - "retrieved_timestamp": "1774263615.0201504", - "source_metadata": { - "source_name": "Exgentic Open Agent Leaderboard", - "source_type": "evaluation_run", - "source_organization_name": "Exgentic", - "source_organization_url": "https://github.com/Exgentic", - "evaluator_relationship": "third_party" - }, - "eval_library": { - "name": "exgentic", - "version": "0.1.0" - }, - "benchmark": "tau-bench-2_airline", - "evaluation_results": [ - { - "evaluation_name": "tau-bench-2/airline", - "source_data": { - "dataset_name": "tau-bench-2/airline", - "source_type": "url", - "url": [ - "https://github.com/Exgentic/exgentic" - ] - }, - "metric_config": { - "evaluation_description": "Tau Bench 2 benchmark evaluation (airline subset)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7, - "uncertainty": { - "num_samples": 50 - }, - "details": { - "average_agent_cost": "0.16", - "total_run_cost": "8.48", - "average_steps": "10.14", - "percent_finished": "1.0" - } - }, - "generation_config": { - "generation_args": { - "agentic_eval_config": { - "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" - } - } - } - } - } - ], - "detailed_evaluation_results": null, - "generation_config": { - "generation_args": { - "agentic_eval_config": { - "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" - } - } - } - } - }, { "evaluation_id": "tau-bench-2/airline/openai-solo__google_gemini-3-pro-preview/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", @@ -2264,7 +2196,7 @@ } }, { - "evaluation_id": "tau-bench-2/airline/smolagents-code__google_gemini-3-pro-preview/1774263615.0201504", + "evaluation_id": "tau-bench-2/airline/litellm-tool-calling-with-shortlisting__google_gemini-3-pro-preview/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -2296,14 +2228,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.68, + "score": 0.7, "uncertainty": { "num_samples": 50 }, "details": { - "average_agent_cost": "0.2", - "total_run_cost": "10.29", - "average_steps": "12.28", + "average_agent_cost": "0.16", + "total_run_cost": "8.48", + "average_steps": "10.14", "percent_finished": "1.0" } }, @@ -2311,8 +2243,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } @@ -2324,15 +2256,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } } }, { - "evaluation_id": "tau-bench-2/airline/litellm-tool-calling-with-shortlisting__google_gemini-3-pro-preview/1774263615.0201504", + "evaluation_id": "tau-bench-2/airline/litellm-tool-calling__google_gemini-3-pro-preview/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -2379,8 +2311,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } @@ -2392,15 +2324,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } } }, { - "evaluation_id": "tau-bench-2/retail/claude-code-cli__google_gemini-3-pro-preview/1774263615.0201504", + "evaluation_id": "tau-bench-2/retail/smolagents-code__google_gemini-3-pro-preview/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -2432,14 +2364,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.7805, + "score": 0.7576, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.19", - "total_run_cost": "19.38", - "average_steps": "11.18", + "average_agent_cost": "0.21", + "total_run_cost": "21.43", + "average_steps": "11.3", "percent_finished": "1.0" } }, @@ -2447,8 +2379,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } @@ -2460,8 +2392,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } @@ -2536,7 +2468,7 @@ } }, { - "evaluation_id": "tau-bench-2/retail/smolagents-code__google_gemini-3-pro-preview/1774263615.0201504", + "evaluation_id": "tau-bench-2/retail/claude-code-cli__google_gemini-3-pro-preview/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -2568,14 +2500,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.7576, + "score": 0.7805, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.21", - "total_run_cost": "21.43", - "average_steps": "11.3", + "average_agent_cost": "0.19", + "total_run_cost": "19.38", + "average_steps": "11.18", "percent_finished": "1.0" } }, @@ -2583,8 +2515,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } @@ -2596,8 +2528,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } @@ -2740,7 +2672,7 @@ } }, { - "evaluation_id": "tau-bench-2/telecom/smolagents-code__google_gemini-3-pro-preview/1774263615.0201504", + "evaluation_id": "tau-bench-2/airline/smolagents-code__google_gemini-3-pro-preview/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -2753,33 +2685,33 @@ "name": "exgentic", "version": "0.1.0" }, - "benchmark": "tau-bench-2_telecom", + "benchmark": "tau-bench-2_airline", "evaluation_results": [ { - "evaluation_name": "tau-bench-2/telecom", + "evaluation_name": "tau-bench-2/airline", "source_data": { - "dataset_name": "tau-bench-2/telecom", + "dataset_name": "tau-bench-2/airline", "source_type": "url", "url": [ "https://github.com/Exgentic/exgentic" ] }, "metric_config": { - "evaluation_description": "Tau Bench 2 benchmark evaluation (telecom subset)", + "evaluation_description": "Tau Bench 2 benchmark evaluation (airline subset)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.88, + "score": 0.68, "uncertainty": { - "num_samples": 100 + "num_samples": 50 }, "details": { - "average_agent_cost": "0.35", - "total_run_cost": "40.25", - "average_steps": "12.71", + "average_agent_cost": "0.2", + "total_run_cost": "10.29", + "average_steps": "12.28", "percent_finished": "1.0" } }, @@ -2876,7 +2808,7 @@ } }, { - "evaluation_id": "tau-bench-2/telecom/litellm-tool-calling__google_gemini-3-pro-preview/1774263615.0201504", + "evaluation_id": "tau-bench-2/telecom/smolagents-code__google_gemini-3-pro-preview/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -2908,14 +2840,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.73, + "score": 0.88, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.3", - "total_run_cost": "36.75", - "average_steps": "14.84", + "average_agent_cost": "0.35", + "total_run_cost": "40.25", + "average_steps": "12.71", "percent_finished": "1.0" } }, @@ -2923,8 +2855,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } @@ -2936,8 +2868,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } @@ -3078,6 +3010,74 @@ } } } + }, + { + "evaluation_id": "tau-bench-2/telecom/litellm-tool-calling__google_gemini-3-pro-preview/1774263615.0201504", + "retrieved_timestamp": "1774263615.0201504", + "source_metadata": { + "source_name": "Exgentic Open Agent Leaderboard", + "source_type": "evaluation_run", + "source_organization_name": "Exgentic", + "source_organization_url": "https://github.com/Exgentic", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "exgentic", + "version": "0.1.0" + }, + "benchmark": "tau-bench-2_telecom", + "evaluation_results": [ + { + "evaluation_name": "tau-bench-2/telecom", + "source_data": { + "dataset_name": "tau-bench-2/telecom", + "source_type": "url", + "url": [ + "https://github.com/Exgentic/exgentic" + ] + }, + "metric_config": { + "evaluation_description": "Tau Bench 2 benchmark evaluation (telecom subset)", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.73, + "uncertainty": { + "num_samples": 100 + }, + "details": { + "average_agent_cost": "0.3", + "total_run_cost": "36.75", + "average_steps": "14.84", + "percent_finished": "1.0" + } + }, + "generation_config": { + "generation_args": { + "agentic_eval_config": { + "additional_details": { + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" + } + } + } + } + } + ], + "detailed_evaluation_results": null, + "generation_config": { + "generation_args": { + "agentic_eval_config": { + "additional_details": { + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" + } + } + } + } } ] } \ No newline at end of file diff --git a/data/models/google_gemini-3-pro.json b/data/models/google_gemini-3-pro.json index 459b730e51fcbb9b4826bc74bd506760f8199202..01e84795b0cee23dd2bc66c9e2c84da542f9aae9 100644 --- a/data/models/google_gemini-3-pro.json +++ b/data/models/google_gemini-3-pro.json @@ -4,13 +4,13 @@ "id": "google/gemini-3-pro", "developer": "Google", "additional_details": { - "agent_name": "Droid", - "agent_organization": "Factory" + "agent_name": "SageAgent", + "agent_organization": "OpenSage" } }, "evaluations": [ { - "evaluation_id": "terminal-bench-2.0/droid__gemini-3-pro/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/sageagent__gemini-3-pro/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -34,7 +34,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-12-24", + "evaluation_timestamp": "2026-02-23", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -43,17 +43,17 @@ "max_score": 100.0 }, "score_details": { - "score": 61.1, + "score": 65.2, "uncertainty": { "standard_error": { - "value": 2.8 + "value": 2.1 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Droid\" -m \"Gemini 3 Pro\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"SageAgent\" -m \"Gemini 3 Pro\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -70,7 +70,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Droid\" -m \"Gemini 3 Pro\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"SageAgent\" -m \"Gemini 3 Pro\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -84,7 +84,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/sageagent__gemini-3-pro/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/terminus-2__gemini-3-pro/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -108,7 +108,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2026-02-23", + "evaluation_timestamp": "2025-11-21", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -117,17 +117,17 @@ "max_score": 100.0 }, "score_details": { - "score": 65.2, + "score": 56.9, "uncertainty": { "standard_error": { - "value": 2.1 + "value": 2.5 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"SageAgent\" -m \"Gemini 3 Pro\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Gemini 3 Pro\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -144,7 +144,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"SageAgent\" -m \"Gemini 3 Pro\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Gemini 3 Pro\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -232,7 +232,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/ante__gemini-3-pro/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/ii-agent__gemini-3-pro/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -256,7 +256,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2026-01-06", + "evaluation_timestamp": "2025-12-23", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -265,17 +265,17 @@ "max_score": 100.0 }, "score_details": { - "score": 69.4, + "score": 61.8, "uncertainty": { "standard_error": { - "value": 2.1 + "value": 2.8 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Ante\" -m \"Gemini 3 Pro\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"II-Agent\" -m \"Gemini 3 Pro\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -292,7 +292,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Ante\" -m \"Gemini 3 Pro\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"II-Agent\" -m \"Gemini 3 Pro\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -306,7 +306,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/ii-agent__gemini-3-pro/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/ante__gemini-3-pro/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -330,7 +330,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-12-23", + "evaluation_timestamp": "2026-01-06", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -339,17 +339,17 @@ "max_score": 100.0 }, "score_details": { - "score": 61.8, + "score": 69.4, "uncertainty": { "standard_error": { - "value": 2.8 + "value": 2.1 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"II-Agent\" -m \"Gemini 3 Pro\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Ante\" -m \"Gemini 3 Pro\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -366,7 +366,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"II-Agent\" -m \"Gemini 3 Pro\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Ante\" -m \"Gemini 3 Pro\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -380,7 +380,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/terminus-2__gemini-3-pro/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/droid__gemini-3-pro/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -404,7 +404,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-21", + "evaluation_timestamp": "2025-12-24", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -413,17 +413,17 @@ "max_score": 100.0 }, "score_details": { - "score": 56.9, + "score": 61.1, "uncertainty": { "standard_error": { - "value": 2.5 + "value": 2.8 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Gemini 3 Pro\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Droid\" -m \"Gemini 3 Pro\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -440,7 +440,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Gemini 3 Pro\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Droid\" -m \"Gemini 3 Pro\" -k 5", "agentic_eval_config": { "available_tools": [ { diff --git a/data/models/google_gemini_3_flash.json b/data/models/google_gemini_3_flash.json index 6beb736773081d366f65edf05d1bf54bd7e07ce2..3a911939d887cdc9433884fdde5ee11c97c277a4 100644 --- a/data/models/google_gemini_3_flash.json +++ b/data/models/google_gemini_3_flash.json @@ -6,6 +6,53 @@ "inference_platform": "unknown" }, "evaluations": [ + { + "evaluation_id": "ace/google_gemini-3-flash/1773260200", + "retrieved_timestamp": "1773260200", + "source_metadata": { + "source_name": "Mercor ACE Leaderboard", + "source_type": "evaluation_run", + "source_organization_name": "Mercor", + "source_organization_url": "https://www.mercor.com", + "evaluator_relationship": "first_party" + }, + "eval_library": { + "name": "archipelago", + "version": "1.0.0" + }, + "benchmark": "ace", + "evaluation_results": [ + { + "evaluation_name": "Gaming Score", + "source_data": { + "dataset_name": "ace", + "source_type": "hf_dataset", + "hf_repo": "Mercor/ACE" + }, + "metric_config": { + "evaluation_description": "Gaming domain score.", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0, + "max_score": 1 + }, + "score_details": { + "score": 0.415 + }, + "generation_config": { + "additional_details": { + "run_setting": "High" + } + } + } + ], + "detailed_evaluation_results": null, + "generation_config": { + "additional_details": { + "run_setting": "High" + } + } + }, { "evaluation_id": "apex-agents/google_gemini-3-flash/1773260200", "retrieved_timestamp": "1773260200", @@ -205,53 +252,6 @@ } } }, - { - "evaluation_id": "ace/google_gemini-3-flash/1773260200", - "retrieved_timestamp": "1773260200", - "source_metadata": { - "source_name": "Mercor ACE Leaderboard", - "source_type": "evaluation_run", - "source_organization_name": "Mercor", - "source_organization_url": "https://www.mercor.com", - "evaluator_relationship": "first_party" - }, - "eval_library": { - "name": "archipelago", - "version": "1.0.0" - }, - "benchmark": "ace", - "evaluation_results": [ - { - "evaluation_name": "Gaming Score", - "source_data": { - "dataset_name": "ace", - "source_type": "hf_dataset", - "hf_repo": "Mercor/ACE" - }, - "metric_config": { - "evaluation_description": "Gaming domain score.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.415 - }, - "generation_config": { - "additional_details": { - "run_setting": "High" - } - } - } - ], - "detailed_evaluation_results": null, - "generation_config": { - "additional_details": { - "run_setting": "High" - } - } - }, { "evaluation_id": "apex-v1/google_gemini-3-flash/1773260200", "retrieved_timestamp": "1773260200", diff --git a/data/models/google_gemini_3_pro.json b/data/models/google_gemini_3_pro.json index 104ce8e17e340df7c6fe6dc4bc1f7866f9c1cc71..8f4f634348fdc10ff492053d2a6241085b80ddd7 100644 --- a/data/models/google_gemini_3_pro.json +++ b/data/models/google_gemini_3_pro.json @@ -6,6 +6,78 @@ "inference_platform": "unknown" }, "evaluations": [ + { + "evaluation_id": "ace/google_gemini-3-pro/1773260200", + "retrieved_timestamp": "1773260200", + "source_metadata": { + "source_name": "Mercor ACE Leaderboard", + "source_type": "evaluation_run", + "source_organization_name": "Mercor", + "source_organization_url": "https://www.mercor.com", + "evaluator_relationship": "first_party" + }, + "eval_library": { + "name": "archipelago", + "version": "1.0.0" + }, + "benchmark": "ace", + "evaluation_results": [ + { + "evaluation_name": "Overall Score", + "source_data": { + "dataset_name": "ace", + "source_type": "hf_dataset", + "hf_repo": "Mercor/ACE" + }, + "metric_config": { + "evaluation_description": "Overall ACE score (paper snapshot, approximate).", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0, + "max_score": 1 + }, + "score_details": { + "score": 0.47 + }, + "generation_config": { + "additional_details": { + "run_setting": "High", + "value_quality": "approximate" + } + } + }, + { + "evaluation_name": "Gaming Score", + "source_data": { + "dataset_name": "ace", + "source_type": "hf_dataset", + "hf_repo": "Mercor/ACE" + }, + "metric_config": { + "evaluation_description": "Gaming domain score.", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0, + "max_score": 1 + }, + "score_details": { + "score": 0.509 + }, + "generation_config": { + "additional_details": { + "run_setting": "High" + } + } + } + ], + "detailed_evaluation_results": null, + "generation_config": { + "additional_details": { + "run_setting": "High", + "value_quality": "approximate" + } + } + }, { "evaluation_id": "apex-agents/google_gemini-3-pro/1773260200", "retrieved_timestamp": "1773260200", @@ -205,78 +277,6 @@ } } }, - { - "evaluation_id": "ace/google_gemini-3-pro/1773260200", - "retrieved_timestamp": "1773260200", - "source_metadata": { - "source_name": "Mercor ACE Leaderboard", - "source_type": "evaluation_run", - "source_organization_name": "Mercor", - "source_organization_url": "https://www.mercor.com", - "evaluator_relationship": "first_party" - }, - "eval_library": { - "name": "archipelago", - "version": "1.0.0" - }, - "benchmark": "ace", - "evaluation_results": [ - { - "evaluation_name": "Overall Score", - "source_data": { - "dataset_name": "ace", - "source_type": "hf_dataset", - "hf_repo": "Mercor/ACE" - }, - "metric_config": { - "evaluation_description": "Overall ACE score (paper snapshot, approximate).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.47 - }, - "generation_config": { - "additional_details": { - "run_setting": "High", - "value_quality": "approximate" - } - } - }, - { - "evaluation_name": "Gaming Score", - "source_data": { - "dataset_name": "ace", - "source_type": "hf_dataset", - "hf_repo": "Mercor/ACE" - }, - "metric_config": { - "evaluation_description": "Gaming domain score.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.509 - }, - "generation_config": { - "additional_details": { - "run_setting": "High" - } - } - } - ], - "detailed_evaluation_results": null, - "generation_config": { - "additional_details": { - "run_setting": "High", - "value_quality": "approximate" - } - } - }, { "evaluation_id": "apex-v1/google_gemini-3-pro/1773260200", "retrieved_timestamp": "1773260200", diff --git a/data/models/google_gemma-2-2b-jpn-it.json b/data/models/google_gemma-2-2b-jpn-it.json index ad8eb46cb13cde4ab2a8a3521d4c0b618fb71ee6..208075f00df888593e36a118e84f64272df933ff 100644 --- a/data/models/google_gemma-2-2b-jpn-it.json +++ b/data/models/google_gemma-2-2b-jpn-it.json @@ -5,7 +5,7 @@ "developer": "Google", "inference_platform": "unknown", "additional_details": { - "precision": "bfloat16", + "precision": "float16", "architecture": "Gemma2ForCausalLM", "params_billions": "2.614" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5288 + "score": 0.5078 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4178 + "score": 0.4226 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0476 + "score": 0.0347 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2752 + "score": 0.2852 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3728 + "score": 0.3964 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2467 + "score": 0.2578 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5078 + "score": 0.5288 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4226 + "score": 0.4178 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0347 + "score": 0.0476 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2852 + "score": 0.2752 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3964 + "score": 0.3728 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2578 + "score": 0.2467 } } ], diff --git a/data/models/google_gemma-2-2b.json b/data/models/google_gemma-2-2b.json index 853dbf9408f0d97b5b090784ef420b0479e50847..3875c2305ff98601b733940bab48bde956384743 100644 --- a/data/models/google_gemma-2-2b.json +++ b/data/models/google_gemma-2-2b.json @@ -5,7 +5,7 @@ "developer": "Google", "inference_platform": "unknown", "additional_details": { - "precision": "bfloat16", + "precision": "float16", "architecture": "InternLM2ForCausalLM", "params_billions": "2.614" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1993 + "score": 0.2018 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3656 + "score": 0.3709 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0287 + "score": 0.0302 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4232 + "score": 0.4219 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.218 + "score": 0.2217 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2018 + "score": 0.1993 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3709 + "score": 0.3656 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0302 + "score": 0.0287 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4219 + "score": 0.4232 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2217 + "score": 0.218 } } ], diff --git a/data/models/google_gemma-3-27b-it.json b/data/models/google_gemma-3-27b-it.json index 31e90c4548397bec1dec70a558a4830dc0c4f7c9..0d22aa7a55f613493f23d93430a44590b7aa715d 100644 --- a/data/models/google_gemma-3-27b-it.json +++ b/data/models/google_gemma-3-27b-it.json @@ -10,8 +10,8 @@ }, "evaluations": [ { - "evaluation_id": "global-mmlu-lite/google_gemma-3-27b-it/1773936496.366405", - "retrieved_timestamp": "1773936496.366405", + "evaluation_id": "global-mmlu-lite/google_gemma-3-27b-it/1773936583.743359", + "retrieved_timestamp": "1773936583.743359", "source_metadata": { "source_name": "Global MMLU Lite Leaderboard", "source_type": "documentation", @@ -525,8 +525,8 @@ "generation_config": null }, { - "evaluation_id": "global-mmlu-lite/google_gemma-3-27b-it/1773936583.743359", - "retrieved_timestamp": "1773936583.743359", + "evaluation_id": "global-mmlu-lite/google_gemma-3-27b-it/1773936496.366405", + "retrieved_timestamp": "1773936496.366405", "source_metadata": { "source_name": "Global MMLU Lite Leaderboard", "source_type": "documentation", diff --git a/data/models/huggingfacetb_smollm2-135m-instruct.json b/data/models/huggingfacetb_smollm2-135m-instruct.json index 4930fd119b327af5dcaff7489655e3b5a735f796..7413ccc13689271f5c637f0314f401eea3acbcec 100644 --- a/data/models/huggingfacetb_smollm2-135m-instruct.json +++ b/data/models/huggingfacetb_smollm2-135m-instruct.json @@ -5,7 +5,7 @@ "developer": "HuggingFaceTB", "inference_platform": "unknown", "additional_details": { - "precision": "bfloat16", + "precision": "float16", "architecture": "LlamaForCausalLM", "params_billions": "0.135" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2883 + "score": 0.0593 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3124 + "score": 0.3135 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.003 + "score": 0.0144 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2357 + "score": 0.2341 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3662 + "score": 0.3871 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1115 + "score": 0.1092 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0593 + "score": 0.2883 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3135 + "score": 0.3124 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0144 + "score": 0.003 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2341 + "score": 0.2357 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3871 + "score": 0.3662 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1092 + "score": 0.1115 } } ], diff --git a/data/models/infly_inf-orm-llama3.1-70b.json b/data/models/infly_inf-orm-llama3.1-70b.json index 82e76ad6cd43b3a105b801967d5f616a3924844a..e7947ee940015eb0652da9a52891a9ab47739595 100644 --- a/data/models/infly_inf-orm-llama3.1-70b.json +++ b/data/models/infly_inf-orm-llama3.1-70b.json @@ -9,10 +9,10 @@ }, "evaluations": [ { - "evaluation_id": "reward-bench/infly_INF-ORM-Llama3.1-70B/1766412838.146816", + "evaluation_id": "reward-bench-2/infly_INF-ORM-Llama3.1-70B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench", + "source_name": "RewardBench 2", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -31,128 +31,104 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9511 + "score": 0.7648 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9665 + "score": 0.7411 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9101 + "score": 0.4188 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Safety", + "evaluation_name": "Math", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Math score - measures mathematical reasoning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9365 + "score": 0.6995 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9912 + "score": 0.9644 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } - } - ], - "detailed_evaluation_results": null, - "generation_config": null - }, - { - "evaluation_id": "reward-bench-2/infly_INF-ORM-Llama3.1-70B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "eval_library": { - "name": "rewardbench", - "version": "0.1.3", - "additional_details": { - "subsets": "Chat, Chat Hard, Safety, Reasoning", - "hf_space": "allenai/reward-bench" - } - }, - "benchmark": "reward-bench", - "evaluation_results": [ + }, { - "evaluation_name": "Score", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7648 + "score": 0.903 }, "source_data": { "dataset_name": "RewardBench 2", @@ -161,111 +137,135 @@ } }, { - "evaluation_name": "Factuality", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7411 + "score": 0.8622 }, "source_data": { "dataset_name": "RewardBench 2", "source_type": "hf_dataset", "hf_repo": "allenai/reward-bench-2-results" } - }, + } + ], + "detailed_evaluation_results": null, + "generation_config": null + }, + { + "evaluation_id": "reward-bench/infly_INF-ORM-Llama3.1-70B/1766412838.146816", + "retrieved_timestamp": "1766412838.146816", + "source_metadata": { + "source_name": "RewardBench", + "source_type": "documentation", + "source_organization_name": "Allen Institute for AI", + "source_organization_url": "https://allenai.org", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "rewardbench", + "version": "0.1.3", + "additional_details": { + "subsets": "Chat, Chat Hard, Safety, Reasoning", + "hf_space": "allenai/reward-bench" + } + }, + "benchmark": "reward-bench", + "evaluation_results": [ { - "evaluation_name": "Precise IF", + "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.4188 + "score": 0.9511 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6995 + "score": 0.9665 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Safety", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9644 + "score": 0.9101 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Focus", + "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.903 + "score": 0.9365 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Ties", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8622 + "score": 0.9912 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } } ], diff --git a/data/models/internlm_internlm2-7b-reward.json b/data/models/internlm_internlm2-7b-reward.json index 850f56827380f314c355f2c532b41bbbb69062e8..5907aad77cdf2d42a7b25e1a1a35520112be4497 100644 --- a/data/models/internlm_internlm2-7b-reward.json +++ b/data/models/internlm_internlm2-7b-reward.json @@ -9,10 +9,10 @@ }, "evaluations": [ { - "evaluation_id": "reward-bench-2/internlm_internlm2-7b-reward/1766412838.146816", + "evaluation_id": "reward-bench/internlm_internlm2-7b-reward/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench 2", + "source_name": "RewardBench", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -31,104 +31,128 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5335 + "score": 0.8759 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Factuality", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.4211 + "score": 0.9916 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Precise IF", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.4 + "score": 0.6952 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5628 + "score": 0.8716 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Safety", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5956 + "score": 0.9453 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } - }, + } + ], + "detailed_evaluation_results": null, + "generation_config": null + }, + { + "evaluation_id": "reward-bench-2/internlm_internlm2-7b-reward/1766412838.146816", + "retrieved_timestamp": "1766412838.146816", + "source_metadata": { + "source_name": "RewardBench 2", + "source_type": "documentation", + "source_organization_name": "Allen Institute for AI", + "source_organization_url": "https://allenai.org", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "rewardbench", + "version": "0.1.3", + "additional_details": { + "subsets": "Chat, Chat Hard, Safety, Reasoning", + "hf_space": "allenai/reward-bench" + } + }, + "benchmark": "reward-bench", + "evaluation_results": [ { - "evaluation_name": "Focus", + "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7051 + "score": 0.5335 }, "source_data": { "dataset_name": "RewardBench 2", @@ -137,135 +161,111 @@ } }, { - "evaluation_name": "Ties", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5164 + "score": 0.4211 }, "source_data": { "dataset_name": "RewardBench 2", "source_type": "hf_dataset", "hf_repo": "allenai/reward-bench-2-results" } - } - ], - "detailed_evaluation_results": null, - "generation_config": null - }, - { - "evaluation_id": "reward-bench/internlm_internlm2-7b-reward/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "eval_library": { - "name": "rewardbench", - "version": "0.1.3", - "additional_details": { - "subsets": "Chat, Chat Hard, Safety, Reasoning", - "hf_space": "allenai/reward-bench" - } - }, - "benchmark": "reward-bench", - "evaluation_results": [ + }, { - "evaluation_name": "Score", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8759 + "score": 0.4 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Math", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Math score - measures mathematical reasoning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9916 + "score": 0.5628 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6952 + "score": 0.5956 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Safety", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8716 + "score": 0.7051 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9453 + "score": 0.5164 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } } ], diff --git a/data/models/jaspionjader_kosmos-evaa-fusion-8b.json b/data/models/jaspionjader_kosmos-evaa-fusion-8b.json index 88cf241ec28671fc887627a1e7bb1af09c15f9cf..1160e36435338576bea29bef2eb39f295798e22d 100644 --- a/data/models/jaspionjader_kosmos-evaa-fusion-8b.json +++ b/data/models/jaspionjader_kosmos-evaa-fusion-8b.json @@ -5,7 +5,7 @@ "developer": "jaspionjader", "inference_platform": "unknown", "additional_details": { - "precision": "float16", + "precision": "bfloat16", "architecture": "LlamaForCausalLM", "params_billions": "8.03" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4418 + "score": 0.4345 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5406 + "score": 0.5419 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1352 + "score": 0.1292 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3062 + "score": 0.3087 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.386 + "score": 0.3854 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4345 + "score": 0.4418 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5419 + "score": 0.5406 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1292 + "score": 0.1352 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3087 + "score": 0.3062 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3854 + "score": 0.386 } } ], diff --git a/data/models/leroydyer_spydazweb_ai_humanai_012_instruct_xa.json b/data/models/leroydyer_spydazweb_ai_humanai_012_instruct_xa.json index 199a2574ee197c6be6eb6b38849e4c63ab43b085..90656456ac84eb015314278cafdaa288005eb1ba 100644 --- a/data/models/leroydyer_spydazweb_ai_humanai_012_instruct_xa.json +++ b/data/models/leroydyer_spydazweb_ai_humanai_012_instruct_xa.json @@ -5,7 +5,7 @@ "developer": "LeroyDyer", "inference_platform": "unknown", "additional_details": { - "precision": "float16", + "precision": "bfloat16", "architecture": "MistralForCausalLM", "params_billions": "7.242" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3798 + "score": 0.3579 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4483 + "score": 0.4477 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.04 + "score": 0.0423 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3129 + "score": 0.3096 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4148 + "score": 0.4134 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2389 + "score": 0.2376 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3579 + "score": 0.3798 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4477 + "score": 0.4483 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0423 + "score": 0.04 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3096 + "score": 0.3129 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4134 + "score": 0.4148 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2376 + "score": 0.2389 } } ], diff --git a/data/models/llmat_mistral-v0.3-7b-orpo.json b/data/models/llmat_mistral-v0.3-7b-orpo.json index c2c9120f6bd010e2ff566effb35617377de554ee..3a1b947d84c5d76bb2423237a20cbf150415592e 100644 --- a/data/models/llmat_mistral-v0.3-7b-orpo.json +++ b/data/models/llmat_mistral-v0.3-7b-orpo.json @@ -5,7 +5,7 @@ "developer": "llmat", "inference_platform": "unknown", "additional_details": { - "precision": "bfloat16", + "precision": "float16", "architecture": "MistralForCausalLM", "params_billions": "7.248" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.377 + "score": 0.364 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3978 + "score": 0.4005 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0242 + "score": 0.0015 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2668 + "score": 0.2693 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3555 + "score": 0.3529 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2278 + "score": 0.2301 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.364 + "score": 0.377 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4005 + "score": 0.3978 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0015 + "score": 0.0242 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2693 + "score": 0.2668 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3529 + "score": 0.3555 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2301 + "score": 0.2278 } } ], diff --git a/data/models/lxzgordon_urm-llama-3.1-8b.json b/data/models/lxzgordon_urm-llama-3.1-8b.json index 7f03035f2bd809fb14be27130e313f5b86a26f9a..2ce56c90ce0d5fb5788660f5b3f1f1e179701786 100644 --- a/data/models/lxzgordon_urm-llama-3.1-8b.json +++ b/data/models/lxzgordon_urm-llama-3.1-8b.json @@ -9,10 +9,10 @@ }, "evaluations": [ { - "evaluation_id": "reward-bench-2/LxzGordon_URM-LLaMa-3.1-8B/1766412838.146816", + "evaluation_id": "reward-bench/LxzGordon_URM-LLaMa-3.1-8B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench 2", + "source_name": "RewardBench", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -31,104 +31,128 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7394 + "score": 0.9294 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Factuality", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6884 + "score": 0.9553 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Precise IF", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.45 + "score": 0.8816 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6393 + "score": 0.9108 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Safety", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9178 + "score": 0.9698 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } - }, + } + ], + "detailed_evaluation_results": null, + "generation_config": null + }, + { + "evaluation_id": "reward-bench-2/LxzGordon_URM-LLaMa-3.1-8B/1766412838.146816", + "retrieved_timestamp": "1766412838.146816", + "source_metadata": { + "source_name": "RewardBench 2", + "source_type": "documentation", + "source_organization_name": "Allen Institute for AI", + "source_organization_url": "https://allenai.org", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "rewardbench", + "version": "0.1.3", + "additional_details": { + "subsets": "Chat, Chat Hard, Safety, Reasoning", + "hf_space": "allenai/reward-bench" + } + }, + "benchmark": "reward-bench", + "evaluation_results": [ { - "evaluation_name": "Focus", + "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9758 + "score": 0.7394 }, "source_data": { "dataset_name": "RewardBench 2", @@ -137,135 +161,111 @@ } }, { - "evaluation_name": "Ties", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7653 + "score": 0.6884 }, "source_data": { "dataset_name": "RewardBench 2", "source_type": "hf_dataset", "hf_repo": "allenai/reward-bench-2-results" } - } - ], - "detailed_evaluation_results": null, - "generation_config": null - }, - { - "evaluation_id": "reward-bench/LxzGordon_URM-LLaMa-3.1-8B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "eval_library": { - "name": "rewardbench", - "version": "0.1.3", - "additional_details": { - "subsets": "Chat, Chat Hard, Safety, Reasoning", - "hf_space": "allenai/reward-bench" - } - }, - "benchmark": "reward-bench", - "evaluation_results": [ + }, { - "evaluation_name": "Score", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9294 + "score": 0.45 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Math", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Math score - measures mathematical reasoning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9553 + "score": 0.6393 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8816 + "score": 0.9178 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Safety", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9108 + "score": 0.9758 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9698 + "score": 0.7653 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } } ], diff --git a/data/models/meta_llama-3.1-8b-instruct-turbo.json b/data/models/meta_llama-3.1-8b-instruct-turbo.json index db79a2433c0e2503859f110928ebb7ec1a96138c..511e540608c036dc4147fb2f225109769aa099a1 100644 --- a/data/models/meta_llama-3.1-8b-instruct-turbo.json +++ b/data/models/meta_llama-3.1-8b-instruct-turbo.json @@ -231,10 +231,10 @@ } }, { - "evaluation_id": "helm_mmlu/meta_llama-3.1-8b-instruct-turbo/1774096312.00548", - "retrieved_timestamp": "1774096312.00548", + "evaluation_id": "helm_lite/meta_llama-3.1-8b-instruct-turbo/1774096306.427425", + "retrieved_timestamp": "1774096306.427425", "source_metadata": { - "source_name": "helm_mmlu", + "source_name": "helm_lite", "source_type": "documentation", "source_organization_name": "crfm", "evaluator_relationship": "third_party" @@ -243,438 +243,382 @@ "name": "helm", "version": "unknown" }, - "benchmark": "helm_mmlu", + "benchmark": "helm_lite", "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects", + "evaluation_name": "Mean win rate", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "helm_lite", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", + "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.561, + "score": 0.303, "details": { - "description": "min=0.26, mean=0.561, max=0.865, sum=63.912 (114)", + "description": "", "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": "{\"description\": \"min=0.202, mean=0.56, max=1.485, sum=63.854 (114)\", \"tab\": \"Efficiency\", \"score\": \"0.5601251981506405\"}", - "MMLU All Subjects - # eval": "{\"description\": \"min=100, mean=246.351, max=1534, sum=28084 (114)\", \"tab\": \"General information\", \"score\": \"246.35087719298247\"}", - "MMLU All Subjects - # train": "{\"description\": \"min=5, mean=5, max=5, sum=570 (114)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "MMLU All Subjects - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (114)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "MMLU All Subjects - # prompt tokens": "{\"description\": \"min=274.52, mean=614.619, max=2797.885, sum=70066.61 (114)\", \"tab\": \"General information\", \"score\": \"614.6193817308517\"}", - "MMLU All Subjects - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=114 (114)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Mean win rate - Efficiency": "{\"description\": \"\", \"tab\": \"Efficiency\", \"score\": \"0.5896504369538077\"}", + "Mean win rate - General information": "{\"description\": \"\", \"tab\": \"General information\", \"score\": \"\"}" } }, "generation_config": { - "additional_details": { - "subject": "[\"abstract_algebra\", \"anatomy\", \"astronomy\", \"business_ethics\", \"clinical_knowledge\", \"college_biology\", \"college_chemistry\", \"college_computer_science\", \"college_mathematics\", \"college_medicine\", \"college_physics\", \"computer_security\", \"conceptual_physics\", \"econometrics\", \"electrical_engineering\", \"elementary_mathematics\", \"formal_logic\", \"global_facts\", \"high_school_biology\", \"high_school_chemistry\", \"high_school_computer_science\", \"high_school_european_history\", \"high_school_geography\", \"high_school_government_and_politics\", \"high_school_macroeconomics\", \"high_school_mathematics\", \"high_school_microeconomics\", \"high_school_physics\", \"high_school_psychology\", \"high_school_statistics\", \"high_school_us_history\", \"high_school_world_history\", \"human_aging\", \"human_sexuality\", \"international_law\", \"jurisprudence\", \"logical_fallacies\", \"machine_learning\", \"management\", \"marketing\", \"medical_genetics\", \"miscellaneous\", \"moral_disputes\", \"moral_scenarios\", \"nutrition\", \"philosophy\", \"prehistory\", \"professional_accounting\", \"professional_law\", \"professional_medicine\", \"professional_psychology\", \"public_relations\", \"security_studies\", \"sociology\", \"us_foreign_policy\", \"virology\", \"world_religions\"]", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "[\"mmlu_abstract_algebra\", \"mmlu_anatomy\", \"mmlu_astronomy\", \"mmlu_business_ethics\", \"mmlu_clinical_knowledge\", \"mmlu_college_biology\", \"mmlu_college_chemistry\", \"mmlu_college_computer_science\", \"mmlu_college_mathematics\", \"mmlu_college_medicine\", \"mmlu_college_physics\", \"mmlu_computer_security\", \"mmlu_conceptual_physics\", \"mmlu_econometrics\", \"mmlu_electrical_engineering\", \"mmlu_elementary_mathematics\", \"mmlu_formal_logic\", \"mmlu_global_facts\", \"mmlu_high_school_biology\", \"mmlu_high_school_chemistry\", \"mmlu_high_school_computer_science\", \"mmlu_high_school_european_history\", \"mmlu_high_school_geography\", \"mmlu_high_school_government_and_politics\", \"mmlu_high_school_macroeconomics\", \"mmlu_high_school_mathematics\", \"mmlu_high_school_microeconomics\", \"mmlu_high_school_physics\", \"mmlu_high_school_psychology\", \"mmlu_high_school_statistics\", \"mmlu_high_school_us_history\", \"mmlu_high_school_world_history\", \"mmlu_human_aging\", \"mmlu_human_sexuality\", \"mmlu_international_law\", \"mmlu_jurisprudence\", \"mmlu_logical_fallacies\", \"mmlu_machine_learning\", \"mmlu_management\", \"mmlu_marketing\", \"mmlu_medical_genetics\", \"mmlu_miscellaneous\", \"mmlu_moral_disputes\", \"mmlu_moral_scenarios\", \"mmlu_nutrition\", \"mmlu_philosophy\", \"mmlu_prehistory\", \"mmlu_professional_accounting\", \"mmlu_professional_law\", \"mmlu_professional_medicine\", \"mmlu_professional_psychology\", \"mmlu_public_relations\", \"mmlu_security_studies\", \"mmlu_sociology\", \"mmlu_us_foreign_policy\", \"mmlu_virology\", \"mmlu_world_religions\"]" - } + "additional_details": {} } }, { - "evaluation_name": "Abstract Algebra", + "evaluation_name": "NarrativeQA", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "NarrativeQA", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on Abstract Algebra", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.26, + "score": 0.756, "details": { - "description": "min=0.26, mean=0.26, max=0.26, sum=0.52 (2)", + "description": "min=0.756, mean=0.756, max=0.756, sum=0.756 (1)", "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": "{\"description\": \"min=0.284, mean=0.284, max=0.284, sum=0.568 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.28381933450698854\"}", - "Abstract Algebra - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "Abstract Algebra - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Abstract Algebra - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Abstract Algebra - # prompt tokens": "{\"description\": \"min=373.43, mean=373.43, max=373.43, sum=746.86 (2)\", \"tab\": \"General information\", \"score\": \"373.43\"}", - "Abstract Algebra - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "NarrativeQA - Observed inference time (s)": "{\"description\": \"min=0.581, mean=0.581, max=0.581, sum=0.581 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.5813529316808136\"}", + "NarrativeQA - # eval": "{\"description\": \"min=355, mean=355, max=355, sum=355 (1)\", \"tab\": \"General information\", \"score\": \"355.0\"}", + "NarrativeQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "NarrativeQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "NarrativeQA - # prompt tokens": "{\"description\": \"min=3484.268, mean=3484.268, max=3484.268, sum=3484.268 (1)\", \"tab\": \"General information\", \"score\": \"3484.2676056338028\"}", + "NarrativeQA - # output tokens": "{\"description\": \"min=7.287, mean=7.287, max=7.287, sum=7.287 (1)\", \"tab\": \"General information\", \"score\": \"7.2873239436619714\"}" } }, "generation_config": { - "additional_details": { - "subject": "\"abstract_algebra\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_abstract_algebra\"" - } + "additional_details": {} } }, { - "evaluation_name": "Anatomy", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on Anatomy", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.459, + "score": 0.209, "details": { - "description": "min=0.459, mean=0.459, max=0.459, sum=0.919 (2)", + "description": "min=0.209, mean=0.209, max=0.209, sum=0.209 (1)", "tab": "Accuracy", - "Anatomy - Observed inference time (s)": "{\"description\": \"min=0.323, mean=0.323, max=0.323, sum=0.646 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3231998196354619\"}", - "Anatomy - # eval": "{\"description\": \"min=135, mean=135, max=135, sum=270 (2)\", \"tab\": \"General information\", \"score\": \"135.0\"}", - "Anatomy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Anatomy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Anatomy - # prompt tokens": "{\"description\": \"min=353.874, mean=353.874, max=353.874, sum=707.748 (2)\", \"tab\": \"General information\", \"score\": \"353.8740740740741\"}", - "Anatomy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "NaturalQuestions (open-book) - Observed inference time (s)": "{\"description\": \"min=0.544, mean=0.544, max=0.544, sum=0.544 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.5441543731689453\"}", + "NaturalQuestions (closed-book) - Observed inference time (s)": "{\"description\": \"min=0.752, mean=0.752, max=0.752, sum=0.752 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.751717613697052\"}", + "NaturalQuestions (open-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", + "NaturalQuestions (open-book) - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "NaturalQuestions (open-book) - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "NaturalQuestions (open-book) - # prompt tokens": "{\"description\": \"min=1716.78, mean=1716.78, max=1716.78, sum=1716.78 (1)\", \"tab\": \"General information\", \"score\": \"1716.78\"}", + "NaturalQuestions (open-book) - # output tokens": "{\"description\": \"min=8.736, mean=8.736, max=8.736, sum=8.736 (1)\", \"tab\": \"General information\", \"score\": \"8.736\"}", + "NaturalQuestions (closed-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", + "NaturalQuestions (closed-book) - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "NaturalQuestions (closed-book) - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "NaturalQuestions (closed-book) - # prompt tokens": "{\"description\": \"min=129.12, mean=129.12, max=129.12, sum=129.12 (1)\", \"tab\": \"General information\", \"score\": \"129.12\"}", + "NaturalQuestions (closed-book) - # output tokens": "{\"description\": \"min=11.732, mean=11.732, max=11.732, sum=11.732 (1)\", \"tab\": \"General information\", \"score\": \"11.732\"}" } }, "generation_config": { "additional_details": { - "subject": "\"anatomy\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_anatomy\"" + "mode": "\"closedbook\"" } } }, { - "evaluation_name": "College Physics", + "evaluation_name": "OpenbookQA", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "OpenbookQA", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on College Physics", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.363, + "score": 0.74, "details": { - "description": "min=0.363, mean=0.363, max=0.363, sum=0.725 (2)", + "description": "min=0.74, mean=0.74, max=0.74, sum=0.74 (1)", "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": "{\"description\": \"min=0.431, mean=0.431, max=0.431, sum=0.862 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.43078258752822873\"}", - "College Biology - Observed inference time (s)": "{\"description\": \"min=0.426, mean=0.426, max=0.426, sum=0.853 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42637243535783553\"}", - "College Computer Science - Observed inference time (s)": "{\"description\": \"min=0.562, mean=0.562, max=0.562, sum=1.125 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5623248195648194\"}", - "College Mathematics - Observed inference time (s)": "{\"description\": \"min=0.371, mean=0.371, max=0.371, sum=0.742 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3709776735305786\"}", - "College Medicine - Observed inference time (s)": "{\"description\": \"min=0.395, mean=0.395, max=0.395, sum=0.79 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3948341918129452\"}", - "College Physics - Observed inference time (s)": "{\"description\": \"min=0.395, mean=0.395, max=0.395, sum=0.789 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.39474552051693784\"}", - "College Chemistry - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "College Chemistry - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "College Chemistry - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "College Chemistry - # prompt tokens": "{\"description\": \"min=549.28, mean=549.28, max=549.28, sum=1098.56 (2)\", \"tab\": \"General information\", \"score\": \"549.28\"}", - "College Chemistry - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "College Biology - # eval": "{\"description\": \"min=144, mean=144, max=144, sum=288 (2)\", \"tab\": \"General information\", \"score\": \"144.0\"}", - "College Biology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "College Biology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "College Biology - # prompt tokens": "{\"description\": \"min=473.875, mean=473.875, max=473.875, sum=947.75 (2)\", \"tab\": \"General information\", \"score\": \"473.875\"}", - "College Biology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "College Computer Science - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "College Computer Science - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "College Computer Science - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "College Computer Science - # prompt tokens": "{\"description\": \"min=828.29, mean=828.29, max=828.29, sum=1656.58 (2)\", \"tab\": \"General information\", \"score\": \"828.29\"}", - "College Computer Science - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "College Mathematics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "College Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "College Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "College Mathematics - # prompt tokens": "{\"description\": \"min=594.51, mean=594.51, max=594.51, sum=1189.02 (2)\", \"tab\": \"General information\", \"score\": \"594.51\"}", - "College Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "College Medicine - # eval": "{\"description\": \"min=173, mean=173, max=173, sum=346 (2)\", \"tab\": \"General information\", \"score\": \"173.0\"}", - "College Medicine - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "College Medicine - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "College Medicine - # prompt tokens": "{\"description\": \"min=502.705, mean=502.705, max=502.705, sum=1005.41 (2)\", \"tab\": \"General information\", \"score\": \"502.70520231213874\"}", - "College Medicine - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "College Physics - # eval": "{\"description\": \"min=102, mean=102, max=102, sum=204 (2)\", \"tab\": \"General information\", \"score\": \"102.0\"}", - "College Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "College Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "College Physics - # prompt tokens": "{\"description\": \"min=503.569, mean=503.569, max=503.569, sum=1007.137 (2)\", \"tab\": \"General information\", \"score\": \"503.5686274509804\"}", - "College Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "OpenbookQA - Observed inference time (s)": "{\"description\": \"min=2.937, mean=2.937, max=2.937, sum=2.937 (1)\", \"tab\": \"Efficiency\", \"score\": \"2.9374450149536133\"}", + "OpenbookQA - # eval": "{\"description\": \"min=500, mean=500, max=500, sum=500 (1)\", \"tab\": \"General information\", \"score\": \"500.0\"}", + "OpenbookQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "OpenbookQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "OpenbookQA - # prompt tokens": "{\"description\": \"min=249.776, mean=249.776, max=249.776, sum=249.776 (1)\", \"tab\": \"General information\", \"score\": \"249.776\"}", + "OpenbookQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"college_physics\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_college_physics\"" + "dataset": "\"openbookqa\"", + "method": "\"multiple_choice_joint\"" } } }, { - "evaluation_name": "Computer Security", + "evaluation_name": "MMLU", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "MMLU", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on Computer Security", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.71, + "score": 0.5, "details": { - "description": "min=0.71, mean=0.71, max=0.71, sum=1.42 (2)", + "description": "min=0.26, mean=0.5, max=0.79, sum=2.501 (5)", "tab": "Accuracy", - "Computer Security - Observed inference time (s)": "{\"description\": \"min=0.434, mean=0.434, max=0.434, sum=0.867 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.43369229555130007\"}", - "Computer Security - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "Computer Security - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Computer Security - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Computer Security - # prompt tokens": "{\"description\": \"min=378.51, mean=378.51, max=378.51, sum=757.02 (2)\", \"tab\": \"General information\", \"score\": \"378.51\"}", - "Computer Security - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "MMLU - Observed inference time (s)": "{\"description\": \"min=0.284, mean=0.417, max=0.567, sum=2.086 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.41729471965421716\"}", + "MMLU - # eval": "{\"description\": \"min=100, mean=102.8, max=114, sum=514 (5)\", \"tab\": \"General information\", \"score\": \"102.8\"}", + "MMLU - # train": "{\"description\": \"min=5, mean=5, max=5, sum=25 (5)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "MMLU - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "MMLU - # prompt tokens": "{\"description\": \"min=373.43, mean=467.686, max=614.421, sum=2338.431 (5)\", \"tab\": \"General information\", \"score\": \"467.6862105263158\"}", + "MMLU - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"computer_security\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_computer_security\"" + "subject": "[\"abstract_algebra\", \"college_chemistry\", \"computer_security\", \"econometrics\", \"us_foreign_policy\"]", + "method": "\"multiple_choice_joint\"" } } }, { - "evaluation_name": "Econometrics", + "evaluation_name": "MATH", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "MATH", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on Econometrics", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.351, + "score": 0.703, "details": { - "description": "min=0.351, mean=0.351, max=0.351, sum=0.702 (2)", + "description": "min=0.509, mean=0.703, max=0.849, sum=4.92 (7)", "tab": "Accuracy", - "Econometrics - Observed inference time (s)": "{\"description\": \"min=0.371, mean=0.371, max=0.371, sum=0.742 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3707838414008157\"}", - "Econometrics - # eval": "{\"description\": \"min=114, mean=114, max=114, sum=228 (2)\", \"tab\": \"General information\", \"score\": \"114.0\"}", - "Econometrics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Econometrics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Econometrics - # prompt tokens": "{\"description\": \"min=614.421, mean=614.421, max=614.421, sum=1228.842 (2)\", \"tab\": \"General information\", \"score\": \"614.421052631579\"}", - "Econometrics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "MATH - Observed inference time (s)": "{\"description\": \"min=1.617, mean=1.927, max=2.175, sum=13.492 (7)\", \"tab\": \"Efficiency\", \"score\": \"1.9274194573191807\"}", + "MATH - # eval": "{\"description\": \"min=30, mean=62.429, max=135, sum=437 (7)\", \"tab\": \"General information\", \"score\": \"62.42857142857143\"}", + "MATH - # train": "{\"description\": \"min=8, mean=8, max=8, sum=56 (7)\", \"tab\": \"General information\", \"score\": \"8.0\"}", + "MATH - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (7)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "MATH - # prompt tokens": "{\"description\": \"min=881.363, mean=1262.909, max=2197.577, sum=8840.364 (7)\", \"tab\": \"General information\", \"score\": \"1262.9092130545007\"}", + "MATH - # output tokens": "{\"description\": \"min=203.384, mean=253.982, max=288.596, sum=1777.872 (7)\", \"tab\": \"General information\", \"score\": \"253.98170179473732\"}" } }, "generation_config": { "additional_details": { - "subject": "\"econometrics\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_econometrics\"" + "subject": "[\"algebra\", \"counting_and_probability\", \"geometry\", \"intermediate_algebra\", \"number_theory\", \"prealgebra\", \"precalculus\"]", + "level": "\"1\"", + "use_official_examples": "\"False\"", + "use_chain_of_thought": "\"True\"" } } }, { - "evaluation_name": "Global Facts", + "evaluation_name": "GSM8K", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "GSM8K", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on Global Facts", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.26, + "score": 0.798, "details": { - "description": "min=0.26, mean=0.26, max=0.26, sum=0.52 (2)", + "description": "min=0.798, mean=0.798, max=0.798, sum=0.798 (1)", "tab": "Accuracy", - "Global Facts - Observed inference time (s)": "{\"description\": \"min=0.202, mean=0.202, max=0.202, sum=0.403 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2015515398979187\"}", - "Global Facts - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "Global Facts - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Global Facts - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Global Facts - # prompt tokens": "{\"description\": \"min=399.71, mean=399.71, max=399.71, sum=799.42 (2)\", \"tab\": \"General information\", \"score\": \"399.71\"}", - "Global Facts - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "GSM8K - Observed inference time (s)": "{\"description\": \"min=2.109, mean=2.109, max=2.109, sum=2.109 (1)\", \"tab\": \"Efficiency\", \"score\": \"2.108796592712402\"}", + "GSM8K - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", + "GSM8K - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "GSM8K - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "GSM8K - # prompt tokens": "{\"description\": \"min=959.032, mean=959.032, max=959.032, sum=959.032 (1)\", \"tab\": \"General information\", \"score\": \"959.032\"}", + "GSM8K - # output tokens": "{\"description\": \"min=150.02, mean=150.02, max=150.02, sum=150.02 (1)\", \"tab\": \"General information\", \"score\": \"150.02\"}" } }, "generation_config": { "additional_details": { - "subject": "\"global_facts\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_global_facts\"" + "stop": "\"none\"" } } }, { - "evaluation_name": "Jurisprudence", + "evaluation_name": "LegalBench", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "LegalBench", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on Jurisprudence", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.731, + "score": 0.342, "details": { - "description": "min=0.731, mean=0.731, max=0.731, sum=1.463 (2)", + "description": "min=0, mean=0.342, max=0.8, sum=1.71 (5)", "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": "{\"description\": \"min=1.035, mean=1.035, max=1.035, sum=2.07 (2)\", \"tab\": \"Efficiency\", \"score\": \"1.0347525963076838\"}", - "Jurisprudence - # eval": "{\"description\": \"min=108, mean=108, max=108, sum=216 (2)\", \"tab\": \"General information\", \"score\": \"108.0\"}", - "Jurisprudence - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Jurisprudence - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Jurisprudence - # prompt tokens": "{\"description\": \"min=394.63, mean=394.63, max=394.63, sum=789.259 (2)\", \"tab\": \"General information\", \"score\": \"394.6296296296296\"}", - "Jurisprudence - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "LegalBench - Observed inference time (s)": "{\"description\": \"min=0.409, mean=0.481, max=0.626, sum=2.407 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.4814103188942614\"}", + "LegalBench - # eval": "{\"description\": \"min=95, mean=409.4, max=1000, sum=2047 (5)\", \"tab\": \"General information\", \"score\": \"409.4\"}", + "LegalBench - # train": "{\"description\": \"min=4, mean=4.8, max=5, sum=24 (5)\", \"tab\": \"General information\", \"score\": \"4.8\"}", + "LegalBench - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "LegalBench - # prompt tokens": "{\"description\": \"min=197.442, mean=1513.882, max=6300.012, sum=7569.412 (5)\", \"tab\": \"General information\", \"score\": \"1513.8824197238912\"}", + "LegalBench - # output tokens": "{\"description\": \"min=2.032, mean=6.824, max=10.886, sum=34.118 (5)\", \"tab\": \"General information\", \"score\": \"6.823557876005701\"}" } }, "generation_config": { "additional_details": { - "subject": "\"jurisprudence\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_jurisprudence\"" + "subset": "[\"abercrombie\", \"corporate_lobbying\", \"function_of_decision_section\", \"international_citizenship_questions\", \"proa\"]" } } }, { - "evaluation_name": "Philosophy", + "evaluation_name": "MedQA", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "MedQA", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on Philosophy", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.64, + "score": 0.245, "details": { - "description": "min=0.64, mean=0.64, max=0.64, sum=1.28 (2)", + "description": "min=0.245, mean=0.245, max=0.245, sum=0.245 (1)", "tab": "Accuracy", - "Philosophy - Observed inference time (s)": "{\"description\": \"min=0.681, mean=0.681, max=0.681, sum=1.363 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6814629341628391\"}", - "Philosophy - # eval": "{\"description\": \"min=311, mean=311, max=311, sum=622 (2)\", \"tab\": \"General information\", \"score\": \"311.0\"}", - "Philosophy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Philosophy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Philosophy - # prompt tokens": "{\"description\": \"min=329.084, mean=329.084, max=329.084, sum=658.167 (2)\", \"tab\": \"General information\", \"score\": \"329.08360128617363\"}", - "Philosophy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "MedQA - Observed inference time (s)": "{\"description\": \"min=0.743, mean=0.743, max=0.743, sum=0.743 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.742541556803891\"}", + "MedQA - # eval": "{\"description\": \"min=503, mean=503, max=503, sum=503 (1)\", \"tab\": \"General information\", \"score\": \"503.0\"}", + "MedQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "MedQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "MedQA - # prompt tokens": "{\"description\": \"min=1025.274, mean=1025.274, max=1025.274, sum=1025.274 (1)\", \"tab\": \"General information\", \"score\": \"1025.2743538767395\"}", + "MedQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { - "additional_details": { - "subject": "\"philosophy\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_philosophy\"" - } + "additional_details": {} } }, { - "evaluation_name": "Professional Psychology", + "evaluation_name": "WMT 2014", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "WMT 2014", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on Professional Psychology", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.649, + "score": 0.181, "details": { - "description": "min=0.649, mean=0.649, max=0.649, sum=1.297 (2)", + "description": "min=0.132, mean=0.181, max=0.219, sum=0.907 (5)", "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": "{\"description\": \"min=0.546, mean=0.546, max=0.546, sum=1.091 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5456299475010704\"}", - "Professional Accounting - Observed inference time (s)": "{\"description\": \"min=0.538, mean=0.538, max=0.538, sum=1.077 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5383730044601657\"}", - "Professional Law - Observed inference time (s)": "{\"description\": \"min=0.881, mean=0.881, max=0.881, sum=1.762 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.8808572895368355\"}", - "Professional Psychology - Observed inference time (s)": "{\"description\": \"min=0.694, mean=0.694, max=0.694, sum=1.388 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6941978611977272\"}", - "Professional Medicine - # eval": "{\"description\": \"min=272, mean=272, max=272, sum=544 (2)\", \"tab\": \"General information\", \"score\": \"272.0\"}", - "Professional Medicine - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Professional Medicine - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Professional Medicine - # prompt tokens": "{\"description\": \"min=1094.489, mean=1094.489, max=1094.489, sum=2188.978 (2)\", \"tab\": \"General information\", \"score\": \"1094.4889705882354\"}", - "Professional Medicine - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "Professional Accounting - # eval": "{\"description\": \"min=282, mean=282, max=282, sum=564 (2)\", \"tab\": \"General information\", \"score\": \"282.0\"}", - "Professional Accounting - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Professional Accounting - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Professional Accounting - # prompt tokens": "{\"description\": \"min=658.585, mean=658.585, max=658.585, sum=1317.17 (2)\", \"tab\": \"General information\", \"score\": \"658.5851063829788\"}", - "Professional Accounting - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "Professional Law - # eval": "{\"description\": \"min=1534, mean=1534, max=1534, sum=3068 (2)\", \"tab\": \"General information\", \"score\": \"1534.0\"}", - "Professional Law - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Professional Law - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Professional Law - # prompt tokens": "{\"description\": \"min=1637.601, mean=1637.601, max=1637.601, sum=3275.202 (2)\", \"tab\": \"General information\", \"score\": \"1637.6010430247718\"}", - "Professional Law - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "Professional Psychology - # eval": "{\"description\": \"min=612, mean=612, max=612, sum=1224 (2)\", \"tab\": \"General information\", \"score\": \"612.0\"}", - "Professional Psychology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Professional Psychology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Professional Psychology - # prompt tokens": "{\"description\": \"min=575.098, mean=575.098, max=575.098, sum=1150.196 (2)\", \"tab\": \"General information\", \"score\": \"575.0980392156863\"}", - "Professional Psychology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "WMT 2014 - Observed inference time (s)": "{\"description\": \"min=0.439, mean=0.565, max=0.727, sum=2.826 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.5651802479746801\"}", + "WMT 2014 - # eval": "{\"description\": \"min=503, mean=568.8, max=832, sum=2844 (5)\", \"tab\": \"General information\", \"score\": \"568.8\"}", + "WMT 2014 - # train": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "WMT 2014 - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "WMT 2014 - # prompt tokens": "{\"description\": \"min=101.139, mean=120.712, max=141.117, sum=603.559 (5)\", \"tab\": \"General information\", \"score\": \"120.71178123566294\"}", + "WMT 2014 - # output tokens": "{\"description\": \"min=24.354, mean=25.779, max=26.833, sum=128.893 (5)\", \"tab\": \"General information\", \"score\": \"25.778561802263347\"}" } }, "generation_config": { "additional_details": { - "subject": "\"professional_psychology\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_professional_psychology\"" + "language_pair": "[\"cs-en\", \"de-en\", \"fr-en\", \"hi-en\", \"ru-en\"]" } } - }, + } + ], + "detailed_evaluation_results": null, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_id": "helm_mmlu/meta_llama-3.1-8b-instruct-turbo/1774096312.00548", + "retrieved_timestamp": "1774096312.00548", + "source_metadata": { + "source_name": "helm_mmlu", + "source_type": "documentation", + "source_organization_name": "crfm", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "helm", + "version": "unknown" + }, + "benchmark": "helm_mmlu", + "evaluation_results": [ { - "evaluation_name": "Us Foreign Policy", + "evaluation_name": "MMLU All Subjects", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -683,36 +627,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.79, + "score": 0.561, "details": { - "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)", + "description": "min=0.26, mean=0.561, max=0.865, sum=63.912 (114)", "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": "{\"description\": \"min=0.567, mean=0.567, max=0.567, sum=1.135 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5673955392837524\"}", - "Us Foreign Policy - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "Us Foreign Policy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Us Foreign Policy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Us Foreign Policy - # prompt tokens": "{\"description\": \"min=422.79, mean=422.79, max=422.79, sum=845.58 (2)\", \"tab\": \"General information\", \"score\": \"422.79\"}", - "Us Foreign Policy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "MMLU All Subjects - Observed inference time (s)": "{\"description\": \"min=0.202, mean=0.56, max=1.485, sum=63.854 (114)\", \"tab\": \"Efficiency\", \"score\": \"0.5601251981506405\"}", + "MMLU All Subjects - # eval": "{\"description\": \"min=100, mean=246.351, max=1534, sum=28084 (114)\", \"tab\": \"General information\", \"score\": \"246.35087719298247\"}", + "MMLU All Subjects - # train": "{\"description\": \"min=5, mean=5, max=5, sum=570 (114)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "MMLU All Subjects - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (114)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "MMLU All Subjects - # prompt tokens": "{\"description\": \"min=274.52, mean=614.619, max=2797.885, sum=70066.61 (114)\", \"tab\": \"General information\", \"score\": \"614.6193817308517\"}", + "MMLU All Subjects - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=114 (114)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"us_foreign_policy\"", + "subject": "[\"abstract_algebra\", \"anatomy\", \"astronomy\", \"business_ethics\", \"clinical_knowledge\", \"college_biology\", \"college_chemistry\", \"college_computer_science\", \"college_mathematics\", \"college_medicine\", \"college_physics\", \"computer_security\", \"conceptual_physics\", \"econometrics\", \"electrical_engineering\", \"elementary_mathematics\", \"formal_logic\", \"global_facts\", \"high_school_biology\", \"high_school_chemistry\", \"high_school_computer_science\", \"high_school_european_history\", \"high_school_geography\", \"high_school_government_and_politics\", \"high_school_macroeconomics\", \"high_school_mathematics\", \"high_school_microeconomics\", \"high_school_physics\", \"high_school_psychology\", \"high_school_statistics\", \"high_school_us_history\", \"high_school_world_history\", \"human_aging\", \"human_sexuality\", \"international_law\", \"jurisprudence\", \"logical_fallacies\", \"machine_learning\", \"management\", \"marketing\", \"medical_genetics\", \"miscellaneous\", \"moral_disputes\", \"moral_scenarios\", \"nutrition\", \"philosophy\", \"prehistory\", \"professional_accounting\", \"professional_law\", \"professional_medicine\", \"professional_psychology\", \"public_relations\", \"security_studies\", \"sociology\", \"us_foreign_policy\", \"virology\", \"world_religions\"]", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_us_foreign_policy\"" + "groups": "[\"mmlu_abstract_algebra\", \"mmlu_anatomy\", \"mmlu_astronomy\", \"mmlu_business_ethics\", \"mmlu_clinical_knowledge\", \"mmlu_college_biology\", \"mmlu_college_chemistry\", \"mmlu_college_computer_science\", \"mmlu_college_mathematics\", \"mmlu_college_medicine\", \"mmlu_college_physics\", \"mmlu_computer_security\", \"mmlu_conceptual_physics\", \"mmlu_econometrics\", \"mmlu_electrical_engineering\", \"mmlu_elementary_mathematics\", \"mmlu_formal_logic\", \"mmlu_global_facts\", \"mmlu_high_school_biology\", \"mmlu_high_school_chemistry\", \"mmlu_high_school_computer_science\", \"mmlu_high_school_european_history\", \"mmlu_high_school_geography\", \"mmlu_high_school_government_and_politics\", \"mmlu_high_school_macroeconomics\", \"mmlu_high_school_mathematics\", \"mmlu_high_school_microeconomics\", \"mmlu_high_school_physics\", \"mmlu_high_school_psychology\", \"mmlu_high_school_statistics\", \"mmlu_high_school_us_history\", \"mmlu_high_school_world_history\", \"mmlu_human_aging\", \"mmlu_human_sexuality\", \"mmlu_international_law\", \"mmlu_jurisprudence\", \"mmlu_logical_fallacies\", \"mmlu_machine_learning\", \"mmlu_management\", \"mmlu_marketing\", \"mmlu_medical_genetics\", \"mmlu_miscellaneous\", \"mmlu_moral_disputes\", \"mmlu_moral_scenarios\", \"mmlu_nutrition\", \"mmlu_philosophy\", \"mmlu_prehistory\", \"mmlu_professional_accounting\", \"mmlu_professional_law\", \"mmlu_professional_medicine\", \"mmlu_professional_psychology\", \"mmlu_public_relations\", \"mmlu_security_studies\", \"mmlu_sociology\", \"mmlu_us_foreign_policy\", \"mmlu_virology\", \"mmlu_world_religions\"]" } } }, { - "evaluation_name": "Astronomy", + "evaluation_name": "Abstract Algebra", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -721,36 +665,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Astronomy", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.645, + "score": 0.26, "details": { - "description": "min=0.645, mean=0.645, max=0.645, sum=1.289 (2)", + "description": "min=0.26, mean=0.26, max=0.26, sum=0.52 (2)", "tab": "Accuracy", - "Astronomy - Observed inference time (s)": "{\"description\": \"min=0.317, mean=0.317, max=0.317, sum=0.634 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3168644199245854\"}", - "Astronomy - # eval": "{\"description\": \"min=152, mean=152, max=152, sum=304 (2)\", \"tab\": \"General information\", \"score\": \"152.0\"}", - "Astronomy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Astronomy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Astronomy - # prompt tokens": "{\"description\": \"min=579.684, mean=579.684, max=579.684, sum=1159.368 (2)\", \"tab\": \"General information\", \"score\": \"579.6842105263158\"}", - "Astronomy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Abstract Algebra - Observed inference time (s)": "{\"description\": \"min=0.284, mean=0.284, max=0.284, sum=0.568 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.28381933450698854\"}", + "Abstract Algebra - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "Abstract Algebra - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Abstract Algebra - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Abstract Algebra - # prompt tokens": "{\"description\": \"min=373.43, mean=373.43, max=373.43, sum=746.86 (2)\", \"tab\": \"General information\", \"score\": \"373.43\"}", + "Abstract Algebra - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"astronomy\"", + "subject": "\"abstract_algebra\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_astronomy\"" + "groups": "\"mmlu_abstract_algebra\"" } } }, { - "evaluation_name": "Business Ethics", + "evaluation_name": "Anatomy", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -759,36 +703,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Business Ethics", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.65, + "score": 0.459, "details": { - "description": "min=0.65, mean=0.65, max=0.65, sum=1.3 (2)", + "description": "min=0.459, mean=0.459, max=0.459, sum=0.919 (2)", "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": "{\"description\": \"min=0.444, mean=0.444, max=0.444, sum=0.888 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.44396358251571655\"}", - "Business Ethics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "Business Ethics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Business Ethics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Business Ethics - # prompt tokens": "{\"description\": \"min=569.52, mean=569.52, max=569.52, sum=1139.04 (2)\", \"tab\": \"General information\", \"score\": \"569.52\"}", - "Business Ethics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Anatomy - Observed inference time (s)": "{\"description\": \"min=0.323, mean=0.323, max=0.323, sum=0.646 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3231998196354619\"}", + "Anatomy - # eval": "{\"description\": \"min=135, mean=135, max=135, sum=270 (2)\", \"tab\": \"General information\", \"score\": \"135.0\"}", + "Anatomy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Anatomy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Anatomy - # prompt tokens": "{\"description\": \"min=353.874, mean=353.874, max=353.874, sum=707.748 (2)\", \"tab\": \"General information\", \"score\": \"353.8740740740741\"}", + "Anatomy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"business_ethics\"", + "subject": "\"anatomy\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_business_ethics\"" + "groups": "\"mmlu_anatomy\"" } } }, { - "evaluation_name": "Clinical Knowledge", + "evaluation_name": "College Physics", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -797,36 +741,66 @@ ] }, "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.615, + "score": 0.363, "details": { - "description": "min=0.615, mean=0.615, max=0.615, sum=1.23 (2)", + "description": "min=0.363, mean=0.363, max=0.363, sum=0.725 (2)", "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": "{\"description\": \"min=0.369, mean=0.369, max=0.369, sum=0.738 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3692442273193935\"}", - "Clinical Knowledge - # eval": "{\"description\": \"min=265, mean=265, max=265, sum=530 (2)\", \"tab\": \"General information\", \"score\": \"265.0\"}", - "Clinical Knowledge - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Clinical Knowledge - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Clinical Knowledge - # prompt tokens": "{\"description\": \"min=397.928, mean=397.928, max=397.928, sum=795.857 (2)\", \"tab\": \"General information\", \"score\": \"397.92830188679244\"}", - "Clinical Knowledge - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "College Chemistry - Observed inference time (s)": "{\"description\": \"min=0.431, mean=0.431, max=0.431, sum=0.862 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.43078258752822873\"}", + "College Biology - Observed inference time (s)": "{\"description\": \"min=0.426, mean=0.426, max=0.426, sum=0.853 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42637243535783553\"}", + "College Computer Science - Observed inference time (s)": "{\"description\": \"min=0.562, mean=0.562, max=0.562, sum=1.125 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5623248195648194\"}", + "College Mathematics - Observed inference time (s)": "{\"description\": \"min=0.371, mean=0.371, max=0.371, sum=0.742 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3709776735305786\"}", + "College Medicine - Observed inference time (s)": "{\"description\": \"min=0.395, mean=0.395, max=0.395, sum=0.79 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3948341918129452\"}", + "College Physics - Observed inference time (s)": "{\"description\": \"min=0.395, mean=0.395, max=0.395, sum=0.789 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.39474552051693784\"}", + "College Chemistry - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "College Chemistry - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "College Chemistry - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "College Chemistry - # prompt tokens": "{\"description\": \"min=549.28, mean=549.28, max=549.28, sum=1098.56 (2)\", \"tab\": \"General information\", \"score\": \"549.28\"}", + "College Chemistry - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "College Biology - # eval": "{\"description\": \"min=144, mean=144, max=144, sum=288 (2)\", \"tab\": \"General information\", \"score\": \"144.0\"}", + "College Biology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "College Biology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "College Biology - # prompt tokens": "{\"description\": \"min=473.875, mean=473.875, max=473.875, sum=947.75 (2)\", \"tab\": \"General information\", \"score\": \"473.875\"}", + "College Biology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "College Computer Science - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "College Computer Science - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "College Computer Science - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "College Computer Science - # prompt tokens": "{\"description\": \"min=828.29, mean=828.29, max=828.29, sum=1656.58 (2)\", \"tab\": \"General information\", \"score\": \"828.29\"}", + "College Computer Science - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "College Mathematics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "College Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "College Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "College Mathematics - # prompt tokens": "{\"description\": \"min=594.51, mean=594.51, max=594.51, sum=1189.02 (2)\", \"tab\": \"General information\", \"score\": \"594.51\"}", + "College Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "College Medicine - # eval": "{\"description\": \"min=173, mean=173, max=173, sum=346 (2)\", \"tab\": \"General information\", \"score\": \"173.0\"}", + "College Medicine - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "College Medicine - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "College Medicine - # prompt tokens": "{\"description\": \"min=502.705, mean=502.705, max=502.705, sum=1005.41 (2)\", \"tab\": \"General information\", \"score\": \"502.70520231213874\"}", + "College Medicine - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "College Physics - # eval": "{\"description\": \"min=102, mean=102, max=102, sum=204 (2)\", \"tab\": \"General information\", \"score\": \"102.0\"}", + "College Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "College Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "College Physics - # prompt tokens": "{\"description\": \"min=503.569, mean=503.569, max=503.569, sum=1007.137 (2)\", \"tab\": \"General information\", \"score\": \"503.5686274509804\"}", + "College Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"clinical_knowledge\"", + "subject": "\"college_physics\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_clinical_knowledge\"" + "groups": "\"mmlu_college_physics\"" } } }, { - "evaluation_name": "Conceptual Physics", + "evaluation_name": "Computer Security", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -835,36 +809,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Conceptual Physics", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.528, + "score": 0.71, "details": { - "description": "min=0.528, mean=0.528, max=0.528, sum=1.055 (2)", + "description": "min=0.71, mean=0.71, max=0.71, sum=1.42 (2)", "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": "{\"description\": \"min=0.351, mean=0.351, max=0.351, sum=0.701 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.35051030605397326\"}", - "Conceptual Physics - # eval": "{\"description\": \"min=235, mean=235, max=235, sum=470 (2)\", \"tab\": \"General information\", \"score\": \"235.0\"}", - "Conceptual Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Conceptual Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Conceptual Physics - # prompt tokens": "{\"description\": \"min=304.834, mean=304.834, max=304.834, sum=609.668 (2)\", \"tab\": \"General information\", \"score\": \"304.83404255319147\"}", - "Conceptual Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Computer Security - Observed inference time (s)": "{\"description\": \"min=0.434, mean=0.434, max=0.434, sum=0.867 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.43369229555130007\"}", + "Computer Security - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "Computer Security - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Computer Security - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Computer Security - # prompt tokens": "{\"description\": \"min=378.51, mean=378.51, max=378.51, sum=757.02 (2)\", \"tab\": \"General information\", \"score\": \"378.51\"}", + "Computer Security - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"conceptual_physics\"", + "subject": "\"computer_security\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_conceptual_physics\"" + "groups": "\"mmlu_computer_security\"" } } }, { - "evaluation_name": "Electrical Engineering", + "evaluation_name": "Econometrics", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -873,36 +847,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Electrical Engineering", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.441, + "score": 0.351, "details": { - "description": "min=0.441, mean=0.441, max=0.441, sum=0.883 (2)", + "description": "min=0.351, mean=0.351, max=0.351, sum=0.702 (2)", "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": "{\"description\": \"min=0.35, mean=0.35, max=0.35, sum=0.7 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.34982287637118636\"}", - "Electrical Engineering - # eval": "{\"description\": \"min=145, mean=145, max=145, sum=290 (2)\", \"tab\": \"General information\", \"score\": \"145.0\"}", - "Electrical Engineering - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Electrical Engineering - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Electrical Engineering - # prompt tokens": "{\"description\": \"min=435.607, mean=435.607, max=435.607, sum=871.214 (2)\", \"tab\": \"General information\", \"score\": \"435.60689655172416\"}", - "Electrical Engineering - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Econometrics - Observed inference time (s)": "{\"description\": \"min=0.371, mean=0.371, max=0.371, sum=0.742 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3707838414008157\"}", + "Econometrics - # eval": "{\"description\": \"min=114, mean=114, max=114, sum=228 (2)\", \"tab\": \"General information\", \"score\": \"114.0\"}", + "Econometrics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Econometrics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Econometrics - # prompt tokens": "{\"description\": \"min=614.421, mean=614.421, max=614.421, sum=1228.842 (2)\", \"tab\": \"General information\", \"score\": \"614.421052631579\"}", + "Econometrics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"electrical_engineering\"", + "subject": "\"econometrics\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_electrical_engineering\"" + "groups": "\"mmlu_econometrics\"" } } }, { - "evaluation_name": "Elementary Mathematics", + "evaluation_name": "Global Facts", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -911,36 +885,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.429, + "score": 0.26, "details": { - "description": "min=0.429, mean=0.429, max=0.429, sum=0.857 (2)", + "description": "min=0.26, mean=0.26, max=0.26, sum=0.52 (2)", "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": "{\"description\": \"min=0.4, mean=0.4, max=0.4, sum=0.801 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4003569991500289\"}", - "Elementary Mathematics - # eval": "{\"description\": \"min=378, mean=378, max=378, sum=756 (2)\", \"tab\": \"General information\", \"score\": \"378.0\"}", - "Elementary Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Elementary Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Elementary Mathematics - # prompt tokens": "{\"description\": \"min=531.854, mean=531.854, max=531.854, sum=1063.709 (2)\", \"tab\": \"General information\", \"score\": \"531.8544973544973\"}", - "Elementary Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Global Facts - Observed inference time (s)": "{\"description\": \"min=0.202, mean=0.202, max=0.202, sum=0.403 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2015515398979187\"}", + "Global Facts - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "Global Facts - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Global Facts - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Global Facts - # prompt tokens": "{\"description\": \"min=399.71, mean=399.71, max=399.71, sum=799.42 (2)\", \"tab\": \"General information\", \"score\": \"399.71\"}", + "Global Facts - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"elementary_mathematics\"", + "subject": "\"global_facts\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_elementary_mathematics\"" + "groups": "\"mmlu_global_facts\"" } } }, { - "evaluation_name": "Formal Logic", + "evaluation_name": "Jurisprudence", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -949,36 +923,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Formal Logic", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.444, + "score": 0.731, "details": { - "description": "min=0.444, mean=0.444, max=0.444, sum=0.889 (2)", + "description": "min=0.731, mean=0.731, max=0.731, sum=1.463 (2)", "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": "{\"description\": \"min=0.357, mean=0.357, max=0.357, sum=0.714 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.35707327108534553\"}", - "Formal Logic - # eval": "{\"description\": \"min=126, mean=126, max=126, sum=252 (2)\", \"tab\": \"General information\", \"score\": \"126.0\"}", - "Formal Logic - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Formal Logic - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Formal Logic - # prompt tokens": "{\"description\": \"min=601.778, mean=601.778, max=601.778, sum=1203.556 (2)\", \"tab\": \"General information\", \"score\": \"601.7777777777778\"}", - "Formal Logic - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Jurisprudence - Observed inference time (s)": "{\"description\": \"min=1.035, mean=1.035, max=1.035, sum=2.07 (2)\", \"tab\": \"Efficiency\", \"score\": \"1.0347525963076838\"}", + "Jurisprudence - # eval": "{\"description\": \"min=108, mean=108, max=108, sum=216 (2)\", \"tab\": \"General information\", \"score\": \"108.0\"}", + "Jurisprudence - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Jurisprudence - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Jurisprudence - # prompt tokens": "{\"description\": \"min=394.63, mean=394.63, max=394.63, sum=789.259 (2)\", \"tab\": \"General information\", \"score\": \"394.6296296296296\"}", + "Jurisprudence - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"formal_logic\"", + "subject": "\"jurisprudence\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_formal_logic\"" + "groups": "\"mmlu_jurisprudence\"" } } }, { - "evaluation_name": "High School World History", + "evaluation_name": "Philosophy", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -987,114 +961,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on High School World History", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.515, + "score": 0.64, "details": { - "description": "min=0.515, mean=0.515, max=0.515, sum=1.03 (2)", + "description": "min=0.64, mean=0.64, max=0.64, sum=1.28 (2)", "tab": "Accuracy", - "High School Biology - Observed inference time (s)": "{\"description\": \"min=0.211, mean=0.211, max=0.211, sum=0.423 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.21137587870320967\"}", - "High School Chemistry - Observed inference time (s)": "{\"description\": \"min=0.211, mean=0.211, max=0.211, sum=0.423 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2113605567387172\"}", - "High School Computer Science - Observed inference time (s)": "{\"description\": \"min=0.214, mean=0.214, max=0.214, sum=0.428 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2138903546333313\"}", - "High School European History - Observed inference time (s)": "{\"description\": \"min=0.332, mean=0.332, max=0.332, sum=0.664 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.33188523668231384\"}", - "High School Geography - Observed inference time (s)": "{\"description\": \"min=0.218, mean=0.218, max=0.218, sum=0.435 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.21753037818754561\"}", - "High School Government And Politics - Observed inference time (s)": "{\"description\": \"min=0.558, mean=0.558, max=0.558, sum=1.117 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.558492410985917\"}", - "High School Macroeconomics - Observed inference time (s)": "{\"description\": \"min=0.703, mean=0.703, max=0.703, sum=1.407 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.7033225890917656\"}", - "High School Mathematics - Observed inference time (s)": "{\"description\": \"min=0.649, mean=0.649, max=0.649, sum=1.299 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6494572189119127\"}", - "High School Microeconomics - Observed inference time (s)": "{\"description\": \"min=0.612, mean=0.612, max=0.612, sum=1.223 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6115654797113242\"}", - "High School Physics - Observed inference time (s)": "{\"description\": \"min=0.564, mean=0.564, max=0.564, sum=1.127 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5636763351642533\"}", - "High School Psychology - Observed inference time (s)": "{\"description\": \"min=0.681, mean=0.681, max=0.681, sum=1.363 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6813242522948378\"}", - "High School Statistics - Observed inference time (s)": "{\"description\": \"min=0.606, mean=0.606, max=0.606, sum=1.212 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6060926814874014\"}", - "High School US History - Observed inference time (s)": "{\"description\": \"min=1.122, mean=1.122, max=1.122, sum=2.244 (2)\", \"tab\": \"Efficiency\", \"score\": \"1.1218917334780973\"}", - "High School World History - Observed inference time (s)": "{\"description\": \"min=0.538, mean=0.538, max=0.538, sum=1.076 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5378943324592043\"}", - "High School Biology - # eval": "{\"description\": \"min=310, mean=310, max=310, sum=620 (2)\", \"tab\": \"General information\", \"score\": \"310.0\"}", - "High School Biology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Biology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Biology - # prompt tokens": "{\"description\": \"min=513.671, mean=513.671, max=513.671, sum=1027.342 (2)\", \"tab\": \"General information\", \"score\": \"513.6709677419354\"}", - "High School Biology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Chemistry - # eval": "{\"description\": \"min=203, mean=203, max=203, sum=406 (2)\", \"tab\": \"General information\", \"score\": \"203.0\"}", - "High School Chemistry - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Chemistry - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Chemistry - # prompt tokens": "{\"description\": \"min=496.704, mean=496.704, max=496.704, sum=993.409 (2)\", \"tab\": \"General information\", \"score\": \"496.70443349753697\"}", - "High School Chemistry - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Computer Science - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "High School Computer Science - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Computer Science - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Computer Science - # prompt tokens": "{\"description\": \"min=867.78, mean=867.78, max=867.78, sum=1735.56 (2)\", \"tab\": \"General information\", \"score\": \"867.78\"}", - "High School Computer Science - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School European History - # eval": "{\"description\": \"min=165, mean=165, max=165, sum=330 (2)\", \"tab\": \"General information\", \"score\": \"165.0\"}", - "High School European History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School European History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School European History - # prompt tokens": "{\"description\": \"min=2797.885, mean=2797.885, max=2797.885, sum=5595.77 (2)\", \"tab\": \"General information\", \"score\": \"2797.8848484848486\"}", - "High School European History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Geography - # eval": "{\"description\": \"min=198, mean=198, max=198, sum=396 (2)\", \"tab\": \"General information\", \"score\": \"198.0\"}", - "High School Geography - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Geography - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Geography - # prompt tokens": "{\"description\": \"min=372.035, mean=372.035, max=372.035, sum=744.071 (2)\", \"tab\": \"General information\", \"score\": \"372.0353535353535\"}", - "High School Geography - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Government And Politics - # eval": "{\"description\": \"min=193, mean=193, max=193, sum=386 (2)\", \"tab\": \"General information\", \"score\": \"193.0\"}", - "High School Government And Politics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Government And Politics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Government And Politics - # prompt tokens": "{\"description\": \"min=465.824, mean=465.824, max=465.824, sum=931.648 (2)\", \"tab\": \"General information\", \"score\": \"465.8238341968912\"}", - "High School Government And Politics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Macroeconomics - # eval": "{\"description\": \"min=390, mean=390, max=390, sum=780 (2)\", \"tab\": \"General information\", \"score\": \"390.0\"}", - "High School Macroeconomics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Macroeconomics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Macroeconomics - # prompt tokens": "{\"description\": \"min=370.908, mean=370.908, max=370.908, sum=741.815 (2)\", \"tab\": \"General information\", \"score\": \"370.9076923076923\"}", - "High School Macroeconomics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Mathematics - # eval": "{\"description\": \"min=270, mean=270, max=270, sum=540 (2)\", \"tab\": \"General information\", \"score\": \"270.0\"}", - "High School Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Mathematics - # prompt tokens": "{\"description\": \"min=532.356, mean=532.356, max=532.356, sum=1064.711 (2)\", \"tab\": \"General information\", \"score\": \"532.3555555555556\"}", - "High School Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Microeconomics - # eval": "{\"description\": \"min=238, mean=238, max=238, sum=476 (2)\", \"tab\": \"General information\", \"score\": \"238.0\"}", - "High School Microeconomics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Microeconomics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Microeconomics - # prompt tokens": "{\"description\": \"min=399.013, mean=399.013, max=399.013, sum=798.025 (2)\", \"tab\": \"General information\", \"score\": \"399.0126050420168\"}", - "High School Microeconomics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Physics - # eval": "{\"description\": \"min=151, mean=151, max=151, sum=302 (2)\", \"tab\": \"General information\", \"score\": \"151.0\"}", - "High School Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Physics - # prompt tokens": "{\"description\": \"min=560.457, mean=560.457, max=560.457, sum=1120.914 (2)\", \"tab\": \"General information\", \"score\": \"560.4569536423841\"}", - "High School Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Psychology - # eval": "{\"description\": \"min=545, mean=545, max=545, sum=1090 (2)\", \"tab\": \"General information\", \"score\": \"545.0\"}", - "High School Psychology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Psychology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Psychology - # prompt tokens": "{\"description\": \"min=495.242, mean=495.242, max=495.242, sum=990.484 (2)\", \"tab\": \"General information\", \"score\": \"495.2422018348624\"}", - "High School Psychology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Statistics - # eval": "{\"description\": \"min=216, mean=216, max=216, sum=432 (2)\", \"tab\": \"General information\", \"score\": \"216.0\"}", - "High School Statistics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Statistics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Statistics - # prompt tokens": "{\"description\": \"min=795.639, mean=795.639, max=795.639, sum=1591.278 (2)\", \"tab\": \"General information\", \"score\": \"795.6388888888889\"}", - "High School Statistics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School US History - # eval": "{\"description\": \"min=204, mean=204, max=204, sum=408 (2)\", \"tab\": \"General information\", \"score\": \"204.0\"}", - "High School US History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School US History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School US History - # prompt tokens": "{\"description\": \"min=2217.809, mean=2217.809, max=2217.809, sum=4435.618 (2)\", \"tab\": \"General information\", \"score\": \"2217.8088235294117\"}", - "High School US History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School World History - # eval": "{\"description\": \"min=237, mean=237, max=237, sum=474 (2)\", \"tab\": \"General information\", \"score\": \"237.0\"}", - "High School World History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School World History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School World History - # prompt tokens": "{\"description\": \"min=1428.173, mean=1428.173, max=1428.173, sum=2856.346 (2)\", \"tab\": \"General information\", \"score\": \"1428.1729957805908\"}", - "High School World History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Philosophy - Observed inference time (s)": "{\"description\": \"min=0.681, mean=0.681, max=0.681, sum=1.363 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6814629341628391\"}", + "Philosophy - # eval": "{\"description\": \"min=311, mean=311, max=311, sum=622 (2)\", \"tab\": \"General information\", \"score\": \"311.0\"}", + "Philosophy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Philosophy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Philosophy - # prompt tokens": "{\"description\": \"min=329.084, mean=329.084, max=329.084, sum=658.167 (2)\", \"tab\": \"General information\", \"score\": \"329.08360128617363\"}", + "Philosophy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"high_school_world_history\"", + "subject": "\"philosophy\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_high_school_world_history\"" + "groups": "\"mmlu_philosophy\"" } } }, { - "evaluation_name": "Human Sexuality", + "evaluation_name": "Professional Psychology", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1103,42 +999,54 @@ ] }, "metric_config": { - "evaluation_description": "EM on Human Sexuality", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.733, + "score": 0.649, "details": { - "description": "min=0.733, mean=0.733, max=0.733, sum=1.466 (2)", + "description": "min=0.649, mean=0.649, max=0.649, sum=1.297 (2)", "tab": "Accuracy", - "Human Aging - Observed inference time (s)": "{\"description\": \"min=0.685, mean=0.685, max=0.685, sum=1.369 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6845707412257858\"}", - "Human Sexuality - Observed inference time (s)": "{\"description\": \"min=1.227, mean=1.227, max=1.227, sum=2.455 (2)\", \"tab\": \"Efficiency\", \"score\": \"1.2273387745136524\"}", - "Human Aging - # eval": "{\"description\": \"min=223, mean=223, max=223, sum=446 (2)\", \"tab\": \"General information\", \"score\": \"223.0\"}", - "Human Aging - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Human Aging - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Human Aging - # prompt tokens": "{\"description\": \"min=319.888, mean=319.888, max=319.888, sum=639.776 (2)\", \"tab\": \"General information\", \"score\": \"319.88789237668163\"}", - "Human Aging - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "Human Sexuality - # eval": "{\"description\": \"min=131, mean=131, max=131, sum=262 (2)\", \"tab\": \"General information\", \"score\": \"131.0\"}", - "Human Sexuality - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Human Sexuality - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Human Sexuality - # prompt tokens": "{\"description\": \"min=341.168, mean=341.168, max=341.168, sum=682.336 (2)\", \"tab\": \"General information\", \"score\": \"341.1679389312977\"}", - "Human Sexuality - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Professional Medicine - Observed inference time (s)": "{\"description\": \"min=0.546, mean=0.546, max=0.546, sum=1.091 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5456299475010704\"}", + "Professional Accounting - Observed inference time (s)": "{\"description\": \"min=0.538, mean=0.538, max=0.538, sum=1.077 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5383730044601657\"}", + "Professional Law - Observed inference time (s)": "{\"description\": \"min=0.881, mean=0.881, max=0.881, sum=1.762 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.8808572895368355\"}", + "Professional Psychology - Observed inference time (s)": "{\"description\": \"min=0.694, mean=0.694, max=0.694, sum=1.388 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6941978611977272\"}", + "Professional Medicine - # eval": "{\"description\": \"min=272, mean=272, max=272, sum=544 (2)\", \"tab\": \"General information\", \"score\": \"272.0\"}", + "Professional Medicine - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Professional Medicine - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Professional Medicine - # prompt tokens": "{\"description\": \"min=1094.489, mean=1094.489, max=1094.489, sum=2188.978 (2)\", \"tab\": \"General information\", \"score\": \"1094.4889705882354\"}", + "Professional Medicine - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "Professional Accounting - # eval": "{\"description\": \"min=282, mean=282, max=282, sum=564 (2)\", \"tab\": \"General information\", \"score\": \"282.0\"}", + "Professional Accounting - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Professional Accounting - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Professional Accounting - # prompt tokens": "{\"description\": \"min=658.585, mean=658.585, max=658.585, sum=1317.17 (2)\", \"tab\": \"General information\", \"score\": \"658.5851063829788\"}", + "Professional Accounting - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "Professional Law - # eval": "{\"description\": \"min=1534, mean=1534, max=1534, sum=3068 (2)\", \"tab\": \"General information\", \"score\": \"1534.0\"}", + "Professional Law - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Professional Law - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Professional Law - # prompt tokens": "{\"description\": \"min=1637.601, mean=1637.601, max=1637.601, sum=3275.202 (2)\", \"tab\": \"General information\", \"score\": \"1637.6010430247718\"}", + "Professional Law - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "Professional Psychology - # eval": "{\"description\": \"min=612, mean=612, max=612, sum=1224 (2)\", \"tab\": \"General information\", \"score\": \"612.0\"}", + "Professional Psychology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Professional Psychology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Professional Psychology - # prompt tokens": "{\"description\": \"min=575.098, mean=575.098, max=575.098, sum=1150.196 (2)\", \"tab\": \"General information\", \"score\": \"575.0980392156863\"}", + "Professional Psychology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"human_sexuality\"", + "subject": "\"professional_psychology\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_human_sexuality\"" + "groups": "\"mmlu_professional_psychology\"" } } }, { - "evaluation_name": "International Law", + "evaluation_name": "Us Foreign Policy", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1147,36 +1055,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on International Law", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.694, + "score": 0.79, "details": { - "description": "min=0.694, mean=0.694, max=0.694, sum=1.388 (2)", + "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)", "tab": "Accuracy", - "International Law - Observed inference time (s)": "{\"description\": \"min=0.684, mean=0.684, max=0.684, sum=1.369 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6842782950598346\"}", - "International Law - # eval": "{\"description\": \"min=121, mean=121, max=121, sum=242 (2)\", \"tab\": \"General information\", \"score\": \"121.0\"}", - "International Law - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "International Law - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "International Law - # prompt tokens": "{\"description\": \"min=639.818, mean=639.818, max=639.818, sum=1279.636 (2)\", \"tab\": \"General information\", \"score\": \"639.8181818181819\"}", - "International Law - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Us Foreign Policy - Observed inference time (s)": "{\"description\": \"min=0.567, mean=0.567, max=0.567, sum=1.135 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5673955392837524\"}", + "Us Foreign Policy - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "Us Foreign Policy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Us Foreign Policy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Us Foreign Policy - # prompt tokens": "{\"description\": \"min=422.79, mean=422.79, max=422.79, sum=845.58 (2)\", \"tab\": \"General information\", \"score\": \"422.79\"}", + "Us Foreign Policy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"international_law\"", + "subject": "\"us_foreign_policy\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_international_law\"" + "groups": "\"mmlu_us_foreign_policy\"" } } }, { - "evaluation_name": "Logical Fallacies", + "evaluation_name": "Astronomy", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1185,36 +1093,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Logical Fallacies", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.742, + "score": 0.645, "details": { - "description": "min=0.742, mean=0.742, max=0.742, sum=1.485 (2)", + "description": "min=0.645, mean=0.645, max=0.645, sum=1.289 (2)", "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": "{\"description\": \"min=1.35, mean=1.35, max=1.35, sum=2.7 (2)\", \"tab\": \"Efficiency\", \"score\": \"1.3501118970063566\"}", - "Logical Fallacies - # eval": "{\"description\": \"min=163, mean=163, max=163, sum=326 (2)\", \"tab\": \"General information\", \"score\": \"163.0\"}", - "Logical Fallacies - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Logical Fallacies - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Logical Fallacies - # prompt tokens": "{\"description\": \"min=449.564, mean=449.564, max=449.564, sum=899.129 (2)\", \"tab\": \"General information\", \"score\": \"449.5644171779141\"}", - "Logical Fallacies - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Astronomy - Observed inference time (s)": "{\"description\": \"min=0.317, mean=0.317, max=0.317, sum=0.634 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3168644199245854\"}", + "Astronomy - # eval": "{\"description\": \"min=152, mean=152, max=152, sum=304 (2)\", \"tab\": \"General information\", \"score\": \"152.0\"}", + "Astronomy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Astronomy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Astronomy - # prompt tokens": "{\"description\": \"min=579.684, mean=579.684, max=579.684, sum=1159.368 (2)\", \"tab\": \"General information\", \"score\": \"579.6842105263158\"}", + "Astronomy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"logical_fallacies\"", + "subject": "\"astronomy\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_logical_fallacies\"" + "groups": "\"mmlu_astronomy\"" } } }, { - "evaluation_name": "Machine Learning", + "evaluation_name": "Business Ethics", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1223,36 +1131,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Machine Learning", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.384, + "score": 0.65, "details": { - "description": "min=0.384, mean=0.384, max=0.384, sum=0.768 (2)", + "description": "min=0.65, mean=0.65, max=0.65, sum=1.3 (2)", "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": "{\"description\": \"min=0.46, mean=0.46, max=0.46, sum=0.919 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.45964209735393524\"}", - "Machine Learning - # eval": "{\"description\": \"min=112, mean=112, max=112, sum=224 (2)\", \"tab\": \"General information\", \"score\": \"112.0\"}", - "Machine Learning - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Machine Learning - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Machine Learning - # prompt tokens": "{\"description\": \"min=668.054, mean=668.054, max=668.054, sum=1336.107 (2)\", \"tab\": \"General information\", \"score\": \"668.0535714285714\"}", - "Machine Learning - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Business Ethics - Observed inference time (s)": "{\"description\": \"min=0.444, mean=0.444, max=0.444, sum=0.888 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.44396358251571655\"}", + "Business Ethics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "Business Ethics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Business Ethics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Business Ethics - # prompt tokens": "{\"description\": \"min=569.52, mean=569.52, max=569.52, sum=1139.04 (2)\", \"tab\": \"General information\", \"score\": \"569.52\"}", + "Business Ethics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"machine_learning\"", + "subject": "\"business_ethics\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_machine_learning\"" + "groups": "\"mmlu_business_ethics\"" } } }, { - "evaluation_name": "Management", + "evaluation_name": "Clinical Knowledge", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1261,36 +1169,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Management", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.709, + "score": 0.615, "details": { - "description": "min=0.709, mean=0.709, max=0.709, sum=1.417 (2)", + "description": "min=0.615, mean=0.615, max=0.615, sum=1.23 (2)", "tab": "Accuracy", - "Management - Observed inference time (s)": "{\"description\": \"min=0.481, mean=0.481, max=0.481, sum=0.963 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.48132226536574874\"}", - "Management - # eval": "{\"description\": \"min=103, mean=103, max=103, sum=206 (2)\", \"tab\": \"General information\", \"score\": \"103.0\"}", - "Management - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Management - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Management - # prompt tokens": "{\"description\": \"min=283.786, mean=283.786, max=283.786, sum=567.573 (2)\", \"tab\": \"General information\", \"score\": \"283.7864077669903\"}", - "Management - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Clinical Knowledge - Observed inference time (s)": "{\"description\": \"min=0.369, mean=0.369, max=0.369, sum=0.738 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3692442273193935\"}", + "Clinical Knowledge - # eval": "{\"description\": \"min=265, mean=265, max=265, sum=530 (2)\", \"tab\": \"General information\", \"score\": \"265.0\"}", + "Clinical Knowledge - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Clinical Knowledge - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Clinical Knowledge - # prompt tokens": "{\"description\": \"min=397.928, mean=397.928, max=397.928, sum=795.857 (2)\", \"tab\": \"General information\", \"score\": \"397.92830188679244\"}", + "Clinical Knowledge - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"management\"", + "subject": "\"clinical_knowledge\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_management\"" + "groups": "\"mmlu_clinical_knowledge\"" } } }, { - "evaluation_name": "Marketing", + "evaluation_name": "Conceptual Physics", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1299,36 +1207,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Marketing", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.833, + "score": 0.528, "details": { - "description": "min=0.833, mean=0.833, max=0.833, sum=1.667 (2)", + "description": "min=0.528, mean=0.528, max=0.528, sum=1.055 (2)", "tab": "Accuracy", - "Marketing - Observed inference time (s)": "{\"description\": \"min=0.529, mean=0.529, max=0.529, sum=1.059 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5294545297948723\"}", - "Marketing - # eval": "{\"description\": \"min=234, mean=234, max=234, sum=468 (2)\", \"tab\": \"General information\", \"score\": \"234.0\"}", - "Marketing - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Marketing - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Marketing - # prompt tokens": "{\"description\": \"min=404.218, mean=404.218, max=404.218, sum=808.436 (2)\", \"tab\": \"General information\", \"score\": \"404.21794871794873\"}", - "Marketing - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Conceptual Physics - Observed inference time (s)": "{\"description\": \"min=0.351, mean=0.351, max=0.351, sum=0.701 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.35051030605397326\"}", + "Conceptual Physics - # eval": "{\"description\": \"min=235, mean=235, max=235, sum=470 (2)\", \"tab\": \"General information\", \"score\": \"235.0\"}", + "Conceptual Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Conceptual Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Conceptual Physics - # prompt tokens": "{\"description\": \"min=304.834, mean=304.834, max=304.834, sum=609.668 (2)\", \"tab\": \"General information\", \"score\": \"304.83404255319147\"}", + "Conceptual Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"marketing\"", + "subject": "\"conceptual_physics\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_marketing\"" + "groups": "\"mmlu_conceptual_physics\"" } } }, { - "evaluation_name": "Medical Genetics", + "evaluation_name": "Electrical Engineering", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1337,36 +1245,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Medical Genetics", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.66, + "score": 0.441, "details": { - "description": "min=0.66, mean=0.66, max=0.66, sum=1.32 (2)", + "description": "min=0.441, mean=0.441, max=0.441, sum=0.883 (2)", "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": "{\"description\": \"min=0.521, mean=0.521, max=0.521, sum=1.041 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.520596706867218\"}", - "Medical Genetics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "Medical Genetics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Medical Genetics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Medical Genetics - # prompt tokens": "{\"description\": \"min=340.99, mean=340.99, max=340.99, sum=681.98 (2)\", \"tab\": \"General information\", \"score\": \"340.99\"}", - "Medical Genetics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Electrical Engineering - Observed inference time (s)": "{\"description\": \"min=0.35, mean=0.35, max=0.35, sum=0.7 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.34982287637118636\"}", + "Electrical Engineering - # eval": "{\"description\": \"min=145, mean=145, max=145, sum=290 (2)\", \"tab\": \"General information\", \"score\": \"145.0\"}", + "Electrical Engineering - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Electrical Engineering - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Electrical Engineering - # prompt tokens": "{\"description\": \"min=435.607, mean=435.607, max=435.607, sum=871.214 (2)\", \"tab\": \"General information\", \"score\": \"435.60689655172416\"}", + "Electrical Engineering - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"medical_genetics\"", + "subject": "\"electrical_engineering\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_medical_genetics\"" + "groups": "\"mmlu_electrical_engineering\"" } } }, { - "evaluation_name": "Miscellaneous", + "evaluation_name": "Elementary Mathematics", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1375,36 +1283,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Miscellaneous", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.653, + "score": 0.429, "details": { - "description": "min=0.653, mean=0.653, max=0.653, sum=1.305 (2)", + "description": "min=0.429, mean=0.429, max=0.429, sum=0.857 (2)", "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": "{\"description\": \"min=0.803, mean=0.803, max=0.803, sum=1.606 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.8030396217282857\"}", - "Miscellaneous - # eval": "{\"description\": \"min=783, mean=783, max=783, sum=1566 (2)\", \"tab\": \"General information\", \"score\": \"783.0\"}", - "Miscellaneous - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Miscellaneous - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Miscellaneous - # prompt tokens": "{\"description\": \"min=299.911, mean=299.911, max=299.911, sum=599.821 (2)\", \"tab\": \"General information\", \"score\": \"299.9106002554278\"}", - "Miscellaneous - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Elementary Mathematics - Observed inference time (s)": "{\"description\": \"min=0.4, mean=0.4, max=0.4, sum=0.801 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4003569991500289\"}", + "Elementary Mathematics - # eval": "{\"description\": \"min=378, mean=378, max=378, sum=756 (2)\", \"tab\": \"General information\", \"score\": \"378.0\"}", + "Elementary Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Elementary Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Elementary Mathematics - # prompt tokens": "{\"description\": \"min=531.854, mean=531.854, max=531.854, sum=1063.709 (2)\", \"tab\": \"General information\", \"score\": \"531.8544973544973\"}", + "Elementary Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"miscellaneous\"", + "subject": "\"elementary_mathematics\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_miscellaneous\"" + "groups": "\"mmlu_elementary_mathematics\"" } } }, { - "evaluation_name": "Moral Scenarios", + "evaluation_name": "Formal Logic", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1413,42 +1321,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Moral Scenarios", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.368, + "score": 0.444, "details": { - "description": "min=0.368, mean=0.368, max=0.368, sum=0.735 (2)", + "description": "min=0.444, mean=0.444, max=0.444, sum=0.889 (2)", "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": "{\"description\": \"min=0.657, mean=0.657, max=0.657, sum=1.314 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6570079657383737\"}", - "Moral Scenarios - Observed inference time (s)": "{\"description\": \"min=0.65, mean=0.65, max=0.65, sum=1.299 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.649639103266114\"}", - "Moral Disputes - # eval": "{\"description\": \"min=346, mean=346, max=346, sum=692 (2)\", \"tab\": \"General information\", \"score\": \"346.0\"}", - "Moral Disputes - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Moral Disputes - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Moral Disputes - # prompt tokens": "{\"description\": \"min=476.113, mean=476.113, max=476.113, sum=952.225 (2)\", \"tab\": \"General information\", \"score\": \"476.1127167630058\"}", - "Moral Disputes - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "Moral Scenarios - # eval": "{\"description\": \"min=895, mean=895, max=895, sum=1790 (2)\", \"tab\": \"General information\", \"score\": \"895.0\"}", - "Moral Scenarios - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Moral Scenarios - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Moral Scenarios - # prompt tokens": "{\"description\": \"min=656.455, mean=656.455, max=656.455, sum=1312.909 (2)\", \"tab\": \"General information\", \"score\": \"656.454748603352\"}", - "Moral Scenarios - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Formal Logic - Observed inference time (s)": "{\"description\": \"min=0.357, mean=0.357, max=0.357, sum=0.714 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.35707327108534553\"}", + "Formal Logic - # eval": "{\"description\": \"min=126, mean=126, max=126, sum=252 (2)\", \"tab\": \"General information\", \"score\": \"126.0\"}", + "Formal Logic - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Formal Logic - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Formal Logic - # prompt tokens": "{\"description\": \"min=601.778, mean=601.778, max=601.778, sum=1203.556 (2)\", \"tab\": \"General information\", \"score\": \"601.7777777777778\"}", + "Formal Logic - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"moral_scenarios\"", + "subject": "\"formal_logic\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_moral_scenarios\"" + "groups": "\"mmlu_formal_logic\"" } } }, { - "evaluation_name": "Nutrition", + "evaluation_name": "High School World History", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1457,36 +1359,114 @@ ] }, "metric_config": { - "evaluation_description": "EM on Nutrition", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.712, + "score": 0.515, "details": { - "description": "min=0.712, mean=0.712, max=0.712, sum=1.425 (2)", + "description": "min=0.515, mean=0.515, max=0.515, sum=1.03 (2)", "tab": "Accuracy", - "Nutrition - Observed inference time (s)": "{\"description\": \"min=1.485, mean=1.485, max=1.485, sum=2.971 (2)\", \"tab\": \"Efficiency\", \"score\": \"1.4853957338270798\"}", - "Nutrition - # eval": "{\"description\": \"min=306, mean=306, max=306, sum=612 (2)\", \"tab\": \"General information\", \"score\": \"306.0\"}", - "Nutrition - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Nutrition - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Nutrition - # prompt tokens": "{\"description\": \"min=586.814, mean=586.814, max=586.814, sum=1173.627 (2)\", \"tab\": \"General information\", \"score\": \"586.8137254901961\"}", - "Nutrition - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "High School Biology - Observed inference time (s)": "{\"description\": \"min=0.211, mean=0.211, max=0.211, sum=0.423 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.21137587870320967\"}", + "High School Chemistry - Observed inference time (s)": "{\"description\": \"min=0.211, mean=0.211, max=0.211, sum=0.423 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2113605567387172\"}", + "High School Computer Science - Observed inference time (s)": "{\"description\": \"min=0.214, mean=0.214, max=0.214, sum=0.428 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2138903546333313\"}", + "High School European History - Observed inference time (s)": "{\"description\": \"min=0.332, mean=0.332, max=0.332, sum=0.664 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.33188523668231384\"}", + "High School Geography - Observed inference time (s)": "{\"description\": \"min=0.218, mean=0.218, max=0.218, sum=0.435 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.21753037818754561\"}", + "High School Government And Politics - Observed inference time (s)": "{\"description\": \"min=0.558, mean=0.558, max=0.558, sum=1.117 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.558492410985917\"}", + "High School Macroeconomics - Observed inference time (s)": "{\"description\": \"min=0.703, mean=0.703, max=0.703, sum=1.407 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.7033225890917656\"}", + "High School Mathematics - Observed inference time (s)": "{\"description\": \"min=0.649, mean=0.649, max=0.649, sum=1.299 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6494572189119127\"}", + "High School Microeconomics - Observed inference time (s)": "{\"description\": \"min=0.612, mean=0.612, max=0.612, sum=1.223 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6115654797113242\"}", + "High School Physics - Observed inference time (s)": "{\"description\": \"min=0.564, mean=0.564, max=0.564, sum=1.127 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5636763351642533\"}", + "High School Psychology - Observed inference time (s)": "{\"description\": \"min=0.681, mean=0.681, max=0.681, sum=1.363 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6813242522948378\"}", + "High School Statistics - Observed inference time (s)": "{\"description\": \"min=0.606, mean=0.606, max=0.606, sum=1.212 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6060926814874014\"}", + "High School US History - Observed inference time (s)": "{\"description\": \"min=1.122, mean=1.122, max=1.122, sum=2.244 (2)\", \"tab\": \"Efficiency\", \"score\": \"1.1218917334780973\"}", + "High School World History - Observed inference time (s)": "{\"description\": \"min=0.538, mean=0.538, max=0.538, sum=1.076 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5378943324592043\"}", + "High School Biology - # eval": "{\"description\": \"min=310, mean=310, max=310, sum=620 (2)\", \"tab\": \"General information\", \"score\": \"310.0\"}", + "High School Biology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Biology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Biology - # prompt tokens": "{\"description\": \"min=513.671, mean=513.671, max=513.671, sum=1027.342 (2)\", \"tab\": \"General information\", \"score\": \"513.6709677419354\"}", + "High School Biology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Chemistry - # eval": "{\"description\": \"min=203, mean=203, max=203, sum=406 (2)\", \"tab\": \"General information\", \"score\": \"203.0\"}", + "High School Chemistry - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Chemistry - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Chemistry - # prompt tokens": "{\"description\": \"min=496.704, mean=496.704, max=496.704, sum=993.409 (2)\", \"tab\": \"General information\", \"score\": \"496.70443349753697\"}", + "High School Chemistry - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Computer Science - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "High School Computer Science - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Computer Science - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Computer Science - # prompt tokens": "{\"description\": \"min=867.78, mean=867.78, max=867.78, sum=1735.56 (2)\", \"tab\": \"General information\", \"score\": \"867.78\"}", + "High School Computer Science - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School European History - # eval": "{\"description\": \"min=165, mean=165, max=165, sum=330 (2)\", \"tab\": \"General information\", \"score\": \"165.0\"}", + "High School European History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School European History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School European History - # prompt tokens": "{\"description\": \"min=2797.885, mean=2797.885, max=2797.885, sum=5595.77 (2)\", \"tab\": \"General information\", \"score\": \"2797.8848484848486\"}", + "High School European History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Geography - # eval": "{\"description\": \"min=198, mean=198, max=198, sum=396 (2)\", \"tab\": \"General information\", \"score\": \"198.0\"}", + "High School Geography - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Geography - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Geography - # prompt tokens": "{\"description\": \"min=372.035, mean=372.035, max=372.035, sum=744.071 (2)\", \"tab\": \"General information\", \"score\": \"372.0353535353535\"}", + "High School Geography - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Government And Politics - # eval": "{\"description\": \"min=193, mean=193, max=193, sum=386 (2)\", \"tab\": \"General information\", \"score\": \"193.0\"}", + "High School Government And Politics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Government And Politics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Government And Politics - # prompt tokens": "{\"description\": \"min=465.824, mean=465.824, max=465.824, sum=931.648 (2)\", \"tab\": \"General information\", \"score\": \"465.8238341968912\"}", + "High School Government And Politics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Macroeconomics - # eval": "{\"description\": \"min=390, mean=390, max=390, sum=780 (2)\", \"tab\": \"General information\", \"score\": \"390.0\"}", + "High School Macroeconomics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Macroeconomics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Macroeconomics - # prompt tokens": "{\"description\": \"min=370.908, mean=370.908, max=370.908, sum=741.815 (2)\", \"tab\": \"General information\", \"score\": \"370.9076923076923\"}", + "High School Macroeconomics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Mathematics - # eval": "{\"description\": \"min=270, mean=270, max=270, sum=540 (2)\", \"tab\": \"General information\", \"score\": \"270.0\"}", + "High School Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Mathematics - # prompt tokens": "{\"description\": \"min=532.356, mean=532.356, max=532.356, sum=1064.711 (2)\", \"tab\": \"General information\", \"score\": \"532.3555555555556\"}", + "High School Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Microeconomics - # eval": "{\"description\": \"min=238, mean=238, max=238, sum=476 (2)\", \"tab\": \"General information\", \"score\": \"238.0\"}", + "High School Microeconomics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Microeconomics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Microeconomics - # prompt tokens": "{\"description\": \"min=399.013, mean=399.013, max=399.013, sum=798.025 (2)\", \"tab\": \"General information\", \"score\": \"399.0126050420168\"}", + "High School Microeconomics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Physics - # eval": "{\"description\": \"min=151, mean=151, max=151, sum=302 (2)\", \"tab\": \"General information\", \"score\": \"151.0\"}", + "High School Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Physics - # prompt tokens": "{\"description\": \"min=560.457, mean=560.457, max=560.457, sum=1120.914 (2)\", \"tab\": \"General information\", \"score\": \"560.4569536423841\"}", + "High School Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Psychology - # eval": "{\"description\": \"min=545, mean=545, max=545, sum=1090 (2)\", \"tab\": \"General information\", \"score\": \"545.0\"}", + "High School Psychology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Psychology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Psychology - # prompt tokens": "{\"description\": \"min=495.242, mean=495.242, max=495.242, sum=990.484 (2)\", \"tab\": \"General information\", \"score\": \"495.2422018348624\"}", + "High School Psychology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Statistics - # eval": "{\"description\": \"min=216, mean=216, max=216, sum=432 (2)\", \"tab\": \"General information\", \"score\": \"216.0\"}", + "High School Statistics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Statistics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Statistics - # prompt tokens": "{\"description\": \"min=795.639, mean=795.639, max=795.639, sum=1591.278 (2)\", \"tab\": \"General information\", \"score\": \"795.6388888888889\"}", + "High School Statistics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School US History - # eval": "{\"description\": \"min=204, mean=204, max=204, sum=408 (2)\", \"tab\": \"General information\", \"score\": \"204.0\"}", + "High School US History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School US History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School US History - # prompt tokens": "{\"description\": \"min=2217.809, mean=2217.809, max=2217.809, sum=4435.618 (2)\", \"tab\": \"General information\", \"score\": \"2217.8088235294117\"}", + "High School US History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School World History - # eval": "{\"description\": \"min=237, mean=237, max=237, sum=474 (2)\", \"tab\": \"General information\", \"score\": \"237.0\"}", + "High School World History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School World History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School World History - # prompt tokens": "{\"description\": \"min=1428.173, mean=1428.173, max=1428.173, sum=2856.346 (2)\", \"tab\": \"General information\", \"score\": \"1428.1729957805908\"}", + "High School World History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"nutrition\"", + "subject": "\"high_school_world_history\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_nutrition\"" + "groups": "\"mmlu_high_school_world_history\"" } } }, { - "evaluation_name": "Prehistory", + "evaluation_name": "Human Sexuality", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1495,36 +1475,42 @@ ] }, "metric_config": { - "evaluation_description": "EM on Prehistory", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.728, + "score": 0.733, "details": { - "description": "min=0.728, mean=0.728, max=0.728, sum=1.457 (2)", + "description": "min=0.733, mean=0.733, max=0.733, sum=1.466 (2)", "tab": "Accuracy", - "Prehistory - Observed inference time (s)": "{\"description\": \"min=0.792, mean=0.792, max=0.792, sum=1.584 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.7917959955003526\"}", - "Prehistory - # eval": "{\"description\": \"min=324, mean=324, max=324, sum=648 (2)\", \"tab\": \"General information\", \"score\": \"324.0\"}", - "Prehistory - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Prehistory - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Prehistory - # prompt tokens": "{\"description\": \"min=514.528, mean=514.528, max=514.528, sum=1029.056 (2)\", \"tab\": \"General information\", \"score\": \"514.5277777777778\"}", - "Prehistory - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Human Aging - Observed inference time (s)": "{\"description\": \"min=0.685, mean=0.685, max=0.685, sum=1.369 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6845707412257858\"}", + "Human Sexuality - Observed inference time (s)": "{\"description\": \"min=1.227, mean=1.227, max=1.227, sum=2.455 (2)\", \"tab\": \"Efficiency\", \"score\": \"1.2273387745136524\"}", + "Human Aging - # eval": "{\"description\": \"min=223, mean=223, max=223, sum=446 (2)\", \"tab\": \"General information\", \"score\": \"223.0\"}", + "Human Aging - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Human Aging - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Human Aging - # prompt tokens": "{\"description\": \"min=319.888, mean=319.888, max=319.888, sum=639.776 (2)\", \"tab\": \"General information\", \"score\": \"319.88789237668163\"}", + "Human Aging - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "Human Sexuality - # eval": "{\"description\": \"min=131, mean=131, max=131, sum=262 (2)\", \"tab\": \"General information\", \"score\": \"131.0\"}", + "Human Sexuality - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Human Sexuality - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Human Sexuality - # prompt tokens": "{\"description\": \"min=341.168, mean=341.168, max=341.168, sum=682.336 (2)\", \"tab\": \"General information\", \"score\": \"341.1679389312977\"}", + "Human Sexuality - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"prehistory\"", + "subject": "\"human_sexuality\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_prehistory\"" + "groups": "\"mmlu_human_sexuality\"" } } }, { - "evaluation_name": "Public Relations", + "evaluation_name": "International Law", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1533,36 +1519,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Public Relations", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.664, + "score": 0.694, "details": { - "description": "min=0.664, mean=0.664, max=0.664, sum=1.327 (2)", + "description": "min=0.694, mean=0.694, max=0.694, sum=1.388 (2)", "tab": "Accuracy", - "Public Relations - Observed inference time (s)": "{\"description\": \"min=0.493, mean=0.493, max=0.493, sum=0.986 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.49318039634011007\"}", - "Public Relations - # eval": "{\"description\": \"min=110, mean=110, max=110, sum=220 (2)\", \"tab\": \"General information\", \"score\": \"110.0\"}", - "Public Relations - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Public Relations - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Public Relations - # prompt tokens": "{\"description\": \"min=405.318, mean=405.318, max=405.318, sum=810.636 (2)\", \"tab\": \"General information\", \"score\": \"405.3181818181818\"}", - "Public Relations - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "International Law - Observed inference time (s)": "{\"description\": \"min=0.684, mean=0.684, max=0.684, sum=1.369 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6842782950598346\"}", + "International Law - # eval": "{\"description\": \"min=121, mean=121, max=121, sum=242 (2)\", \"tab\": \"General information\", \"score\": \"121.0\"}", + "International Law - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "International Law - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "International Law - # prompt tokens": "{\"description\": \"min=639.818, mean=639.818, max=639.818, sum=1279.636 (2)\", \"tab\": \"General information\", \"score\": \"639.8181818181819\"}", + "International Law - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"public_relations\"", + "subject": "\"international_law\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_public_relations\"" + "groups": "\"mmlu_international_law\"" } } }, { - "evaluation_name": "Security Studies", + "evaluation_name": "Logical Fallacies", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1571,36 +1557,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Security Studies", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.576, + "score": 0.742, "details": { - "description": "min=0.576, mean=0.576, max=0.576, sum=1.151 (2)", + "description": "min=0.742, mean=0.742, max=0.742, sum=1.485 (2)", "tab": "Accuracy", - "Security Studies - Observed inference time (s)": "{\"description\": \"min=0.656, mean=0.656, max=0.656, sum=1.312 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6561975401275012\"}", - "Security Studies - # eval": "{\"description\": \"min=245, mean=245, max=245, sum=490 (2)\", \"tab\": \"General information\", \"score\": \"245.0\"}", - "Security Studies - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Security Studies - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Security Studies - # prompt tokens": "{\"description\": \"min=1164.473, mean=1164.473, max=1164.473, sum=2328.947 (2)\", \"tab\": \"General information\", \"score\": \"1164.4734693877551\"}", - "Security Studies - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Logical Fallacies - Observed inference time (s)": "{\"description\": \"min=1.35, mean=1.35, max=1.35, sum=2.7 (2)\", \"tab\": \"Efficiency\", \"score\": \"1.3501118970063566\"}", + "Logical Fallacies - # eval": "{\"description\": \"min=163, mean=163, max=163, sum=326 (2)\", \"tab\": \"General information\", \"score\": \"163.0\"}", + "Logical Fallacies - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Logical Fallacies - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Logical Fallacies - # prompt tokens": "{\"description\": \"min=449.564, mean=449.564, max=449.564, sum=899.129 (2)\", \"tab\": \"General information\", \"score\": \"449.5644171779141\"}", + "Logical Fallacies - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"security_studies\"", + "subject": "\"logical_fallacies\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_security_studies\"" + "groups": "\"mmlu_logical_fallacies\"" } } }, { - "evaluation_name": "Sociology", + "evaluation_name": "Machine Learning", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1609,36 +1595,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Sociology", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.701, + "score": 0.384, "details": { - "description": "min=0.701, mean=0.701, max=0.701, sum=1.403 (2)", + "description": "min=0.384, mean=0.384, max=0.384, sum=0.768 (2)", "tab": "Accuracy", - "Sociology - Observed inference time (s)": "{\"description\": \"min=0.517, mean=0.517, max=0.517, sum=1.034 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5170851643405744\"}", - "Sociology - # eval": "{\"description\": \"min=201, mean=201, max=201, sum=402 (2)\", \"tab\": \"General information\", \"score\": \"201.0\"}", - "Sociology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Sociology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Sociology - # prompt tokens": "{\"description\": \"min=445.517, mean=445.517, max=445.517, sum=891.035 (2)\", \"tab\": \"General information\", \"score\": \"445.51741293532336\"}", - "Sociology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Machine Learning - Observed inference time (s)": "{\"description\": \"min=0.46, mean=0.46, max=0.46, sum=0.919 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.45964209735393524\"}", + "Machine Learning - # eval": "{\"description\": \"min=112, mean=112, max=112, sum=224 (2)\", \"tab\": \"General information\", \"score\": \"112.0\"}", + "Machine Learning - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Machine Learning - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Machine Learning - # prompt tokens": "{\"description\": \"min=668.054, mean=668.054, max=668.054, sum=1336.107 (2)\", \"tab\": \"General information\", \"score\": \"668.0535714285714\"}", + "Machine Learning - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"sociology\"", + "subject": "\"machine_learning\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_sociology\"" + "groups": "\"mmlu_machine_learning\"" } } }, { - "evaluation_name": "Virology", + "evaluation_name": "Management", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1647,36 +1633,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Virology", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.446, + "score": 0.709, "details": { - "description": "min=0.446, mean=0.446, max=0.446, sum=0.892 (2)", + "description": "min=0.709, mean=0.709, max=0.709, sum=1.417 (2)", "tab": "Accuracy", - "Virology - Observed inference time (s)": "{\"description\": \"min=0.406, mean=0.406, max=0.406, sum=0.813 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.40646702553852493\"}", - "Virology - # eval": "{\"description\": \"min=166, mean=166, max=166, sum=332 (2)\", \"tab\": \"General information\", \"score\": \"166.0\"}", - "Virology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Virology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Virology - # prompt tokens": "{\"description\": \"min=343.018, mean=343.018, max=343.018, sum=686.036 (2)\", \"tab\": \"General information\", \"score\": \"343.01807228915663\"}", - "Virology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Management - Observed inference time (s)": "{\"description\": \"min=0.481, mean=0.481, max=0.481, sum=0.963 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.48132226536574874\"}", + "Management - # eval": "{\"description\": \"min=103, mean=103, max=103, sum=206 (2)\", \"tab\": \"General information\", \"score\": \"103.0\"}", + "Management - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Management - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Management - # prompt tokens": "{\"description\": \"min=283.786, mean=283.786, max=283.786, sum=567.573 (2)\", \"tab\": \"General information\", \"score\": \"283.7864077669903\"}", + "Management - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"virology\"", + "subject": "\"management\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_virology\"" + "groups": "\"mmlu_management\"" } } }, { - "evaluation_name": "World Religions", + "evaluation_name": "Marketing", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1685,36 +1671,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on World Religions", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.789, + "score": 0.833, "details": { - "description": "min=0.789, mean=0.789, max=0.789, sum=1.579 (2)", + "description": "min=0.833, mean=0.833, max=0.833, sum=1.667 (2)", "tab": "Accuracy", - "World Religions - Observed inference time (s)": "{\"description\": \"min=0.587, mean=0.587, max=0.587, sum=1.173 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5866640882882458\"}", - "World Religions - # eval": "{\"description\": \"min=171, mean=171, max=171, sum=342 (2)\", \"tab\": \"General information\", \"score\": \"171.0\"}", - "World Religions - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "World Religions - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "World Religions - # prompt tokens": "{\"description\": \"min=274.52, mean=274.52, max=274.52, sum=549.041 (2)\", \"tab\": \"General information\", \"score\": \"274.5204678362573\"}", - "World Religions - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Marketing - Observed inference time (s)": "{\"description\": \"min=0.529, mean=0.529, max=0.529, sum=1.059 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5294545297948723\"}", + "Marketing - # eval": "{\"description\": \"min=234, mean=234, max=234, sum=468 (2)\", \"tab\": \"General information\", \"score\": \"234.0\"}", + "Marketing - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Marketing - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Marketing - # prompt tokens": "{\"description\": \"min=404.218, mean=404.218, max=404.218, sum=808.436 (2)\", \"tab\": \"General information\", \"score\": \"404.21794871794873\"}", + "Marketing - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"world_religions\"", + "subject": "\"marketing\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_world_religions\"" + "groups": "\"mmlu_marketing\"" } } }, { - "evaluation_name": "Mean win rate", + "evaluation_name": "Medical Genetics", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1723,404 +1709,418 @@ ] }, "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.475, + "score": 0.66, "details": { - "description": "", - "tab": "Efficiency" + "description": "min=0.66, mean=0.66, max=0.66, sum=1.32 (2)", + "tab": "Accuracy", + "Medical Genetics - Observed inference time (s)": "{\"description\": \"min=0.521, mean=0.521, max=0.521, sum=1.041 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.520596706867218\"}", + "Medical Genetics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "Medical Genetics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Medical Genetics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Medical Genetics - # prompt tokens": "{\"description\": \"min=340.99, mean=340.99, max=340.99, sum=681.98 (2)\", \"tab\": \"General information\", \"score\": \"340.99\"}", + "Medical Genetics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { - "additional_details": {} + "additional_details": { + "subject": "\"medical_genetics\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_medical_genetics\"" + } } - } - ], - "detailed_evaluation_results": null, - "generation_config": { - "additional_details": { - "subject": "[\"abstract_algebra\", \"anatomy\", \"astronomy\", \"business_ethics\", \"clinical_knowledge\", \"college_biology\", \"college_chemistry\", \"college_computer_science\", \"college_mathematics\", \"college_medicine\", \"college_physics\", \"computer_security\", \"conceptual_physics\", \"econometrics\", \"electrical_engineering\", \"elementary_mathematics\", \"formal_logic\", \"global_facts\", \"high_school_biology\", \"high_school_chemistry\", \"high_school_computer_science\", \"high_school_european_history\", \"high_school_geography\", \"high_school_government_and_politics\", \"high_school_macroeconomics\", \"high_school_mathematics\", \"high_school_microeconomics\", \"high_school_physics\", \"high_school_psychology\", \"high_school_statistics\", \"high_school_us_history\", \"high_school_world_history\", \"human_aging\", \"human_sexuality\", \"international_law\", \"jurisprudence\", \"logical_fallacies\", \"machine_learning\", \"management\", \"marketing\", \"medical_genetics\", \"miscellaneous\", \"moral_disputes\", \"moral_scenarios\", \"nutrition\", \"philosophy\", \"prehistory\", \"professional_accounting\", \"professional_law\", \"professional_medicine\", \"professional_psychology\", \"public_relations\", \"security_studies\", \"sociology\", \"us_foreign_policy\", \"virology\", \"world_religions\"]", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "[\"mmlu_abstract_algebra\", \"mmlu_anatomy\", \"mmlu_astronomy\", \"mmlu_business_ethics\", \"mmlu_clinical_knowledge\", \"mmlu_college_biology\", \"mmlu_college_chemistry\", \"mmlu_college_computer_science\", \"mmlu_college_mathematics\", \"mmlu_college_medicine\", \"mmlu_college_physics\", \"mmlu_computer_security\", \"mmlu_conceptual_physics\", \"mmlu_econometrics\", \"mmlu_electrical_engineering\", \"mmlu_elementary_mathematics\", \"mmlu_formal_logic\", \"mmlu_global_facts\", \"mmlu_high_school_biology\", \"mmlu_high_school_chemistry\", \"mmlu_high_school_computer_science\", \"mmlu_high_school_european_history\", \"mmlu_high_school_geography\", \"mmlu_high_school_government_and_politics\", \"mmlu_high_school_macroeconomics\", \"mmlu_high_school_mathematics\", \"mmlu_high_school_microeconomics\", \"mmlu_high_school_physics\", \"mmlu_high_school_psychology\", \"mmlu_high_school_statistics\", \"mmlu_high_school_us_history\", \"mmlu_high_school_world_history\", \"mmlu_human_aging\", \"mmlu_human_sexuality\", \"mmlu_international_law\", \"mmlu_jurisprudence\", \"mmlu_logical_fallacies\", \"mmlu_machine_learning\", \"mmlu_management\", \"mmlu_marketing\", \"mmlu_medical_genetics\", \"mmlu_miscellaneous\", \"mmlu_moral_disputes\", \"mmlu_moral_scenarios\", \"mmlu_nutrition\", \"mmlu_philosophy\", \"mmlu_prehistory\", \"mmlu_professional_accounting\", \"mmlu_professional_law\", \"mmlu_professional_medicine\", \"mmlu_professional_psychology\", \"mmlu_public_relations\", \"mmlu_security_studies\", \"mmlu_sociology\", \"mmlu_us_foreign_policy\", \"mmlu_virology\", \"mmlu_world_religions\"]" - } - } - }, - { - "evaluation_id": "helm_lite/meta_llama-3.1-8b-instruct-turbo/1774096306.427425", - "retrieved_timestamp": "1774096306.427425", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "eval_library": { - "name": "helm", - "version": "unknown" - }, - "benchmark": "helm_lite", - "evaluation_results": [ + }, { - "evaluation_name": "Mean win rate", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "helm_lite", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.303, + "score": 0.653, "details": { - "description": "", + "description": "min=0.653, mean=0.653, max=0.653, sum=1.305 (2)", "tab": "Accuracy", - "Mean win rate - Efficiency": "{\"description\": \"\", \"tab\": \"Efficiency\", \"score\": \"0.5896504369538077\"}", - "Mean win rate - General information": "{\"description\": \"\", \"tab\": \"General information\", \"score\": \"\"}" + "Miscellaneous - Observed inference time (s)": "{\"description\": \"min=0.803, mean=0.803, max=0.803, sum=1.606 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.8030396217282857\"}", + "Miscellaneous - # eval": "{\"description\": \"min=783, mean=783, max=783, sum=1566 (2)\", \"tab\": \"General information\", \"score\": \"783.0\"}", + "Miscellaneous - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Miscellaneous - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Miscellaneous - # prompt tokens": "{\"description\": \"min=299.911, mean=299.911, max=299.911, sum=599.821 (2)\", \"tab\": \"General information\", \"score\": \"299.9106002554278\"}", + "Miscellaneous - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { - "additional_details": {} + "additional_details": { + "subject": "\"miscellaneous\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_miscellaneous\"" + } } }, { - "evaluation_name": "NarrativeQA", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "NarrativeQA", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "F1 on NarrativeQA", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.756, + "score": 0.368, "details": { - "description": "min=0.756, mean=0.756, max=0.756, sum=0.756 (1)", + "description": "min=0.368, mean=0.368, max=0.368, sum=0.735 (2)", "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": "{\"description\": \"min=0.581, mean=0.581, max=0.581, sum=0.581 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.5813529316808136\"}", - "NarrativeQA - # eval": "{\"description\": \"min=355, mean=355, max=355, sum=355 (1)\", \"tab\": \"General information\", \"score\": \"355.0\"}", - "NarrativeQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "NarrativeQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "NarrativeQA - # prompt tokens": "{\"description\": \"min=3484.268, mean=3484.268, max=3484.268, sum=3484.268 (1)\", \"tab\": \"General information\", \"score\": \"3484.2676056338028\"}", - "NarrativeQA - # output tokens": "{\"description\": \"min=7.287, mean=7.287, max=7.287, sum=7.287 (1)\", \"tab\": \"General information\", \"score\": \"7.2873239436619714\"}" + "Moral Disputes - Observed inference time (s)": "{\"description\": \"min=0.657, mean=0.657, max=0.657, sum=1.314 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6570079657383737\"}", + "Moral Scenarios - Observed inference time (s)": "{\"description\": \"min=0.65, mean=0.65, max=0.65, sum=1.299 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.649639103266114\"}", + "Moral Disputes - # eval": "{\"description\": \"min=346, mean=346, max=346, sum=692 (2)\", \"tab\": \"General information\", \"score\": \"346.0\"}", + "Moral Disputes - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Moral Disputes - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Moral Disputes - # prompt tokens": "{\"description\": \"min=476.113, mean=476.113, max=476.113, sum=952.225 (2)\", \"tab\": \"General information\", \"score\": \"476.1127167630058\"}", + "Moral Disputes - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "Moral Scenarios - # eval": "{\"description\": \"min=895, mean=895, max=895, sum=1790 (2)\", \"tab\": \"General information\", \"score\": \"895.0\"}", + "Moral Scenarios - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Moral Scenarios - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Moral Scenarios - # prompt tokens": "{\"description\": \"min=656.455, mean=656.455, max=656.455, sum=1312.909 (2)\", \"tab\": \"General information\", \"score\": \"656.454748603352\"}", + "Moral Scenarios - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { - "additional_details": {} + "additional_details": { + "subject": "\"moral_scenarios\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_moral_scenarios\"" + } } }, { - "evaluation_name": "NaturalQuestions (closed-book)", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.209, + "score": 0.712, "details": { - "description": "min=0.209, mean=0.209, max=0.209, sum=0.209 (1)", + "description": "min=0.712, mean=0.712, max=0.712, sum=1.425 (2)", "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": "{\"description\": \"min=0.544, mean=0.544, max=0.544, sum=0.544 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.5441543731689453\"}", - "NaturalQuestions (closed-book) - Observed inference time (s)": "{\"description\": \"min=0.752, mean=0.752, max=0.752, sum=0.752 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.751717613697052\"}", - "NaturalQuestions (open-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", - "NaturalQuestions (open-book) - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "NaturalQuestions (open-book) - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "NaturalQuestions (open-book) - # prompt tokens": "{\"description\": \"min=1716.78, mean=1716.78, max=1716.78, sum=1716.78 (1)\", \"tab\": \"General information\", \"score\": \"1716.78\"}", - "NaturalQuestions (open-book) - # output tokens": "{\"description\": \"min=8.736, mean=8.736, max=8.736, sum=8.736 (1)\", \"tab\": \"General information\", \"score\": \"8.736\"}", - "NaturalQuestions (closed-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", - "NaturalQuestions (closed-book) - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "NaturalQuestions (closed-book) - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "NaturalQuestions (closed-book) - # prompt tokens": "{\"description\": \"min=129.12, mean=129.12, max=129.12, sum=129.12 (1)\", \"tab\": \"General information\", \"score\": \"129.12\"}", - "NaturalQuestions (closed-book) - # output tokens": "{\"description\": \"min=11.732, mean=11.732, max=11.732, sum=11.732 (1)\", \"tab\": \"General information\", \"score\": \"11.732\"}" + "Nutrition - Observed inference time (s)": "{\"description\": \"min=1.485, mean=1.485, max=1.485, sum=2.971 (2)\", \"tab\": \"Efficiency\", \"score\": \"1.4853957338270798\"}", + "Nutrition - # eval": "{\"description\": \"min=306, mean=306, max=306, sum=612 (2)\", \"tab\": \"General information\", \"score\": \"306.0\"}", + "Nutrition - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Nutrition - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Nutrition - # prompt tokens": "{\"description\": \"min=586.814, mean=586.814, max=586.814, sum=1173.627 (2)\", \"tab\": \"General information\", \"score\": \"586.8137254901961\"}", + "Nutrition - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "mode": "\"closedbook\"" + "subject": "\"nutrition\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_nutrition\"" } } }, { - "evaluation_name": "OpenbookQA", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "OpenbookQA", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "EM on OpenbookQA", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.74, + "score": 0.728, "details": { - "description": "min=0.74, mean=0.74, max=0.74, sum=0.74 (1)", + "description": "min=0.728, mean=0.728, max=0.728, sum=1.457 (2)", "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": "{\"description\": \"min=2.937, mean=2.937, max=2.937, sum=2.937 (1)\", \"tab\": \"Efficiency\", \"score\": \"2.9374450149536133\"}", - "OpenbookQA - # eval": "{\"description\": \"min=500, mean=500, max=500, sum=500 (1)\", \"tab\": \"General information\", \"score\": \"500.0\"}", - "OpenbookQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "OpenbookQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "OpenbookQA - # prompt tokens": "{\"description\": \"min=249.776, mean=249.776, max=249.776, sum=249.776 (1)\", \"tab\": \"General information\", \"score\": \"249.776\"}", - "OpenbookQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Prehistory - Observed inference time (s)": "{\"description\": \"min=0.792, mean=0.792, max=0.792, sum=1.584 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.7917959955003526\"}", + "Prehistory - # eval": "{\"description\": \"min=324, mean=324, max=324, sum=648 (2)\", \"tab\": \"General information\", \"score\": \"324.0\"}", + "Prehistory - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Prehistory - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Prehistory - # prompt tokens": "{\"description\": \"min=514.528, mean=514.528, max=514.528, sum=1029.056 (2)\", \"tab\": \"General information\", \"score\": \"514.5277777777778\"}", + "Prehistory - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "dataset": "\"openbookqa\"", - "method": "\"multiple_choice_joint\"" + "subject": "\"prehistory\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_prehistory\"" } } }, { - "evaluation_name": "MMLU", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "MMLU", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "EM on MMLU", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5, + "score": 0.664, "details": { - "description": "min=0.26, mean=0.5, max=0.79, sum=2.501 (5)", + "description": "min=0.664, mean=0.664, max=0.664, sum=1.327 (2)", "tab": "Accuracy", - "MMLU - Observed inference time (s)": "{\"description\": \"min=0.284, mean=0.417, max=0.567, sum=2.086 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.41729471965421716\"}", - "MMLU - # eval": "{\"description\": \"min=100, mean=102.8, max=114, sum=514 (5)\", \"tab\": \"General information\", \"score\": \"102.8\"}", - "MMLU - # train": "{\"description\": \"min=5, mean=5, max=5, sum=25 (5)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "MMLU - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "MMLU - # prompt tokens": "{\"description\": \"min=373.43, mean=467.686, max=614.421, sum=2338.431 (5)\", \"tab\": \"General information\", \"score\": \"467.6862105263158\"}", - "MMLU - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Public Relations - Observed inference time (s)": "{\"description\": \"min=0.493, mean=0.493, max=0.493, sum=0.986 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.49318039634011007\"}", + "Public Relations - # eval": "{\"description\": \"min=110, mean=110, max=110, sum=220 (2)\", \"tab\": \"General information\", \"score\": \"110.0\"}", + "Public Relations - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Public Relations - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Public Relations - # prompt tokens": "{\"description\": \"min=405.318, mean=405.318, max=405.318, sum=810.636 (2)\", \"tab\": \"General information\", \"score\": \"405.3181818181818\"}", + "Public Relations - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "[\"abstract_algebra\", \"college_chemistry\", \"computer_security\", \"econometrics\", \"us_foreign_policy\"]", - "method": "\"multiple_choice_joint\"" + "subject": "\"public_relations\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_public_relations\"" } } }, { - "evaluation_name": "MATH", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "MATH", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.703, + "score": 0.576, "details": { - "description": "min=0.509, mean=0.703, max=0.849, sum=4.92 (7)", + "description": "min=0.576, mean=0.576, max=0.576, sum=1.151 (2)", "tab": "Accuracy", - "MATH - Observed inference time (s)": "{\"description\": \"min=1.617, mean=1.927, max=2.175, sum=13.492 (7)\", \"tab\": \"Efficiency\", \"score\": \"1.9274194573191807\"}", - "MATH - # eval": "{\"description\": \"min=30, mean=62.429, max=135, sum=437 (7)\", \"tab\": \"General information\", \"score\": \"62.42857142857143\"}", - "MATH - # train": "{\"description\": \"min=8, mean=8, max=8, sum=56 (7)\", \"tab\": \"General information\", \"score\": \"8.0\"}", - "MATH - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (7)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "MATH - # prompt tokens": "{\"description\": \"min=881.363, mean=1262.909, max=2197.577, sum=8840.364 (7)\", \"tab\": \"General information\", \"score\": \"1262.9092130545007\"}", - "MATH - # output tokens": "{\"description\": \"min=203.384, mean=253.982, max=288.596, sum=1777.872 (7)\", \"tab\": \"General information\", \"score\": \"253.98170179473732\"}" + "Security Studies - Observed inference time (s)": "{\"description\": \"min=0.656, mean=0.656, max=0.656, sum=1.312 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6561975401275012\"}", + "Security Studies - # eval": "{\"description\": \"min=245, mean=245, max=245, sum=490 (2)\", \"tab\": \"General information\", \"score\": \"245.0\"}", + "Security Studies - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Security Studies - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Security Studies - # prompt tokens": "{\"description\": \"min=1164.473, mean=1164.473, max=1164.473, sum=2328.947 (2)\", \"tab\": \"General information\", \"score\": \"1164.4734693877551\"}", + "Security Studies - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "[\"algebra\", \"counting_and_probability\", \"geometry\", \"intermediate_algebra\", \"number_theory\", \"prealgebra\", \"precalculus\"]", - "level": "\"1\"", - "use_official_examples": "\"False\"", - "use_chain_of_thought": "\"True\"" + "subject": "\"security_studies\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_security_studies\"" } } }, { - "evaluation_name": "GSM8K", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "GSM8K", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "EM on GSM8K", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.798, + "score": 0.701, "details": { - "description": "min=0.798, mean=0.798, max=0.798, sum=0.798 (1)", + "description": "min=0.701, mean=0.701, max=0.701, sum=1.403 (2)", "tab": "Accuracy", - "GSM8K - Observed inference time (s)": "{\"description\": \"min=2.109, mean=2.109, max=2.109, sum=2.109 (1)\", \"tab\": \"Efficiency\", \"score\": \"2.108796592712402\"}", - "GSM8K - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", - "GSM8K - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "GSM8K - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "GSM8K - # prompt tokens": "{\"description\": \"min=959.032, mean=959.032, max=959.032, sum=959.032 (1)\", \"tab\": \"General information\", \"score\": \"959.032\"}", - "GSM8K - # output tokens": "{\"description\": \"min=150.02, mean=150.02, max=150.02, sum=150.02 (1)\", \"tab\": \"General information\", \"score\": \"150.02\"}" + "Sociology - Observed inference time (s)": "{\"description\": \"min=0.517, mean=0.517, max=0.517, sum=1.034 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5170851643405744\"}", + "Sociology - # eval": "{\"description\": \"min=201, mean=201, max=201, sum=402 (2)\", \"tab\": \"General information\", \"score\": \"201.0\"}", + "Sociology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Sociology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Sociology - # prompt tokens": "{\"description\": \"min=445.517, mean=445.517, max=445.517, sum=891.035 (2)\", \"tab\": \"General information\", \"score\": \"445.51741293532336\"}", + "Sociology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "stop": "\"none\"" + "subject": "\"sociology\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_sociology\"" } } }, { - "evaluation_name": "LegalBench", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "LegalBench", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "EM on LegalBench", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.342, + "score": 0.446, "details": { - "description": "min=0, mean=0.342, max=0.8, sum=1.71 (5)", + "description": "min=0.446, mean=0.446, max=0.446, sum=0.892 (2)", "tab": "Accuracy", - "LegalBench - Observed inference time (s)": "{\"description\": \"min=0.409, mean=0.481, max=0.626, sum=2.407 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.4814103188942614\"}", - "LegalBench - # eval": "{\"description\": \"min=95, mean=409.4, max=1000, sum=2047 (5)\", \"tab\": \"General information\", \"score\": \"409.4\"}", - "LegalBench - # train": "{\"description\": \"min=4, mean=4.8, max=5, sum=24 (5)\", \"tab\": \"General information\", \"score\": \"4.8\"}", - "LegalBench - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "LegalBench - # prompt tokens": "{\"description\": \"min=197.442, mean=1513.882, max=6300.012, sum=7569.412 (5)\", \"tab\": \"General information\", \"score\": \"1513.8824197238912\"}", - "LegalBench - # output tokens": "{\"description\": \"min=2.032, mean=6.824, max=10.886, sum=34.118 (5)\", \"tab\": \"General information\", \"score\": \"6.823557876005701\"}" + "Virology - Observed inference time (s)": "{\"description\": \"min=0.406, mean=0.406, max=0.406, sum=0.813 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.40646702553852493\"}", + "Virology - # eval": "{\"description\": \"min=166, mean=166, max=166, sum=332 (2)\", \"tab\": \"General information\", \"score\": \"166.0\"}", + "Virology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Virology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Virology - # prompt tokens": "{\"description\": \"min=343.018, mean=343.018, max=343.018, sum=686.036 (2)\", \"tab\": \"General information\", \"score\": \"343.01807228915663\"}", + "Virology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subset": "[\"abercrombie\", \"corporate_lobbying\", \"function_of_decision_section\", \"international_citizenship_questions\", \"proa\"]" + "subject": "\"virology\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_virology\"" } } }, { - "evaluation_name": "MedQA", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "MedQA", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "EM on MedQA", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.245, + "score": 0.789, "details": { - "description": "min=0.245, mean=0.245, max=0.245, sum=0.245 (1)", + "description": "min=0.789, mean=0.789, max=0.789, sum=1.579 (2)", "tab": "Accuracy", - "MedQA - Observed inference time (s)": "{\"description\": \"min=0.743, mean=0.743, max=0.743, sum=0.743 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.742541556803891\"}", - "MedQA - # eval": "{\"description\": \"min=503, mean=503, max=503, sum=503 (1)\", \"tab\": \"General information\", \"score\": \"503.0\"}", - "MedQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "MedQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "MedQA - # prompt tokens": "{\"description\": \"min=1025.274, mean=1025.274, max=1025.274, sum=1025.274 (1)\", \"tab\": \"General information\", \"score\": \"1025.2743538767395\"}", - "MedQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "World Religions - Observed inference time (s)": "{\"description\": \"min=0.587, mean=0.587, max=0.587, sum=1.173 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5866640882882458\"}", + "World Religions - # eval": "{\"description\": \"min=171, mean=171, max=171, sum=342 (2)\", \"tab\": \"General information\", \"score\": \"171.0\"}", + "World Religions - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "World Religions - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "World Religions - # prompt tokens": "{\"description\": \"min=274.52, mean=274.52, max=274.52, sum=549.041 (2)\", \"tab\": \"General information\", \"score\": \"274.5204678362573\"}", + "World Religions - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { - "additional_details": {} + "additional_details": { + "subject": "\"world_religions\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_world_religions\"" + } } }, { - "evaluation_name": "WMT 2014", + "evaluation_name": "Mean win rate", "source_data": { - "dataset_name": "WMT 2014", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", + "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.181, + "score": 0.475, "details": { - "description": "min=0.132, mean=0.181, max=0.219, sum=0.907 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": "{\"description\": \"min=0.439, mean=0.565, max=0.727, sum=2.826 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.5651802479746801\"}", - "WMT 2014 - # eval": "{\"description\": \"min=503, mean=568.8, max=832, sum=2844 (5)\", \"tab\": \"General information\", \"score\": \"568.8\"}", - "WMT 2014 - # train": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "WMT 2014 - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "WMT 2014 - # prompt tokens": "{\"description\": \"min=101.139, mean=120.712, max=141.117, sum=603.559 (5)\", \"tab\": \"General information\", \"score\": \"120.71178123566294\"}", - "WMT 2014 - # output tokens": "{\"description\": \"min=24.354, mean=25.779, max=26.833, sum=128.893 (5)\", \"tab\": \"General information\", \"score\": \"25.778561802263347\"}" + "description": "", + "tab": "Efficiency" } }, "generation_config": { - "additional_details": { - "language_pair": "[\"cs-en\", \"de-en\", \"fr-en\", \"hi-en\", \"ru-en\"]" - } + "additional_details": {} } } ], "detailed_evaluation_results": null, "generation_config": { - "additional_details": {} + "additional_details": { + "subject": "[\"abstract_algebra\", \"anatomy\", \"astronomy\", \"business_ethics\", \"clinical_knowledge\", \"college_biology\", \"college_chemistry\", \"college_computer_science\", \"college_mathematics\", \"college_medicine\", \"college_physics\", \"computer_security\", \"conceptual_physics\", \"econometrics\", \"electrical_engineering\", \"elementary_mathematics\", \"formal_logic\", \"global_facts\", \"high_school_biology\", \"high_school_chemistry\", \"high_school_computer_science\", \"high_school_european_history\", \"high_school_geography\", \"high_school_government_and_politics\", \"high_school_macroeconomics\", \"high_school_mathematics\", \"high_school_microeconomics\", \"high_school_physics\", \"high_school_psychology\", \"high_school_statistics\", \"high_school_us_history\", \"high_school_world_history\", \"human_aging\", \"human_sexuality\", \"international_law\", \"jurisprudence\", \"logical_fallacies\", \"machine_learning\", \"management\", \"marketing\", \"medical_genetics\", \"miscellaneous\", \"moral_disputes\", \"moral_scenarios\", \"nutrition\", \"philosophy\", \"prehistory\", \"professional_accounting\", \"professional_law\", \"professional_medicine\", \"professional_psychology\", \"public_relations\", \"security_studies\", \"sociology\", \"us_foreign_policy\", \"virology\", \"world_religions\"]", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "[\"mmlu_abstract_algebra\", \"mmlu_anatomy\", \"mmlu_astronomy\", \"mmlu_business_ethics\", \"mmlu_clinical_knowledge\", \"mmlu_college_biology\", \"mmlu_college_chemistry\", \"mmlu_college_computer_science\", \"mmlu_college_mathematics\", \"mmlu_college_medicine\", \"mmlu_college_physics\", \"mmlu_computer_security\", \"mmlu_conceptual_physics\", \"mmlu_econometrics\", \"mmlu_electrical_engineering\", \"mmlu_elementary_mathematics\", \"mmlu_formal_logic\", \"mmlu_global_facts\", \"mmlu_high_school_biology\", \"mmlu_high_school_chemistry\", \"mmlu_high_school_computer_science\", \"mmlu_high_school_european_history\", \"mmlu_high_school_geography\", \"mmlu_high_school_government_and_politics\", \"mmlu_high_school_macroeconomics\", \"mmlu_high_school_mathematics\", \"mmlu_high_school_microeconomics\", \"mmlu_high_school_physics\", \"mmlu_high_school_psychology\", \"mmlu_high_school_statistics\", \"mmlu_high_school_us_history\", \"mmlu_high_school_world_history\", \"mmlu_human_aging\", \"mmlu_human_sexuality\", \"mmlu_international_law\", \"mmlu_jurisprudence\", \"mmlu_logical_fallacies\", \"mmlu_machine_learning\", \"mmlu_management\", \"mmlu_marketing\", \"mmlu_medical_genetics\", \"mmlu_miscellaneous\", \"mmlu_moral_disputes\", \"mmlu_moral_scenarios\", \"mmlu_nutrition\", \"mmlu_philosophy\", \"mmlu_prehistory\", \"mmlu_professional_accounting\", \"mmlu_professional_law\", \"mmlu_professional_medicine\", \"mmlu_professional_psychology\", \"mmlu_public_relations\", \"mmlu_security_studies\", \"mmlu_sociology\", \"mmlu_us_foreign_policy\", \"mmlu_virology\", \"mmlu_world_religions\"]" + } } } ] diff --git a/data/models/meta_llama-3.2-90b-vision-instruct-turbo.json b/data/models/meta_llama-3.2-90b-vision-instruct-turbo.json index 8db40ff310b6015e156a3e086e416f8136fe7cde..d30ec26b3cfffb3a8ce0593911166de1a589585a 100644 --- a/data/models/meta_llama-3.2-90b-vision-instruct-turbo.json +++ b/data/models/meta_llama-3.2-90b-vision-instruct-turbo.json @@ -7,10 +7,10 @@ }, "evaluations": [ { - "evaluation_id": "helm_mmlu/meta_llama-3.2-90b-vision-instruct-turbo/1774096312.00548", - "retrieved_timestamp": "1774096312.00548", + "evaluation_id": "helm_lite/meta_llama-3.2-90b-vision-instruct-turbo/1774096306.427425", + "retrieved_timestamp": "1774096306.427425", "source_metadata": { - "source_name": "helm_mmlu", + "source_name": "helm_lite", "source_type": "documentation", "source_organization_name": "crfm", "evaluator_relationship": "third_party" @@ -19,438 +19,382 @@ "name": "helm", "version": "unknown" }, - "benchmark": "helm_mmlu", + "benchmark": "helm_lite", "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects", + "evaluation_name": "Mean win rate", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "helm_lite", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", + "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.803, + "score": 0.819, "details": { - "description": "min=0.407, mean=0.803, max=0.979, sum=91.503 (114)", + "description": "", "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": "{\"description\": \"min=0.256, mean=0.374, max=2.612, sum=42.58 (114)\", \"tab\": \"Efficiency\", \"score\": \"0.37350966276831277\"}", - "MMLU All Subjects - # eval": "{\"description\": \"min=100, mean=246.351, max=1534, sum=28084 (114)\", \"tab\": \"General information\", \"score\": \"246.35087719298247\"}", - "MMLU All Subjects - # train": "{\"description\": \"min=5, mean=5, max=5, sum=570 (114)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "MMLU All Subjects - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (114)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "MMLU All Subjects - # prompt tokens": "{\"description\": \"min=274.52, mean=614.619, max=2797.885, sum=70066.61 (114)\", \"tab\": \"General information\", \"score\": \"614.6193817308517\"}", - "MMLU All Subjects - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (114)\", \"tab\": \"General information\", \"score\": \"0.0\"}" + "Mean win rate - Efficiency": "{\"description\": \"\", \"tab\": \"Efficiency\", \"score\": \"0.5839825218476904\"}", + "Mean win rate - General information": "{\"description\": \"\", \"tab\": \"General information\", \"score\": \"\"}" } }, "generation_config": { - "additional_details": { - "subject": "[\"abstract_algebra\", \"anatomy\", \"astronomy\", \"business_ethics\", \"clinical_knowledge\", \"college_biology\", \"college_chemistry\", \"college_computer_science\", \"college_mathematics\", \"college_medicine\", \"college_physics\", \"computer_security\", \"conceptual_physics\", \"econometrics\", \"electrical_engineering\", \"elementary_mathematics\", \"formal_logic\", \"global_facts\", \"high_school_biology\", \"high_school_chemistry\", \"high_school_computer_science\", \"high_school_european_history\", \"high_school_geography\", \"high_school_government_and_politics\", \"high_school_macroeconomics\", \"high_school_mathematics\", \"high_school_microeconomics\", \"high_school_physics\", \"high_school_psychology\", \"high_school_statistics\", \"high_school_us_history\", \"high_school_world_history\", \"human_aging\", \"human_sexuality\", \"international_law\", \"jurisprudence\", \"logical_fallacies\", \"machine_learning\", \"management\", \"marketing\", \"medical_genetics\", \"miscellaneous\", \"moral_disputes\", \"moral_scenarios\", \"nutrition\", \"philosophy\", \"prehistory\", \"professional_accounting\", \"professional_law\", \"professional_medicine\", \"professional_psychology\", \"public_relations\", \"security_studies\", \"sociology\", \"us_foreign_policy\", \"virology\", \"world_religions\"]", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "[\"mmlu_abstract_algebra\", \"mmlu_anatomy\", \"mmlu_astronomy\", \"mmlu_business_ethics\", \"mmlu_clinical_knowledge\", \"mmlu_college_biology\", \"mmlu_college_chemistry\", \"mmlu_college_computer_science\", \"mmlu_college_mathematics\", \"mmlu_college_medicine\", \"mmlu_college_physics\", \"mmlu_computer_security\", \"mmlu_conceptual_physics\", \"mmlu_econometrics\", \"mmlu_electrical_engineering\", \"mmlu_elementary_mathematics\", \"mmlu_formal_logic\", \"mmlu_global_facts\", \"mmlu_high_school_biology\", \"mmlu_high_school_chemistry\", \"mmlu_high_school_computer_science\", \"mmlu_high_school_european_history\", \"mmlu_high_school_geography\", \"mmlu_high_school_government_and_politics\", \"mmlu_high_school_macroeconomics\", \"mmlu_high_school_mathematics\", \"mmlu_high_school_microeconomics\", \"mmlu_high_school_physics\", \"mmlu_high_school_psychology\", \"mmlu_high_school_statistics\", \"mmlu_high_school_us_history\", \"mmlu_high_school_world_history\", \"mmlu_human_aging\", \"mmlu_human_sexuality\", \"mmlu_international_law\", \"mmlu_jurisprudence\", \"mmlu_logical_fallacies\", \"mmlu_machine_learning\", \"mmlu_management\", \"mmlu_marketing\", \"mmlu_medical_genetics\", \"mmlu_miscellaneous\", \"mmlu_moral_disputes\", \"mmlu_moral_scenarios\", \"mmlu_nutrition\", \"mmlu_philosophy\", \"mmlu_prehistory\", \"mmlu_professional_accounting\", \"mmlu_professional_law\", \"mmlu_professional_medicine\", \"mmlu_professional_psychology\", \"mmlu_public_relations\", \"mmlu_security_studies\", \"mmlu_sociology\", \"mmlu_us_foreign_policy\", \"mmlu_virology\", \"mmlu_world_religions\"]" - } + "additional_details": {} } }, { - "evaluation_name": "Abstract Algebra", + "evaluation_name": "NarrativeQA", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "NarrativeQA", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on Abstract Algebra", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.52, + "score": 0.777, "details": { - "description": "min=0.52, mean=0.52, max=0.52, sum=1.04 (2)", + "description": "min=0.777, mean=0.777, max=0.777, sum=0.777 (1)", "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": "{\"description\": \"min=2.612, mean=2.612, max=2.612, sum=5.224 (2)\", \"tab\": \"Efficiency\", \"score\": \"2.611864836215973\"}", - "Abstract Algebra - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "Abstract Algebra - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Abstract Algebra - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Abstract Algebra - # prompt tokens": "{\"description\": \"min=373.43, mean=373.43, max=373.43, sum=746.86 (2)\", \"tab\": \"General information\", \"score\": \"373.43\"}", - "Abstract Algebra - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" + "NarrativeQA - Observed inference time (s)": "{\"description\": \"min=0.83, mean=0.83, max=0.83, sum=0.83 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.8297326531208736\"}", + "NarrativeQA - # eval": "{\"description\": \"min=355, mean=355, max=355, sum=355 (1)\", \"tab\": \"General information\", \"score\": \"355.0\"}", + "NarrativeQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "NarrativeQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "NarrativeQA - # prompt tokens": "{\"description\": \"min=3484.268, mean=3484.268, max=3484.268, sum=3484.268 (1)\", \"tab\": \"General information\", \"score\": \"3484.2676056338028\"}", + "NarrativeQA - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}" } }, "generation_config": { - "additional_details": { - "subject": "\"abstract_algebra\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_abstract_algebra\"" - } + "additional_details": {} } }, { - "evaluation_name": "Anatomy", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on Anatomy", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8, + "score": 0.457, "details": { - "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)", + "description": "min=0.457, mean=0.457, max=0.457, sum=0.457 (1)", "tab": "Accuracy", - "Anatomy - Observed inference time (s)": "{\"description\": \"min=0.336, mean=0.336, max=0.336, sum=0.672 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3359027315069128\"}", - "Anatomy - # eval": "{\"description\": \"min=135, mean=135, max=135, sum=270 (2)\", \"tab\": \"General information\", \"score\": \"135.0\"}", - "Anatomy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Anatomy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Anatomy - # prompt tokens": "{\"description\": \"min=353.874, mean=353.874, max=353.874, sum=707.748 (2)\", \"tab\": \"General information\", \"score\": \"353.8740740740741\"}", - "Anatomy - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" + "NaturalQuestions (open-book) - Observed inference time (s)": "{\"description\": \"min=1.111, mean=1.111, max=1.111, sum=1.111 (1)\", \"tab\": \"Efficiency\", \"score\": \"1.110703297138214\"}", + "NaturalQuestions (closed-book) - Observed inference time (s)": "{\"description\": \"min=0.422, mean=0.422, max=0.422, sum=0.422 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.4218848171234131\"}", + "NaturalQuestions (open-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", + "NaturalQuestions (open-book) - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "NaturalQuestions (open-book) - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "NaturalQuestions (open-book) - # prompt tokens": "{\"description\": \"min=1716.785, mean=1716.785, max=1716.785, sum=1716.785 (1)\", \"tab\": \"General information\", \"score\": \"1716.785\"}", + "NaturalQuestions (open-book) - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "NaturalQuestions (closed-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", + "NaturalQuestions (closed-book) - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "NaturalQuestions (closed-book) - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "NaturalQuestions (closed-book) - # prompt tokens": "{\"description\": \"min=129.12, mean=129.12, max=129.12, sum=129.12 (1)\", \"tab\": \"General information\", \"score\": \"129.12\"}", + "NaturalQuestions (closed-book) - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"anatomy\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_anatomy\"" + "mode": "\"closedbook\"" } } }, { - "evaluation_name": "College Physics", + "evaluation_name": "OpenbookQA", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "OpenbookQA", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on College Physics", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.539, + "score": 0.942, "details": { - "description": "min=0.539, mean=0.539, max=0.539, sum=1.078 (2)", + "description": "min=0.942, mean=0.942, max=0.942, sum=0.942 (1)", "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": "{\"description\": \"min=0.31, mean=0.31, max=0.31, sum=0.621 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3104448890686035\"}", - "College Biology - Observed inference time (s)": "{\"description\": \"min=0.272, mean=0.272, max=0.272, sum=0.544 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2720499005582597\"}", - "College Computer Science - Observed inference time (s)": "{\"description\": \"min=0.321, mean=0.321, max=0.321, sum=0.642 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.32119542360305786\"}", - "College Mathematics - Observed inference time (s)": "{\"description\": \"min=0.315, mean=0.315, max=0.315, sum=0.63 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.31477957487106323\"}", - "College Medicine - Observed inference time (s)": "{\"description\": \"min=0.283, mean=0.283, max=0.283, sum=0.566 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.28313319255850905\"}", - "College Physics - Observed inference time (s)": "{\"description\": \"min=0.317, mean=0.317, max=0.317, sum=0.634 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.31692570097306194\"}", - "College Chemistry - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "College Chemistry - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "College Chemistry - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "College Chemistry - # prompt tokens": "{\"description\": \"min=549.28, mean=549.28, max=549.28, sum=1098.56 (2)\", \"tab\": \"General information\", \"score\": \"549.28\"}", - "College Chemistry - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "College Biology - # eval": "{\"description\": \"min=144, mean=144, max=144, sum=288 (2)\", \"tab\": \"General information\", \"score\": \"144.0\"}", - "College Biology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "College Biology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "College Biology - # prompt tokens": "{\"description\": \"min=473.875, mean=473.875, max=473.875, sum=947.75 (2)\", \"tab\": \"General information\", \"score\": \"473.875\"}", - "College Biology - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "College Computer Science - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "College Computer Science - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "College Computer Science - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "College Computer Science - # prompt tokens": "{\"description\": \"min=828.29, mean=828.29, max=828.29, sum=1656.58 (2)\", \"tab\": \"General information\", \"score\": \"828.29\"}", - "College Computer Science - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "College Mathematics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "College Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "College Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "College Mathematics - # prompt tokens": "{\"description\": \"min=594.51, mean=594.51, max=594.51, sum=1189.02 (2)\", \"tab\": \"General information\", \"score\": \"594.51\"}", - "College Mathematics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "College Medicine - # eval": "{\"description\": \"min=173, mean=173, max=173, sum=346 (2)\", \"tab\": \"General information\", \"score\": \"173.0\"}", - "College Medicine - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "College Medicine - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "College Medicine - # prompt tokens": "{\"description\": \"min=502.705, mean=502.705, max=502.705, sum=1005.41 (2)\", \"tab\": \"General information\", \"score\": \"502.70520231213874\"}", - "College Medicine - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "College Physics - # eval": "{\"description\": \"min=102, mean=102, max=102, sum=204 (2)\", \"tab\": \"General information\", \"score\": \"102.0\"}", - "College Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "College Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "College Physics - # prompt tokens": "{\"description\": \"min=503.569, mean=503.569, max=503.569, sum=1007.137 (2)\", \"tab\": \"General information\", \"score\": \"503.5686274509804\"}", - "College Physics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" + "OpenbookQA - Observed inference time (s)": "{\"description\": \"min=0.285, mean=0.285, max=0.285, sum=0.285 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.28476666021347047\"}", + "OpenbookQA - # eval": "{\"description\": \"min=500, mean=500, max=500, sum=500 (1)\", \"tab\": \"General information\", \"score\": \"500.0\"}", + "OpenbookQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "OpenbookQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "OpenbookQA - # prompt tokens": "{\"description\": \"min=249.776, mean=249.776, max=249.776, sum=249.776 (1)\", \"tab\": \"General information\", \"score\": \"249.776\"}", + "OpenbookQA - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"college_physics\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_college_physics\"" + "dataset": "\"openbookqa\"", + "method": "\"multiple_choice_joint\"" } } }, { - "evaluation_name": "Computer Security", + "evaluation_name": "MMLU", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "MMLU", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on Computer Security", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.81, + "score": 0.703, "details": { - "description": "min=0.81, mean=0.81, max=0.81, sum=1.62 (2)", + "description": "min=0.52, mean=0.703, max=0.93, sum=3.514 (5)", "tab": "Accuracy", - "Computer Security - Observed inference time (s)": "{\"description\": \"min=0.266, mean=0.266, max=0.266, sum=0.532 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.26576273441314696\"}", - "Computer Security - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "Computer Security - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Computer Security - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Computer Security - # prompt tokens": "{\"description\": \"min=378.51, mean=378.51, max=378.51, sum=757.02 (2)\", \"tab\": \"General information\", \"score\": \"378.51\"}", - "Computer Security - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" + "MMLU - Observed inference time (s)": "{\"description\": \"min=0.266, mean=0.798, max=2.612, sum=3.992 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.7984467656654225\"}", + "MMLU - # eval": "{\"description\": \"min=100, mean=102.8, max=114, sum=514 (5)\", \"tab\": \"General information\", \"score\": \"102.8\"}", + "MMLU - # train": "{\"description\": \"min=5, mean=5, max=5, sum=25 (5)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "MMLU - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "MMLU - # prompt tokens": "{\"description\": \"min=373.43, mean=467.686, max=614.421, sum=2338.431 (5)\", \"tab\": \"General information\", \"score\": \"467.6862105263158\"}", + "MMLU - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"computer_security\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_computer_security\"" + "subject": "[\"abstract_algebra\", \"college_chemistry\", \"computer_security\", \"econometrics\", \"us_foreign_policy\"]", + "method": "\"multiple_choice_joint\"" } } }, { - "evaluation_name": "Econometrics", + "evaluation_name": "MATH", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "MATH", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on Econometrics", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.684, + "score": 0.791, "details": { - "description": "min=0.684, mean=0.684, max=0.684, sum=1.368 (2)", + "description": "min=0.579, mean=0.791, max=0.978, sum=5.54 (7)", "tab": "Accuracy", - "Econometrics - Observed inference time (s)": "{\"description\": \"min=0.297, mean=0.297, max=0.297, sum=0.595 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2972530210227297\"}", - "Econometrics - # eval": "{\"description\": \"min=114, mean=114, max=114, sum=228 (2)\", \"tab\": \"General information\", \"score\": \"114.0\"}", - "Econometrics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Econometrics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Econometrics - # prompt tokens": "{\"description\": \"min=614.421, mean=614.421, max=614.421, sum=1228.842 (2)\", \"tab\": \"General information\", \"score\": \"614.421052631579\"}", - "Econometrics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" + "MATH - Observed inference time (s)": "{\"description\": \"min=4.64, mean=5.739, max=6.652, sum=40.174 (7)\", \"tab\": \"Efficiency\", \"score\": \"5.739186799526185\"}", + "MATH - # eval": "{\"description\": \"min=30, mean=62.429, max=135, sum=437 (7)\", \"tab\": \"General information\", \"score\": \"62.42857142857143\"}", + "MATH - # train": "{\"description\": \"min=8, mean=8, max=8, sum=56 (7)\", \"tab\": \"General information\", \"score\": \"8.0\"}", + "MATH - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (7)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "MATH - # prompt tokens": "{\"description\": \"min=881.363, mean=1262.909, max=2197.577, sum=8840.364 (7)\", \"tab\": \"General information\", \"score\": \"1262.9092130545007\"}", + "MATH - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (7)\", \"tab\": \"General information\", \"score\": \"0.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"econometrics\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_econometrics\"" + "subject": "[\"algebra\", \"counting_and_probability\", \"geometry\", \"intermediate_algebra\", \"number_theory\", \"prealgebra\", \"precalculus\"]", + "level": "\"1\"", + "use_official_examples": "\"False\"", + "use_chain_of_thought": "\"True\"" } } }, { - "evaluation_name": "Global Facts", + "evaluation_name": "GSM8K", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "GSM8K", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on Global Facts", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6, + "score": 0.936, "details": { - "description": "min=0.6, mean=0.6, max=0.6, sum=1.2 (2)", + "description": "min=0.936, mean=0.936, max=0.936, sum=0.936 (1)", "tab": "Accuracy", - "Global Facts - Observed inference time (s)": "{\"description\": \"min=0.267, mean=0.267, max=0.267, sum=0.533 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2666162133216858\"}", - "Global Facts - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "Global Facts - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Global Facts - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Global Facts - # prompt tokens": "{\"description\": \"min=399.71, mean=399.71, max=399.71, sum=799.42 (2)\", \"tab\": \"General information\", \"score\": \"399.71\"}", - "Global Facts - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" + "GSM8K - Observed inference time (s)": "{\"description\": \"min=2.889, mean=2.889, max=2.889, sum=2.889 (1)\", \"tab\": \"Efficiency\", \"score\": \"2.8894128675460817\"}", + "GSM8K - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", + "GSM8K - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "GSM8K - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "GSM8K - # prompt tokens": "{\"description\": \"min=959.032, mean=959.032, max=959.032, sum=959.032 (1)\", \"tab\": \"General information\", \"score\": \"959.032\"}", + "GSM8K - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"global_facts\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_global_facts\"" + "stop": "\"none\"" } } }, { - "evaluation_name": "Jurisprudence", + "evaluation_name": "LegalBench", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "LegalBench", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on Jurisprudence", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.88, + "score": 0.68, "details": { - "description": "min=0.88, mean=0.88, max=0.88, sum=1.759 (2)", + "description": "min=0.438, mean=0.68, max=0.989, sum=3.398 (5)", "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": "{\"description\": \"min=0.279, mean=0.279, max=0.279, sum=0.558 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.278864703796528\"}", - "Jurisprudence - # eval": "{\"description\": \"min=108, mean=108, max=108, sum=216 (2)\", \"tab\": \"General information\", \"score\": \"108.0\"}", - "Jurisprudence - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Jurisprudence - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Jurisprudence - # prompt tokens": "{\"description\": \"min=394.63, mean=394.63, max=394.63, sum=789.259 (2)\", \"tab\": \"General information\", \"score\": \"394.6296296296296\"}", - "Jurisprudence - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" + "LegalBench - Observed inference time (s)": "{\"description\": \"min=0.284, mean=0.478, max=1.152, sum=2.389 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.47773526830658064\"}", + "LegalBench - # eval": "{\"description\": \"min=95, mean=409.4, max=1000, sum=2047 (5)\", \"tab\": \"General information\", \"score\": \"409.4\"}", + "LegalBench - # train": "{\"description\": \"min=4, mean=4.8, max=5, sum=24 (5)\", \"tab\": \"General information\", \"score\": \"4.8\"}", + "LegalBench - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "LegalBench - # prompt tokens": "{\"description\": \"min=197.442, mean=1513.882, max=6300.012, sum=7569.412 (5)\", \"tab\": \"General information\", \"score\": \"1513.8824197238912\"}", + "LegalBench - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"jurisprudence\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_jurisprudence\"" + "subset": "[\"abercrombie\", \"corporate_lobbying\", \"function_of_decision_section\", \"international_citizenship_questions\", \"proa\"]" } } }, { - "evaluation_name": "Philosophy", + "evaluation_name": "MedQA", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "MedQA", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on Philosophy", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.839, + "score": 0.769, "details": { - "description": "min=0.839, mean=0.839, max=0.839, sum=1.678 (2)", + "description": "min=0.769, mean=0.769, max=0.769, sum=0.769 (1)", "tab": "Accuracy", - "Philosophy - Observed inference time (s)": "{\"description\": \"min=0.297, mean=0.297, max=0.297, sum=0.594 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.29689135582117404\"}", - "Philosophy - # eval": "{\"description\": \"min=311, mean=311, max=311, sum=622 (2)\", \"tab\": \"General information\", \"score\": \"311.0\"}", - "Philosophy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Philosophy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Philosophy - # prompt tokens": "{\"description\": \"min=329.084, mean=329.084, max=329.084, sum=658.167 (2)\", \"tab\": \"General information\", \"score\": \"329.08360128617363\"}", - "Philosophy - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" + "MedQA - Observed inference time (s)": "{\"description\": \"min=0.318, mean=0.318, max=0.318, sum=0.318 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.3180293652930743\"}", + "MedQA - # eval": "{\"description\": \"min=503, mean=503, max=503, sum=503 (1)\", \"tab\": \"General information\", \"score\": \"503.0\"}", + "MedQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "MedQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "MedQA - # prompt tokens": "{\"description\": \"min=1025.274, mean=1025.274, max=1025.274, sum=1025.274 (1)\", \"tab\": \"General information\", \"score\": \"1025.2743538767395\"}", + "MedQA - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}" } }, "generation_config": { - "additional_details": { - "subject": "\"philosophy\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_philosophy\"" - } + "additional_details": {} } }, { - "evaluation_name": "Professional Psychology", + "evaluation_name": "WMT 2014", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "WMT 2014", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on Professional Psychology", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.843, + "score": 0.224, "details": { - "description": "min=0.843, mean=0.843, max=0.843, sum=1.686 (2)", + "description": "min=0.182, mean=0.224, max=0.266, sum=1.121 (5)", "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": "{\"description\": \"min=0.553, mean=0.553, max=0.553, sum=1.106 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5529017465956071\"}", - "Professional Accounting - Observed inference time (s)": "{\"description\": \"min=0.323, mean=0.323, max=0.323, sum=0.647 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.32346555189038\"}", - "Professional Law - Observed inference time (s)": "{\"description\": \"min=0.372, mean=0.372, max=0.372, sum=0.743 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3715069820859131\"}", - "Professional Psychology - Observed inference time (s)": "{\"description\": \"min=0.315, mean=0.315, max=0.315, sum=0.63 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3151663907992294\"}", - "Professional Medicine - # eval": "{\"description\": \"min=272, mean=272, max=272, sum=544 (2)\", \"tab\": \"General information\", \"score\": \"272.0\"}", - "Professional Medicine - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Professional Medicine - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Professional Medicine - # prompt tokens": "{\"description\": \"min=1094.489, mean=1094.489, max=1094.489, sum=2188.978 (2)\", \"tab\": \"General information\", \"score\": \"1094.4889705882354\"}", - "Professional Medicine - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Professional Accounting - # eval": "{\"description\": \"min=282, mean=282, max=282, sum=564 (2)\", \"tab\": \"General information\", \"score\": \"282.0\"}", - "Professional Accounting - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Professional Accounting - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Professional Accounting - # prompt tokens": "{\"description\": \"min=658.585, mean=658.585, max=658.585, sum=1317.17 (2)\", \"tab\": \"General information\", \"score\": \"658.5851063829788\"}", - "Professional Accounting - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Professional Law - # eval": "{\"description\": \"min=1534, mean=1534, max=1534, sum=3068 (2)\", \"tab\": \"General information\", \"score\": \"1534.0\"}", - "Professional Law - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Professional Law - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Professional Law - # prompt tokens": "{\"description\": \"min=1637.601, mean=1637.601, max=1637.601, sum=3275.202 (2)\", \"tab\": \"General information\", \"score\": \"1637.6010430247718\"}", - "Professional Law - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Professional Psychology - # eval": "{\"description\": \"min=612, mean=612, max=612, sum=1224 (2)\", \"tab\": \"General information\", \"score\": \"612.0\"}", - "Professional Psychology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Professional Psychology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Professional Psychology - # prompt tokens": "{\"description\": \"min=575.098, mean=575.098, max=575.098, sum=1150.196 (2)\", \"tab\": \"General information\", \"score\": \"575.0980392156863\"}", - "Professional Psychology - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" + "WMT 2014 - Observed inference time (s)": "{\"description\": \"min=0.737, mean=0.816, max=0.848, sum=4.078 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.8156762526912515\"}", + "WMT 2014 - # eval": "{\"description\": \"min=503, mean=568.8, max=832, sum=2844 (5)\", \"tab\": \"General information\", \"score\": \"568.8\"}", + "WMT 2014 - # train": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "WMT 2014 - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "WMT 2014 - # prompt tokens": "{\"description\": \"min=101.139, mean=120.868, max=141.33, sum=604.34 (5)\", \"tab\": \"General information\", \"score\": \"120.86804366111025\"}", + "WMT 2014 - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"professional_psychology\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_professional_psychology\"" + "language_pair": "[\"cs-en\", \"de-en\", \"fr-en\", \"hi-en\", \"ru-en\"]" } } - }, + } + ], + "detailed_evaluation_results": null, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_id": "helm_mmlu/meta_llama-3.2-90b-vision-instruct-turbo/1774096312.00548", + "retrieved_timestamp": "1774096312.00548", + "source_metadata": { + "source_name": "helm_mmlu", + "source_type": "documentation", + "source_organization_name": "crfm", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "helm", + "version": "unknown" + }, + "benchmark": "helm_mmlu", + "evaluation_results": [ { - "evaluation_name": "Us Foreign Policy", + "evaluation_name": "MMLU All Subjects", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -459,36 +403,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.93, + "score": 0.803, "details": { - "description": "min=0.93, mean=0.93, max=0.93, sum=1.86 (2)", + "description": "min=0.407, mean=0.803, max=0.979, sum=91.503 (114)", "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": "{\"description\": \"min=0.507, mean=0.507, max=0.507, sum=1.014 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5069083476066589\"}", - "Us Foreign Policy - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "Us Foreign Policy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Us Foreign Policy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Us Foreign Policy - # prompt tokens": "{\"description\": \"min=422.79, mean=422.79, max=422.79, sum=845.58 (2)\", \"tab\": \"General information\", \"score\": \"422.79\"}", - "Us Foreign Policy - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" + "MMLU All Subjects - Observed inference time (s)": "{\"description\": \"min=0.256, mean=0.374, max=2.612, sum=42.58 (114)\", \"tab\": \"Efficiency\", \"score\": \"0.37350966276831277\"}", + "MMLU All Subjects - # eval": "{\"description\": \"min=100, mean=246.351, max=1534, sum=28084 (114)\", \"tab\": \"General information\", \"score\": \"246.35087719298247\"}", + "MMLU All Subjects - # train": "{\"description\": \"min=5, mean=5, max=5, sum=570 (114)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "MMLU All Subjects - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (114)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "MMLU All Subjects - # prompt tokens": "{\"description\": \"min=274.52, mean=614.619, max=2797.885, sum=70066.61 (114)\", \"tab\": \"General information\", \"score\": \"614.6193817308517\"}", + "MMLU All Subjects - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (114)\", \"tab\": \"General information\", \"score\": \"0.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"us_foreign_policy\"", + "subject": "[\"abstract_algebra\", \"anatomy\", \"astronomy\", \"business_ethics\", \"clinical_knowledge\", \"college_biology\", \"college_chemistry\", \"college_computer_science\", \"college_mathematics\", \"college_medicine\", \"college_physics\", \"computer_security\", \"conceptual_physics\", \"econometrics\", \"electrical_engineering\", \"elementary_mathematics\", \"formal_logic\", \"global_facts\", \"high_school_biology\", \"high_school_chemistry\", \"high_school_computer_science\", \"high_school_european_history\", \"high_school_geography\", \"high_school_government_and_politics\", \"high_school_macroeconomics\", \"high_school_mathematics\", \"high_school_microeconomics\", \"high_school_physics\", \"high_school_psychology\", \"high_school_statistics\", \"high_school_us_history\", \"high_school_world_history\", \"human_aging\", \"human_sexuality\", \"international_law\", \"jurisprudence\", \"logical_fallacies\", \"machine_learning\", \"management\", \"marketing\", \"medical_genetics\", \"miscellaneous\", \"moral_disputes\", \"moral_scenarios\", \"nutrition\", \"philosophy\", \"prehistory\", \"professional_accounting\", \"professional_law\", \"professional_medicine\", \"professional_psychology\", \"public_relations\", \"security_studies\", \"sociology\", \"us_foreign_policy\", \"virology\", \"world_religions\"]", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_us_foreign_policy\"" + "groups": "[\"mmlu_abstract_algebra\", \"mmlu_anatomy\", \"mmlu_astronomy\", \"mmlu_business_ethics\", \"mmlu_clinical_knowledge\", \"mmlu_college_biology\", \"mmlu_college_chemistry\", \"mmlu_college_computer_science\", \"mmlu_college_mathematics\", \"mmlu_college_medicine\", \"mmlu_college_physics\", \"mmlu_computer_security\", \"mmlu_conceptual_physics\", \"mmlu_econometrics\", \"mmlu_electrical_engineering\", \"mmlu_elementary_mathematics\", \"mmlu_formal_logic\", \"mmlu_global_facts\", \"mmlu_high_school_biology\", \"mmlu_high_school_chemistry\", \"mmlu_high_school_computer_science\", \"mmlu_high_school_european_history\", \"mmlu_high_school_geography\", \"mmlu_high_school_government_and_politics\", \"mmlu_high_school_macroeconomics\", \"mmlu_high_school_mathematics\", \"mmlu_high_school_microeconomics\", \"mmlu_high_school_physics\", \"mmlu_high_school_psychology\", \"mmlu_high_school_statistics\", \"mmlu_high_school_us_history\", \"mmlu_high_school_world_history\", \"mmlu_human_aging\", \"mmlu_human_sexuality\", \"mmlu_international_law\", \"mmlu_jurisprudence\", \"mmlu_logical_fallacies\", \"mmlu_machine_learning\", \"mmlu_management\", \"mmlu_marketing\", \"mmlu_medical_genetics\", \"mmlu_miscellaneous\", \"mmlu_moral_disputes\", \"mmlu_moral_scenarios\", \"mmlu_nutrition\", \"mmlu_philosophy\", \"mmlu_prehistory\", \"mmlu_professional_accounting\", \"mmlu_professional_law\", \"mmlu_professional_medicine\", \"mmlu_professional_psychology\", \"mmlu_public_relations\", \"mmlu_security_studies\", \"mmlu_sociology\", \"mmlu_us_foreign_policy\", \"mmlu_virology\", \"mmlu_world_religions\"]" } } }, { - "evaluation_name": "Astronomy", + "evaluation_name": "Abstract Algebra", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -497,36 +441,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Astronomy", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.921, + "score": 0.52, "details": { - "description": "min=0.921, mean=0.921, max=0.921, sum=1.842 (2)", + "description": "min=0.52, mean=0.52, max=0.52, sum=1.04 (2)", "tab": "Accuracy", - "Astronomy - Observed inference time (s)": "{\"description\": \"min=0.332, mean=0.332, max=0.332, sum=0.665 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3323579352152975\"}", - "Astronomy - # eval": "{\"description\": \"min=152, mean=152, max=152, sum=304 (2)\", \"tab\": \"General information\", \"score\": \"152.0\"}", - "Astronomy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Astronomy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Astronomy - # prompt tokens": "{\"description\": \"min=579.684, mean=579.684, max=579.684, sum=1159.368 (2)\", \"tab\": \"General information\", \"score\": \"579.6842105263158\"}", - "Astronomy - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" + "Abstract Algebra - Observed inference time (s)": "{\"description\": \"min=2.612, mean=2.612, max=2.612, sum=5.224 (2)\", \"tab\": \"Efficiency\", \"score\": \"2.611864836215973\"}", + "Abstract Algebra - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "Abstract Algebra - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Abstract Algebra - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Abstract Algebra - # prompt tokens": "{\"description\": \"min=373.43, mean=373.43, max=373.43, sum=746.86 (2)\", \"tab\": \"General information\", \"score\": \"373.43\"}", + "Abstract Algebra - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"astronomy\"", + "subject": "\"abstract_algebra\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_astronomy\"" + "groups": "\"mmlu_abstract_algebra\"" } } }, { - "evaluation_name": "Business Ethics", + "evaluation_name": "Anatomy", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -535,36 +479,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Business Ethics", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.76, + "score": 0.8, "details": { - "description": "min=0.76, mean=0.76, max=0.76, sum=1.52 (2)", + "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)", "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": "{\"description\": \"min=0.291, mean=0.291, max=0.291, sum=0.581 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.29072295665740966\"}", - "Business Ethics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "Business Ethics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Business Ethics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Business Ethics - # prompt tokens": "{\"description\": \"min=569.52, mean=569.52, max=569.52, sum=1139.04 (2)\", \"tab\": \"General information\", \"score\": \"569.52\"}", - "Business Ethics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" + "Anatomy - Observed inference time (s)": "{\"description\": \"min=0.336, mean=0.336, max=0.336, sum=0.672 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3359027315069128\"}", + "Anatomy - # eval": "{\"description\": \"min=135, mean=135, max=135, sum=270 (2)\", \"tab\": \"General information\", \"score\": \"135.0\"}", + "Anatomy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Anatomy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Anatomy - # prompt tokens": "{\"description\": \"min=353.874, mean=353.874, max=353.874, sum=707.748 (2)\", \"tab\": \"General information\", \"score\": \"353.8740740740741\"}", + "Anatomy - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"business_ethics\"", + "subject": "\"anatomy\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_business_ethics\"" + "groups": "\"mmlu_anatomy\"" } } }, { - "evaluation_name": "Clinical Knowledge", + "evaluation_name": "College Physics", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -573,36 +517,66 @@ ] }, "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.845, + "score": 0.539, "details": { - "description": "min=0.845, mean=0.845, max=0.845, sum=1.691 (2)", + "description": "min=0.539, mean=0.539, max=0.539, sum=1.078 (2)", "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": "{\"description\": \"min=0.29, mean=0.29, max=0.29, sum=0.579 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2897273891376999\"}", - "Clinical Knowledge - # eval": "{\"description\": \"min=265, mean=265, max=265, sum=530 (2)\", \"tab\": \"General information\", \"score\": \"265.0\"}", - "Clinical Knowledge - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Clinical Knowledge - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Clinical Knowledge - # prompt tokens": "{\"description\": \"min=397.928, mean=397.928, max=397.928, sum=795.857 (2)\", \"tab\": \"General information\", \"score\": \"397.92830188679244\"}", - "Clinical Knowledge - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" + "College Chemistry - Observed inference time (s)": "{\"description\": \"min=0.31, mean=0.31, max=0.31, sum=0.621 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3104448890686035\"}", + "College Biology - Observed inference time (s)": "{\"description\": \"min=0.272, mean=0.272, max=0.272, sum=0.544 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2720499005582597\"}", + "College Computer Science - Observed inference time (s)": "{\"description\": \"min=0.321, mean=0.321, max=0.321, sum=0.642 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.32119542360305786\"}", + "College Mathematics - Observed inference time (s)": "{\"description\": \"min=0.315, mean=0.315, max=0.315, sum=0.63 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.31477957487106323\"}", + "College Medicine - Observed inference time (s)": "{\"description\": \"min=0.283, mean=0.283, max=0.283, sum=0.566 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.28313319255850905\"}", + "College Physics - Observed inference time (s)": "{\"description\": \"min=0.317, mean=0.317, max=0.317, sum=0.634 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.31692570097306194\"}", + "College Chemistry - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "College Chemistry - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "College Chemistry - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "College Chemistry - # prompt tokens": "{\"description\": \"min=549.28, mean=549.28, max=549.28, sum=1098.56 (2)\", \"tab\": \"General information\", \"score\": \"549.28\"}", + "College Chemistry - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "College Biology - # eval": "{\"description\": \"min=144, mean=144, max=144, sum=288 (2)\", \"tab\": \"General information\", \"score\": \"144.0\"}", + "College Biology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "College Biology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "College Biology - # prompt tokens": "{\"description\": \"min=473.875, mean=473.875, max=473.875, sum=947.75 (2)\", \"tab\": \"General information\", \"score\": \"473.875\"}", + "College Biology - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "College Computer Science - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "College Computer Science - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "College Computer Science - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "College Computer Science - # prompt tokens": "{\"description\": \"min=828.29, mean=828.29, max=828.29, sum=1656.58 (2)\", \"tab\": \"General information\", \"score\": \"828.29\"}", + "College Computer Science - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "College Mathematics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "College Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "College Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "College Mathematics - # prompt tokens": "{\"description\": \"min=594.51, mean=594.51, max=594.51, sum=1189.02 (2)\", \"tab\": \"General information\", \"score\": \"594.51\"}", + "College Mathematics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "College Medicine - # eval": "{\"description\": \"min=173, mean=173, max=173, sum=346 (2)\", \"tab\": \"General information\", \"score\": \"173.0\"}", + "College Medicine - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "College Medicine - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "College Medicine - # prompt tokens": "{\"description\": \"min=502.705, mean=502.705, max=502.705, sum=1005.41 (2)\", \"tab\": \"General information\", \"score\": \"502.70520231213874\"}", + "College Medicine - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "College Physics - # eval": "{\"description\": \"min=102, mean=102, max=102, sum=204 (2)\", \"tab\": \"General information\", \"score\": \"102.0\"}", + "College Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "College Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "College Physics - # prompt tokens": "{\"description\": \"min=503.569, mean=503.569, max=503.569, sum=1007.137 (2)\", \"tab\": \"General information\", \"score\": \"503.5686274509804\"}", + "College Physics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"clinical_knowledge\"", + "subject": "\"college_physics\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_clinical_knowledge\"" + "groups": "\"mmlu_college_physics\"" } } }, { - "evaluation_name": "Conceptual Physics", + "evaluation_name": "Computer Security", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -611,36 +585,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Conceptual Physics", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.826, + "score": 0.81, "details": { - "description": "min=0.826, mean=0.826, max=0.826, sum=1.651 (2)", + "description": "min=0.81, mean=0.81, max=0.81, sum=1.62 (2)", "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": "{\"description\": \"min=0.279, mean=0.279, max=0.279, sum=0.559 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2794749209221373\"}", - "Conceptual Physics - # eval": "{\"description\": \"min=235, mean=235, max=235, sum=470 (2)\", \"tab\": \"General information\", \"score\": \"235.0\"}", - "Conceptual Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Conceptual Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Conceptual Physics - # prompt tokens": "{\"description\": \"min=304.834, mean=304.834, max=304.834, sum=609.668 (2)\", \"tab\": \"General information\", \"score\": \"304.83404255319147\"}", - "Conceptual Physics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" + "Computer Security - Observed inference time (s)": "{\"description\": \"min=0.266, mean=0.266, max=0.266, sum=0.532 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.26576273441314696\"}", + "Computer Security - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "Computer Security - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Computer Security - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Computer Security - # prompt tokens": "{\"description\": \"min=378.51, mean=378.51, max=378.51, sum=757.02 (2)\", \"tab\": \"General information\", \"score\": \"378.51\"}", + "Computer Security - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"conceptual_physics\"", + "subject": "\"computer_security\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_conceptual_physics\"" + "groups": "\"mmlu_computer_security\"" } } }, { - "evaluation_name": "Electrical Engineering", + "evaluation_name": "Econometrics", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -649,36 +623,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Electrical Engineering", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.759, + "score": 0.684, "details": { - "description": "min=0.759, mean=0.759, max=0.759, sum=1.517 (2)", + "description": "min=0.684, mean=0.684, max=0.684, sum=1.368 (2)", "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": "{\"description\": \"min=0.256, mean=0.256, max=0.256, sum=0.512 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2558267790695717\"}", - "Electrical Engineering - # eval": "{\"description\": \"min=145, mean=145, max=145, sum=290 (2)\", \"tab\": \"General information\", \"score\": \"145.0\"}", - "Electrical Engineering - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Electrical Engineering - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Electrical Engineering - # prompt tokens": "{\"description\": \"min=435.607, mean=435.607, max=435.607, sum=871.214 (2)\", \"tab\": \"General information\", \"score\": \"435.60689655172416\"}", - "Electrical Engineering - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" + "Econometrics - Observed inference time (s)": "{\"description\": \"min=0.297, mean=0.297, max=0.297, sum=0.595 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2972530210227297\"}", + "Econometrics - # eval": "{\"description\": \"min=114, mean=114, max=114, sum=228 (2)\", \"tab\": \"General information\", \"score\": \"114.0\"}", + "Econometrics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Econometrics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Econometrics - # prompt tokens": "{\"description\": \"min=614.421, mean=614.421, max=614.421, sum=1228.842 (2)\", \"tab\": \"General information\", \"score\": \"614.421052631579\"}", + "Econometrics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"electrical_engineering\"", + "subject": "\"econometrics\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_electrical_engineering\"" + "groups": "\"mmlu_econometrics\"" } } }, { - "evaluation_name": "Elementary Mathematics", + "evaluation_name": "Global Facts", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -687,36 +661,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.688, + "score": 0.6, "details": { - "description": "min=0.688, mean=0.688, max=0.688, sum=1.376 (2)", + "description": "min=0.6, mean=0.6, max=0.6, sum=1.2 (2)", "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": "{\"description\": \"min=0.308, mean=0.308, max=0.308, sum=0.617 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.30840403945357714\"}", - "Elementary Mathematics - # eval": "{\"description\": \"min=378, mean=378, max=378, sum=756 (2)\", \"tab\": \"General information\", \"score\": \"378.0\"}", - "Elementary Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Elementary Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Elementary Mathematics - # prompt tokens": "{\"description\": \"min=531.854, mean=531.854, max=531.854, sum=1063.709 (2)\", \"tab\": \"General information\", \"score\": \"531.8544973544973\"}", - "Elementary Mathematics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" + "Global Facts - Observed inference time (s)": "{\"description\": \"min=0.267, mean=0.267, max=0.267, sum=0.533 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2666162133216858\"}", + "Global Facts - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "Global Facts - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Global Facts - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Global Facts - # prompt tokens": "{\"description\": \"min=399.71, mean=399.71, max=399.71, sum=799.42 (2)\", \"tab\": \"General information\", \"score\": \"399.71\"}", + "Global Facts - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"elementary_mathematics\"", + "subject": "\"global_facts\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_elementary_mathematics\"" + "groups": "\"mmlu_global_facts\"" } } }, { - "evaluation_name": "Formal Logic", + "evaluation_name": "Jurisprudence", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -725,36 +699,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Formal Logic", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.683, + "score": 0.88, "details": { - "description": "min=0.683, mean=0.683, max=0.683, sum=1.365 (2)", + "description": "min=0.88, mean=0.88, max=0.88, sum=1.759 (2)", "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": "{\"description\": \"min=0.304, mean=0.304, max=0.304, sum=0.609 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.30448357074979754\"}", - "Formal Logic - # eval": "{\"description\": \"min=126, mean=126, max=126, sum=252 (2)\", \"tab\": \"General information\", \"score\": \"126.0\"}", - "Formal Logic - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Formal Logic - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Formal Logic - # prompt tokens": "{\"description\": \"min=601.778, mean=601.778, max=601.778, sum=1203.556 (2)\", \"tab\": \"General information\", \"score\": \"601.7777777777778\"}", - "Formal Logic - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" + "Jurisprudence - Observed inference time (s)": "{\"description\": \"min=0.279, mean=0.279, max=0.279, sum=0.558 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.278864703796528\"}", + "Jurisprudence - # eval": "{\"description\": \"min=108, mean=108, max=108, sum=216 (2)\", \"tab\": \"General information\", \"score\": \"108.0\"}", + "Jurisprudence - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Jurisprudence - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Jurisprudence - # prompt tokens": "{\"description\": \"min=394.63, mean=394.63, max=394.63, sum=789.259 (2)\", \"tab\": \"General information\", \"score\": \"394.6296296296296\"}", + "Jurisprudence - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"formal_logic\"", + "subject": "\"jurisprudence\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_formal_logic\"" + "groups": "\"mmlu_jurisprudence\"" } } }, { - "evaluation_name": "High School World History", + "evaluation_name": "Philosophy", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -763,114 +737,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on High School World History", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.941, + "score": 0.839, "details": { - "description": "min=0.941, mean=0.941, max=0.941, sum=1.882 (2)", + "description": "min=0.839, mean=0.839, max=0.839, sum=1.678 (2)", "tab": "Accuracy", - "High School Biology - Observed inference time (s)": "{\"description\": \"min=0.309, mean=0.309, max=0.309, sum=0.619 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3094667688492806\"}", - "High School Chemistry - Observed inference time (s)": "{\"description\": \"min=0.294, mean=0.294, max=0.294, sum=0.588 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.29394797386207017\"}", - "High School Computer Science - Observed inference time (s)": "{\"description\": \"min=0.301, mean=0.301, max=0.301, sum=0.602 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.30106969356536867\"}", - "High School European History - Observed inference time (s)": "{\"description\": \"min=0.48, mean=0.48, max=0.48, sum=0.96 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4799844944115841\"}", - "High School Geography - Observed inference time (s)": "{\"description\": \"min=0.297, mean=0.297, max=0.297, sum=0.595 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.29747620014229204\"}", - "High School Government And Politics - Observed inference time (s)": "{\"description\": \"min=0.291, mean=0.291, max=0.291, sum=0.583 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2914604300662026\"}", - "High School Macroeconomics - Observed inference time (s)": "{\"description\": \"min=0.279, mean=0.279, max=0.279, sum=0.557 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.27857950650728663\"}", - "High School Mathematics - Observed inference time (s)": "{\"description\": \"min=0.312, mean=0.312, max=0.312, sum=0.625 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3123831342767786\"}", - "High School Microeconomics - Observed inference time (s)": "{\"description\": \"min=0.302, mean=0.302, max=0.302, sum=0.603 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.30159517997453195\"}", - "High School Physics - Observed inference time (s)": "{\"description\": \"min=0.322, mean=0.322, max=0.322, sum=0.643 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.32152655108874995\"}", - "High School Psychology - Observed inference time (s)": "{\"description\": \"min=0.29, mean=0.29, max=0.29, sum=0.581 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2903494253071076\"}", - "High School Statistics - Observed inference time (s)": "{\"description\": \"min=0.333, mean=0.333, max=0.333, sum=0.667 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.33328031720938506\"}", - "High School US History - Observed inference time (s)": "{\"description\": \"min=0.394, mean=0.394, max=0.394, sum=0.788 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.39396579826579375\"}", - "High School World History - Observed inference time (s)": "{\"description\": \"min=0.679, mean=0.679, max=0.679, sum=1.359 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6793377369265013\"}", - "High School Biology - # eval": "{\"description\": \"min=310, mean=310, max=310, sum=620 (2)\", \"tab\": \"General information\", \"score\": \"310.0\"}", - "High School Biology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Biology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Biology - # prompt tokens": "{\"description\": \"min=513.671, mean=513.671, max=513.671, sum=1027.342 (2)\", \"tab\": \"General information\", \"score\": \"513.6709677419354\"}", - "High School Biology - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Chemistry - # eval": "{\"description\": \"min=203, mean=203, max=203, sum=406 (2)\", \"tab\": \"General information\", \"score\": \"203.0\"}", - "High School Chemistry - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Chemistry - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Chemistry - # prompt tokens": "{\"description\": \"min=496.704, mean=496.704, max=496.704, sum=993.409 (2)\", \"tab\": \"General information\", \"score\": \"496.70443349753697\"}", - "High School Chemistry - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Computer Science - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "High School Computer Science - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Computer Science - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Computer Science - # prompt tokens": "{\"description\": \"min=867.78, mean=867.78, max=867.78, sum=1735.56 (2)\", \"tab\": \"General information\", \"score\": \"867.78\"}", - "High School Computer Science - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School European History - # eval": "{\"description\": \"min=165, mean=165, max=165, sum=330 (2)\", \"tab\": \"General information\", \"score\": \"165.0\"}", - "High School European History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School European History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School European History - # prompt tokens": "{\"description\": \"min=2797.885, mean=2797.885, max=2797.885, sum=5595.77 (2)\", \"tab\": \"General information\", \"score\": \"2797.8848484848486\"}", - "High School European History - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Geography - # eval": "{\"description\": \"min=198, mean=198, max=198, sum=396 (2)\", \"tab\": \"General information\", \"score\": \"198.0\"}", - "High School Geography - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Geography - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Geography - # prompt tokens": "{\"description\": \"min=372.035, mean=372.035, max=372.035, sum=744.071 (2)\", \"tab\": \"General information\", \"score\": \"372.0353535353535\"}", - "High School Geography - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Government And Politics - # eval": "{\"description\": \"min=193, mean=193, max=193, sum=386 (2)\", \"tab\": \"General information\", \"score\": \"193.0\"}", - "High School Government And Politics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Government And Politics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Government And Politics - # prompt tokens": "{\"description\": \"min=465.824, mean=465.824, max=465.824, sum=931.648 (2)\", \"tab\": \"General information\", \"score\": \"465.8238341968912\"}", - "High School Government And Politics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Macroeconomics - # eval": "{\"description\": \"min=390, mean=390, max=390, sum=780 (2)\", \"tab\": \"General information\", \"score\": \"390.0\"}", - "High School Macroeconomics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Macroeconomics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Macroeconomics - # prompt tokens": "{\"description\": \"min=370.908, mean=370.908, max=370.908, sum=741.815 (2)\", \"tab\": \"General information\", \"score\": \"370.9076923076923\"}", - "High School Macroeconomics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Mathematics - # eval": "{\"description\": \"min=270, mean=270, max=270, sum=540 (2)\", \"tab\": \"General information\", \"score\": \"270.0\"}", - "High School Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Mathematics - # prompt tokens": "{\"description\": \"min=532.356, mean=532.356, max=532.356, sum=1064.711 (2)\", \"tab\": \"General information\", \"score\": \"532.3555555555556\"}", - "High School Mathematics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Microeconomics - # eval": "{\"description\": \"min=238, mean=238, max=238, sum=476 (2)\", \"tab\": \"General information\", \"score\": \"238.0\"}", - "High School Microeconomics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Microeconomics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Microeconomics - # prompt tokens": "{\"description\": \"min=399.013, mean=399.013, max=399.013, sum=798.025 (2)\", \"tab\": \"General information\", \"score\": \"399.0126050420168\"}", - "High School Microeconomics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Physics - # eval": "{\"description\": \"min=151, mean=151, max=151, sum=302 (2)\", \"tab\": \"General information\", \"score\": \"151.0\"}", - "High School Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Physics - # prompt tokens": "{\"description\": \"min=560.457, mean=560.457, max=560.457, sum=1120.914 (2)\", \"tab\": \"General information\", \"score\": \"560.4569536423841\"}", - "High School Physics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Psychology - # eval": "{\"description\": \"min=545, mean=545, max=545, sum=1090 (2)\", \"tab\": \"General information\", \"score\": \"545.0\"}", - "High School Psychology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Psychology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Psychology - # prompt tokens": "{\"description\": \"min=495.242, mean=495.242, max=495.242, sum=990.484 (2)\", \"tab\": \"General information\", \"score\": \"495.2422018348624\"}", - "High School Psychology - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Statistics - # eval": "{\"description\": \"min=216, mean=216, max=216, sum=432 (2)\", \"tab\": \"General information\", \"score\": \"216.0\"}", - "High School Statistics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Statistics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Statistics - # prompt tokens": "{\"description\": \"min=795.639, mean=795.639, max=795.639, sum=1591.278 (2)\", \"tab\": \"General information\", \"score\": \"795.6388888888889\"}", - "High School Statistics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School US History - # eval": "{\"description\": \"min=204, mean=204, max=204, sum=408 (2)\", \"tab\": \"General information\", \"score\": \"204.0\"}", - "High School US History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School US History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School US History - # prompt tokens": "{\"description\": \"min=2217.809, mean=2217.809, max=2217.809, sum=4435.618 (2)\", \"tab\": \"General information\", \"score\": \"2217.8088235294117\"}", - "High School US History - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School World History - # eval": "{\"description\": \"min=237, mean=237, max=237, sum=474 (2)\", \"tab\": \"General information\", \"score\": \"237.0\"}", - "High School World History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School World History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School World History - # prompt tokens": "{\"description\": \"min=1428.173, mean=1428.173, max=1428.173, sum=2856.346 (2)\", \"tab\": \"General information\", \"score\": \"1428.1729957805908\"}", - "High School World History - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" + "Philosophy - Observed inference time (s)": "{\"description\": \"min=0.297, mean=0.297, max=0.297, sum=0.594 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.29689135582117404\"}", + "Philosophy - # eval": "{\"description\": \"min=311, mean=311, max=311, sum=622 (2)\", \"tab\": \"General information\", \"score\": \"311.0\"}", + "Philosophy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Philosophy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Philosophy - # prompt tokens": "{\"description\": \"min=329.084, mean=329.084, max=329.084, sum=658.167 (2)\", \"tab\": \"General information\", \"score\": \"329.08360128617363\"}", + "Philosophy - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"high_school_world_history\"", + "subject": "\"philosophy\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_high_school_world_history\"" + "groups": "\"mmlu_philosophy\"" } } }, { - "evaluation_name": "Human Sexuality", + "evaluation_name": "Professional Psychology", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -879,42 +775,54 @@ ] }, "metric_config": { - "evaluation_description": "EM on Human Sexuality", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.87, + "score": 0.843, "details": { - "description": "min=0.87, mean=0.87, max=0.87, sum=1.74 (2)", + "description": "min=0.843, mean=0.843, max=0.843, sum=1.686 (2)", "tab": "Accuracy", - "Human Aging - Observed inference time (s)": "{\"description\": \"min=0.388, mean=0.388, max=0.388, sum=0.776 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.38789880863754206\"}", - "Human Sexuality - Observed inference time (s)": "{\"description\": \"min=0.293, mean=0.293, max=0.293, sum=0.586 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2929920222013051\"}", - "Human Aging - # eval": "{\"description\": \"min=223, mean=223, max=223, sum=446 (2)\", \"tab\": \"General information\", \"score\": \"223.0\"}", - "Human Aging - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Human Aging - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Human Aging - # prompt tokens": "{\"description\": \"min=319.888, mean=319.888, max=319.888, sum=639.776 (2)\", \"tab\": \"General information\", \"score\": \"319.88789237668163\"}", - "Human Aging - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Human Sexuality - # eval": "{\"description\": \"min=131, mean=131, max=131, sum=262 (2)\", \"tab\": \"General information\", \"score\": \"131.0\"}", - "Human Sexuality - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Human Sexuality - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Human Sexuality - # prompt tokens": "{\"description\": \"min=341.168, mean=341.168, max=341.168, sum=682.336 (2)\", \"tab\": \"General information\", \"score\": \"341.1679389312977\"}", - "Human Sexuality - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" + "Professional Medicine - Observed inference time (s)": "{\"description\": \"min=0.553, mean=0.553, max=0.553, sum=1.106 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5529017465956071\"}", + "Professional Accounting - Observed inference time (s)": "{\"description\": \"min=0.323, mean=0.323, max=0.323, sum=0.647 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.32346555189038\"}", + "Professional Law - Observed inference time (s)": "{\"description\": \"min=0.372, mean=0.372, max=0.372, sum=0.743 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3715069820859131\"}", + "Professional Psychology - Observed inference time (s)": "{\"description\": \"min=0.315, mean=0.315, max=0.315, sum=0.63 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3151663907992294\"}", + "Professional Medicine - # eval": "{\"description\": \"min=272, mean=272, max=272, sum=544 (2)\", \"tab\": \"General information\", \"score\": \"272.0\"}", + "Professional Medicine - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Professional Medicine - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Professional Medicine - # prompt tokens": "{\"description\": \"min=1094.489, mean=1094.489, max=1094.489, sum=2188.978 (2)\", \"tab\": \"General information\", \"score\": \"1094.4889705882354\"}", + "Professional Medicine - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Professional Accounting - # eval": "{\"description\": \"min=282, mean=282, max=282, sum=564 (2)\", \"tab\": \"General information\", \"score\": \"282.0\"}", + "Professional Accounting - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Professional Accounting - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Professional Accounting - # prompt tokens": "{\"description\": \"min=658.585, mean=658.585, max=658.585, sum=1317.17 (2)\", \"tab\": \"General information\", \"score\": \"658.5851063829788\"}", + "Professional Accounting - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Professional Law - # eval": "{\"description\": \"min=1534, mean=1534, max=1534, sum=3068 (2)\", \"tab\": \"General information\", \"score\": \"1534.0\"}", + "Professional Law - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Professional Law - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Professional Law - # prompt tokens": "{\"description\": \"min=1637.601, mean=1637.601, max=1637.601, sum=3275.202 (2)\", \"tab\": \"General information\", \"score\": \"1637.6010430247718\"}", + "Professional Law - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Professional Psychology - # eval": "{\"description\": \"min=612, mean=612, max=612, sum=1224 (2)\", \"tab\": \"General information\", \"score\": \"612.0\"}", + "Professional Psychology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Professional Psychology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Professional Psychology - # prompt tokens": "{\"description\": \"min=575.098, mean=575.098, max=575.098, sum=1150.196 (2)\", \"tab\": \"General information\", \"score\": \"575.0980392156863\"}", + "Professional Psychology - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"human_sexuality\"", + "subject": "\"professional_psychology\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_human_sexuality\"" + "groups": "\"mmlu_professional_psychology\"" } } }, { - "evaluation_name": "International Law", + "evaluation_name": "Us Foreign Policy", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -923,36 +831,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on International Law", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.934, + "score": 0.93, "details": { - "description": "min=0.934, mean=0.934, max=0.934, sum=1.868 (2)", + "description": "min=0.93, mean=0.93, max=0.93, sum=1.86 (2)", "tab": "Accuracy", - "International Law - Observed inference time (s)": "{\"description\": \"min=0.342, mean=0.342, max=0.342, sum=0.685 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.34241620962284813\"}", - "International Law - # eval": "{\"description\": \"min=121, mean=121, max=121, sum=242 (2)\", \"tab\": \"General information\", \"score\": \"121.0\"}", - "International Law - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "International Law - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "International Law - # prompt tokens": "{\"description\": \"min=639.818, mean=639.818, max=639.818, sum=1279.636 (2)\", \"tab\": \"General information\", \"score\": \"639.8181818181819\"}", - "International Law - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" + "Us Foreign Policy - Observed inference time (s)": "{\"description\": \"min=0.507, mean=0.507, max=0.507, sum=1.014 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5069083476066589\"}", + "Us Foreign Policy - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "Us Foreign Policy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Us Foreign Policy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Us Foreign Policy - # prompt tokens": "{\"description\": \"min=422.79, mean=422.79, max=422.79, sum=845.58 (2)\", \"tab\": \"General information\", \"score\": \"422.79\"}", + "Us Foreign Policy - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"international_law\"", + "subject": "\"us_foreign_policy\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_international_law\"" + "groups": "\"mmlu_us_foreign_policy\"" } } }, { - "evaluation_name": "Logical Fallacies", + "evaluation_name": "Astronomy", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -961,36 +869,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Logical Fallacies", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.834, + "score": 0.921, "details": { - "description": "min=0.834, mean=0.834, max=0.834, sum=1.669 (2)", + "description": "min=0.921, mean=0.921, max=0.921, sum=1.842 (2)", "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": "{\"description\": \"min=0.282, mean=0.282, max=0.282, sum=0.565 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.28232605325663745\"}", - "Logical Fallacies - # eval": "{\"description\": \"min=163, mean=163, max=163, sum=326 (2)\", \"tab\": \"General information\", \"score\": \"163.0\"}", - "Logical Fallacies - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Logical Fallacies - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Logical Fallacies - # prompt tokens": "{\"description\": \"min=449.564, mean=449.564, max=449.564, sum=899.129 (2)\", \"tab\": \"General information\", \"score\": \"449.5644171779141\"}", - "Logical Fallacies - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" + "Astronomy - Observed inference time (s)": "{\"description\": \"min=0.332, mean=0.332, max=0.332, sum=0.665 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3323579352152975\"}", + "Astronomy - # eval": "{\"description\": \"min=152, mean=152, max=152, sum=304 (2)\", \"tab\": \"General information\", \"score\": \"152.0\"}", + "Astronomy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Astronomy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Astronomy - # prompt tokens": "{\"description\": \"min=579.684, mean=579.684, max=579.684, sum=1159.368 (2)\", \"tab\": \"General information\", \"score\": \"579.6842105263158\"}", + "Astronomy - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"logical_fallacies\"", + "subject": "\"astronomy\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_logical_fallacies\"" + "groups": "\"mmlu_astronomy\"" } } }, { - "evaluation_name": "Machine Learning", + "evaluation_name": "Business Ethics", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -999,36 +907,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Machine Learning", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.688, + "score": 0.76, "details": { - "description": "min=0.688, mean=0.688, max=0.688, sum=1.375 (2)", + "description": "min=0.76, mean=0.76, max=0.76, sum=1.52 (2)", "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": "{\"description\": \"min=0.338, mean=0.338, max=0.338, sum=0.676 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.33782388057027546\"}", - "Machine Learning - # eval": "{\"description\": \"min=112, mean=112, max=112, sum=224 (2)\", \"tab\": \"General information\", \"score\": \"112.0\"}", - "Machine Learning - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Machine Learning - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Machine Learning - # prompt tokens": "{\"description\": \"min=668.054, mean=668.054, max=668.054, sum=1336.107 (2)\", \"tab\": \"General information\", \"score\": \"668.0535714285714\"}", - "Machine Learning - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" + "Business Ethics - Observed inference time (s)": "{\"description\": \"min=0.291, mean=0.291, max=0.291, sum=0.581 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.29072295665740966\"}", + "Business Ethics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "Business Ethics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Business Ethics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Business Ethics - # prompt tokens": "{\"description\": \"min=569.52, mean=569.52, max=569.52, sum=1139.04 (2)\", \"tab\": \"General information\", \"score\": \"569.52\"}", + "Business Ethics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"machine_learning\"", + "subject": "\"business_ethics\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_machine_learning\"" + "groups": "\"mmlu_business_ethics\"" } } }, { - "evaluation_name": "Management", + "evaluation_name": "Clinical Knowledge", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1037,36 +945,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Management", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.913, + "score": 0.845, "details": { - "description": "min=0.913, mean=0.913, max=0.913, sum=1.825 (2)", + "description": "min=0.845, mean=0.845, max=0.845, sum=1.691 (2)", "tab": "Accuracy", - "Management - Observed inference time (s)": "{\"description\": \"min=0.285, mean=0.285, max=0.285, sum=0.571 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2853238027072647\"}", - "Management - # eval": "{\"description\": \"min=103, mean=103, max=103, sum=206 (2)\", \"tab\": \"General information\", \"score\": \"103.0\"}", - "Management - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Management - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Management - # prompt tokens": "{\"description\": \"min=283.786, mean=283.786, max=283.786, sum=567.573 (2)\", \"tab\": \"General information\", \"score\": \"283.7864077669903\"}", - "Management - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" + "Clinical Knowledge - Observed inference time (s)": "{\"description\": \"min=0.29, mean=0.29, max=0.29, sum=0.579 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2897273891376999\"}", + "Clinical Knowledge - # eval": "{\"description\": \"min=265, mean=265, max=265, sum=530 (2)\", \"tab\": \"General information\", \"score\": \"265.0\"}", + "Clinical Knowledge - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Clinical Knowledge - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Clinical Knowledge - # prompt tokens": "{\"description\": \"min=397.928, mean=397.928, max=397.928, sum=795.857 (2)\", \"tab\": \"General information\", \"score\": \"397.92830188679244\"}", + "Clinical Knowledge - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"management\"", + "subject": "\"clinical_knowledge\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_management\"" + "groups": "\"mmlu_clinical_knowledge\"" } } }, { - "evaluation_name": "Marketing", + "evaluation_name": "Conceptual Physics", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1075,36 +983,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Marketing", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.944, + "score": 0.826, "details": { - "description": "min=0.944, mean=0.944, max=0.944, sum=1.889 (2)", + "description": "min=0.826, mean=0.826, max=0.826, sum=1.651 (2)", "tab": "Accuracy", - "Marketing - Observed inference time (s)": "{\"description\": \"min=0.28, mean=0.28, max=0.28, sum=0.561 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.28032574796269083\"}", - "Marketing - # eval": "{\"description\": \"min=234, mean=234, max=234, sum=468 (2)\", \"tab\": \"General information\", \"score\": \"234.0\"}", - "Marketing - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Marketing - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Marketing - # prompt tokens": "{\"description\": \"min=404.218, mean=404.218, max=404.218, sum=808.436 (2)\", \"tab\": \"General information\", \"score\": \"404.21794871794873\"}", - "Marketing - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" + "Conceptual Physics - Observed inference time (s)": "{\"description\": \"min=0.279, mean=0.279, max=0.279, sum=0.559 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2794749209221373\"}", + "Conceptual Physics - # eval": "{\"description\": \"min=235, mean=235, max=235, sum=470 (2)\", \"tab\": \"General information\", \"score\": \"235.0\"}", + "Conceptual Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Conceptual Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Conceptual Physics - # prompt tokens": "{\"description\": \"min=304.834, mean=304.834, max=304.834, sum=609.668 (2)\", \"tab\": \"General information\", \"score\": \"304.83404255319147\"}", + "Conceptual Physics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"marketing\"", + "subject": "\"conceptual_physics\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_marketing\"" + "groups": "\"mmlu_conceptual_physics\"" } } }, { - "evaluation_name": "Medical Genetics", + "evaluation_name": "Electrical Engineering", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1113,36 +1021,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Medical Genetics", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.92, + "score": 0.759, "details": { - "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)", + "description": "min=0.759, mean=0.759, max=0.759, sum=1.517 (2)", "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": "{\"description\": \"min=0.296, mean=0.296, max=0.296, sum=0.592 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.29611136198043825\"}", - "Medical Genetics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "Medical Genetics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Medical Genetics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Medical Genetics - # prompt tokens": "{\"description\": \"min=340.99, mean=340.99, max=340.99, sum=681.98 (2)\", \"tab\": \"General information\", \"score\": \"340.99\"}", - "Medical Genetics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" + "Electrical Engineering - Observed inference time (s)": "{\"description\": \"min=0.256, mean=0.256, max=0.256, sum=0.512 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2558267790695717\"}", + "Electrical Engineering - # eval": "{\"description\": \"min=145, mean=145, max=145, sum=290 (2)\", \"tab\": \"General information\", \"score\": \"145.0\"}", + "Electrical Engineering - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Electrical Engineering - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Electrical Engineering - # prompt tokens": "{\"description\": \"min=435.607, mean=435.607, max=435.607, sum=871.214 (2)\", \"tab\": \"General information\", \"score\": \"435.60689655172416\"}", + "Electrical Engineering - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"medical_genetics\"", + "subject": "\"electrical_engineering\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_medical_genetics\"" + "groups": "\"mmlu_electrical_engineering\"" } } }, { - "evaluation_name": "Miscellaneous", + "evaluation_name": "Elementary Mathematics", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1151,36 +1059,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Miscellaneous", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.913, + "score": 0.688, "details": { - "description": "min=0.913, mean=0.913, max=0.913, sum=1.826 (2)", + "description": "min=0.688, mean=0.688, max=0.688, sum=1.376 (2)", "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": "{\"description\": \"min=0.324, mean=0.324, max=0.324, sum=0.647 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3237126984967735\"}", - "Miscellaneous - # eval": "{\"description\": \"min=783, mean=783, max=783, sum=1566 (2)\", \"tab\": \"General information\", \"score\": \"783.0\"}", - "Miscellaneous - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Miscellaneous - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Miscellaneous - # prompt tokens": "{\"description\": \"min=299.911, mean=299.911, max=299.911, sum=599.821 (2)\", \"tab\": \"General information\", \"score\": \"299.9106002554278\"}", - "Miscellaneous - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" + "Elementary Mathematics - Observed inference time (s)": "{\"description\": \"min=0.308, mean=0.308, max=0.308, sum=0.617 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.30840403945357714\"}", + "Elementary Mathematics - # eval": "{\"description\": \"min=378, mean=378, max=378, sum=756 (2)\", \"tab\": \"General information\", \"score\": \"378.0\"}", + "Elementary Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Elementary Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Elementary Mathematics - # prompt tokens": "{\"description\": \"min=531.854, mean=531.854, max=531.854, sum=1063.709 (2)\", \"tab\": \"General information\", \"score\": \"531.8544973544973\"}", + "Elementary Mathematics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"miscellaneous\"", + "subject": "\"elementary_mathematics\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_miscellaneous\"" + "groups": "\"mmlu_elementary_mathematics\"" } } }, { - "evaluation_name": "Moral Scenarios", + "evaluation_name": "Formal Logic", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1189,42 +1097,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Moral Scenarios", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.841, + "score": 0.683, "details": { - "description": "min=0.841, mean=0.841, max=0.841, sum=1.683 (2)", + "description": "min=0.683, mean=0.683, max=0.683, sum=1.365 (2)", "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": "{\"description\": \"min=0.29, mean=0.29, max=0.29, sum=0.58 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2901734975032035\"}", - "Moral Scenarios - Observed inference time (s)": "{\"description\": \"min=0.506, mean=0.506, max=0.506, sum=1.012 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5058047955262595\"}", - "Moral Disputes - # eval": "{\"description\": \"min=346, mean=346, max=346, sum=692 (2)\", \"tab\": \"General information\", \"score\": \"346.0\"}", - "Moral Disputes - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Moral Disputes - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Moral Disputes - # prompt tokens": "{\"description\": \"min=476.113, mean=476.113, max=476.113, sum=952.225 (2)\", \"tab\": \"General information\", \"score\": \"476.1127167630058\"}", - "Moral Disputes - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Moral Scenarios - # eval": "{\"description\": \"min=895, mean=895, max=895, sum=1790 (2)\", \"tab\": \"General information\", \"score\": \"895.0\"}", - "Moral Scenarios - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Moral Scenarios - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Moral Scenarios - # prompt tokens": "{\"description\": \"min=656.455, mean=656.455, max=656.455, sum=1312.909 (2)\", \"tab\": \"General information\", \"score\": \"656.454748603352\"}", - "Moral Scenarios - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" + "Formal Logic - Observed inference time (s)": "{\"description\": \"min=0.304, mean=0.304, max=0.304, sum=0.609 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.30448357074979754\"}", + "Formal Logic - # eval": "{\"description\": \"min=126, mean=126, max=126, sum=252 (2)\", \"tab\": \"General information\", \"score\": \"126.0\"}", + "Formal Logic - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Formal Logic - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Formal Logic - # prompt tokens": "{\"description\": \"min=601.778, mean=601.778, max=601.778, sum=1203.556 (2)\", \"tab\": \"General information\", \"score\": \"601.7777777777778\"}", + "Formal Logic - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"moral_scenarios\"", + "subject": "\"formal_logic\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_moral_scenarios\"" + "groups": "\"mmlu_formal_logic\"" } } }, { - "evaluation_name": "Nutrition", + "evaluation_name": "High School World History", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1233,36 +1135,114 @@ ] }, "metric_config": { - "evaluation_description": "EM on Nutrition", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.889, + "score": 0.941, "details": { - "description": "min=0.889, mean=0.889, max=0.889, sum=1.778 (2)", + "description": "min=0.941, mean=0.941, max=0.941, sum=1.882 (2)", "tab": "Accuracy", - "Nutrition - Observed inference time (s)": "{\"description\": \"min=0.321, mean=0.321, max=0.321, sum=0.641 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.32064209264867444\"}", - "Nutrition - # eval": "{\"description\": \"min=306, mean=306, max=306, sum=612 (2)\", \"tab\": \"General information\", \"score\": \"306.0\"}", - "Nutrition - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Nutrition - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Nutrition - # prompt tokens": "{\"description\": \"min=586.814, mean=586.814, max=586.814, sum=1173.627 (2)\", \"tab\": \"General information\", \"score\": \"586.8137254901961\"}", - "Nutrition - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" + "High School Biology - Observed inference time (s)": "{\"description\": \"min=0.309, mean=0.309, max=0.309, sum=0.619 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3094667688492806\"}", + "High School Chemistry - Observed inference time (s)": "{\"description\": \"min=0.294, mean=0.294, max=0.294, sum=0.588 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.29394797386207017\"}", + "High School Computer Science - Observed inference time (s)": "{\"description\": \"min=0.301, mean=0.301, max=0.301, sum=0.602 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.30106969356536867\"}", + "High School European History - Observed inference time (s)": "{\"description\": \"min=0.48, mean=0.48, max=0.48, sum=0.96 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4799844944115841\"}", + "High School Geography - Observed inference time (s)": "{\"description\": \"min=0.297, mean=0.297, max=0.297, sum=0.595 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.29747620014229204\"}", + "High School Government And Politics - Observed inference time (s)": "{\"description\": \"min=0.291, mean=0.291, max=0.291, sum=0.583 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2914604300662026\"}", + "High School Macroeconomics - Observed inference time (s)": "{\"description\": \"min=0.279, mean=0.279, max=0.279, sum=0.557 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.27857950650728663\"}", + "High School Mathematics - Observed inference time (s)": "{\"description\": \"min=0.312, mean=0.312, max=0.312, sum=0.625 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3123831342767786\"}", + "High School Microeconomics - Observed inference time (s)": "{\"description\": \"min=0.302, mean=0.302, max=0.302, sum=0.603 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.30159517997453195\"}", + "High School Physics - Observed inference time (s)": "{\"description\": \"min=0.322, mean=0.322, max=0.322, sum=0.643 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.32152655108874995\"}", + "High School Psychology - Observed inference time (s)": "{\"description\": \"min=0.29, mean=0.29, max=0.29, sum=0.581 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2903494253071076\"}", + "High School Statistics - Observed inference time (s)": "{\"description\": \"min=0.333, mean=0.333, max=0.333, sum=0.667 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.33328031720938506\"}", + "High School US History - Observed inference time (s)": "{\"description\": \"min=0.394, mean=0.394, max=0.394, sum=0.788 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.39396579826579375\"}", + "High School World History - Observed inference time (s)": "{\"description\": \"min=0.679, mean=0.679, max=0.679, sum=1.359 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6793377369265013\"}", + "High School Biology - # eval": "{\"description\": \"min=310, mean=310, max=310, sum=620 (2)\", \"tab\": \"General information\", \"score\": \"310.0\"}", + "High School Biology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Biology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Biology - # prompt tokens": "{\"description\": \"min=513.671, mean=513.671, max=513.671, sum=1027.342 (2)\", \"tab\": \"General information\", \"score\": \"513.6709677419354\"}", + "High School Biology - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Chemistry - # eval": "{\"description\": \"min=203, mean=203, max=203, sum=406 (2)\", \"tab\": \"General information\", \"score\": \"203.0\"}", + "High School Chemistry - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Chemistry - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Chemistry - # prompt tokens": "{\"description\": \"min=496.704, mean=496.704, max=496.704, sum=993.409 (2)\", \"tab\": \"General information\", \"score\": \"496.70443349753697\"}", + "High School Chemistry - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Computer Science - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "High School Computer Science - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Computer Science - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Computer Science - # prompt tokens": "{\"description\": \"min=867.78, mean=867.78, max=867.78, sum=1735.56 (2)\", \"tab\": \"General information\", \"score\": \"867.78\"}", + "High School Computer Science - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School European History - # eval": "{\"description\": \"min=165, mean=165, max=165, sum=330 (2)\", \"tab\": \"General information\", \"score\": \"165.0\"}", + "High School European History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School European History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School European History - # prompt tokens": "{\"description\": \"min=2797.885, mean=2797.885, max=2797.885, sum=5595.77 (2)\", \"tab\": \"General information\", \"score\": \"2797.8848484848486\"}", + "High School European History - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Geography - # eval": "{\"description\": \"min=198, mean=198, max=198, sum=396 (2)\", \"tab\": \"General information\", \"score\": \"198.0\"}", + "High School Geography - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Geography - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Geography - # prompt tokens": "{\"description\": \"min=372.035, mean=372.035, max=372.035, sum=744.071 (2)\", \"tab\": \"General information\", \"score\": \"372.0353535353535\"}", + "High School Geography - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Government And Politics - # eval": "{\"description\": \"min=193, mean=193, max=193, sum=386 (2)\", \"tab\": \"General information\", \"score\": \"193.0\"}", + "High School Government And Politics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Government And Politics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Government And Politics - # prompt tokens": "{\"description\": \"min=465.824, mean=465.824, max=465.824, sum=931.648 (2)\", \"tab\": \"General information\", \"score\": \"465.8238341968912\"}", + "High School Government And Politics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Macroeconomics - # eval": "{\"description\": \"min=390, mean=390, max=390, sum=780 (2)\", \"tab\": \"General information\", \"score\": \"390.0\"}", + "High School Macroeconomics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Macroeconomics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Macroeconomics - # prompt tokens": "{\"description\": \"min=370.908, mean=370.908, max=370.908, sum=741.815 (2)\", \"tab\": \"General information\", \"score\": \"370.9076923076923\"}", + "High School Macroeconomics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Mathematics - # eval": "{\"description\": \"min=270, mean=270, max=270, sum=540 (2)\", \"tab\": \"General information\", \"score\": \"270.0\"}", + "High School Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Mathematics - # prompt tokens": "{\"description\": \"min=532.356, mean=532.356, max=532.356, sum=1064.711 (2)\", \"tab\": \"General information\", \"score\": \"532.3555555555556\"}", + "High School Mathematics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Microeconomics - # eval": "{\"description\": \"min=238, mean=238, max=238, sum=476 (2)\", \"tab\": \"General information\", \"score\": \"238.0\"}", + "High School Microeconomics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Microeconomics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Microeconomics - # prompt tokens": "{\"description\": \"min=399.013, mean=399.013, max=399.013, sum=798.025 (2)\", \"tab\": \"General information\", \"score\": \"399.0126050420168\"}", + "High School Microeconomics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Physics - # eval": "{\"description\": \"min=151, mean=151, max=151, sum=302 (2)\", \"tab\": \"General information\", \"score\": \"151.0\"}", + "High School Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Physics - # prompt tokens": "{\"description\": \"min=560.457, mean=560.457, max=560.457, sum=1120.914 (2)\", \"tab\": \"General information\", \"score\": \"560.4569536423841\"}", + "High School Physics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Psychology - # eval": "{\"description\": \"min=545, mean=545, max=545, sum=1090 (2)\", \"tab\": \"General information\", \"score\": \"545.0\"}", + "High School Psychology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Psychology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Psychology - # prompt tokens": "{\"description\": \"min=495.242, mean=495.242, max=495.242, sum=990.484 (2)\", \"tab\": \"General information\", \"score\": \"495.2422018348624\"}", + "High School Psychology - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Statistics - # eval": "{\"description\": \"min=216, mean=216, max=216, sum=432 (2)\", \"tab\": \"General information\", \"score\": \"216.0\"}", + "High School Statistics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Statistics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Statistics - # prompt tokens": "{\"description\": \"min=795.639, mean=795.639, max=795.639, sum=1591.278 (2)\", \"tab\": \"General information\", \"score\": \"795.6388888888889\"}", + "High School Statistics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School US History - # eval": "{\"description\": \"min=204, mean=204, max=204, sum=408 (2)\", \"tab\": \"General information\", \"score\": \"204.0\"}", + "High School US History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School US History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School US History - # prompt tokens": "{\"description\": \"min=2217.809, mean=2217.809, max=2217.809, sum=4435.618 (2)\", \"tab\": \"General information\", \"score\": \"2217.8088235294117\"}", + "High School US History - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School World History - # eval": "{\"description\": \"min=237, mean=237, max=237, sum=474 (2)\", \"tab\": \"General information\", \"score\": \"237.0\"}", + "High School World History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School World History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School World History - # prompt tokens": "{\"description\": \"min=1428.173, mean=1428.173, max=1428.173, sum=2856.346 (2)\", \"tab\": \"General information\", \"score\": \"1428.1729957805908\"}", + "High School World History - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"nutrition\"", + "subject": "\"high_school_world_history\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_nutrition\"" + "groups": "\"mmlu_high_school_world_history\"" } } }, { - "evaluation_name": "Prehistory", + "evaluation_name": "Human Sexuality", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1271,36 +1251,42 @@ ] }, "metric_config": { - "evaluation_description": "EM on Prehistory", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.886, + "score": 0.87, "details": { - "description": "min=0.886, mean=0.886, max=0.886, sum=1.772 (2)", + "description": "min=0.87, mean=0.87, max=0.87, sum=1.74 (2)", "tab": "Accuracy", - "Prehistory - Observed inference time (s)": "{\"description\": \"min=0.614, mean=0.614, max=0.614, sum=1.227 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6136744522754057\"}", - "Prehistory - # eval": "{\"description\": \"min=324, mean=324, max=324, sum=648 (2)\", \"tab\": \"General information\", \"score\": \"324.0\"}", - "Prehistory - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Prehistory - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Prehistory - # prompt tokens": "{\"description\": \"min=514.528, mean=514.528, max=514.528, sum=1029.056 (2)\", \"tab\": \"General information\", \"score\": \"514.5277777777778\"}", - "Prehistory - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" + "Human Aging - Observed inference time (s)": "{\"description\": \"min=0.388, mean=0.388, max=0.388, sum=0.776 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.38789880863754206\"}", + "Human Sexuality - Observed inference time (s)": "{\"description\": \"min=0.293, mean=0.293, max=0.293, sum=0.586 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2929920222013051\"}", + "Human Aging - # eval": "{\"description\": \"min=223, mean=223, max=223, sum=446 (2)\", \"tab\": \"General information\", \"score\": \"223.0\"}", + "Human Aging - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Human Aging - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Human Aging - # prompt tokens": "{\"description\": \"min=319.888, mean=319.888, max=319.888, sum=639.776 (2)\", \"tab\": \"General information\", \"score\": \"319.88789237668163\"}", + "Human Aging - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Human Sexuality - # eval": "{\"description\": \"min=131, mean=131, max=131, sum=262 (2)\", \"tab\": \"General information\", \"score\": \"131.0\"}", + "Human Sexuality - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Human Sexuality - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Human Sexuality - # prompt tokens": "{\"description\": \"min=341.168, mean=341.168, max=341.168, sum=682.336 (2)\", \"tab\": \"General information\", \"score\": \"341.1679389312977\"}", + "Human Sexuality - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"prehistory\"", + "subject": "\"human_sexuality\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_prehistory\"" + "groups": "\"mmlu_human_sexuality\"" } } }, { - "evaluation_name": "Public Relations", + "evaluation_name": "International Law", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1309,36 +1295,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Public Relations", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.718, + "score": 0.934, "details": { - "description": "min=0.718, mean=0.718, max=0.718, sum=1.436 (2)", + "description": "min=0.934, mean=0.934, max=0.934, sum=1.868 (2)", "tab": "Accuracy", - "Public Relations - Observed inference time (s)": "{\"description\": \"min=0.3, mean=0.3, max=0.3, sum=0.599 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.29952496832067316\"}", - "Public Relations - # eval": "{\"description\": \"min=110, mean=110, max=110, sum=220 (2)\", \"tab\": \"General information\", \"score\": \"110.0\"}", - "Public Relations - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Public Relations - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Public Relations - # prompt tokens": "{\"description\": \"min=405.318, mean=405.318, max=405.318, sum=810.636 (2)\", \"tab\": \"General information\", \"score\": \"405.3181818181818\"}", - "Public Relations - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" + "International Law - Observed inference time (s)": "{\"description\": \"min=0.342, mean=0.342, max=0.342, sum=0.685 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.34241620962284813\"}", + "International Law - # eval": "{\"description\": \"min=121, mean=121, max=121, sum=242 (2)\", \"tab\": \"General information\", \"score\": \"121.0\"}", + "International Law - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "International Law - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "International Law - # prompt tokens": "{\"description\": \"min=639.818, mean=639.818, max=639.818, sum=1279.636 (2)\", \"tab\": \"General information\", \"score\": \"639.8181818181819\"}", + "International Law - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"public_relations\"", + "subject": "\"international_law\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_public_relations\"" + "groups": "\"mmlu_international_law\"" } } }, { - "evaluation_name": "Security Studies", + "evaluation_name": "Logical Fallacies", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1347,36 +1333,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Security Studies", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.853, + "score": 0.834, "details": { - "description": "min=0.853, mean=0.853, max=0.853, sum=1.706 (2)", + "description": "min=0.834, mean=0.834, max=0.834, sum=1.669 (2)", "tab": "Accuracy", - "Security Studies - Observed inference time (s)": "{\"description\": \"min=0.348, mean=0.348, max=0.348, sum=0.697 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.348436891789339\"}", - "Security Studies - # eval": "{\"description\": \"min=245, mean=245, max=245, sum=490 (2)\", \"tab\": \"General information\", \"score\": \"245.0\"}", - "Security Studies - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Security Studies - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Security Studies - # prompt tokens": "{\"description\": \"min=1164.473, mean=1164.473, max=1164.473, sum=2328.947 (2)\", \"tab\": \"General information\", \"score\": \"1164.4734693877551\"}", - "Security Studies - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" + "Logical Fallacies - Observed inference time (s)": "{\"description\": \"min=0.282, mean=0.282, max=0.282, sum=0.565 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.28232605325663745\"}", + "Logical Fallacies - # eval": "{\"description\": \"min=163, mean=163, max=163, sum=326 (2)\", \"tab\": \"General information\", \"score\": \"163.0\"}", + "Logical Fallacies - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Logical Fallacies - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Logical Fallacies - # prompt tokens": "{\"description\": \"min=449.564, mean=449.564, max=449.564, sum=899.129 (2)\", \"tab\": \"General information\", \"score\": \"449.5644171779141\"}", + "Logical Fallacies - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"security_studies\"", + "subject": "\"logical_fallacies\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_security_studies\"" + "groups": "\"mmlu_logical_fallacies\"" } } }, { - "evaluation_name": "Sociology", + "evaluation_name": "Machine Learning", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1385,36 +1371,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Sociology", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.92, + "score": 0.688, "details": { - "description": "min=0.92, mean=0.92, max=0.92, sum=1.841 (2)", + "description": "min=0.688, mean=0.688, max=0.688, sum=1.375 (2)", "tab": "Accuracy", - "Sociology - Observed inference time (s)": "{\"description\": \"min=0.297, mean=0.297, max=0.297, sum=0.595 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.29732529915387357\"}", - "Sociology - # eval": "{\"description\": \"min=201, mean=201, max=201, sum=402 (2)\", \"tab\": \"General information\", \"score\": \"201.0\"}", - "Sociology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Sociology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Sociology - # prompt tokens": "{\"description\": \"min=445.517, mean=445.517, max=445.517, sum=891.035 (2)\", \"tab\": \"General information\", \"score\": \"445.51741293532336\"}", - "Sociology - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" + "Machine Learning - Observed inference time (s)": "{\"description\": \"min=0.338, mean=0.338, max=0.338, sum=0.676 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.33782388057027546\"}", + "Machine Learning - # eval": "{\"description\": \"min=112, mean=112, max=112, sum=224 (2)\", \"tab\": \"General information\", \"score\": \"112.0\"}", + "Machine Learning - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Machine Learning - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Machine Learning - # prompt tokens": "{\"description\": \"min=668.054, mean=668.054, max=668.054, sum=1336.107 (2)\", \"tab\": \"General information\", \"score\": \"668.0535714285714\"}", + "Machine Learning - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"sociology\"", + "subject": "\"machine_learning\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_sociology\"" + "groups": "\"mmlu_machine_learning\"" } } }, { - "evaluation_name": "Virology", + "evaluation_name": "Management", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1423,36 +1409,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Virology", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.584, + "score": 0.913, "details": { - "description": "min=0.584, mean=0.584, max=0.584, sum=1.169 (2)", + "description": "min=0.913, mean=0.913, max=0.913, sum=1.825 (2)", "tab": "Accuracy", - "Virology - Observed inference time (s)": "{\"description\": \"min=0.321, mean=0.321, max=0.321, sum=0.642 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.32124968609177923\"}", - "Virology - # eval": "{\"description\": \"min=166, mean=166, max=166, sum=332 (2)\", \"tab\": \"General information\", \"score\": \"166.0\"}", - "Virology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Virology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Virology - # prompt tokens": "{\"description\": \"min=343.018, mean=343.018, max=343.018, sum=686.036 (2)\", \"tab\": \"General information\", \"score\": \"343.01807228915663\"}", - "Virology - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" + "Management - Observed inference time (s)": "{\"description\": \"min=0.285, mean=0.285, max=0.285, sum=0.571 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2853238027072647\"}", + "Management - # eval": "{\"description\": \"min=103, mean=103, max=103, sum=206 (2)\", \"tab\": \"General information\", \"score\": \"103.0\"}", + "Management - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Management - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Management - # prompt tokens": "{\"description\": \"min=283.786, mean=283.786, max=283.786, sum=567.573 (2)\", \"tab\": \"General information\", \"score\": \"283.7864077669903\"}", + "Management - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"virology\"", + "subject": "\"management\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_virology\"" + "groups": "\"mmlu_management\"" } } }, { - "evaluation_name": "World Religions", + "evaluation_name": "Marketing", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1461,36 +1447,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on World Religions", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.901, + "score": 0.944, "details": { - "description": "min=0.901, mean=0.901, max=0.901, sum=1.801 (2)", + "description": "min=0.944, mean=0.944, max=0.944, sum=1.889 (2)", "tab": "Accuracy", - "World Religions - Observed inference time (s)": "{\"description\": \"min=0.277, mean=0.277, max=0.277, sum=0.554 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.27723441068191973\"}", - "World Religions - # eval": "{\"description\": \"min=171, mean=171, max=171, sum=342 (2)\", \"tab\": \"General information\", \"score\": \"171.0\"}", - "World Religions - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "World Religions - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "World Religions - # prompt tokens": "{\"description\": \"min=274.52, mean=274.52, max=274.52, sum=549.041 (2)\", \"tab\": \"General information\", \"score\": \"274.5204678362573\"}", - "World Religions - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" + "Marketing - Observed inference time (s)": "{\"description\": \"min=0.28, mean=0.28, max=0.28, sum=0.561 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.28032574796269083\"}", + "Marketing - # eval": "{\"description\": \"min=234, mean=234, max=234, sum=468 (2)\", \"tab\": \"General information\", \"score\": \"234.0\"}", + "Marketing - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Marketing - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Marketing - # prompt tokens": "{\"description\": \"min=404.218, mean=404.218, max=404.218, sum=808.436 (2)\", \"tab\": \"General information\", \"score\": \"404.21794871794873\"}", + "Marketing - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"world_religions\"", + "subject": "\"marketing\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_world_religions\"" + "groups": "\"mmlu_marketing\"" } } }, { - "evaluation_name": "Mean win rate", + "evaluation_name": "Medical Genetics", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1499,404 +1485,418 @@ ] }, "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.773, + "score": 0.92, "details": { - "description": "", - "tab": "Efficiency" + "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)", + "tab": "Accuracy", + "Medical Genetics - Observed inference time (s)": "{\"description\": \"min=0.296, mean=0.296, max=0.296, sum=0.592 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.29611136198043825\"}", + "Medical Genetics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "Medical Genetics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Medical Genetics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Medical Genetics - # prompt tokens": "{\"description\": \"min=340.99, mean=340.99, max=340.99, sum=681.98 (2)\", \"tab\": \"General information\", \"score\": \"340.99\"}", + "Medical Genetics - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" } }, "generation_config": { - "additional_details": {} + "additional_details": { + "subject": "\"medical_genetics\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_medical_genetics\"" + } } - } - ], - "detailed_evaluation_results": null, - "generation_config": { - "additional_details": { - "subject": "[\"abstract_algebra\", \"anatomy\", \"astronomy\", \"business_ethics\", \"clinical_knowledge\", \"college_biology\", \"college_chemistry\", \"college_computer_science\", \"college_mathematics\", \"college_medicine\", \"college_physics\", \"computer_security\", \"conceptual_physics\", \"econometrics\", \"electrical_engineering\", \"elementary_mathematics\", \"formal_logic\", \"global_facts\", \"high_school_biology\", \"high_school_chemistry\", \"high_school_computer_science\", \"high_school_european_history\", \"high_school_geography\", \"high_school_government_and_politics\", \"high_school_macroeconomics\", \"high_school_mathematics\", \"high_school_microeconomics\", \"high_school_physics\", \"high_school_psychology\", \"high_school_statistics\", \"high_school_us_history\", \"high_school_world_history\", \"human_aging\", \"human_sexuality\", \"international_law\", \"jurisprudence\", \"logical_fallacies\", \"machine_learning\", \"management\", \"marketing\", \"medical_genetics\", \"miscellaneous\", \"moral_disputes\", \"moral_scenarios\", \"nutrition\", \"philosophy\", \"prehistory\", \"professional_accounting\", \"professional_law\", \"professional_medicine\", \"professional_psychology\", \"public_relations\", \"security_studies\", \"sociology\", \"us_foreign_policy\", \"virology\", \"world_religions\"]", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "[\"mmlu_abstract_algebra\", \"mmlu_anatomy\", \"mmlu_astronomy\", \"mmlu_business_ethics\", \"mmlu_clinical_knowledge\", \"mmlu_college_biology\", \"mmlu_college_chemistry\", \"mmlu_college_computer_science\", \"mmlu_college_mathematics\", \"mmlu_college_medicine\", \"mmlu_college_physics\", \"mmlu_computer_security\", \"mmlu_conceptual_physics\", \"mmlu_econometrics\", \"mmlu_electrical_engineering\", \"mmlu_elementary_mathematics\", \"mmlu_formal_logic\", \"mmlu_global_facts\", \"mmlu_high_school_biology\", \"mmlu_high_school_chemistry\", \"mmlu_high_school_computer_science\", \"mmlu_high_school_european_history\", \"mmlu_high_school_geography\", \"mmlu_high_school_government_and_politics\", \"mmlu_high_school_macroeconomics\", \"mmlu_high_school_mathematics\", \"mmlu_high_school_microeconomics\", \"mmlu_high_school_physics\", \"mmlu_high_school_psychology\", \"mmlu_high_school_statistics\", \"mmlu_high_school_us_history\", \"mmlu_high_school_world_history\", \"mmlu_human_aging\", \"mmlu_human_sexuality\", \"mmlu_international_law\", \"mmlu_jurisprudence\", \"mmlu_logical_fallacies\", \"mmlu_machine_learning\", \"mmlu_management\", \"mmlu_marketing\", \"mmlu_medical_genetics\", \"mmlu_miscellaneous\", \"mmlu_moral_disputes\", \"mmlu_moral_scenarios\", \"mmlu_nutrition\", \"mmlu_philosophy\", \"mmlu_prehistory\", \"mmlu_professional_accounting\", \"mmlu_professional_law\", \"mmlu_professional_medicine\", \"mmlu_professional_psychology\", \"mmlu_public_relations\", \"mmlu_security_studies\", \"mmlu_sociology\", \"mmlu_us_foreign_policy\", \"mmlu_virology\", \"mmlu_world_religions\"]" - } - } - }, - { - "evaluation_id": "helm_lite/meta_llama-3.2-90b-vision-instruct-turbo/1774096306.427425", - "retrieved_timestamp": "1774096306.427425", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "eval_library": { - "name": "helm", - "version": "unknown" - }, - "benchmark": "helm_lite", - "evaluation_results": [ + }, { - "evaluation_name": "Mean win rate", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "helm_lite", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.819, + "score": 0.913, "details": { - "description": "", + "description": "min=0.913, mean=0.913, max=0.913, sum=1.826 (2)", "tab": "Accuracy", - "Mean win rate - Efficiency": "{\"description\": \"\", \"tab\": \"Efficiency\", \"score\": \"0.5839825218476904\"}", - "Mean win rate - General information": "{\"description\": \"\", \"tab\": \"General information\", \"score\": \"\"}" + "Miscellaneous - Observed inference time (s)": "{\"description\": \"min=0.324, mean=0.324, max=0.324, sum=0.647 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3237126984967735\"}", + "Miscellaneous - # eval": "{\"description\": \"min=783, mean=783, max=783, sum=1566 (2)\", \"tab\": \"General information\", \"score\": \"783.0\"}", + "Miscellaneous - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Miscellaneous - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Miscellaneous - # prompt tokens": "{\"description\": \"min=299.911, mean=299.911, max=299.911, sum=599.821 (2)\", \"tab\": \"General information\", \"score\": \"299.9106002554278\"}", + "Miscellaneous - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" } }, "generation_config": { - "additional_details": {} + "additional_details": { + "subject": "\"miscellaneous\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_miscellaneous\"" + } } }, { - "evaluation_name": "NarrativeQA", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "NarrativeQA", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "F1 on NarrativeQA", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.777, + "score": 0.841, "details": { - "description": "min=0.777, mean=0.777, max=0.777, sum=0.777 (1)", + "description": "min=0.841, mean=0.841, max=0.841, sum=1.683 (2)", "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": "{\"description\": \"min=0.83, mean=0.83, max=0.83, sum=0.83 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.8297326531208736\"}", - "NarrativeQA - # eval": "{\"description\": \"min=355, mean=355, max=355, sum=355 (1)\", \"tab\": \"General information\", \"score\": \"355.0\"}", - "NarrativeQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "NarrativeQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "NarrativeQA - # prompt tokens": "{\"description\": \"min=3484.268, mean=3484.268, max=3484.268, sum=3484.268 (1)\", \"tab\": \"General information\", \"score\": \"3484.2676056338028\"}", - "NarrativeQA - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}" + "Moral Disputes - Observed inference time (s)": "{\"description\": \"min=0.29, mean=0.29, max=0.29, sum=0.58 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2901734975032035\"}", + "Moral Scenarios - Observed inference time (s)": "{\"description\": \"min=0.506, mean=0.506, max=0.506, sum=1.012 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5058047955262595\"}", + "Moral Disputes - # eval": "{\"description\": \"min=346, mean=346, max=346, sum=692 (2)\", \"tab\": \"General information\", \"score\": \"346.0\"}", + "Moral Disputes - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Moral Disputes - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Moral Disputes - # prompt tokens": "{\"description\": \"min=476.113, mean=476.113, max=476.113, sum=952.225 (2)\", \"tab\": \"General information\", \"score\": \"476.1127167630058\"}", + "Moral Disputes - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Moral Scenarios - # eval": "{\"description\": \"min=895, mean=895, max=895, sum=1790 (2)\", \"tab\": \"General information\", \"score\": \"895.0\"}", + "Moral Scenarios - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Moral Scenarios - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Moral Scenarios - # prompt tokens": "{\"description\": \"min=656.455, mean=656.455, max=656.455, sum=1312.909 (2)\", \"tab\": \"General information\", \"score\": \"656.454748603352\"}", + "Moral Scenarios - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" } }, "generation_config": { - "additional_details": {} + "additional_details": { + "subject": "\"moral_scenarios\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_moral_scenarios\"" + } } }, { - "evaluation_name": "NaturalQuestions (closed-book)", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.457, + "score": 0.889, "details": { - "description": "min=0.457, mean=0.457, max=0.457, sum=0.457 (1)", + "description": "min=0.889, mean=0.889, max=0.889, sum=1.778 (2)", "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": "{\"description\": \"min=1.111, mean=1.111, max=1.111, sum=1.111 (1)\", \"tab\": \"Efficiency\", \"score\": \"1.110703297138214\"}", - "NaturalQuestions (closed-book) - Observed inference time (s)": "{\"description\": \"min=0.422, mean=0.422, max=0.422, sum=0.422 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.4218848171234131\"}", - "NaturalQuestions (open-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", - "NaturalQuestions (open-book) - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "NaturalQuestions (open-book) - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "NaturalQuestions (open-book) - # prompt tokens": "{\"description\": \"min=1716.785, mean=1716.785, max=1716.785, sum=1716.785 (1)\", \"tab\": \"General information\", \"score\": \"1716.785\"}", - "NaturalQuestions (open-book) - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "NaturalQuestions (closed-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", - "NaturalQuestions (closed-book) - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "NaturalQuestions (closed-book) - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "NaturalQuestions (closed-book) - # prompt tokens": "{\"description\": \"min=129.12, mean=129.12, max=129.12, sum=129.12 (1)\", \"tab\": \"General information\", \"score\": \"129.12\"}", - "NaturalQuestions (closed-book) - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}" + "Nutrition - Observed inference time (s)": "{\"description\": \"min=0.321, mean=0.321, max=0.321, sum=0.641 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.32064209264867444\"}", + "Nutrition - # eval": "{\"description\": \"min=306, mean=306, max=306, sum=612 (2)\", \"tab\": \"General information\", \"score\": \"306.0\"}", + "Nutrition - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Nutrition - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Nutrition - # prompt tokens": "{\"description\": \"min=586.814, mean=586.814, max=586.814, sum=1173.627 (2)\", \"tab\": \"General information\", \"score\": \"586.8137254901961\"}", + "Nutrition - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" } }, "generation_config": { "additional_details": { - "mode": "\"closedbook\"" + "subject": "\"nutrition\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_nutrition\"" } } }, { - "evaluation_name": "OpenbookQA", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "OpenbookQA", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "EM on OpenbookQA", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.942, + "score": 0.886, "details": { - "description": "min=0.942, mean=0.942, max=0.942, sum=0.942 (1)", + "description": "min=0.886, mean=0.886, max=0.886, sum=1.772 (2)", "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": "{\"description\": \"min=0.285, mean=0.285, max=0.285, sum=0.285 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.28476666021347047\"}", - "OpenbookQA - # eval": "{\"description\": \"min=500, mean=500, max=500, sum=500 (1)\", \"tab\": \"General information\", \"score\": \"500.0\"}", - "OpenbookQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "OpenbookQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "OpenbookQA - # prompt tokens": "{\"description\": \"min=249.776, mean=249.776, max=249.776, sum=249.776 (1)\", \"tab\": \"General information\", \"score\": \"249.776\"}", - "OpenbookQA - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}" + "Prehistory - Observed inference time (s)": "{\"description\": \"min=0.614, mean=0.614, max=0.614, sum=1.227 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6136744522754057\"}", + "Prehistory - # eval": "{\"description\": \"min=324, mean=324, max=324, sum=648 (2)\", \"tab\": \"General information\", \"score\": \"324.0\"}", + "Prehistory - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Prehistory - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Prehistory - # prompt tokens": "{\"description\": \"min=514.528, mean=514.528, max=514.528, sum=1029.056 (2)\", \"tab\": \"General information\", \"score\": \"514.5277777777778\"}", + "Prehistory - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" } }, "generation_config": { "additional_details": { - "dataset": "\"openbookqa\"", - "method": "\"multiple_choice_joint\"" + "subject": "\"prehistory\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_prehistory\"" } } }, { - "evaluation_name": "MMLU", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "MMLU", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "EM on MMLU", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.703, + "score": 0.718, "details": { - "description": "min=0.52, mean=0.703, max=0.93, sum=3.514 (5)", + "description": "min=0.718, mean=0.718, max=0.718, sum=1.436 (2)", "tab": "Accuracy", - "MMLU - Observed inference time (s)": "{\"description\": \"min=0.266, mean=0.798, max=2.612, sum=3.992 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.7984467656654225\"}", - "MMLU - # eval": "{\"description\": \"min=100, mean=102.8, max=114, sum=514 (5)\", \"tab\": \"General information\", \"score\": \"102.8\"}", - "MMLU - # train": "{\"description\": \"min=5, mean=5, max=5, sum=25 (5)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "MMLU - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "MMLU - # prompt tokens": "{\"description\": \"min=373.43, mean=467.686, max=614.421, sum=2338.431 (5)\", \"tab\": \"General information\", \"score\": \"467.6862105263158\"}", - "MMLU - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}" + "Public Relations - Observed inference time (s)": "{\"description\": \"min=0.3, mean=0.3, max=0.3, sum=0.599 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.29952496832067316\"}", + "Public Relations - # eval": "{\"description\": \"min=110, mean=110, max=110, sum=220 (2)\", \"tab\": \"General information\", \"score\": \"110.0\"}", + "Public Relations - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Public Relations - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Public Relations - # prompt tokens": "{\"description\": \"min=405.318, mean=405.318, max=405.318, sum=810.636 (2)\", \"tab\": \"General information\", \"score\": \"405.3181818181818\"}", + "Public Relations - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" } }, "generation_config": { "additional_details": { - "subject": "[\"abstract_algebra\", \"college_chemistry\", \"computer_security\", \"econometrics\", \"us_foreign_policy\"]", - "method": "\"multiple_choice_joint\"" + "subject": "\"public_relations\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_public_relations\"" } } }, { - "evaluation_name": "MATH", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "MATH", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.791, + "score": 0.853, "details": { - "description": "min=0.579, mean=0.791, max=0.978, sum=5.54 (7)", + "description": "min=0.853, mean=0.853, max=0.853, sum=1.706 (2)", "tab": "Accuracy", - "MATH - Observed inference time (s)": "{\"description\": \"min=4.64, mean=5.739, max=6.652, sum=40.174 (7)\", \"tab\": \"Efficiency\", \"score\": \"5.739186799526185\"}", - "MATH - # eval": "{\"description\": \"min=30, mean=62.429, max=135, sum=437 (7)\", \"tab\": \"General information\", \"score\": \"62.42857142857143\"}", - "MATH - # train": "{\"description\": \"min=8, mean=8, max=8, sum=56 (7)\", \"tab\": \"General information\", \"score\": \"8.0\"}", - "MATH - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (7)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "MATH - # prompt tokens": "{\"description\": \"min=881.363, mean=1262.909, max=2197.577, sum=8840.364 (7)\", \"tab\": \"General information\", \"score\": \"1262.9092130545007\"}", - "MATH - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (7)\", \"tab\": \"General information\", \"score\": \"0.0\"}" + "Security Studies - Observed inference time (s)": "{\"description\": \"min=0.348, mean=0.348, max=0.348, sum=0.697 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.348436891789339\"}", + "Security Studies - # eval": "{\"description\": \"min=245, mean=245, max=245, sum=490 (2)\", \"tab\": \"General information\", \"score\": \"245.0\"}", + "Security Studies - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Security Studies - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Security Studies - # prompt tokens": "{\"description\": \"min=1164.473, mean=1164.473, max=1164.473, sum=2328.947 (2)\", \"tab\": \"General information\", \"score\": \"1164.4734693877551\"}", + "Security Studies - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" } }, "generation_config": { "additional_details": { - "subject": "[\"algebra\", \"counting_and_probability\", \"geometry\", \"intermediate_algebra\", \"number_theory\", \"prealgebra\", \"precalculus\"]", - "level": "\"1\"", - "use_official_examples": "\"False\"", - "use_chain_of_thought": "\"True\"" + "subject": "\"security_studies\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_security_studies\"" } } }, { - "evaluation_name": "GSM8K", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "GSM8K", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "EM on GSM8K", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.936, + "score": 0.92, "details": { - "description": "min=0.936, mean=0.936, max=0.936, sum=0.936 (1)", + "description": "min=0.92, mean=0.92, max=0.92, sum=1.841 (2)", "tab": "Accuracy", - "GSM8K - Observed inference time (s)": "{\"description\": \"min=2.889, mean=2.889, max=2.889, sum=2.889 (1)\", \"tab\": \"Efficiency\", \"score\": \"2.8894128675460817\"}", - "GSM8K - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", - "GSM8K - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "GSM8K - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "GSM8K - # prompt tokens": "{\"description\": \"min=959.032, mean=959.032, max=959.032, sum=959.032 (1)\", \"tab\": \"General information\", \"score\": \"959.032\"}", - "GSM8K - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}" + "Sociology - Observed inference time (s)": "{\"description\": \"min=0.297, mean=0.297, max=0.297, sum=0.595 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.29732529915387357\"}", + "Sociology - # eval": "{\"description\": \"min=201, mean=201, max=201, sum=402 (2)\", \"tab\": \"General information\", \"score\": \"201.0\"}", + "Sociology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Sociology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Sociology - # prompt tokens": "{\"description\": \"min=445.517, mean=445.517, max=445.517, sum=891.035 (2)\", \"tab\": \"General information\", \"score\": \"445.51741293532336\"}", + "Sociology - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" } }, "generation_config": { "additional_details": { - "stop": "\"none\"" + "subject": "\"sociology\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_sociology\"" } } }, { - "evaluation_name": "LegalBench", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "LegalBench", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "EM on LegalBench", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.68, + "score": 0.584, "details": { - "description": "min=0.438, mean=0.68, max=0.989, sum=3.398 (5)", + "description": "min=0.584, mean=0.584, max=0.584, sum=1.169 (2)", "tab": "Accuracy", - "LegalBench - Observed inference time (s)": "{\"description\": \"min=0.284, mean=0.478, max=1.152, sum=2.389 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.47773526830658064\"}", - "LegalBench - # eval": "{\"description\": \"min=95, mean=409.4, max=1000, sum=2047 (5)\", \"tab\": \"General information\", \"score\": \"409.4\"}", - "LegalBench - # train": "{\"description\": \"min=4, mean=4.8, max=5, sum=24 (5)\", \"tab\": \"General information\", \"score\": \"4.8\"}", - "LegalBench - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "LegalBench - # prompt tokens": "{\"description\": \"min=197.442, mean=1513.882, max=6300.012, sum=7569.412 (5)\", \"tab\": \"General information\", \"score\": \"1513.8824197238912\"}", - "LegalBench - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}" + "Virology - Observed inference time (s)": "{\"description\": \"min=0.321, mean=0.321, max=0.321, sum=0.642 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.32124968609177923\"}", + "Virology - # eval": "{\"description\": \"min=166, mean=166, max=166, sum=332 (2)\", \"tab\": \"General information\", \"score\": \"166.0\"}", + "Virology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Virology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Virology - # prompt tokens": "{\"description\": \"min=343.018, mean=343.018, max=343.018, sum=686.036 (2)\", \"tab\": \"General information\", \"score\": \"343.01807228915663\"}", + "Virology - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" } }, "generation_config": { "additional_details": { - "subset": "[\"abercrombie\", \"corporate_lobbying\", \"function_of_decision_section\", \"international_citizenship_questions\", \"proa\"]" + "subject": "\"virology\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_virology\"" } } }, { - "evaluation_name": "MedQA", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "MedQA", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "EM on MedQA", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.769, + "score": 0.901, "details": { - "description": "min=0.769, mean=0.769, max=0.769, sum=0.769 (1)", + "description": "min=0.901, mean=0.901, max=0.901, sum=1.801 (2)", "tab": "Accuracy", - "MedQA - Observed inference time (s)": "{\"description\": \"min=0.318, mean=0.318, max=0.318, sum=0.318 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.3180293652930743\"}", - "MedQA - # eval": "{\"description\": \"min=503, mean=503, max=503, sum=503 (1)\", \"tab\": \"General information\", \"score\": \"503.0\"}", - "MedQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "MedQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "MedQA - # prompt tokens": "{\"description\": \"min=1025.274, mean=1025.274, max=1025.274, sum=1025.274 (1)\", \"tab\": \"General information\", \"score\": \"1025.2743538767395\"}", - "MedQA - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}" + "World Religions - Observed inference time (s)": "{\"description\": \"min=0.277, mean=0.277, max=0.277, sum=0.554 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.27723441068191973\"}", + "World Religions - # eval": "{\"description\": \"min=171, mean=171, max=171, sum=342 (2)\", \"tab\": \"General information\", \"score\": \"171.0\"}", + "World Religions - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "World Religions - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "World Religions - # prompt tokens": "{\"description\": \"min=274.52, mean=274.52, max=274.52, sum=549.041 (2)\", \"tab\": \"General information\", \"score\": \"274.5204678362573\"}", + "World Religions - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}" } }, "generation_config": { - "additional_details": {} + "additional_details": { + "subject": "\"world_religions\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_world_religions\"" + } } }, { - "evaluation_name": "WMT 2014", + "evaluation_name": "Mean win rate", "source_data": { - "dataset_name": "WMT 2014", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", + "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.224, + "score": 0.773, "details": { - "description": "min=0.182, mean=0.224, max=0.266, sum=1.121 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": "{\"description\": \"min=0.737, mean=0.816, max=0.848, sum=4.078 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.8156762526912515\"}", - "WMT 2014 - # eval": "{\"description\": \"min=503, mean=568.8, max=832, sum=2844 (5)\", \"tab\": \"General information\", \"score\": \"568.8\"}", - "WMT 2014 - # train": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "WMT 2014 - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "WMT 2014 - # prompt tokens": "{\"description\": \"min=101.139, mean=120.868, max=141.33, sum=604.34 (5)\", \"tab\": \"General information\", \"score\": \"120.86804366111025\"}", - "WMT 2014 - # output tokens": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}" + "description": "", + "tab": "Efficiency" } }, "generation_config": { - "additional_details": { - "language_pair": "[\"cs-en\", \"de-en\", \"fr-en\", \"hi-en\", \"ru-en\"]" - } + "additional_details": {} } } ], "detailed_evaluation_results": null, "generation_config": { - "additional_details": {} + "additional_details": { + "subject": "[\"abstract_algebra\", \"anatomy\", \"astronomy\", \"business_ethics\", \"clinical_knowledge\", \"college_biology\", \"college_chemistry\", \"college_computer_science\", \"college_mathematics\", \"college_medicine\", \"college_physics\", \"computer_security\", \"conceptual_physics\", \"econometrics\", \"electrical_engineering\", \"elementary_mathematics\", \"formal_logic\", \"global_facts\", \"high_school_biology\", \"high_school_chemistry\", \"high_school_computer_science\", \"high_school_european_history\", \"high_school_geography\", \"high_school_government_and_politics\", \"high_school_macroeconomics\", \"high_school_mathematics\", \"high_school_microeconomics\", \"high_school_physics\", \"high_school_psychology\", \"high_school_statistics\", \"high_school_us_history\", \"high_school_world_history\", \"human_aging\", \"human_sexuality\", \"international_law\", \"jurisprudence\", \"logical_fallacies\", \"machine_learning\", \"management\", \"marketing\", \"medical_genetics\", \"miscellaneous\", \"moral_disputes\", \"moral_scenarios\", \"nutrition\", \"philosophy\", \"prehistory\", \"professional_accounting\", \"professional_law\", \"professional_medicine\", \"professional_psychology\", \"public_relations\", \"security_studies\", \"sociology\", \"us_foreign_policy\", \"virology\", \"world_religions\"]", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "[\"mmlu_abstract_algebra\", \"mmlu_anatomy\", \"mmlu_astronomy\", \"mmlu_business_ethics\", \"mmlu_clinical_knowledge\", \"mmlu_college_biology\", \"mmlu_college_chemistry\", \"mmlu_college_computer_science\", \"mmlu_college_mathematics\", \"mmlu_college_medicine\", \"mmlu_college_physics\", \"mmlu_computer_security\", \"mmlu_conceptual_physics\", \"mmlu_econometrics\", \"mmlu_electrical_engineering\", \"mmlu_elementary_mathematics\", \"mmlu_formal_logic\", \"mmlu_global_facts\", \"mmlu_high_school_biology\", \"mmlu_high_school_chemistry\", \"mmlu_high_school_computer_science\", \"mmlu_high_school_european_history\", \"mmlu_high_school_geography\", \"mmlu_high_school_government_and_politics\", \"mmlu_high_school_macroeconomics\", \"mmlu_high_school_mathematics\", \"mmlu_high_school_microeconomics\", \"mmlu_high_school_physics\", \"mmlu_high_school_psychology\", \"mmlu_high_school_statistics\", \"mmlu_high_school_us_history\", \"mmlu_high_school_world_history\", \"mmlu_human_aging\", \"mmlu_human_sexuality\", \"mmlu_international_law\", \"mmlu_jurisprudence\", \"mmlu_logical_fallacies\", \"mmlu_machine_learning\", \"mmlu_management\", \"mmlu_marketing\", \"mmlu_medical_genetics\", \"mmlu_miscellaneous\", \"mmlu_moral_disputes\", \"mmlu_moral_scenarios\", \"mmlu_nutrition\", \"mmlu_philosophy\", \"mmlu_prehistory\", \"mmlu_professional_accounting\", \"mmlu_professional_law\", \"mmlu_professional_medicine\", \"mmlu_professional_psychology\", \"mmlu_public_relations\", \"mmlu_security_studies\", \"mmlu_sociology\", \"mmlu_us_foreign_policy\", \"mmlu_virology\", \"mmlu_world_religions\"]" + } } } ] diff --git a/data/models/minimax_minimax-m2.1.json b/data/models/minimax_minimax-m2.1.json index 33905b0ffaa72b65b1d021a73faac651c1ebfecd..a5b957c9841bcbc5016fb37de174c988ce273877 100644 --- a/data/models/minimax_minimax-m2.1.json +++ b/data/models/minimax_minimax-m2.1.json @@ -4,13 +4,13 @@ "id": "minimax/minimax-m2.1", "developer": "MiniMax", "additional_details": { - "agent_name": "Crux", - "agent_organization": "Roam" + "agent_name": "Terminus 2", + "agent_organization": "Terminal Bench" } }, "evaluations": [ { - "evaluation_id": "terminal-bench-2.0/crux__minimax-m2.1/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/terminus-2__minimax-m2.1/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -34,7 +34,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-12-22", + "evaluation_timestamp": "2025-12-23", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -43,7 +43,7 @@ "max_score": 100.0 }, "score_details": { - "score": 36.6, + "score": 29.2, "uncertainty": { "standard_error": { "value": 2.9 @@ -53,7 +53,7 @@ }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Crux\" -m \"MiniMax M2.1\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"MiniMax M2.1\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -70,7 +70,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Crux\" -m \"MiniMax M2.1\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"MiniMax M2.1\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -84,7 +84,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/terminus-2__minimax-m2.1/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/crux__minimax-m2.1/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -108,7 +108,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-12-23", + "evaluation_timestamp": "2025-12-22", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -117,7 +117,7 @@ "max_score": 100.0 }, "score_details": { - "score": 29.2, + "score": 36.6, "uncertainty": { "standard_error": { "value": 2.9 @@ -127,7 +127,7 @@ }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"MiniMax M2.1\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Crux\" -m \"MiniMax M2.1\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -144,7 +144,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"MiniMax M2.1\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Crux\" -m \"MiniMax M2.1\" -k 5", "agentic_eval_config": { "available_tools": [ { diff --git a/data/models/mistralai_mistral-7b-instruct-v0.3.json b/data/models/mistralai_mistral-7b-instruct-v0.3.json index d35f23e1653c8321bbb94b0e748858ca165accef..0f7fe68f0b7a1a194dc76677de8b8b0fb7a1f4bb 100644 --- a/data/models/mistralai_mistral-7b-instruct-v0.3.json +++ b/data/models/mistralai_mistral-7b-instruct-v0.3.json @@ -236,10 +236,10 @@ } }, { - "evaluation_id": "helm_mmlu/mistralai_mistral-7b-instruct-v0.3/1774096312.00548", - "retrieved_timestamp": "1774096312.00548", + "evaluation_id": "helm_lite/mistralai_mistral-7b-instruct-v0.3/1774096306.427425", + "retrieved_timestamp": "1774096306.427425", "source_metadata": { - "source_name": "helm_mmlu", + "source_name": "helm_lite", "source_type": "documentation", "source_organization_name": "crfm", "evaluator_relationship": "third_party" @@ -248,438 +248,382 @@ "name": "helm", "version": "unknown" }, - "benchmark": "helm_mmlu", + "benchmark": "helm_lite", "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects", + "evaluation_name": "Mean win rate", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "helm_lite", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", + "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.599, + "score": 0.196, "details": { - "description": "min=0.258, mean=0.599, max=0.881, sum=68.3 (114)", + "description": "", "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": "{\"description\": \"min=0.212, mean=0.526, max=1.438, sum=59.959 (114)\", \"tab\": \"Efficiency\", \"score\": \"0.525951832745908\"}", - "MMLU All Subjects - # eval": "{\"description\": \"min=100, mean=246.351, max=1534, sum=28084 (114)\", \"tab\": \"General information\", \"score\": \"246.35087719298247\"}", - "MMLU All Subjects - # train": "{\"description\": \"min=5, mean=5, max=5, sum=570 (114)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "MMLU All Subjects - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (114)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "MMLU All Subjects - # prompt tokens": "{\"description\": \"min=317.924, mean=705.273, max=3098.109, sum=80401.178 (114)\", \"tab\": \"General information\", \"score\": \"705.2734899593811\"}", - "MMLU All Subjects - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=114 (114)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Mean win rate - Efficiency": "{\"description\": \"\", \"tab\": \"Efficiency\", \"score\": \"0.6493133583021223\"}", + "Mean win rate - General information": "{\"description\": \"\", \"tab\": \"General information\", \"score\": \"\"}" } }, "generation_config": { - "additional_details": { - "subject": "[\"abstract_algebra\", \"anatomy\", \"astronomy\", \"business_ethics\", \"clinical_knowledge\", \"college_biology\", \"college_chemistry\", \"college_computer_science\", \"college_mathematics\", \"college_medicine\", \"college_physics\", \"computer_security\", \"conceptual_physics\", \"econometrics\", \"electrical_engineering\", \"elementary_mathematics\", \"formal_logic\", \"global_facts\", \"high_school_biology\", \"high_school_chemistry\", \"high_school_computer_science\", \"high_school_european_history\", \"high_school_geography\", \"high_school_government_and_politics\", \"high_school_macroeconomics\", \"high_school_mathematics\", \"high_school_microeconomics\", \"high_school_physics\", \"high_school_psychology\", \"high_school_statistics\", \"high_school_us_history\", \"high_school_world_history\", \"human_aging\", \"human_sexuality\", \"international_law\", \"jurisprudence\", \"logical_fallacies\", \"machine_learning\", \"management\", \"marketing\", \"medical_genetics\", \"miscellaneous\", \"moral_disputes\", \"moral_scenarios\", \"nutrition\", \"philosophy\", \"prehistory\", \"professional_accounting\", \"professional_law\", \"professional_medicine\", \"professional_psychology\", \"public_relations\", \"security_studies\", \"sociology\", \"us_foreign_policy\", \"virology\", \"world_religions\"]", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "[\"mmlu_abstract_algebra\", \"mmlu_anatomy\", \"mmlu_astronomy\", \"mmlu_business_ethics\", \"mmlu_clinical_knowledge\", \"mmlu_college_biology\", \"mmlu_college_chemistry\", \"mmlu_college_computer_science\", \"mmlu_college_mathematics\", \"mmlu_college_medicine\", \"mmlu_college_physics\", \"mmlu_computer_security\", \"mmlu_conceptual_physics\", \"mmlu_econometrics\", \"mmlu_electrical_engineering\", \"mmlu_elementary_mathematics\", \"mmlu_formal_logic\", \"mmlu_global_facts\", \"mmlu_high_school_biology\", \"mmlu_high_school_chemistry\", \"mmlu_high_school_computer_science\", \"mmlu_high_school_european_history\", \"mmlu_high_school_geography\", \"mmlu_high_school_government_and_politics\", \"mmlu_high_school_macroeconomics\", \"mmlu_high_school_mathematics\", \"mmlu_high_school_microeconomics\", \"mmlu_high_school_physics\", \"mmlu_high_school_psychology\", \"mmlu_high_school_statistics\", \"mmlu_high_school_us_history\", \"mmlu_high_school_world_history\", \"mmlu_human_aging\", \"mmlu_human_sexuality\", \"mmlu_international_law\", \"mmlu_jurisprudence\", \"mmlu_logical_fallacies\", \"mmlu_machine_learning\", \"mmlu_management\", \"mmlu_marketing\", \"mmlu_medical_genetics\", \"mmlu_miscellaneous\", \"mmlu_moral_disputes\", \"mmlu_moral_scenarios\", \"mmlu_nutrition\", \"mmlu_philosophy\", \"mmlu_prehistory\", \"mmlu_professional_accounting\", \"mmlu_professional_law\", \"mmlu_professional_medicine\", \"mmlu_professional_psychology\", \"mmlu_public_relations\", \"mmlu_security_studies\", \"mmlu_sociology\", \"mmlu_us_foreign_policy\", \"mmlu_virology\", \"mmlu_world_religions\"]" - } + "additional_details": {} } }, { - "evaluation_name": "Abstract Algebra", + "evaluation_name": "NarrativeQA", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "NarrativeQA", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on Abstract Algebra", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.27, + "score": 0.716, "details": { - "description": "min=0.27, mean=0.27, max=0.27, sum=0.54 (2)", + "description": "min=0.716, mean=0.716, max=0.716, sum=0.716 (1)", "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": "{\"description\": \"min=0.321, mean=0.321, max=0.321, sum=0.642 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.32117165088653565\"}", - "Abstract Algebra - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "Abstract Algebra - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Abstract Algebra - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Abstract Algebra - # prompt tokens": "{\"description\": \"min=411.44, mean=411.44, max=411.44, sum=822.88 (2)\", \"tab\": \"General information\", \"score\": \"411.44\"}", - "Abstract Algebra - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "NarrativeQA - Observed inference time (s)": "{\"description\": \"min=0.813, mean=0.813, max=0.813, sum=0.813 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.8132137520212522\"}", + "NarrativeQA - # eval": "{\"description\": \"min=355, mean=355, max=355, sum=355 (1)\", \"tab\": \"General information\", \"score\": \"355.0\"}", + "NarrativeQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "NarrativeQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "NarrativeQA - # prompt tokens": "{\"description\": \"min=3924.33, mean=3924.33, max=3924.33, sum=3924.33 (1)\", \"tab\": \"General information\", \"score\": \"3924.3295774647886\"}", + "NarrativeQA - # output tokens": "{\"description\": \"min=7.107, mean=7.107, max=7.107, sum=7.107 (1)\", \"tab\": \"General information\", \"score\": \"7.107042253521127\"}" } }, "generation_config": { - "additional_details": { - "subject": "\"abstract_algebra\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_abstract_algebra\"" - } + "additional_details": {} } }, { - "evaluation_name": "Anatomy", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on Anatomy", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.585, + "score": 0.253, "details": { - "description": "min=0.585, mean=0.585, max=0.585, sum=1.17 (2)", + "description": "min=0.253, mean=0.253, max=0.253, sum=0.253 (1)", "tab": "Accuracy", - "Anatomy - Observed inference time (s)": "{\"description\": \"min=0.246, mean=0.246, max=0.246, sum=0.493 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.24627229902479383\"}", - "Anatomy - # eval": "{\"description\": \"min=135, mean=135, max=135, sum=270 (2)\", \"tab\": \"General information\", \"score\": \"135.0\"}", - "Anatomy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Anatomy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Anatomy - # prompt tokens": "{\"description\": \"min=416.089, mean=416.089, max=416.089, sum=832.178 (2)\", \"tab\": \"General information\", \"score\": \"416.0888888888889\"}", - "Anatomy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "NaturalQuestions (open-book) - Observed inference time (s)": "{\"description\": \"min=0.563, mean=0.563, max=0.563, sum=0.563 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.5634698050022126\"}", + "NaturalQuestions (closed-book) - Observed inference time (s)": "{\"description\": \"min=0.535, mean=0.535, max=0.535, sum=0.535 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.5347676448822022\"}", + "NaturalQuestions (open-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", + "NaturalQuestions (open-book) - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "NaturalQuestions (open-book) - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "NaturalQuestions (open-book) - # prompt tokens": "{\"description\": \"min=2498.79, mean=2498.79, max=2498.79, sum=2498.79 (1)\", \"tab\": \"General information\", \"score\": \"2498.79\"}", + "NaturalQuestions (open-book) - # output tokens": "{\"description\": \"min=12.448, mean=12.448, max=12.448, sum=12.448 (1)\", \"tab\": \"General information\", \"score\": \"12.448\"}", + "NaturalQuestions (closed-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", + "NaturalQuestions (closed-book) - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "NaturalQuestions (closed-book) - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "NaturalQuestions (closed-book) - # prompt tokens": "{\"description\": \"min=172.069, mean=172.069, max=172.069, sum=172.069 (1)\", \"tab\": \"General information\", \"score\": \"172.069\"}", + "NaturalQuestions (closed-book) - # output tokens": "{\"description\": \"min=20.461, mean=20.461, max=20.461, sum=20.461 (1)\", \"tab\": \"General information\", \"score\": \"20.461\"}" } }, "generation_config": { "additional_details": { - "subject": "\"anatomy\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_anatomy\"" + "mode": "\"closedbook\"" } } }, { - "evaluation_name": "College Physics", + "evaluation_name": "OpenbookQA", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "OpenbookQA", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on College Physics", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.343, + "score": 0.79, "details": { - "description": "min=0.343, mean=0.343, max=0.343, sum=0.686 (2)", + "description": "min=0.79, mean=0.79, max=0.79, sum=0.79 (1)", "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": "{\"description\": \"min=0.221, mean=0.221, max=0.221, sum=0.442 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.22099271774291993\"}", - "College Biology - Observed inference time (s)": "{\"description\": \"min=0.7, mean=0.7, max=0.7, sum=1.399 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6997380173868604\"}", - "College Computer Science - Observed inference time (s)": "{\"description\": \"min=0.466, mean=0.466, max=0.466, sum=0.932 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4661028146743774\"}", - "College Mathematics - Observed inference time (s)": "{\"description\": \"min=0.212, mean=0.212, max=0.212, sum=0.424 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.21210591793060302\"}", - "College Medicine - Observed inference time (s)": "{\"description\": \"min=0.387, mean=0.387, max=0.387, sum=0.774 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3871537646806309\"}", - "College Physics - Observed inference time (s)": "{\"description\": \"min=0.455, mean=0.455, max=0.455, sum=0.91 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.45503536392660704\"}", - "College Chemistry - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "College Chemistry - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "College Chemistry - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "College Chemistry - # prompt tokens": "{\"description\": \"min=636.71, mean=636.71, max=636.71, sum=1273.42 (2)\", \"tab\": \"General information\", \"score\": \"636.71\"}", - "College Chemistry - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "College Biology - # eval": "{\"description\": \"min=144, mean=144, max=144, sum=288 (2)\", \"tab\": \"General information\", \"score\": \"144.0\"}", - "College Biology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "College Biology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "College Biology - # prompt tokens": "{\"description\": \"min=559.799, mean=559.799, max=559.799, sum=1119.597 (2)\", \"tab\": \"General information\", \"score\": \"559.7986111111111\"}", - "College Biology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "College Computer Science - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "College Computer Science - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "College Computer Science - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "College Computer Science - # prompt tokens": "{\"description\": \"min=911.17, mean=911.17, max=911.17, sum=1822.34 (2)\", \"tab\": \"General information\", \"score\": \"911.17\"}", - "College Computer Science - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "College Mathematics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "College Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "College Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "College Mathematics - # prompt tokens": "{\"description\": \"min=667.31, mean=667.31, max=667.31, sum=1334.62 (2)\", \"tab\": \"General information\", \"score\": \"667.31\"}", - "College Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "College Medicine - # eval": "{\"description\": \"min=173, mean=173, max=173, sum=346 (2)\", \"tab\": \"General information\", \"score\": \"173.0\"}", - "College Medicine - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "College Medicine - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "College Medicine - # prompt tokens": "{\"description\": \"min=601.41, mean=601.41, max=601.41, sum=1202.821 (2)\", \"tab\": \"General information\", \"score\": \"601.4104046242775\"}", - "College Medicine - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "College Physics - # eval": "{\"description\": \"min=102, mean=102, max=102, sum=204 (2)\", \"tab\": \"General information\", \"score\": \"102.0\"}", - "College Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "College Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "College Physics - # prompt tokens": "{\"description\": \"min=560.029, mean=560.029, max=560.029, sum=1120.059 (2)\", \"tab\": \"General information\", \"score\": \"560.0294117647059\"}", - "College Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "OpenbookQA - Observed inference time (s)": "{\"description\": \"min=0.256, mean=0.256, max=0.256, sum=0.256 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.25593132400512697\"}", + "OpenbookQA - # eval": "{\"description\": \"min=500, mean=500, max=500, sum=500 (1)\", \"tab\": \"General information\", \"score\": \"500.0\"}", + "OpenbookQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "OpenbookQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "OpenbookQA - # prompt tokens": "{\"description\": \"min=289.15, mean=289.15, max=289.15, sum=289.15 (1)\", \"tab\": \"General information\", \"score\": \"289.15\"}", + "OpenbookQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"college_physics\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_college_physics\"" + "dataset": "\"openbookqa\"", + "method": "\"multiple_choice_joint\"" } } }, { - "evaluation_name": "Computer Security", + "evaluation_name": "MMLU", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "MMLU", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on Computer Security", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7, + "score": 0.51, "details": { - "description": "min=0.7, mean=0.7, max=0.7, sum=1.4 (2)", + "description": "min=0.27, mean=0.51, max=0.79, sum=2.551 (5)", "tab": "Accuracy", - "Computer Security - Observed inference time (s)": "{\"description\": \"min=0.426, mean=0.426, max=0.426, sum=0.853 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4263953256607056\"}", - "Computer Security - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "Computer Security - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Computer Security - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Computer Security - # prompt tokens": "{\"description\": \"min=433.94, mean=433.94, max=433.94, sum=867.88 (2)\", \"tab\": \"General information\", \"score\": \"433.94\"}", - "Computer Security - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "MMLU - Observed inference time (s)": "{\"description\": \"min=0.221, mean=0.372, max=0.487, sum=1.862 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.37230395750413864\"}", + "MMLU - # eval": "{\"description\": \"min=100, mean=102.8, max=114, sum=514 (5)\", \"tab\": \"General information\", \"score\": \"102.8\"}", + "MMLU - # train": "{\"description\": \"min=5, mean=5, max=5, sum=25 (5)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "MMLU - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "MMLU - # prompt tokens": "{\"description\": \"min=411.44, mean=532.091, max=696.175, sum=2660.455 (5)\", \"tab\": \"General information\", \"score\": \"532.0910877192983\"}", + "MMLU - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"computer_security\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_computer_security\"" + "subject": "[\"abstract_algebra\", \"college_chemistry\", \"computer_security\", \"econometrics\", \"us_foreign_policy\"]", + "method": "\"multiple_choice_joint\"" } } }, { - "evaluation_name": "Econometrics", + "evaluation_name": "MATH", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "MATH", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on Econometrics", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.421, + "score": 0.289, "details": { - "description": "min=0.421, mean=0.421, max=0.421, sum=0.842 (2)", + "description": "min=0.115, mean=0.289, max=0.477, sum=2.02 (7)", "tab": "Accuracy", - "Econometrics - Observed inference time (s)": "{\"description\": \"min=0.406, mean=0.406, max=0.406, sum=0.813 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.406455958098696\"}", - "Econometrics - # eval": "{\"description\": \"min=114, mean=114, max=114, sum=228 (2)\", \"tab\": \"General information\", \"score\": \"114.0\"}", - "Econometrics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Econometrics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Econometrics - # prompt tokens": "{\"description\": \"min=696.175, mean=696.175, max=696.175, sum=1392.351 (2)\", \"tab\": \"General information\", \"score\": \"696.1754385964912\"}", - "Econometrics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "MATH - Observed inference time (s)": "{\"description\": \"min=2.027, mean=2.656, max=3.039, sum=18.593 (7)\", \"tab\": \"Efficiency\", \"score\": \"2.656151831465352\"}", + "MATH - # eval": "{\"description\": \"min=30, mean=62.429, max=135, sum=437 (7)\", \"tab\": \"General information\", \"score\": \"62.42857142857143\"}", + "MATH - # train": "{\"description\": \"min=8, mean=8, max=8, sum=56 (7)\", \"tab\": \"General information\", \"score\": \"8.0\"}", + "MATH - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (7)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "MATH - # prompt tokens": "{\"description\": \"min=991.615, mean=1455.266, max=2502.962, sum=10186.865 (7)\", \"tab\": \"General information\", \"score\": \"1455.2664139976257\"}", + "MATH - # output tokens": "{\"description\": \"min=123.616, mean=149.99, max=172.789, sum=1049.933 (7)\", \"tab\": \"General information\", \"score\": \"149.99043902740354\"}" } }, "generation_config": { "additional_details": { - "subject": "\"econometrics\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_econometrics\"" + "subject": "[\"algebra\", \"counting_and_probability\", \"geometry\", \"intermediate_algebra\", \"number_theory\", \"prealgebra\", \"precalculus\"]", + "level": "\"1\"", + "use_official_examples": "\"False\"", + "use_chain_of_thought": "\"True\"" } } }, { - "evaluation_name": "Global Facts", + "evaluation_name": "GSM8K", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "GSM8K", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on Global Facts", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.33, + "score": 0.538, "details": { - "description": "min=0.33, mean=0.33, max=0.33, sum=0.66 (2)", + "description": "min=0.538, mean=0.538, max=0.538, sum=0.538 (1)", "tab": "Accuracy", - "Global Facts - Observed inference time (s)": "{\"description\": \"min=0.299, mean=0.299, max=0.299, sum=0.598 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.29881003856658933\"}", - "Global Facts - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "Global Facts - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Global Facts - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Global Facts - # prompt tokens": "{\"description\": \"min=492.47, mean=492.47, max=492.47, sum=984.94 (2)\", \"tab\": \"General information\", \"score\": \"492.47\"}", - "Global Facts - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "GSM8K - Observed inference time (s)": "{\"description\": \"min=3.95, mean=3.95, max=3.95, sum=3.95 (1)\", \"tab\": \"Efficiency\", \"score\": \"3.949965229511261\"}", + "GSM8K - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", + "GSM8K - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "GSM8K - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "GSM8K - # prompt tokens": "{\"description\": \"min=1187.268, mean=1187.268, max=1187.268, sum=1187.268 (1)\", \"tab\": \"General information\", \"score\": \"1187.268\"}", + "GSM8K - # output tokens": "{\"description\": \"min=196.611, mean=196.611, max=196.611, sum=196.611 (1)\", \"tab\": \"General information\", \"score\": \"196.611\"}" } }, "generation_config": { "additional_details": { - "subject": "\"global_facts\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_global_facts\"" + "stop": "\"none\"" } } }, { - "evaluation_name": "Jurisprudence", + "evaluation_name": "LegalBench", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "LegalBench", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on Jurisprudence", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.713, + "score": 0.331, "details": { - "description": "min=0.713, mean=0.713, max=0.713, sum=1.426 (2)", + "description": "min=0.063, mean=0.331, max=0.733, sum=1.655 (5)", "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": "{\"description\": \"min=0.232, mean=0.232, max=0.232, sum=0.465 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.23237781833719323\"}", - "Jurisprudence - # eval": "{\"description\": \"min=108, mean=108, max=108, sum=216 (2)\", \"tab\": \"General information\", \"score\": \"108.0\"}", - "Jurisprudence - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Jurisprudence - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Jurisprudence - # prompt tokens": "{\"description\": \"min=460.093, mean=460.093, max=460.093, sum=920.185 (2)\", \"tab\": \"General information\", \"score\": \"460.0925925925926\"}", - "Jurisprudence - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "LegalBench - Observed inference time (s)": "{\"description\": \"min=0.316, mean=0.489, max=0.855, sum=2.444 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.4887186054518059\"}", + "LegalBench - # eval": "{\"description\": \"min=95, mean=409.4, max=1000, sum=2047 (5)\", \"tab\": \"General information\", \"score\": \"409.4\"}", + "LegalBench - # train": "{\"description\": \"min=4, mean=4.8, max=5, sum=24 (5)\", \"tab\": \"General information\", \"score\": \"4.8\"}", + "LegalBench - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "LegalBench - # prompt tokens": "{\"description\": \"min=236.453, mean=1750.748, max=7224.488, sum=8753.741 (5)\", \"tab\": \"General information\", \"score\": \"1750.7482458432962\"}", + "LegalBench - # output tokens": "{\"description\": \"min=2, mean=9.174, max=15.242, sum=45.871 (5)\", \"tab\": \"General information\", \"score\": \"9.17419274343898\"}" } }, "generation_config": { "additional_details": { - "subject": "\"jurisprudence\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_jurisprudence\"" + "subset": "[\"abercrombie\", \"corporate_lobbying\", \"function_of_decision_section\", \"international_citizenship_questions\", \"proa\"]" } } }, { - "evaluation_name": "Philosophy", + "evaluation_name": "MedQA", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "MedQA", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on Philosophy", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.659, + "score": 0.517, "details": { - "description": "min=0.659, mean=0.659, max=0.659, sum=1.318 (2)", + "description": "min=0.517, mean=0.517, max=0.517, sum=0.517 (1)", "tab": "Accuracy", - "Philosophy - Observed inference time (s)": "{\"description\": \"min=0.899, mean=0.899, max=0.899, sum=1.798 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.8987545852109167\"}", - "Philosophy - # eval": "{\"description\": \"min=311, mean=311, max=311, sum=622 (2)\", \"tab\": \"General information\", \"score\": \"311.0\"}", - "Philosophy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Philosophy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Philosophy - # prompt tokens": "{\"description\": \"min=382.82, mean=382.82, max=382.82, sum=765.64 (2)\", \"tab\": \"General information\", \"score\": \"382.81993569131834\"}", - "Philosophy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "MedQA - Observed inference time (s)": "{\"description\": \"min=0.418, mean=0.418, max=0.418, sum=0.418 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.4182186216767692\"}", + "MedQA - # eval": "{\"description\": \"min=503, mean=503, max=503, sum=503 (1)\", \"tab\": \"General information\", \"score\": \"503.0\"}", + "MedQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "MedQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "MedQA - # prompt tokens": "{\"description\": \"min=1202.093, mean=1202.093, max=1202.093, sum=1202.093 (1)\", \"tab\": \"General information\", \"score\": \"1202.0934393638172\"}", + "MedQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { - "additional_details": { - "subject": "\"philosophy\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_philosophy\"" - } + "additional_details": {} } }, { - "evaluation_name": "Professional Psychology", + "evaluation_name": "WMT 2014", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "WMT 2014", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on Professional Psychology", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.641, + "score": 0.142, "details": { - "description": "min=0.641, mean=0.641, max=0.641, sum=1.281 (2)", + "description": "min=0.047, mean=0.142, max=0.184, sum=0.712 (5)", "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": "{\"description\": \"min=0.615, mean=0.615, max=0.615, sum=1.23 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6148438769228318\"}", - "Professional Accounting - Observed inference time (s)": "{\"description\": \"min=0.825, mean=0.825, max=0.825, sum=1.651 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.8254362666015084\"}", - "Professional Law - Observed inference time (s)": "{\"description\": \"min=0.682, mean=0.682, max=0.682, sum=1.364 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.68212915414937\"}", - "Professional Psychology - Observed inference time (s)": "{\"description\": \"min=0.506, mean=0.506, max=0.506, sum=1.012 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.505940170459498\"}", - "Professional Medicine - # eval": "{\"description\": \"min=272, mean=272, max=272, sum=544 (2)\", \"tab\": \"General information\", \"score\": \"272.0\"}", - "Professional Medicine - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Professional Medicine - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Professional Medicine - # prompt tokens": "{\"description\": \"min=1288.143, mean=1288.143, max=1288.143, sum=2576.287 (2)\", \"tab\": \"General information\", \"score\": \"1288.1433823529412\"}", - "Professional Medicine - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "Professional Accounting - # eval": "{\"description\": \"min=282, mean=282, max=282, sum=564 (2)\", \"tab\": \"General information\", \"score\": \"282.0\"}", - "Professional Accounting - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Professional Accounting - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Professional Accounting - # prompt tokens": "{\"description\": \"min=805.496, mean=805.496, max=805.496, sum=1610.993 (2)\", \"tab\": \"General information\", \"score\": \"805.4964539007092\"}", - "Professional Accounting - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "Professional Law - # eval": "{\"description\": \"min=1534, mean=1534, max=1534, sum=3068 (2)\", \"tab\": \"General information\", \"score\": \"1534.0\"}", - "Professional Law - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Professional Law - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Professional Law - # prompt tokens": "{\"description\": \"min=1858.711, mean=1858.711, max=1858.711, sum=3717.421 (2)\", \"tab\": \"General information\", \"score\": \"1858.7105606258149\"}", - "Professional Law - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "Professional Psychology - # eval": "{\"description\": \"min=612, mean=612, max=612, sum=1224 (2)\", \"tab\": \"General information\", \"score\": \"612.0\"}", - "Professional Psychology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Professional Psychology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Professional Psychology - # prompt tokens": "{\"description\": \"min=654.278, mean=654.278, max=654.278, sum=1308.556 (2)\", \"tab\": \"General information\", \"score\": \"654.2777777777778\"}", - "Professional Psychology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "WMT 2014 - Observed inference time (s)": "{\"description\": \"min=0.582, mean=0.775, max=0.872, sum=3.875 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.7750062139801958\"}", + "WMT 2014 - # eval": "{\"description\": \"min=503, mean=568.8, max=832, sum=2844 (5)\", \"tab\": \"General information\", \"score\": \"568.8\"}", + "WMT 2014 - # train": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "WMT 2014 - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "WMT 2014 - # prompt tokens": "{\"description\": \"min=148.306, mean=162.433, max=181.018, sum=812.166 (5)\", \"tab\": \"General information\", \"score\": \"162.43317355482492\"}", + "WMT 2014 - # output tokens": "{\"description\": \"min=28.3, mean=30.51, max=31.912, sum=152.552 (5)\", \"tab\": \"General information\", \"score\": \"30.510483732222053\"}" } }, "generation_config": { "additional_details": { - "subject": "\"professional_psychology\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_professional_psychology\"" + "language_pair": "[\"cs-en\", \"de-en\", \"fr-en\", \"hi-en\", \"ru-en\"]" } } - }, + } + ], + "detailed_evaluation_results": null, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_id": "helm_mmlu/mistralai_mistral-7b-instruct-v0.3/1774096312.00548", + "retrieved_timestamp": "1774096312.00548", + "source_metadata": { + "source_name": "helm_mmlu", + "source_type": "documentation", + "source_organization_name": "crfm", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "helm", + "version": "unknown" + }, + "benchmark": "helm_mmlu", + "evaluation_results": [ { - "evaluation_name": "Us Foreign Policy", + "evaluation_name": "MMLU All Subjects", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -688,36 +632,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.79, + "score": 0.599, "details": { - "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)", + "description": "min=0.258, mean=0.599, max=0.881, sum=68.3 (114)", "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": "{\"description\": \"min=0.487, mean=0.487, max=0.487, sum=0.973 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.48650413513183594\"}", - "Us Foreign Policy - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "Us Foreign Policy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Us Foreign Policy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Us Foreign Policy - # prompt tokens": "{\"description\": \"min=482.19, mean=482.19, max=482.19, sum=964.38 (2)\", \"tab\": \"General information\", \"score\": \"482.19\"}", - "Us Foreign Policy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "MMLU All Subjects - Observed inference time (s)": "{\"description\": \"min=0.212, mean=0.526, max=1.438, sum=59.959 (114)\", \"tab\": \"Efficiency\", \"score\": \"0.525951832745908\"}", + "MMLU All Subjects - # eval": "{\"description\": \"min=100, mean=246.351, max=1534, sum=28084 (114)\", \"tab\": \"General information\", \"score\": \"246.35087719298247\"}", + "MMLU All Subjects - # train": "{\"description\": \"min=5, mean=5, max=5, sum=570 (114)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "MMLU All Subjects - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (114)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "MMLU All Subjects - # prompt tokens": "{\"description\": \"min=317.924, mean=705.273, max=3098.109, sum=80401.178 (114)\", \"tab\": \"General information\", \"score\": \"705.2734899593811\"}", + "MMLU All Subjects - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=114 (114)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"us_foreign_policy\"", + "subject": "[\"abstract_algebra\", \"anatomy\", \"astronomy\", \"business_ethics\", \"clinical_knowledge\", \"college_biology\", \"college_chemistry\", \"college_computer_science\", \"college_mathematics\", \"college_medicine\", \"college_physics\", \"computer_security\", \"conceptual_physics\", \"econometrics\", \"electrical_engineering\", \"elementary_mathematics\", \"formal_logic\", \"global_facts\", \"high_school_biology\", \"high_school_chemistry\", \"high_school_computer_science\", \"high_school_european_history\", \"high_school_geography\", \"high_school_government_and_politics\", \"high_school_macroeconomics\", \"high_school_mathematics\", \"high_school_microeconomics\", \"high_school_physics\", \"high_school_psychology\", \"high_school_statistics\", \"high_school_us_history\", \"high_school_world_history\", \"human_aging\", \"human_sexuality\", \"international_law\", \"jurisprudence\", \"logical_fallacies\", \"machine_learning\", \"management\", \"marketing\", \"medical_genetics\", \"miscellaneous\", \"moral_disputes\", \"moral_scenarios\", \"nutrition\", \"philosophy\", \"prehistory\", \"professional_accounting\", \"professional_law\", \"professional_medicine\", \"professional_psychology\", \"public_relations\", \"security_studies\", \"sociology\", \"us_foreign_policy\", \"virology\", \"world_religions\"]", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_us_foreign_policy\"" + "groups": "[\"mmlu_abstract_algebra\", \"mmlu_anatomy\", \"mmlu_astronomy\", \"mmlu_business_ethics\", \"mmlu_clinical_knowledge\", \"mmlu_college_biology\", \"mmlu_college_chemistry\", \"mmlu_college_computer_science\", \"mmlu_college_mathematics\", \"mmlu_college_medicine\", \"mmlu_college_physics\", \"mmlu_computer_security\", \"mmlu_conceptual_physics\", \"mmlu_econometrics\", \"mmlu_electrical_engineering\", \"mmlu_elementary_mathematics\", \"mmlu_formal_logic\", \"mmlu_global_facts\", \"mmlu_high_school_biology\", \"mmlu_high_school_chemistry\", \"mmlu_high_school_computer_science\", \"mmlu_high_school_european_history\", \"mmlu_high_school_geography\", \"mmlu_high_school_government_and_politics\", \"mmlu_high_school_macroeconomics\", \"mmlu_high_school_mathematics\", \"mmlu_high_school_microeconomics\", \"mmlu_high_school_physics\", \"mmlu_high_school_psychology\", \"mmlu_high_school_statistics\", \"mmlu_high_school_us_history\", \"mmlu_high_school_world_history\", \"mmlu_human_aging\", \"mmlu_human_sexuality\", \"mmlu_international_law\", \"mmlu_jurisprudence\", \"mmlu_logical_fallacies\", \"mmlu_machine_learning\", \"mmlu_management\", \"mmlu_marketing\", \"mmlu_medical_genetics\", \"mmlu_miscellaneous\", \"mmlu_moral_disputes\", \"mmlu_moral_scenarios\", \"mmlu_nutrition\", \"mmlu_philosophy\", \"mmlu_prehistory\", \"mmlu_professional_accounting\", \"mmlu_professional_law\", \"mmlu_professional_medicine\", \"mmlu_professional_psychology\", \"mmlu_public_relations\", \"mmlu_security_studies\", \"mmlu_sociology\", \"mmlu_us_foreign_policy\", \"mmlu_virology\", \"mmlu_world_religions\"]" } } }, { - "evaluation_name": "Astronomy", + "evaluation_name": "Abstract Algebra", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -726,36 +670,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Astronomy", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.638, + "score": 0.27, "details": { - "description": "min=0.638, mean=0.638, max=0.638, sum=1.276 (2)", + "description": "min=0.27, mean=0.27, max=0.27, sum=0.54 (2)", "tab": "Accuracy", - "Astronomy - Observed inference time (s)": "{\"description\": \"min=0.678, mean=0.678, max=0.678, sum=1.355 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6775346147386652\"}", - "Astronomy - # eval": "{\"description\": \"min=152, mean=152, max=152, sum=304 (2)\", \"tab\": \"General information\", \"score\": \"152.0\"}", - "Astronomy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Astronomy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Astronomy - # prompt tokens": "{\"description\": \"min=674.987, mean=674.987, max=674.987, sum=1349.974 (2)\", \"tab\": \"General information\", \"score\": \"674.9868421052631\"}", - "Astronomy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Abstract Algebra - Observed inference time (s)": "{\"description\": \"min=0.321, mean=0.321, max=0.321, sum=0.642 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.32117165088653565\"}", + "Abstract Algebra - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "Abstract Algebra - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Abstract Algebra - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Abstract Algebra - # prompt tokens": "{\"description\": \"min=411.44, mean=411.44, max=411.44, sum=822.88 (2)\", \"tab\": \"General information\", \"score\": \"411.44\"}", + "Abstract Algebra - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"astronomy\"", + "subject": "\"abstract_algebra\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_astronomy\"" + "groups": "\"mmlu_abstract_algebra\"" } } }, { - "evaluation_name": "Business Ethics", + "evaluation_name": "Anatomy", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -764,36 +708,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Business Ethics", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.57, + "score": 0.585, "details": { - "description": "min=0.57, mean=0.57, max=0.57, sum=1.14 (2)", + "description": "min=0.585, mean=0.585, max=0.585, sum=1.17 (2)", "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": "{\"description\": \"min=0.645, mean=0.645, max=0.645, sum=1.289 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6446590375900269\"}", - "Business Ethics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "Business Ethics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Business Ethics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Business Ethics - # prompt tokens": "{\"description\": \"min=653.6, mean=653.6, max=653.6, sum=1307.2 (2)\", \"tab\": \"General information\", \"score\": \"653.6\"}", - "Business Ethics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Anatomy - Observed inference time (s)": "{\"description\": \"min=0.246, mean=0.246, max=0.246, sum=0.493 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.24627229902479383\"}", + "Anatomy - # eval": "{\"description\": \"min=135, mean=135, max=135, sum=270 (2)\", \"tab\": \"General information\", \"score\": \"135.0\"}", + "Anatomy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Anatomy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Anatomy - # prompt tokens": "{\"description\": \"min=416.089, mean=416.089, max=416.089, sum=832.178 (2)\", \"tab\": \"General information\", \"score\": \"416.0888888888889\"}", + "Anatomy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"business_ethics\"", + "subject": "\"anatomy\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_business_ethics\"" + "groups": "\"mmlu_anatomy\"" } } }, { - "evaluation_name": "Clinical Knowledge", + "evaluation_name": "College Physics", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -802,36 +746,66 @@ ] }, "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.687, + "score": 0.343, "details": { - "description": "min=0.687, mean=0.687, max=0.687, sum=1.374 (2)", + "description": "min=0.343, mean=0.343, max=0.343, sum=0.686 (2)", "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": "{\"description\": \"min=0.844, mean=0.844, max=0.844, sum=1.687 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.8436905698956184\"}", - "Clinical Knowledge - # eval": "{\"description\": \"min=265, mean=265, max=265, sum=530 (2)\", \"tab\": \"General information\", \"score\": \"265.0\"}", - "Clinical Knowledge - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Clinical Knowledge - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Clinical Knowledge - # prompt tokens": "{\"description\": \"min=496.174, mean=496.174, max=496.174, sum=992.347 (2)\", \"tab\": \"General information\", \"score\": \"496.1735849056604\"}", - "Clinical Knowledge - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "College Chemistry - Observed inference time (s)": "{\"description\": \"min=0.221, mean=0.221, max=0.221, sum=0.442 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.22099271774291993\"}", + "College Biology - Observed inference time (s)": "{\"description\": \"min=0.7, mean=0.7, max=0.7, sum=1.399 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6997380173868604\"}", + "College Computer Science - Observed inference time (s)": "{\"description\": \"min=0.466, mean=0.466, max=0.466, sum=0.932 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4661028146743774\"}", + "College Mathematics - Observed inference time (s)": "{\"description\": \"min=0.212, mean=0.212, max=0.212, sum=0.424 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.21210591793060302\"}", + "College Medicine - Observed inference time (s)": "{\"description\": \"min=0.387, mean=0.387, max=0.387, sum=0.774 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3871537646806309\"}", + "College Physics - Observed inference time (s)": "{\"description\": \"min=0.455, mean=0.455, max=0.455, sum=0.91 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.45503536392660704\"}", + "College Chemistry - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "College Chemistry - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "College Chemistry - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "College Chemistry - # prompt tokens": "{\"description\": \"min=636.71, mean=636.71, max=636.71, sum=1273.42 (2)\", \"tab\": \"General information\", \"score\": \"636.71\"}", + "College Chemistry - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "College Biology - # eval": "{\"description\": \"min=144, mean=144, max=144, sum=288 (2)\", \"tab\": \"General information\", \"score\": \"144.0\"}", + "College Biology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "College Biology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "College Biology - # prompt tokens": "{\"description\": \"min=559.799, mean=559.799, max=559.799, sum=1119.597 (2)\", \"tab\": \"General information\", \"score\": \"559.7986111111111\"}", + "College Biology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "College Computer Science - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "College Computer Science - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "College Computer Science - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "College Computer Science - # prompt tokens": "{\"description\": \"min=911.17, mean=911.17, max=911.17, sum=1822.34 (2)\", \"tab\": \"General information\", \"score\": \"911.17\"}", + "College Computer Science - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "College Mathematics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "College Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "College Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "College Mathematics - # prompt tokens": "{\"description\": \"min=667.31, mean=667.31, max=667.31, sum=1334.62 (2)\", \"tab\": \"General information\", \"score\": \"667.31\"}", + "College Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "College Medicine - # eval": "{\"description\": \"min=173, mean=173, max=173, sum=346 (2)\", \"tab\": \"General information\", \"score\": \"173.0\"}", + "College Medicine - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "College Medicine - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "College Medicine - # prompt tokens": "{\"description\": \"min=601.41, mean=601.41, max=601.41, sum=1202.821 (2)\", \"tab\": \"General information\", \"score\": \"601.4104046242775\"}", + "College Medicine - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "College Physics - # eval": "{\"description\": \"min=102, mean=102, max=102, sum=204 (2)\", \"tab\": \"General information\", \"score\": \"102.0\"}", + "College Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "College Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "College Physics - # prompt tokens": "{\"description\": \"min=560.029, mean=560.029, max=560.029, sum=1120.059 (2)\", \"tab\": \"General information\", \"score\": \"560.0294117647059\"}", + "College Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"clinical_knowledge\"", + "subject": "\"college_physics\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_clinical_knowledge\"" + "groups": "\"mmlu_college_physics\"" } } }, { - "evaluation_name": "Conceptual Physics", + "evaluation_name": "Computer Security", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -840,36 +814,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Conceptual Physics", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.549, + "score": 0.7, "details": { - "description": "min=0.549, mean=0.549, max=0.549, sum=1.098 (2)", + "description": "min=0.7, mean=0.7, max=0.7, sum=1.4 (2)", "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": "{\"description\": \"min=0.333, mean=0.333, max=0.333, sum=0.666 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.33306963900302317\"}", - "Conceptual Physics - # eval": "{\"description\": \"min=235, mean=235, max=235, sum=470 (2)\", \"tab\": \"General information\", \"score\": \"235.0\"}", - "Conceptual Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Conceptual Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Conceptual Physics - # prompt tokens": "{\"description\": \"min=343.285, mean=343.285, max=343.285, sum=686.57 (2)\", \"tab\": \"General information\", \"score\": \"343.2851063829787\"}", - "Conceptual Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Computer Security - Observed inference time (s)": "{\"description\": \"min=0.426, mean=0.426, max=0.426, sum=0.853 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4263953256607056\"}", + "Computer Security - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "Computer Security - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Computer Security - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Computer Security - # prompt tokens": "{\"description\": \"min=433.94, mean=433.94, max=433.94, sum=867.88 (2)\", \"tab\": \"General information\", \"score\": \"433.94\"}", + "Computer Security - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"conceptual_physics\"", + "subject": "\"computer_security\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_conceptual_physics\"" + "groups": "\"mmlu_computer_security\"" } } }, { - "evaluation_name": "Electrical Engineering", + "evaluation_name": "Econometrics", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -878,36 +852,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Electrical Engineering", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.572, + "score": 0.421, "details": { - "description": "min=0.572, mean=0.572, max=0.572, sum=1.145 (2)", + "description": "min=0.421, mean=0.421, max=0.421, sum=0.842 (2)", "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": "{\"description\": \"min=0.392, mean=0.392, max=0.392, sum=0.784 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3922290703345989\"}", - "Electrical Engineering - # eval": "{\"description\": \"min=145, mean=145, max=145, sum=290 (2)\", \"tab\": \"General information\", \"score\": \"145.0\"}", - "Electrical Engineering - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Electrical Engineering - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Electrical Engineering - # prompt tokens": "{\"description\": \"min=510.379, mean=510.379, max=510.379, sum=1020.759 (2)\", \"tab\": \"General information\", \"score\": \"510.37931034482756\"}", - "Electrical Engineering - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Econometrics - Observed inference time (s)": "{\"description\": \"min=0.406, mean=0.406, max=0.406, sum=0.813 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.406455958098696\"}", + "Econometrics - # eval": "{\"description\": \"min=114, mean=114, max=114, sum=228 (2)\", \"tab\": \"General information\", \"score\": \"114.0\"}", + "Econometrics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Econometrics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Econometrics - # prompt tokens": "{\"description\": \"min=696.175, mean=696.175, max=696.175, sum=1392.351 (2)\", \"tab\": \"General information\", \"score\": \"696.1754385964912\"}", + "Econometrics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"electrical_engineering\"", + "subject": "\"econometrics\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_electrical_engineering\"" + "groups": "\"mmlu_econometrics\"" } } }, { - "evaluation_name": "Elementary Mathematics", + "evaluation_name": "Global Facts", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -916,36 +890,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.402, + "score": 0.33, "details": { - "description": "min=0.402, mean=0.402, max=0.402, sum=0.804 (2)", + "description": "min=0.33, mean=0.33, max=0.33, sum=0.66 (2)", "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": "{\"description\": \"min=0.676, mean=0.676, max=0.676, sum=1.352 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6761655416438188\"}", - "Elementary Mathematics - # eval": "{\"description\": \"min=378, mean=378, max=378, sum=756 (2)\", \"tab\": \"General information\", \"score\": \"378.0\"}", - "Elementary Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Elementary Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Elementary Mathematics - # prompt tokens": "{\"description\": \"min=622.386, mean=622.386, max=622.386, sum=1244.772 (2)\", \"tab\": \"General information\", \"score\": \"622.3862433862433\"}", - "Elementary Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Global Facts - Observed inference time (s)": "{\"description\": \"min=0.299, mean=0.299, max=0.299, sum=0.598 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.29881003856658933\"}", + "Global Facts - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "Global Facts - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Global Facts - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Global Facts - # prompt tokens": "{\"description\": \"min=492.47, mean=492.47, max=492.47, sum=984.94 (2)\", \"tab\": \"General information\", \"score\": \"492.47\"}", + "Global Facts - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"elementary_mathematics\"", + "subject": "\"global_facts\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_elementary_mathematics\"" + "groups": "\"mmlu_global_facts\"" } } }, { - "evaluation_name": "Formal Logic", + "evaluation_name": "Jurisprudence", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -954,36 +928,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Formal Logic", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.397, + "score": 0.713, "details": { - "description": "min=0.397, mean=0.397, max=0.397, sum=0.794 (2)", + "description": "min=0.713, mean=0.713, max=0.713, sum=1.426 (2)", "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": "{\"description\": \"min=0.734, mean=0.734, max=0.734, sum=1.467 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.7336057802987477\"}", - "Formal Logic - # eval": "{\"description\": \"min=126, mean=126, max=126, sum=252 (2)\", \"tab\": \"General information\", \"score\": \"126.0\"}", - "Formal Logic - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Formal Logic - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Formal Logic - # prompt tokens": "{\"description\": \"min=727.984, mean=727.984, max=727.984, sum=1455.968 (2)\", \"tab\": \"General information\", \"score\": \"727.984126984127\"}", - "Formal Logic - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Jurisprudence - Observed inference time (s)": "{\"description\": \"min=0.232, mean=0.232, max=0.232, sum=0.465 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.23237781833719323\"}", + "Jurisprudence - # eval": "{\"description\": \"min=108, mean=108, max=108, sum=216 (2)\", \"tab\": \"General information\", \"score\": \"108.0\"}", + "Jurisprudence - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Jurisprudence - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Jurisprudence - # prompt tokens": "{\"description\": \"min=460.093, mean=460.093, max=460.093, sum=920.185 (2)\", \"tab\": \"General information\", \"score\": \"460.0925925925926\"}", + "Jurisprudence - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"formal_logic\"", + "subject": "\"jurisprudence\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_formal_logic\"" + "groups": "\"mmlu_jurisprudence\"" } } }, { - "evaluation_name": "High School World History", + "evaluation_name": "Philosophy", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -992,114 +966,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on High School World History", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.759, + "score": 0.659, "details": { - "description": "min=0.759, mean=0.759, max=0.759, sum=1.519 (2)", + "description": "min=0.659, mean=0.659, max=0.659, sum=1.318 (2)", "tab": "Accuracy", - "High School Biology - Observed inference time (s)": "{\"description\": \"min=0.805, mean=0.805, max=0.805, sum=1.61 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.8049156188964843\"}", - "High School Chemistry - Observed inference time (s)": "{\"description\": \"min=0.44, mean=0.44, max=0.44, sum=0.881 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.44036899529067164\"}", - "High School Computer Science - Observed inference time (s)": "{\"description\": \"min=0.435, mean=0.435, max=0.435, sum=0.869 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4347002100944519\"}", - "High School European History - Observed inference time (s)": "{\"description\": \"min=0.445, mean=0.445, max=0.445, sum=0.891 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4453156341205944\"}", - "High School Geography - Observed inference time (s)": "{\"description\": \"min=0.331, mean=0.331, max=0.331, sum=0.661 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3305177327358361\"}", - "High School Government And Politics - Observed inference time (s)": "{\"description\": \"min=0.545, mean=0.545, max=0.545, sum=1.089 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5445178654527417\"}", - "High School Macroeconomics - Observed inference time (s)": "{\"description\": \"min=0.53, mean=0.53, max=0.53, sum=1.061 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5302642871172\"}", - "High School Mathematics - Observed inference time (s)": "{\"description\": \"min=0.585, mean=0.585, max=0.585, sum=1.169 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5845282289716932\"}", - "High School Microeconomics - Observed inference time (s)": "{\"description\": \"min=0.234, mean=0.234, max=0.234, sum=0.468 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.23408917118521297\"}", - "High School Physics - Observed inference time (s)": "{\"description\": \"min=0.384, mean=0.384, max=0.384, sum=0.768 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3838195042894376\"}", - "High School Psychology - Observed inference time (s)": "{\"description\": \"min=0.274, mean=0.274, max=0.274, sum=0.547 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2735835779697523\"}", - "High School Statistics - Observed inference time (s)": "{\"description\": \"min=0.654, mean=0.654, max=0.654, sum=1.308 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6539056665367551\"}", - "High School US History - Observed inference time (s)": "{\"description\": \"min=0.942, mean=0.942, max=0.942, sum=1.883 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.9417344308366963\"}", - "High School World History - Observed inference time (s)": "{\"description\": \"min=0.864, mean=0.864, max=0.864, sum=1.727 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.8635432951561006\"}", - "High School Biology - # eval": "{\"description\": \"min=310, mean=310, max=310, sum=620 (2)\", \"tab\": \"General information\", \"score\": \"310.0\"}", - "High School Biology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Biology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Biology - # prompt tokens": "{\"description\": \"min=609.561, mean=609.561, max=609.561, sum=1219.123 (2)\", \"tab\": \"General information\", \"score\": \"609.5612903225806\"}", - "High School Biology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Chemistry - # eval": "{\"description\": \"min=203, mean=203, max=203, sum=406 (2)\", \"tab\": \"General information\", \"score\": \"203.0\"}", - "High School Chemistry - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Chemistry - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Chemistry - # prompt tokens": "{\"description\": \"min=581.798, mean=581.798, max=581.798, sum=1163.596 (2)\", \"tab\": \"General information\", \"score\": \"581.7980295566502\"}", - "High School Chemistry - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Computer Science - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "High School Computer Science - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Computer Science - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Computer Science - # prompt tokens": "{\"description\": \"min=997.24, mean=997.24, max=997.24, sum=1994.48 (2)\", \"tab\": \"General information\", \"score\": \"997.24\"}", - "High School Computer Science - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School European History - # eval": "{\"description\": \"min=165, mean=165, max=165, sum=330 (2)\", \"tab\": \"General information\", \"score\": \"165.0\"}", - "High School European History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School European History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School European History - # prompt tokens": "{\"description\": \"min=3098.109, mean=3098.109, max=3098.109, sum=6196.218 (2)\", \"tab\": \"General information\", \"score\": \"3098.109090909091\"}", - "High School European History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Geography - # eval": "{\"description\": \"min=198, mean=198, max=198, sum=396 (2)\", \"tab\": \"General information\", \"score\": \"198.0\"}", - "High School Geography - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Geography - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Geography - # prompt tokens": "{\"description\": \"min=438.207, mean=438.207, max=438.207, sum=876.414 (2)\", \"tab\": \"General information\", \"score\": \"438.2070707070707\"}", - "High School Geography - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Government And Politics - # eval": "{\"description\": \"min=193, mean=193, max=193, sum=386 (2)\", \"tab\": \"General information\", \"score\": \"193.0\"}", - "High School Government And Politics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Government And Politics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Government And Politics - # prompt tokens": "{\"description\": \"min=523.808, mean=523.808, max=523.808, sum=1047.617 (2)\", \"tab\": \"General information\", \"score\": \"523.8082901554404\"}", - "High School Government And Politics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Macroeconomics - # eval": "{\"description\": \"min=390, mean=390, max=390, sum=780 (2)\", \"tab\": \"General information\", \"score\": \"390.0\"}", - "High School Macroeconomics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Macroeconomics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Macroeconomics - # prompt tokens": "{\"description\": \"min=432.815, mean=432.815, max=432.815, sum=865.631 (2)\", \"tab\": \"General information\", \"score\": \"432.81538461538463\"}", - "High School Macroeconomics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Mathematics - # eval": "{\"description\": \"min=270, mean=270, max=270, sum=540 (2)\", \"tab\": \"General information\", \"score\": \"270.0\"}", - "High School Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Mathematics - # prompt tokens": "{\"description\": \"min=593.13, mean=593.13, max=593.13, sum=1186.259 (2)\", \"tab\": \"General information\", \"score\": \"593.1296296296297\"}", - "High School Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Microeconomics - # eval": "{\"description\": \"min=238, mean=238, max=238, sum=476 (2)\", \"tab\": \"General information\", \"score\": \"238.0\"}", - "High School Microeconomics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Microeconomics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Microeconomics - # prompt tokens": "{\"description\": \"min=452.345, mean=452.345, max=452.345, sum=904.689 (2)\", \"tab\": \"General information\", \"score\": \"452.34453781512605\"}", - "High School Microeconomics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Physics - # eval": "{\"description\": \"min=151, mean=151, max=151, sum=302 (2)\", \"tab\": \"General information\", \"score\": \"151.0\"}", - "High School Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Physics - # prompt tokens": "{\"description\": \"min=631.775, mean=631.775, max=631.775, sum=1263.55 (2)\", \"tab\": \"General information\", \"score\": \"631.774834437086\"}", - "High School Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Psychology - # eval": "{\"description\": \"min=545, mean=545, max=545, sum=1090 (2)\", \"tab\": \"General information\", \"score\": \"545.0\"}", - "High School Psychology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Psychology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Psychology - # prompt tokens": "{\"description\": \"min=567.873, mean=567.873, max=567.873, sum=1135.747 (2)\", \"tab\": \"General information\", \"score\": \"567.8733944954129\"}", - "High School Psychology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Statistics - # eval": "{\"description\": \"min=216, mean=216, max=216, sum=432 (2)\", \"tab\": \"General information\", \"score\": \"216.0\"}", - "High School Statistics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Statistics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Statistics - # prompt tokens": "{\"description\": \"min=922.644, mean=922.644, max=922.644, sum=1845.287 (2)\", \"tab\": \"General information\", \"score\": \"922.6435185185185\"}", - "High School Statistics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School US History - # eval": "{\"description\": \"min=204, mean=204, max=204, sum=408 (2)\", \"tab\": \"General information\", \"score\": \"204.0\"}", - "High School US History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School US History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School US History - # prompt tokens": "{\"description\": \"min=2486.446, mean=2486.446, max=2486.446, sum=4972.892 (2)\", \"tab\": \"General information\", \"score\": \"2486.4460784313724\"}", - "High School US History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School World History - # eval": "{\"description\": \"min=237, mean=237, max=237, sum=474 (2)\", \"tab\": \"General information\", \"score\": \"237.0\"}", - "High School World History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School World History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School World History - # prompt tokens": "{\"description\": \"min=1594.553, mean=1594.553, max=1594.553, sum=3189.105 (2)\", \"tab\": \"General information\", \"score\": \"1594.5527426160338\"}", - "High School World History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Philosophy - Observed inference time (s)": "{\"description\": \"min=0.899, mean=0.899, max=0.899, sum=1.798 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.8987545852109167\"}", + "Philosophy - # eval": "{\"description\": \"min=311, mean=311, max=311, sum=622 (2)\", \"tab\": \"General information\", \"score\": \"311.0\"}", + "Philosophy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Philosophy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Philosophy - # prompt tokens": "{\"description\": \"min=382.82, mean=382.82, max=382.82, sum=765.64 (2)\", \"tab\": \"General information\", \"score\": \"382.81993569131834\"}", + "Philosophy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"high_school_world_history\"", + "subject": "\"philosophy\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_high_school_world_history\"" + "groups": "\"mmlu_philosophy\"" } } }, { - "evaluation_name": "Human Sexuality", + "evaluation_name": "Professional Psychology", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1108,42 +1004,54 @@ ] }, "metric_config": { - "evaluation_description": "EM on Human Sexuality", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.702, + "score": 0.641, "details": { - "description": "min=0.702, mean=0.702, max=0.702, sum=1.405 (2)", + "description": "min=0.641, mean=0.641, max=0.641, sum=1.281 (2)", "tab": "Accuracy", - "Human Aging - Observed inference time (s)": "{\"description\": \"min=0.809, mean=0.809, max=0.809, sum=1.618 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.8091403518557014\"}", - "Human Sexuality - Observed inference time (s)": "{\"description\": \"min=1.438, mean=1.438, max=1.438, sum=2.875 (2)\", \"tab\": \"Efficiency\", \"score\": \"1.437711750278036\"}", - "Human Aging - # eval": "{\"description\": \"min=223, mean=223, max=223, sum=446 (2)\", \"tab\": \"General information\", \"score\": \"223.0\"}", - "Human Aging - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Human Aging - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Human Aging - # prompt tokens": "{\"description\": \"min=362.152, mean=362.152, max=362.152, sum=724.305 (2)\", \"tab\": \"General information\", \"score\": \"362.15246636771303\"}", - "Human Aging - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "Human Sexuality - # eval": "{\"description\": \"min=131, mean=131, max=131, sum=262 (2)\", \"tab\": \"General information\", \"score\": \"131.0\"}", - "Human Sexuality - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Human Sexuality - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Human Sexuality - # prompt tokens": "{\"description\": \"min=403.748, mean=403.748, max=403.748, sum=807.496 (2)\", \"tab\": \"General information\", \"score\": \"403.7480916030534\"}", - "Human Sexuality - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Professional Medicine - Observed inference time (s)": "{\"description\": \"min=0.615, mean=0.615, max=0.615, sum=1.23 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6148438769228318\"}", + "Professional Accounting - Observed inference time (s)": "{\"description\": \"min=0.825, mean=0.825, max=0.825, sum=1.651 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.8254362666015084\"}", + "Professional Law - Observed inference time (s)": "{\"description\": \"min=0.682, mean=0.682, max=0.682, sum=1.364 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.68212915414937\"}", + "Professional Psychology - Observed inference time (s)": "{\"description\": \"min=0.506, mean=0.506, max=0.506, sum=1.012 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.505940170459498\"}", + "Professional Medicine - # eval": "{\"description\": \"min=272, mean=272, max=272, sum=544 (2)\", \"tab\": \"General information\", \"score\": \"272.0\"}", + "Professional Medicine - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Professional Medicine - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Professional Medicine - # prompt tokens": "{\"description\": \"min=1288.143, mean=1288.143, max=1288.143, sum=2576.287 (2)\", \"tab\": \"General information\", \"score\": \"1288.1433823529412\"}", + "Professional Medicine - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "Professional Accounting - # eval": "{\"description\": \"min=282, mean=282, max=282, sum=564 (2)\", \"tab\": \"General information\", \"score\": \"282.0\"}", + "Professional Accounting - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Professional Accounting - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Professional Accounting - # prompt tokens": "{\"description\": \"min=805.496, mean=805.496, max=805.496, sum=1610.993 (2)\", \"tab\": \"General information\", \"score\": \"805.4964539007092\"}", + "Professional Accounting - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "Professional Law - # eval": "{\"description\": \"min=1534, mean=1534, max=1534, sum=3068 (2)\", \"tab\": \"General information\", \"score\": \"1534.0\"}", + "Professional Law - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Professional Law - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Professional Law - # prompt tokens": "{\"description\": \"min=1858.711, mean=1858.711, max=1858.711, sum=3717.421 (2)\", \"tab\": \"General information\", \"score\": \"1858.7105606258149\"}", + "Professional Law - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "Professional Psychology - # eval": "{\"description\": \"min=612, mean=612, max=612, sum=1224 (2)\", \"tab\": \"General information\", \"score\": \"612.0\"}", + "Professional Psychology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Professional Psychology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Professional Psychology - # prompt tokens": "{\"description\": \"min=654.278, mean=654.278, max=654.278, sum=1308.556 (2)\", \"tab\": \"General information\", \"score\": \"654.2777777777778\"}", + "Professional Psychology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"human_sexuality\"", + "subject": "\"professional_psychology\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_human_sexuality\"" + "groups": "\"mmlu_professional_psychology\"" } } }, { - "evaluation_name": "International Law", + "evaluation_name": "Us Foreign Policy", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1152,36 +1060,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on International Law", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.76, + "score": 0.79, "details": { - "description": "min=0.76, mean=0.76, max=0.76, sum=1.521 (2)", + "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)", "tab": "Accuracy", - "International Law - Observed inference time (s)": "{\"description\": \"min=0.393, mean=0.393, max=0.393, sum=0.787 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3933255593638775\"}", - "International Law - # eval": "{\"description\": \"min=121, mean=121, max=121, sum=242 (2)\", \"tab\": \"General information\", \"score\": \"121.0\"}", - "International Law - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "International Law - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "International Law - # prompt tokens": "{\"description\": \"min=729.182, mean=729.182, max=729.182, sum=1458.364 (2)\", \"tab\": \"General information\", \"score\": \"729.1818181818181\"}", - "International Law - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Us Foreign Policy - Observed inference time (s)": "{\"description\": \"min=0.487, mean=0.487, max=0.487, sum=0.973 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.48650413513183594\"}", + "Us Foreign Policy - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "Us Foreign Policy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Us Foreign Policy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Us Foreign Policy - # prompt tokens": "{\"description\": \"min=482.19, mean=482.19, max=482.19, sum=964.38 (2)\", \"tab\": \"General information\", \"score\": \"482.19\"}", + "Us Foreign Policy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"international_law\"", + "subject": "\"us_foreign_policy\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_international_law\"" + "groups": "\"mmlu_us_foreign_policy\"" } } }, { - "evaluation_name": "Logical Fallacies", + "evaluation_name": "Astronomy", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1190,36 +1098,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Logical Fallacies", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.712, + "score": 0.638, "details": { - "description": "min=0.712, mean=0.712, max=0.712, sum=1.423 (2)", + "description": "min=0.638, mean=0.638, max=0.638, sum=1.276 (2)", "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": "{\"description\": \"min=0.848, mean=0.848, max=0.848, sum=1.695 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.8476987660296855\"}", - "Logical Fallacies - # eval": "{\"description\": \"min=163, mean=163, max=163, sum=326 (2)\", \"tab\": \"General information\", \"score\": \"163.0\"}", - "Logical Fallacies - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Logical Fallacies - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Logical Fallacies - # prompt tokens": "{\"description\": \"min=495.779, mean=495.779, max=495.779, sum=991.558 (2)\", \"tab\": \"General information\", \"score\": \"495.77914110429447\"}", - "Logical Fallacies - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Astronomy - Observed inference time (s)": "{\"description\": \"min=0.678, mean=0.678, max=0.678, sum=1.355 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6775346147386652\"}", + "Astronomy - # eval": "{\"description\": \"min=152, mean=152, max=152, sum=304 (2)\", \"tab\": \"General information\", \"score\": \"152.0\"}", + "Astronomy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Astronomy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Astronomy - # prompt tokens": "{\"description\": \"min=674.987, mean=674.987, max=674.987, sum=1349.974 (2)\", \"tab\": \"General information\", \"score\": \"674.9868421052631\"}", + "Astronomy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"logical_fallacies\"", + "subject": "\"astronomy\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_logical_fallacies\"" + "groups": "\"mmlu_astronomy\"" } } }, { - "evaluation_name": "Machine Learning", + "evaluation_name": "Business Ethics", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1228,36 +1136,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Machine Learning", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.455, + "score": 0.57, "details": { - "description": "min=0.455, mean=0.455, max=0.455, sum=0.911 (2)", + "description": "min=0.57, mean=0.57, max=0.57, sum=1.14 (2)", "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": "{\"description\": \"min=0.557, mean=0.557, max=0.557, sum=1.113 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5566470899752208\"}", - "Machine Learning - # eval": "{\"description\": \"min=112, mean=112, max=112, sum=224 (2)\", \"tab\": \"General information\", \"score\": \"112.0\"}", - "Machine Learning - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Machine Learning - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Machine Learning - # prompt tokens": "{\"description\": \"min=743.83, mean=743.83, max=743.83, sum=1487.661 (2)\", \"tab\": \"General information\", \"score\": \"743.8303571428571\"}", - "Machine Learning - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Business Ethics - Observed inference time (s)": "{\"description\": \"min=0.645, mean=0.645, max=0.645, sum=1.289 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6446590375900269\"}", + "Business Ethics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "Business Ethics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Business Ethics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Business Ethics - # prompt tokens": "{\"description\": \"min=653.6, mean=653.6, max=653.6, sum=1307.2 (2)\", \"tab\": \"General information\", \"score\": \"653.6\"}", + "Business Ethics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"machine_learning\"", + "subject": "\"business_ethics\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_machine_learning\"" + "groups": "\"mmlu_business_ethics\"" } } }, { - "evaluation_name": "Management", + "evaluation_name": "Clinical Knowledge", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1266,36 +1174,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Management", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.767, + "score": 0.687, "details": { - "description": "min=0.767, mean=0.767, max=0.767, sum=1.534 (2)", + "description": "min=0.687, mean=0.687, max=0.687, sum=1.374 (2)", "tab": "Accuracy", - "Management - Observed inference time (s)": "{\"description\": \"min=0.365, mean=0.365, max=0.365, sum=0.73 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.36507687059420985\"}", - "Management - # eval": "{\"description\": \"min=103, mean=103, max=103, sum=206 (2)\", \"tab\": \"General information\", \"score\": \"103.0\"}", - "Management - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Management - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Management - # prompt tokens": "{\"description\": \"min=324.359, mean=324.359, max=324.359, sum=648.718 (2)\", \"tab\": \"General information\", \"score\": \"324.3592233009709\"}", - "Management - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Clinical Knowledge - Observed inference time (s)": "{\"description\": \"min=0.844, mean=0.844, max=0.844, sum=1.687 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.8436905698956184\"}", + "Clinical Knowledge - # eval": "{\"description\": \"min=265, mean=265, max=265, sum=530 (2)\", \"tab\": \"General information\", \"score\": \"265.0\"}", + "Clinical Knowledge - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Clinical Knowledge - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Clinical Knowledge - # prompt tokens": "{\"description\": \"min=496.174, mean=496.174, max=496.174, sum=992.347 (2)\", \"tab\": \"General information\", \"score\": \"496.1735849056604\"}", + "Clinical Knowledge - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"management\"", + "subject": "\"clinical_knowledge\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_management\"" + "groups": "\"mmlu_clinical_knowledge\"" } } }, { - "evaluation_name": "Marketing", + "evaluation_name": "Conceptual Physics", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1304,36 +1212,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Marketing", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.842, + "score": 0.549, "details": { - "description": "min=0.842, mean=0.842, max=0.842, sum=1.684 (2)", + "description": "min=0.549, mean=0.549, max=0.549, sum=1.098 (2)", "tab": "Accuracy", - "Marketing - Observed inference time (s)": "{\"description\": \"min=0.585, mean=0.585, max=0.585, sum=1.17 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.58499161606161\"}", - "Marketing - # eval": "{\"description\": \"min=234, mean=234, max=234, sum=468 (2)\", \"tab\": \"General information\", \"score\": \"234.0\"}", - "Marketing - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Marketing - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Marketing - # prompt tokens": "{\"description\": \"min=472.423, mean=472.423, max=472.423, sum=944.846 (2)\", \"tab\": \"General information\", \"score\": \"472.4230769230769\"}", - "Marketing - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Conceptual Physics - Observed inference time (s)": "{\"description\": \"min=0.333, mean=0.333, max=0.333, sum=0.666 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.33306963900302317\"}", + "Conceptual Physics - # eval": "{\"description\": \"min=235, mean=235, max=235, sum=470 (2)\", \"tab\": \"General information\", \"score\": \"235.0\"}", + "Conceptual Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Conceptual Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Conceptual Physics - # prompt tokens": "{\"description\": \"min=343.285, mean=343.285, max=343.285, sum=686.57 (2)\", \"tab\": \"General information\", \"score\": \"343.2851063829787\"}", + "Conceptual Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"marketing\"", + "subject": "\"conceptual_physics\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_marketing\"" + "groups": "\"mmlu_conceptual_physics\"" } } }, { - "evaluation_name": "Medical Genetics", + "evaluation_name": "Electrical Engineering", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1342,36 +1250,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Medical Genetics", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.75, + "score": 0.572, "details": { - "description": "min=0.75, mean=0.75, max=0.75, sum=1.5 (2)", + "description": "min=0.572, mean=0.572, max=0.572, sum=1.145 (2)", "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": "{\"description\": \"min=0.268, mean=0.268, max=0.268, sum=0.535 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2675498366355896\"}", - "Medical Genetics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "Medical Genetics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Medical Genetics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Medical Genetics - # prompt tokens": "{\"description\": \"min=414.71, mean=414.71, max=414.71, sum=829.42 (2)\", \"tab\": \"General information\", \"score\": \"414.71\"}", - "Medical Genetics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Electrical Engineering - Observed inference time (s)": "{\"description\": \"min=0.392, mean=0.392, max=0.392, sum=0.784 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3922290703345989\"}", + "Electrical Engineering - # eval": "{\"description\": \"min=145, mean=145, max=145, sum=290 (2)\", \"tab\": \"General information\", \"score\": \"145.0\"}", + "Electrical Engineering - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Electrical Engineering - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Electrical Engineering - # prompt tokens": "{\"description\": \"min=510.379, mean=510.379, max=510.379, sum=1020.759 (2)\", \"tab\": \"General information\", \"score\": \"510.37931034482756\"}", + "Electrical Engineering - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"medical_genetics\"", + "subject": "\"electrical_engineering\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_medical_genetics\"" + "groups": "\"mmlu_electrical_engineering\"" } } }, { - "evaluation_name": "Miscellaneous", + "evaluation_name": "Elementary Mathematics", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1380,36 +1288,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Miscellaneous", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.785, + "score": 0.402, "details": { - "description": "min=0.785, mean=0.785, max=0.785, sum=1.571 (2)", + "description": "min=0.402, mean=0.402, max=0.402, sum=0.804 (2)", "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": "{\"description\": \"min=0.504, mean=0.504, max=0.504, sum=1.008 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5038632959850599\"}", - "Miscellaneous - # eval": "{\"description\": \"min=783, mean=783, max=783, sum=1566 (2)\", \"tab\": \"General information\", \"score\": \"783.0\"}", - "Miscellaneous - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Miscellaneous - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Miscellaneous - # prompt tokens": "{\"description\": \"min=357.519, mean=357.519, max=357.519, sum=715.037 (2)\", \"tab\": \"General information\", \"score\": \"357.51851851851853\"}", - "Miscellaneous - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Elementary Mathematics - Observed inference time (s)": "{\"description\": \"min=0.676, mean=0.676, max=0.676, sum=1.352 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6761655416438188\"}", + "Elementary Mathematics - # eval": "{\"description\": \"min=378, mean=378, max=378, sum=756 (2)\", \"tab\": \"General information\", \"score\": \"378.0\"}", + "Elementary Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Elementary Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Elementary Mathematics - # prompt tokens": "{\"description\": \"min=622.386, mean=622.386, max=622.386, sum=1244.772 (2)\", \"tab\": \"General information\", \"score\": \"622.3862433862433\"}", + "Elementary Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"miscellaneous\"", + "subject": "\"elementary_mathematics\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_miscellaneous\"" + "groups": "\"mmlu_elementary_mathematics\"" } } }, { - "evaluation_name": "Moral Scenarios", + "evaluation_name": "Formal Logic", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1418,42 +1326,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Moral Scenarios", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.393, + "score": 0.397, "details": { - "description": "min=0.393, mean=0.393, max=0.393, sum=0.787 (2)", + "description": "min=0.397, mean=0.397, max=0.397, sum=0.794 (2)", "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": "{\"description\": \"min=0.777, mean=0.777, max=0.777, sum=1.553 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.7765735477381359\"}", - "Moral Scenarios - Observed inference time (s)": "{\"description\": \"min=0.493, mean=0.493, max=0.493, sum=0.986 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4927780463042872\"}", - "Moral Disputes - # eval": "{\"description\": \"min=346, mean=346, max=346, sum=692 (2)\", \"tab\": \"General information\", \"score\": \"346.0\"}", - "Moral Disputes - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Moral Disputes - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Moral Disputes - # prompt tokens": "{\"description\": \"min=549.038, mean=549.038, max=549.038, sum=1098.075 (2)\", \"tab\": \"General information\", \"score\": \"549.0375722543353\"}", - "Moral Disputes - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "Moral Scenarios - # eval": "{\"description\": \"min=895, mean=895, max=895, sum=1790 (2)\", \"tab\": \"General information\", \"score\": \"895.0\"}", - "Moral Scenarios - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Moral Scenarios - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Moral Scenarios - # prompt tokens": "{\"description\": \"min=754.516, mean=754.516, max=754.516, sum=1509.032 (2)\", \"tab\": \"General information\", \"score\": \"754.5162011173185\"}", - "Moral Scenarios - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Formal Logic - Observed inference time (s)": "{\"description\": \"min=0.734, mean=0.734, max=0.734, sum=1.467 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.7336057802987477\"}", + "Formal Logic - # eval": "{\"description\": \"min=126, mean=126, max=126, sum=252 (2)\", \"tab\": \"General information\", \"score\": \"126.0\"}", + "Formal Logic - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Formal Logic - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Formal Logic - # prompt tokens": "{\"description\": \"min=727.984, mean=727.984, max=727.984, sum=1455.968 (2)\", \"tab\": \"General information\", \"score\": \"727.984126984127\"}", + "Formal Logic - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"moral_scenarios\"", + "subject": "\"formal_logic\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_moral_scenarios\"" + "groups": "\"mmlu_formal_logic\"" } } }, { - "evaluation_name": "Nutrition", + "evaluation_name": "High School World History", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1462,36 +1364,114 @@ ] }, "metric_config": { - "evaluation_description": "EM on Nutrition", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.676, + "score": 0.759, "details": { - "description": "min=0.676, mean=0.676, max=0.676, sum=1.353 (2)", + "description": "min=0.759, mean=0.759, max=0.759, sum=1.519 (2)", "tab": "Accuracy", - "Nutrition - Observed inference time (s)": "{\"description\": \"min=0.236, mean=0.236, max=0.236, sum=0.471 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.23563866054310517\"}", - "Nutrition - # eval": "{\"description\": \"min=306, mean=306, max=306, sum=612 (2)\", \"tab\": \"General information\", \"score\": \"306.0\"}", - "Nutrition - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Nutrition - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Nutrition - # prompt tokens": "{\"description\": \"min=689.69, mean=689.69, max=689.69, sum=1379.379 (2)\", \"tab\": \"General information\", \"score\": \"689.6895424836601\"}", - "Nutrition - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "High School Biology - Observed inference time (s)": "{\"description\": \"min=0.805, mean=0.805, max=0.805, sum=1.61 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.8049156188964843\"}", + "High School Chemistry - Observed inference time (s)": "{\"description\": \"min=0.44, mean=0.44, max=0.44, sum=0.881 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.44036899529067164\"}", + "High School Computer Science - Observed inference time (s)": "{\"description\": \"min=0.435, mean=0.435, max=0.435, sum=0.869 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4347002100944519\"}", + "High School European History - Observed inference time (s)": "{\"description\": \"min=0.445, mean=0.445, max=0.445, sum=0.891 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4453156341205944\"}", + "High School Geography - Observed inference time (s)": "{\"description\": \"min=0.331, mean=0.331, max=0.331, sum=0.661 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3305177327358361\"}", + "High School Government And Politics - Observed inference time (s)": "{\"description\": \"min=0.545, mean=0.545, max=0.545, sum=1.089 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5445178654527417\"}", + "High School Macroeconomics - Observed inference time (s)": "{\"description\": \"min=0.53, mean=0.53, max=0.53, sum=1.061 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5302642871172\"}", + "High School Mathematics - Observed inference time (s)": "{\"description\": \"min=0.585, mean=0.585, max=0.585, sum=1.169 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5845282289716932\"}", + "High School Microeconomics - Observed inference time (s)": "{\"description\": \"min=0.234, mean=0.234, max=0.234, sum=0.468 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.23408917118521297\"}", + "High School Physics - Observed inference time (s)": "{\"description\": \"min=0.384, mean=0.384, max=0.384, sum=0.768 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3838195042894376\"}", + "High School Psychology - Observed inference time (s)": "{\"description\": \"min=0.274, mean=0.274, max=0.274, sum=0.547 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2735835779697523\"}", + "High School Statistics - Observed inference time (s)": "{\"description\": \"min=0.654, mean=0.654, max=0.654, sum=1.308 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6539056665367551\"}", + "High School US History - Observed inference time (s)": "{\"description\": \"min=0.942, mean=0.942, max=0.942, sum=1.883 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.9417344308366963\"}", + "High School World History - Observed inference time (s)": "{\"description\": \"min=0.864, mean=0.864, max=0.864, sum=1.727 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.8635432951561006\"}", + "High School Biology - # eval": "{\"description\": \"min=310, mean=310, max=310, sum=620 (2)\", \"tab\": \"General information\", \"score\": \"310.0\"}", + "High School Biology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Biology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Biology - # prompt tokens": "{\"description\": \"min=609.561, mean=609.561, max=609.561, sum=1219.123 (2)\", \"tab\": \"General information\", \"score\": \"609.5612903225806\"}", + "High School Biology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Chemistry - # eval": "{\"description\": \"min=203, mean=203, max=203, sum=406 (2)\", \"tab\": \"General information\", \"score\": \"203.0\"}", + "High School Chemistry - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Chemistry - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Chemistry - # prompt tokens": "{\"description\": \"min=581.798, mean=581.798, max=581.798, sum=1163.596 (2)\", \"tab\": \"General information\", \"score\": \"581.7980295566502\"}", + "High School Chemistry - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Computer Science - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "High School Computer Science - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Computer Science - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Computer Science - # prompt tokens": "{\"description\": \"min=997.24, mean=997.24, max=997.24, sum=1994.48 (2)\", \"tab\": \"General information\", \"score\": \"997.24\"}", + "High School Computer Science - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School European History - # eval": "{\"description\": \"min=165, mean=165, max=165, sum=330 (2)\", \"tab\": \"General information\", \"score\": \"165.0\"}", + "High School European History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School European History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School European History - # prompt tokens": "{\"description\": \"min=3098.109, mean=3098.109, max=3098.109, sum=6196.218 (2)\", \"tab\": \"General information\", \"score\": \"3098.109090909091\"}", + "High School European History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Geography - # eval": "{\"description\": \"min=198, mean=198, max=198, sum=396 (2)\", \"tab\": \"General information\", \"score\": \"198.0\"}", + "High School Geography - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Geography - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Geography - # prompt tokens": "{\"description\": \"min=438.207, mean=438.207, max=438.207, sum=876.414 (2)\", \"tab\": \"General information\", \"score\": \"438.2070707070707\"}", + "High School Geography - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Government And Politics - # eval": "{\"description\": \"min=193, mean=193, max=193, sum=386 (2)\", \"tab\": \"General information\", \"score\": \"193.0\"}", + "High School Government And Politics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Government And Politics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Government And Politics - # prompt tokens": "{\"description\": \"min=523.808, mean=523.808, max=523.808, sum=1047.617 (2)\", \"tab\": \"General information\", \"score\": \"523.8082901554404\"}", + "High School Government And Politics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Macroeconomics - # eval": "{\"description\": \"min=390, mean=390, max=390, sum=780 (2)\", \"tab\": \"General information\", \"score\": \"390.0\"}", + "High School Macroeconomics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Macroeconomics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Macroeconomics - # prompt tokens": "{\"description\": \"min=432.815, mean=432.815, max=432.815, sum=865.631 (2)\", \"tab\": \"General information\", \"score\": \"432.81538461538463\"}", + "High School Macroeconomics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Mathematics - # eval": "{\"description\": \"min=270, mean=270, max=270, sum=540 (2)\", \"tab\": \"General information\", \"score\": \"270.0\"}", + "High School Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Mathematics - # prompt tokens": "{\"description\": \"min=593.13, mean=593.13, max=593.13, sum=1186.259 (2)\", \"tab\": \"General information\", \"score\": \"593.1296296296297\"}", + "High School Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Microeconomics - # eval": "{\"description\": \"min=238, mean=238, max=238, sum=476 (2)\", \"tab\": \"General information\", \"score\": \"238.0\"}", + "High School Microeconomics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Microeconomics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Microeconomics - # prompt tokens": "{\"description\": \"min=452.345, mean=452.345, max=452.345, sum=904.689 (2)\", \"tab\": \"General information\", \"score\": \"452.34453781512605\"}", + "High School Microeconomics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Physics - # eval": "{\"description\": \"min=151, mean=151, max=151, sum=302 (2)\", \"tab\": \"General information\", \"score\": \"151.0\"}", + "High School Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Physics - # prompt tokens": "{\"description\": \"min=631.775, mean=631.775, max=631.775, sum=1263.55 (2)\", \"tab\": \"General information\", \"score\": \"631.774834437086\"}", + "High School Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Psychology - # eval": "{\"description\": \"min=545, mean=545, max=545, sum=1090 (2)\", \"tab\": \"General information\", \"score\": \"545.0\"}", + "High School Psychology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Psychology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Psychology - # prompt tokens": "{\"description\": \"min=567.873, mean=567.873, max=567.873, sum=1135.747 (2)\", \"tab\": \"General information\", \"score\": \"567.8733944954129\"}", + "High School Psychology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Statistics - # eval": "{\"description\": \"min=216, mean=216, max=216, sum=432 (2)\", \"tab\": \"General information\", \"score\": \"216.0\"}", + "High School Statistics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Statistics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Statistics - # prompt tokens": "{\"description\": \"min=922.644, mean=922.644, max=922.644, sum=1845.287 (2)\", \"tab\": \"General information\", \"score\": \"922.6435185185185\"}", + "High School Statistics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School US History - # eval": "{\"description\": \"min=204, mean=204, max=204, sum=408 (2)\", \"tab\": \"General information\", \"score\": \"204.0\"}", + "High School US History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School US History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School US History - # prompt tokens": "{\"description\": \"min=2486.446, mean=2486.446, max=2486.446, sum=4972.892 (2)\", \"tab\": \"General information\", \"score\": \"2486.4460784313724\"}", + "High School US History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School World History - # eval": "{\"description\": \"min=237, mean=237, max=237, sum=474 (2)\", \"tab\": \"General information\", \"score\": \"237.0\"}", + "High School World History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School World History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School World History - # prompt tokens": "{\"description\": \"min=1594.553, mean=1594.553, max=1594.553, sum=3189.105 (2)\", \"tab\": \"General information\", \"score\": \"1594.5527426160338\"}", + "High School World History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"nutrition\"", + "subject": "\"high_school_world_history\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_nutrition\"" + "groups": "\"mmlu_high_school_world_history\"" } } }, { - "evaluation_name": "Prehistory", + "evaluation_name": "Human Sexuality", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1500,36 +1480,42 @@ ] }, "metric_config": { - "evaluation_description": "EM on Prehistory", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.673, + "score": 0.702, "details": { - "description": "min=0.673, mean=0.673, max=0.673, sum=1.346 (2)", + "description": "min=0.702, mean=0.702, max=0.702, sum=1.405 (2)", "tab": "Accuracy", - "Prehistory - Observed inference time (s)": "{\"description\": \"min=0.345, mean=0.345, max=0.345, sum=0.69 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.34476134880089465\"}", - "Prehistory - # eval": "{\"description\": \"min=324, mean=324, max=324, sum=648 (2)\", \"tab\": \"General information\", \"score\": \"324.0\"}", - "Prehistory - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Prehistory - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Prehistory - # prompt tokens": "{\"description\": \"min=611.145, mean=611.145, max=611.145, sum=1222.29 (2)\", \"tab\": \"General information\", \"score\": \"611.145061728395\"}", - "Prehistory - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Human Aging - Observed inference time (s)": "{\"description\": \"min=0.809, mean=0.809, max=0.809, sum=1.618 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.8091403518557014\"}", + "Human Sexuality - Observed inference time (s)": "{\"description\": \"min=1.438, mean=1.438, max=1.438, sum=2.875 (2)\", \"tab\": \"Efficiency\", \"score\": \"1.437711750278036\"}", + "Human Aging - # eval": "{\"description\": \"min=223, mean=223, max=223, sum=446 (2)\", \"tab\": \"General information\", \"score\": \"223.0\"}", + "Human Aging - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Human Aging - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Human Aging - # prompt tokens": "{\"description\": \"min=362.152, mean=362.152, max=362.152, sum=724.305 (2)\", \"tab\": \"General information\", \"score\": \"362.15246636771303\"}", + "Human Aging - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "Human Sexuality - # eval": "{\"description\": \"min=131, mean=131, max=131, sum=262 (2)\", \"tab\": \"General information\", \"score\": \"131.0\"}", + "Human Sexuality - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Human Sexuality - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Human Sexuality - # prompt tokens": "{\"description\": \"min=403.748, mean=403.748, max=403.748, sum=807.496 (2)\", \"tab\": \"General information\", \"score\": \"403.7480916030534\"}", + "Human Sexuality - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"prehistory\"", + "subject": "\"human_sexuality\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_prehistory\"" + "groups": "\"mmlu_human_sexuality\"" } } }, { - "evaluation_name": "Public Relations", + "evaluation_name": "International Law", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1538,36 +1524,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Public Relations", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.636, + "score": 0.76, "details": { - "description": "min=0.636, mean=0.636, max=0.636, sum=1.273 (2)", + "description": "min=0.76, mean=0.76, max=0.76, sum=1.521 (2)", "tab": "Accuracy", - "Public Relations - Observed inference time (s)": "{\"description\": \"min=0.327, mean=0.327, max=0.327, sum=0.654 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3271717678416859\"}", - "Public Relations - # eval": "{\"description\": \"min=110, mean=110, max=110, sum=220 (2)\", \"tab\": \"General information\", \"score\": \"110.0\"}", - "Public Relations - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Public Relations - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Public Relations - # prompt tokens": "{\"description\": \"min=471.036, mean=471.036, max=471.036, sum=942.073 (2)\", \"tab\": \"General information\", \"score\": \"471.03636363636366\"}", - "Public Relations - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "International Law - Observed inference time (s)": "{\"description\": \"min=0.393, mean=0.393, max=0.393, sum=0.787 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3933255593638775\"}", + "International Law - # eval": "{\"description\": \"min=121, mean=121, max=121, sum=242 (2)\", \"tab\": \"General information\", \"score\": \"121.0\"}", + "International Law - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "International Law - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "International Law - # prompt tokens": "{\"description\": \"min=729.182, mean=729.182, max=729.182, sum=1458.364 (2)\", \"tab\": \"General information\", \"score\": \"729.1818181818181\"}", + "International Law - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"public_relations\"", + "subject": "\"international_law\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_public_relations\"" + "groups": "\"mmlu_international_law\"" } } }, { - "evaluation_name": "Security Studies", + "evaluation_name": "Logical Fallacies", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1576,36 +1562,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Security Studies", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.682, + "score": 0.712, "details": { - "description": "min=0.682, mean=0.682, max=0.682, sum=1.363 (2)", + "description": "min=0.712, mean=0.712, max=0.712, sum=1.423 (2)", "tab": "Accuracy", - "Security Studies - Observed inference time (s)": "{\"description\": \"min=0.561, mean=0.561, max=0.561, sum=1.121 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5606838294437954\"}", - "Security Studies - # eval": "{\"description\": \"min=245, mean=245, max=245, sum=490 (2)\", \"tab\": \"General information\", \"score\": \"245.0\"}", - "Security Studies - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Security Studies - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Security Studies - # prompt tokens": "{\"description\": \"min=1324.865, mean=1324.865, max=1324.865, sum=2649.731 (2)\", \"tab\": \"General information\", \"score\": \"1324.865306122449\"}", - "Security Studies - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Logical Fallacies - Observed inference time (s)": "{\"description\": \"min=0.848, mean=0.848, max=0.848, sum=1.695 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.8476987660296855\"}", + "Logical Fallacies - # eval": "{\"description\": \"min=163, mean=163, max=163, sum=326 (2)\", \"tab\": \"General information\", \"score\": \"163.0\"}", + "Logical Fallacies - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Logical Fallacies - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Logical Fallacies - # prompt tokens": "{\"description\": \"min=495.779, mean=495.779, max=495.779, sum=991.558 (2)\", \"tab\": \"General information\", \"score\": \"495.77914110429447\"}", + "Logical Fallacies - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"security_studies\"", + "subject": "\"logical_fallacies\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_security_studies\"" + "groups": "\"mmlu_logical_fallacies\"" } } }, { - "evaluation_name": "Sociology", + "evaluation_name": "Machine Learning", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1614,36 +1600,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Sociology", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.806, + "score": 0.455, "details": { - "description": "min=0.806, mean=0.806, max=0.806, sum=1.612 (2)", + "description": "min=0.455, mean=0.455, max=0.455, sum=0.911 (2)", "tab": "Accuracy", - "Sociology - Observed inference time (s)": "{\"description\": \"min=0.413, mean=0.413, max=0.413, sum=0.825 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.41272182962787685\"}", - "Sociology - # eval": "{\"description\": \"min=201, mean=201, max=201, sum=402 (2)\", \"tab\": \"General information\", \"score\": \"201.0\"}", - "Sociology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Sociology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Sociology - # prompt tokens": "{\"description\": \"min=496.95, mean=496.95, max=496.95, sum=993.9 (2)\", \"tab\": \"General information\", \"score\": \"496.9502487562189\"}", - "Sociology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Machine Learning - Observed inference time (s)": "{\"description\": \"min=0.557, mean=0.557, max=0.557, sum=1.113 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5566470899752208\"}", + "Machine Learning - # eval": "{\"description\": \"min=112, mean=112, max=112, sum=224 (2)\", \"tab\": \"General information\", \"score\": \"112.0\"}", + "Machine Learning - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Machine Learning - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Machine Learning - # prompt tokens": "{\"description\": \"min=743.83, mean=743.83, max=743.83, sum=1487.661 (2)\", \"tab\": \"General information\", \"score\": \"743.8303571428571\"}", + "Machine Learning - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"sociology\"", + "subject": "\"machine_learning\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_sociology\"" + "groups": "\"mmlu_machine_learning\"" } } }, { - "evaluation_name": "Virology", + "evaluation_name": "Management", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1652,36 +1638,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Virology", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.47, + "score": 0.767, "details": { - "description": "min=0.47, mean=0.47, max=0.47, sum=0.94 (2)", + "description": "min=0.767, mean=0.767, max=0.767, sum=1.534 (2)", "tab": "Accuracy", - "Virology - Observed inference time (s)": "{\"description\": \"min=0.644, mean=0.644, max=0.644, sum=1.288 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6437842285776713\"}", - "Virology - # eval": "{\"description\": \"min=166, mean=166, max=166, sum=332 (2)\", \"tab\": \"General information\", \"score\": \"166.0\"}", - "Virology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Virology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Virology - # prompt tokens": "{\"description\": \"min=404.349, mean=404.349, max=404.349, sum=808.699 (2)\", \"tab\": \"General information\", \"score\": \"404.34939759036143\"}", - "Virology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Management - Observed inference time (s)": "{\"description\": \"min=0.365, mean=0.365, max=0.365, sum=0.73 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.36507687059420985\"}", + "Management - # eval": "{\"description\": \"min=103, mean=103, max=103, sum=206 (2)\", \"tab\": \"General information\", \"score\": \"103.0\"}", + "Management - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Management - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Management - # prompt tokens": "{\"description\": \"min=324.359, mean=324.359, max=324.359, sum=648.718 (2)\", \"tab\": \"General information\", \"score\": \"324.3592233009709\"}", + "Management - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"virology\"", + "subject": "\"management\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_virology\"" + "groups": "\"mmlu_management\"" } } }, { - "evaluation_name": "World Religions", + "evaluation_name": "Marketing", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1690,36 +1676,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on World Religions", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.825, + "score": 0.842, "details": { - "description": "min=0.825, mean=0.825, max=0.825, sum=1.649 (2)", + "description": "min=0.842, mean=0.842, max=0.842, sum=1.684 (2)", "tab": "Accuracy", - "World Religions - Observed inference time (s)": "{\"description\": \"min=0.266, mean=0.266, max=0.266, sum=0.532 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.26615772330970094\"}", - "World Religions - # eval": "{\"description\": \"min=171, mean=171, max=171, sum=342 (2)\", \"tab\": \"General information\", \"score\": \"171.0\"}", - "World Religions - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "World Religions - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "World Religions - # prompt tokens": "{\"description\": \"min=317.924, mean=317.924, max=317.924, sum=635.848 (2)\", \"tab\": \"General information\", \"score\": \"317.92397660818716\"}", - "World Religions - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Marketing - Observed inference time (s)": "{\"description\": \"min=0.585, mean=0.585, max=0.585, sum=1.17 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.58499161606161\"}", + "Marketing - # eval": "{\"description\": \"min=234, mean=234, max=234, sum=468 (2)\", \"tab\": \"General information\", \"score\": \"234.0\"}", + "Marketing - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Marketing - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Marketing - # prompt tokens": "{\"description\": \"min=472.423, mean=472.423, max=472.423, sum=944.846 (2)\", \"tab\": \"General information\", \"score\": \"472.4230769230769\"}", + "Marketing - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"world_religions\"", + "subject": "\"marketing\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_world_religions\"" + "groups": "\"mmlu_marketing\"" } } }, { - "evaluation_name": "Mean win rate", + "evaluation_name": "Medical Genetics", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1728,404 +1714,418 @@ ] }, "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.509, + "score": 0.75, "details": { - "description": "", - "tab": "Efficiency" + "description": "min=0.75, mean=0.75, max=0.75, sum=1.5 (2)", + "tab": "Accuracy", + "Medical Genetics - Observed inference time (s)": "{\"description\": \"min=0.268, mean=0.268, max=0.268, sum=0.535 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.2675498366355896\"}", + "Medical Genetics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "Medical Genetics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Medical Genetics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Medical Genetics - # prompt tokens": "{\"description\": \"min=414.71, mean=414.71, max=414.71, sum=829.42 (2)\", \"tab\": \"General information\", \"score\": \"414.71\"}", + "Medical Genetics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { - "additional_details": {} + "additional_details": { + "subject": "\"medical_genetics\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_medical_genetics\"" + } } - } - ], - "detailed_evaluation_results": null, - "generation_config": { - "additional_details": { - "subject": "[\"abstract_algebra\", \"anatomy\", \"astronomy\", \"business_ethics\", \"clinical_knowledge\", \"college_biology\", \"college_chemistry\", \"college_computer_science\", \"college_mathematics\", \"college_medicine\", \"college_physics\", \"computer_security\", \"conceptual_physics\", \"econometrics\", \"electrical_engineering\", \"elementary_mathematics\", \"formal_logic\", \"global_facts\", \"high_school_biology\", \"high_school_chemistry\", \"high_school_computer_science\", \"high_school_european_history\", \"high_school_geography\", \"high_school_government_and_politics\", \"high_school_macroeconomics\", \"high_school_mathematics\", \"high_school_microeconomics\", \"high_school_physics\", \"high_school_psychology\", \"high_school_statistics\", \"high_school_us_history\", \"high_school_world_history\", \"human_aging\", \"human_sexuality\", \"international_law\", \"jurisprudence\", \"logical_fallacies\", \"machine_learning\", \"management\", \"marketing\", \"medical_genetics\", \"miscellaneous\", \"moral_disputes\", \"moral_scenarios\", \"nutrition\", \"philosophy\", \"prehistory\", \"professional_accounting\", \"professional_law\", \"professional_medicine\", \"professional_psychology\", \"public_relations\", \"security_studies\", \"sociology\", \"us_foreign_policy\", \"virology\", \"world_religions\"]", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "[\"mmlu_abstract_algebra\", \"mmlu_anatomy\", \"mmlu_astronomy\", \"mmlu_business_ethics\", \"mmlu_clinical_knowledge\", \"mmlu_college_biology\", \"mmlu_college_chemistry\", \"mmlu_college_computer_science\", \"mmlu_college_mathematics\", \"mmlu_college_medicine\", \"mmlu_college_physics\", \"mmlu_computer_security\", \"mmlu_conceptual_physics\", \"mmlu_econometrics\", \"mmlu_electrical_engineering\", \"mmlu_elementary_mathematics\", \"mmlu_formal_logic\", \"mmlu_global_facts\", \"mmlu_high_school_biology\", \"mmlu_high_school_chemistry\", \"mmlu_high_school_computer_science\", \"mmlu_high_school_european_history\", \"mmlu_high_school_geography\", \"mmlu_high_school_government_and_politics\", \"mmlu_high_school_macroeconomics\", \"mmlu_high_school_mathematics\", \"mmlu_high_school_microeconomics\", \"mmlu_high_school_physics\", \"mmlu_high_school_psychology\", \"mmlu_high_school_statistics\", \"mmlu_high_school_us_history\", \"mmlu_high_school_world_history\", \"mmlu_human_aging\", \"mmlu_human_sexuality\", \"mmlu_international_law\", \"mmlu_jurisprudence\", \"mmlu_logical_fallacies\", \"mmlu_machine_learning\", \"mmlu_management\", \"mmlu_marketing\", \"mmlu_medical_genetics\", \"mmlu_miscellaneous\", \"mmlu_moral_disputes\", \"mmlu_moral_scenarios\", \"mmlu_nutrition\", \"mmlu_philosophy\", \"mmlu_prehistory\", \"mmlu_professional_accounting\", \"mmlu_professional_law\", \"mmlu_professional_medicine\", \"mmlu_professional_psychology\", \"mmlu_public_relations\", \"mmlu_security_studies\", \"mmlu_sociology\", \"mmlu_us_foreign_policy\", \"mmlu_virology\", \"mmlu_world_religions\"]" - } - } - }, - { - "evaluation_id": "helm_lite/mistralai_mistral-7b-instruct-v0.3/1774096306.427425", - "retrieved_timestamp": "1774096306.427425", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "eval_library": { - "name": "helm", - "version": "unknown" - }, - "benchmark": "helm_lite", - "evaluation_results": [ + }, { - "evaluation_name": "Mean win rate", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "helm_lite", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.196, + "score": 0.785, "details": { - "description": "", + "description": "min=0.785, mean=0.785, max=0.785, sum=1.571 (2)", "tab": "Accuracy", - "Mean win rate - Efficiency": "{\"description\": \"\", \"tab\": \"Efficiency\", \"score\": \"0.6493133583021223\"}", - "Mean win rate - General information": "{\"description\": \"\", \"tab\": \"General information\", \"score\": \"\"}" + "Miscellaneous - Observed inference time (s)": "{\"description\": \"min=0.504, mean=0.504, max=0.504, sum=1.008 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5038632959850599\"}", + "Miscellaneous - # eval": "{\"description\": \"min=783, mean=783, max=783, sum=1566 (2)\", \"tab\": \"General information\", \"score\": \"783.0\"}", + "Miscellaneous - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Miscellaneous - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Miscellaneous - # prompt tokens": "{\"description\": \"min=357.519, mean=357.519, max=357.519, sum=715.037 (2)\", \"tab\": \"General information\", \"score\": \"357.51851851851853\"}", + "Miscellaneous - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { - "additional_details": {} + "additional_details": { + "subject": "\"miscellaneous\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_miscellaneous\"" + } } }, { - "evaluation_name": "NarrativeQA", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "NarrativeQA", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "F1 on NarrativeQA", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.716, + "score": 0.393, "details": { - "description": "min=0.716, mean=0.716, max=0.716, sum=0.716 (1)", + "description": "min=0.393, mean=0.393, max=0.393, sum=0.787 (2)", "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": "{\"description\": \"min=0.813, mean=0.813, max=0.813, sum=0.813 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.8132137520212522\"}", - "NarrativeQA - # eval": "{\"description\": \"min=355, mean=355, max=355, sum=355 (1)\", \"tab\": \"General information\", \"score\": \"355.0\"}", - "NarrativeQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "NarrativeQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "NarrativeQA - # prompt tokens": "{\"description\": \"min=3924.33, mean=3924.33, max=3924.33, sum=3924.33 (1)\", \"tab\": \"General information\", \"score\": \"3924.3295774647886\"}", - "NarrativeQA - # output tokens": "{\"description\": \"min=7.107, mean=7.107, max=7.107, sum=7.107 (1)\", \"tab\": \"General information\", \"score\": \"7.107042253521127\"}" + "Moral Disputes - Observed inference time (s)": "{\"description\": \"min=0.777, mean=0.777, max=0.777, sum=1.553 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.7765735477381359\"}", + "Moral Scenarios - Observed inference time (s)": "{\"description\": \"min=0.493, mean=0.493, max=0.493, sum=0.986 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4927780463042872\"}", + "Moral Disputes - # eval": "{\"description\": \"min=346, mean=346, max=346, sum=692 (2)\", \"tab\": \"General information\", \"score\": \"346.0\"}", + "Moral Disputes - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Moral Disputes - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Moral Disputes - # prompt tokens": "{\"description\": \"min=549.038, mean=549.038, max=549.038, sum=1098.075 (2)\", \"tab\": \"General information\", \"score\": \"549.0375722543353\"}", + "Moral Disputes - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "Moral Scenarios - # eval": "{\"description\": \"min=895, mean=895, max=895, sum=1790 (2)\", \"tab\": \"General information\", \"score\": \"895.0\"}", + "Moral Scenarios - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Moral Scenarios - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Moral Scenarios - # prompt tokens": "{\"description\": \"min=754.516, mean=754.516, max=754.516, sum=1509.032 (2)\", \"tab\": \"General information\", \"score\": \"754.5162011173185\"}", + "Moral Scenarios - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { - "additional_details": {} + "additional_details": { + "subject": "\"moral_scenarios\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_moral_scenarios\"" + } } }, { - "evaluation_name": "NaturalQuestions (closed-book)", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.253, + "score": 0.676, "details": { - "description": "min=0.253, mean=0.253, max=0.253, sum=0.253 (1)", + "description": "min=0.676, mean=0.676, max=0.676, sum=1.353 (2)", "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": "{\"description\": \"min=0.563, mean=0.563, max=0.563, sum=0.563 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.5634698050022126\"}", - "NaturalQuestions (closed-book) - Observed inference time (s)": "{\"description\": \"min=0.535, mean=0.535, max=0.535, sum=0.535 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.5347676448822022\"}", - "NaturalQuestions (open-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", - "NaturalQuestions (open-book) - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "NaturalQuestions (open-book) - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "NaturalQuestions (open-book) - # prompt tokens": "{\"description\": \"min=2498.79, mean=2498.79, max=2498.79, sum=2498.79 (1)\", \"tab\": \"General information\", \"score\": \"2498.79\"}", - "NaturalQuestions (open-book) - # output tokens": "{\"description\": \"min=12.448, mean=12.448, max=12.448, sum=12.448 (1)\", \"tab\": \"General information\", \"score\": \"12.448\"}", - "NaturalQuestions (closed-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", - "NaturalQuestions (closed-book) - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "NaturalQuestions (closed-book) - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "NaturalQuestions (closed-book) - # prompt tokens": "{\"description\": \"min=172.069, mean=172.069, max=172.069, sum=172.069 (1)\", \"tab\": \"General information\", \"score\": \"172.069\"}", - "NaturalQuestions (closed-book) - # output tokens": "{\"description\": \"min=20.461, mean=20.461, max=20.461, sum=20.461 (1)\", \"tab\": \"General information\", \"score\": \"20.461\"}" + "Nutrition - Observed inference time (s)": "{\"description\": \"min=0.236, mean=0.236, max=0.236, sum=0.471 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.23563866054310517\"}", + "Nutrition - # eval": "{\"description\": \"min=306, mean=306, max=306, sum=612 (2)\", \"tab\": \"General information\", \"score\": \"306.0\"}", + "Nutrition - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Nutrition - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Nutrition - # prompt tokens": "{\"description\": \"min=689.69, mean=689.69, max=689.69, sum=1379.379 (2)\", \"tab\": \"General information\", \"score\": \"689.6895424836601\"}", + "Nutrition - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "mode": "\"closedbook\"" + "subject": "\"nutrition\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_nutrition\"" } } }, { - "evaluation_name": "OpenbookQA", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "OpenbookQA", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "EM on OpenbookQA", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.79, + "score": 0.673, "details": { - "description": "min=0.79, mean=0.79, max=0.79, sum=0.79 (1)", + "description": "min=0.673, mean=0.673, max=0.673, sum=1.346 (2)", "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": "{\"description\": \"min=0.256, mean=0.256, max=0.256, sum=0.256 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.25593132400512697\"}", - "OpenbookQA - # eval": "{\"description\": \"min=500, mean=500, max=500, sum=500 (1)\", \"tab\": \"General information\", \"score\": \"500.0\"}", - "OpenbookQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "OpenbookQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "OpenbookQA - # prompt tokens": "{\"description\": \"min=289.15, mean=289.15, max=289.15, sum=289.15 (1)\", \"tab\": \"General information\", \"score\": \"289.15\"}", - "OpenbookQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Prehistory - Observed inference time (s)": "{\"description\": \"min=0.345, mean=0.345, max=0.345, sum=0.69 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.34476134880089465\"}", + "Prehistory - # eval": "{\"description\": \"min=324, mean=324, max=324, sum=648 (2)\", \"tab\": \"General information\", \"score\": \"324.0\"}", + "Prehistory - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Prehistory - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Prehistory - # prompt tokens": "{\"description\": \"min=611.145, mean=611.145, max=611.145, sum=1222.29 (2)\", \"tab\": \"General information\", \"score\": \"611.145061728395\"}", + "Prehistory - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "dataset": "\"openbookqa\"", - "method": "\"multiple_choice_joint\"" + "subject": "\"prehistory\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_prehistory\"" } } }, { - "evaluation_name": "MMLU", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "MMLU", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "EM on MMLU", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.51, + "score": 0.636, "details": { - "description": "min=0.27, mean=0.51, max=0.79, sum=2.551 (5)", + "description": "min=0.636, mean=0.636, max=0.636, sum=1.273 (2)", "tab": "Accuracy", - "MMLU - Observed inference time (s)": "{\"description\": \"min=0.221, mean=0.372, max=0.487, sum=1.862 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.37230395750413864\"}", - "MMLU - # eval": "{\"description\": \"min=100, mean=102.8, max=114, sum=514 (5)\", \"tab\": \"General information\", \"score\": \"102.8\"}", - "MMLU - # train": "{\"description\": \"min=5, mean=5, max=5, sum=25 (5)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "MMLU - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "MMLU - # prompt tokens": "{\"description\": \"min=411.44, mean=532.091, max=696.175, sum=2660.455 (5)\", \"tab\": \"General information\", \"score\": \"532.0910877192983\"}", - "MMLU - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Public Relations - Observed inference time (s)": "{\"description\": \"min=0.327, mean=0.327, max=0.327, sum=0.654 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3271717678416859\"}", + "Public Relations - # eval": "{\"description\": \"min=110, mean=110, max=110, sum=220 (2)\", \"tab\": \"General information\", \"score\": \"110.0\"}", + "Public Relations - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Public Relations - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Public Relations - # prompt tokens": "{\"description\": \"min=471.036, mean=471.036, max=471.036, sum=942.073 (2)\", \"tab\": \"General information\", \"score\": \"471.03636363636366\"}", + "Public Relations - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "[\"abstract_algebra\", \"college_chemistry\", \"computer_security\", \"econometrics\", \"us_foreign_policy\"]", - "method": "\"multiple_choice_joint\"" + "subject": "\"public_relations\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_public_relations\"" } } }, { - "evaluation_name": "MATH", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "MATH", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.289, + "score": 0.682, "details": { - "description": "min=0.115, mean=0.289, max=0.477, sum=2.02 (7)", + "description": "min=0.682, mean=0.682, max=0.682, sum=1.363 (2)", "tab": "Accuracy", - "MATH - Observed inference time (s)": "{\"description\": \"min=2.027, mean=2.656, max=3.039, sum=18.593 (7)\", \"tab\": \"Efficiency\", \"score\": \"2.656151831465352\"}", - "MATH - # eval": "{\"description\": \"min=30, mean=62.429, max=135, sum=437 (7)\", \"tab\": \"General information\", \"score\": \"62.42857142857143\"}", - "MATH - # train": "{\"description\": \"min=8, mean=8, max=8, sum=56 (7)\", \"tab\": \"General information\", \"score\": \"8.0\"}", - "MATH - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (7)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "MATH - # prompt tokens": "{\"description\": \"min=991.615, mean=1455.266, max=2502.962, sum=10186.865 (7)\", \"tab\": \"General information\", \"score\": \"1455.2664139976257\"}", - "MATH - # output tokens": "{\"description\": \"min=123.616, mean=149.99, max=172.789, sum=1049.933 (7)\", \"tab\": \"General information\", \"score\": \"149.99043902740354\"}" + "Security Studies - Observed inference time (s)": "{\"description\": \"min=0.561, mean=0.561, max=0.561, sum=1.121 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5606838294437954\"}", + "Security Studies - # eval": "{\"description\": \"min=245, mean=245, max=245, sum=490 (2)\", \"tab\": \"General information\", \"score\": \"245.0\"}", + "Security Studies - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Security Studies - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Security Studies - # prompt tokens": "{\"description\": \"min=1324.865, mean=1324.865, max=1324.865, sum=2649.731 (2)\", \"tab\": \"General information\", \"score\": \"1324.865306122449\"}", + "Security Studies - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "[\"algebra\", \"counting_and_probability\", \"geometry\", \"intermediate_algebra\", \"number_theory\", \"prealgebra\", \"precalculus\"]", - "level": "\"1\"", - "use_official_examples": "\"False\"", - "use_chain_of_thought": "\"True\"" + "subject": "\"security_studies\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_security_studies\"" } } }, { - "evaluation_name": "GSM8K", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "GSM8K", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "EM on GSM8K", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.538, + "score": 0.806, "details": { - "description": "min=0.538, mean=0.538, max=0.538, sum=0.538 (1)", + "description": "min=0.806, mean=0.806, max=0.806, sum=1.612 (2)", "tab": "Accuracy", - "GSM8K - Observed inference time (s)": "{\"description\": \"min=3.95, mean=3.95, max=3.95, sum=3.95 (1)\", \"tab\": \"Efficiency\", \"score\": \"3.949965229511261\"}", - "GSM8K - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", - "GSM8K - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "GSM8K - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "GSM8K - # prompt tokens": "{\"description\": \"min=1187.268, mean=1187.268, max=1187.268, sum=1187.268 (1)\", \"tab\": \"General information\", \"score\": \"1187.268\"}", - "GSM8K - # output tokens": "{\"description\": \"min=196.611, mean=196.611, max=196.611, sum=196.611 (1)\", \"tab\": \"General information\", \"score\": \"196.611\"}" + "Sociology - Observed inference time (s)": "{\"description\": \"min=0.413, mean=0.413, max=0.413, sum=0.825 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.41272182962787685\"}", + "Sociology - # eval": "{\"description\": \"min=201, mean=201, max=201, sum=402 (2)\", \"tab\": \"General information\", \"score\": \"201.0\"}", + "Sociology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Sociology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Sociology - # prompt tokens": "{\"description\": \"min=496.95, mean=496.95, max=496.95, sum=993.9 (2)\", \"tab\": \"General information\", \"score\": \"496.9502487562189\"}", + "Sociology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "stop": "\"none\"" + "subject": "\"sociology\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_sociology\"" } } }, { - "evaluation_name": "LegalBench", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "LegalBench", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "EM on LegalBench", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.331, + "score": 0.47, "details": { - "description": "min=0.063, mean=0.331, max=0.733, sum=1.655 (5)", + "description": "min=0.47, mean=0.47, max=0.47, sum=0.94 (2)", "tab": "Accuracy", - "LegalBench - Observed inference time (s)": "{\"description\": \"min=0.316, mean=0.489, max=0.855, sum=2.444 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.4887186054518059\"}", - "LegalBench - # eval": "{\"description\": \"min=95, mean=409.4, max=1000, sum=2047 (5)\", \"tab\": \"General information\", \"score\": \"409.4\"}", - "LegalBench - # train": "{\"description\": \"min=4, mean=4.8, max=5, sum=24 (5)\", \"tab\": \"General information\", \"score\": \"4.8\"}", - "LegalBench - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "LegalBench - # prompt tokens": "{\"description\": \"min=236.453, mean=1750.748, max=7224.488, sum=8753.741 (5)\", \"tab\": \"General information\", \"score\": \"1750.7482458432962\"}", - "LegalBench - # output tokens": "{\"description\": \"min=2, mean=9.174, max=15.242, sum=45.871 (5)\", \"tab\": \"General information\", \"score\": \"9.17419274343898\"}" + "Virology - Observed inference time (s)": "{\"description\": \"min=0.644, mean=0.644, max=0.644, sum=1.288 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6437842285776713\"}", + "Virology - # eval": "{\"description\": \"min=166, mean=166, max=166, sum=332 (2)\", \"tab\": \"General information\", \"score\": \"166.0\"}", + "Virology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Virology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Virology - # prompt tokens": "{\"description\": \"min=404.349, mean=404.349, max=404.349, sum=808.699 (2)\", \"tab\": \"General information\", \"score\": \"404.34939759036143\"}", + "Virology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subset": "[\"abercrombie\", \"corporate_lobbying\", \"function_of_decision_section\", \"international_citizenship_questions\", \"proa\"]" + "subject": "\"virology\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_virology\"" } } }, { - "evaluation_name": "MedQA", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "MedQA", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "EM on MedQA", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.517, + "score": 0.825, "details": { - "description": "min=0.517, mean=0.517, max=0.517, sum=0.517 (1)", + "description": "min=0.825, mean=0.825, max=0.825, sum=1.649 (2)", "tab": "Accuracy", - "MedQA - Observed inference time (s)": "{\"description\": \"min=0.418, mean=0.418, max=0.418, sum=0.418 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.4182186216767692\"}", - "MedQA - # eval": "{\"description\": \"min=503, mean=503, max=503, sum=503 (1)\", \"tab\": \"General information\", \"score\": \"503.0\"}", - "MedQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "MedQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "MedQA - # prompt tokens": "{\"description\": \"min=1202.093, mean=1202.093, max=1202.093, sum=1202.093 (1)\", \"tab\": \"General information\", \"score\": \"1202.0934393638172\"}", - "MedQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "World Religions - Observed inference time (s)": "{\"description\": \"min=0.266, mean=0.266, max=0.266, sum=0.532 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.26615772330970094\"}", + "World Religions - # eval": "{\"description\": \"min=171, mean=171, max=171, sum=342 (2)\", \"tab\": \"General information\", \"score\": \"171.0\"}", + "World Religions - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "World Religions - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "World Religions - # prompt tokens": "{\"description\": \"min=317.924, mean=317.924, max=317.924, sum=635.848 (2)\", \"tab\": \"General information\", \"score\": \"317.92397660818716\"}", + "World Religions - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { - "additional_details": {} + "additional_details": { + "subject": "\"world_religions\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_world_religions\"" + } } }, { - "evaluation_name": "WMT 2014", + "evaluation_name": "Mean win rate", "source_data": { - "dataset_name": "WMT 2014", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", + "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.142, + "score": 0.509, "details": { - "description": "min=0.047, mean=0.142, max=0.184, sum=0.712 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": "{\"description\": \"min=0.582, mean=0.775, max=0.872, sum=3.875 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.7750062139801958\"}", - "WMT 2014 - # eval": "{\"description\": \"min=503, mean=568.8, max=832, sum=2844 (5)\", \"tab\": \"General information\", \"score\": \"568.8\"}", - "WMT 2014 - # train": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "WMT 2014 - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "WMT 2014 - # prompt tokens": "{\"description\": \"min=148.306, mean=162.433, max=181.018, sum=812.166 (5)\", \"tab\": \"General information\", \"score\": \"162.43317355482492\"}", - "WMT 2014 - # output tokens": "{\"description\": \"min=28.3, mean=30.51, max=31.912, sum=152.552 (5)\", \"tab\": \"General information\", \"score\": \"30.510483732222053\"}" + "description": "", + "tab": "Efficiency" } }, "generation_config": { - "additional_details": { - "language_pair": "[\"cs-en\", \"de-en\", \"fr-en\", \"hi-en\", \"ru-en\"]" - } + "additional_details": {} } } ], "detailed_evaluation_results": null, "generation_config": { - "additional_details": {} + "additional_details": { + "subject": "[\"abstract_algebra\", \"anatomy\", \"astronomy\", \"business_ethics\", \"clinical_knowledge\", \"college_biology\", \"college_chemistry\", \"college_computer_science\", \"college_mathematics\", \"college_medicine\", \"college_physics\", \"computer_security\", \"conceptual_physics\", \"econometrics\", \"electrical_engineering\", \"elementary_mathematics\", \"formal_logic\", \"global_facts\", \"high_school_biology\", \"high_school_chemistry\", \"high_school_computer_science\", \"high_school_european_history\", \"high_school_geography\", \"high_school_government_and_politics\", \"high_school_macroeconomics\", \"high_school_mathematics\", \"high_school_microeconomics\", \"high_school_physics\", \"high_school_psychology\", \"high_school_statistics\", \"high_school_us_history\", \"high_school_world_history\", \"human_aging\", \"human_sexuality\", \"international_law\", \"jurisprudence\", \"logical_fallacies\", \"machine_learning\", \"management\", \"marketing\", \"medical_genetics\", \"miscellaneous\", \"moral_disputes\", \"moral_scenarios\", \"nutrition\", \"philosophy\", \"prehistory\", \"professional_accounting\", \"professional_law\", \"professional_medicine\", \"professional_psychology\", \"public_relations\", \"security_studies\", \"sociology\", \"us_foreign_policy\", \"virology\", \"world_religions\"]", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "[\"mmlu_abstract_algebra\", \"mmlu_anatomy\", \"mmlu_astronomy\", \"mmlu_business_ethics\", \"mmlu_clinical_knowledge\", \"mmlu_college_biology\", \"mmlu_college_chemistry\", \"mmlu_college_computer_science\", \"mmlu_college_mathematics\", \"mmlu_college_medicine\", \"mmlu_college_physics\", \"mmlu_computer_security\", \"mmlu_conceptual_physics\", \"mmlu_econometrics\", \"mmlu_electrical_engineering\", \"mmlu_elementary_mathematics\", \"mmlu_formal_logic\", \"mmlu_global_facts\", \"mmlu_high_school_biology\", \"mmlu_high_school_chemistry\", \"mmlu_high_school_computer_science\", \"mmlu_high_school_european_history\", \"mmlu_high_school_geography\", \"mmlu_high_school_government_and_politics\", \"mmlu_high_school_macroeconomics\", \"mmlu_high_school_mathematics\", \"mmlu_high_school_microeconomics\", \"mmlu_high_school_physics\", \"mmlu_high_school_psychology\", \"mmlu_high_school_statistics\", \"mmlu_high_school_us_history\", \"mmlu_high_school_world_history\", \"mmlu_human_aging\", \"mmlu_human_sexuality\", \"mmlu_international_law\", \"mmlu_jurisprudence\", \"mmlu_logical_fallacies\", \"mmlu_machine_learning\", \"mmlu_management\", \"mmlu_marketing\", \"mmlu_medical_genetics\", \"mmlu_miscellaneous\", \"mmlu_moral_disputes\", \"mmlu_moral_scenarios\", \"mmlu_nutrition\", \"mmlu_philosophy\", \"mmlu_prehistory\", \"mmlu_professional_accounting\", \"mmlu_professional_law\", \"mmlu_professional_medicine\", \"mmlu_professional_psychology\", \"mmlu_public_relations\", \"mmlu_security_studies\", \"mmlu_sociology\", \"mmlu_us_foreign_policy\", \"mmlu_virology\", \"mmlu_world_religions\"]" + } } }, { diff --git a/data/models/mistralai_mistral-small-2503.json b/data/models/mistralai_mistral-small-2503.json index be5d73de7278abb3c747dbebd44b60d3fa624503..6df0d972b005ae32b060fdc4673d6092e770670f 100644 --- a/data/models/mistralai_mistral-small-2503.json +++ b/data/models/mistralai_mistral-small-2503.json @@ -10,8 +10,8 @@ }, "evaluations": [ { - "evaluation_id": "global-mmlu-lite/mistralai_mistral-small-2503/1773936496.366405", - "retrieved_timestamp": "1773936496.366405", + "evaluation_id": "global-mmlu-lite/mistralai_mistral-small-2503/1773936583.743359", + "retrieved_timestamp": "1773936583.743359", "source_metadata": { "source_name": "Global MMLU Lite Leaderboard", "source_type": "documentation", @@ -525,8 +525,8 @@ "generation_config": null }, { - "evaluation_id": "global-mmlu-lite/mistralai_mistral-small-2503/1773936583.743359", - "retrieved_timestamp": "1773936583.743359", + "evaluation_id": "global-mmlu-lite/mistralai_mistral-small-2503/1773936496.366405", + "retrieved_timestamp": "1773936496.366405", "source_metadata": { "source_name": "Global MMLU Lite Leaderboard", "source_type": "documentation", diff --git a/data/models/mistralai_mixtral-8x7b-v0.1.json b/data/models/mistralai_mixtral-8x7b-v0.1.json index 9d997e3527157e47894ae0f49b424a1634297e78..c3ac844f0072de0a86748b754d6c90570298768c 100644 --- a/data/models/mistralai_mixtral-8x7b-v0.1.json +++ b/data/models/mistralai_mixtral-8x7b-v0.1.json @@ -5,7 +5,7 @@ "developer": "mistralai", "inference_platform": "unknown", "additional_details": { - "precision": "bfloat16", + "precision": "float16", "architecture": "MixtralForCausalLM", "params_billions": "46.703" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2415 + "score": 0.2326 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5087 + "score": 0.5098 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.102 + "score": 0.0937 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3138 + "score": 0.3205 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4321 + "score": 0.4413 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.385 + "score": 0.3871 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2326 + "score": 0.2415 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5098 + "score": 0.5087 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0937 + "score": 0.102 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3205 + "score": 0.3138 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4413 + "score": 0.4321 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3871 + "score": 0.385 } } ], diff --git a/data/models/mlabonne_neuraldaredevil-8b-abliterated.json b/data/models/mlabonne_neuraldaredevil-8b-abliterated.json index d443de39bb7ed82b00df80190432e583c21fd660..7ef165972eafdeec56d923c82e02fdbbc9479eac 100644 --- a/data/models/mlabonne_neuraldaredevil-8b-abliterated.json +++ b/data/models/mlabonne_neuraldaredevil-8b-abliterated.json @@ -5,7 +5,7 @@ "developer": "mlabonne", "inference_platform": "unknown", "additional_details": { - "precision": "float16", + "precision": "bfloat16", "architecture": "LlamaForCausalLM", "params_billions": "8.03" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.7561 + "score": 0.4162 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5111 + "score": 0.5124 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0906 + "score": 0.0853 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3062 + "score": 0.3029 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4019 + "score": 0.415 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3841 + "score": 0.3802 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4162 + "score": 0.7561 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5124 + "score": 0.5111 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0853 + "score": 0.0906 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3029 + "score": 0.3062 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.415 + "score": 0.4019 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3802 + "score": 0.3841 } } ], diff --git a/data/models/moonshot-ai_kimi-k2-instruct.json b/data/models/moonshot-ai_kimi-k2-instruct.json index 2cfded0145e5c6821159f45b392f6b86e15c7f49..758984500ae56b028445a56fb8562c74a90a3a8f 100644 --- a/data/models/moonshot-ai_kimi-k2-instruct.json +++ b/data/models/moonshot-ai_kimi-k2-instruct.json @@ -4,13 +4,13 @@ "id": "moonshot-ai/kimi-k2-instruct", "developer": "Moonshot AI", "additional_details": { - "agent_name": "Terminus 2", - "agent_organization": "Terminal Bench" + "agent_name": "OpenHands", + "agent_organization": "OpenHands" } }, "evaluations": [ { - "evaluation_id": "terminal-bench-2.0/terminus-2__kimi-k2-instruct/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/openhands__kimi-k2-instruct/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -34,7 +34,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-01", + "evaluation_timestamp": "2025-11-02", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -43,17 +43,17 @@ "max_score": 100.0 }, "score_details": { - "score": 27.8, + "score": 26.7, "uncertainty": { "standard_error": { - "value": 2.5 + "value": 2.7 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Kimi K2 Instruct\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Kimi K2 Instruct\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -70,7 +70,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Kimi K2 Instruct\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Kimi K2 Instruct\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -84,7 +84,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/openhands__kimi-k2-instruct/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/terminus-2__kimi-k2-instruct/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -108,7 +108,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-02", + "evaluation_timestamp": "2025-11-01", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -117,17 +117,17 @@ "max_score": 100.0 }, "score_details": { - "score": 26.7, + "score": 27.8, "uncertainty": { "standard_error": { - "value": 2.7 + "value": 2.5 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Kimi K2 Instruct\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Kimi K2 Instruct\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -144,7 +144,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"Kimi K2 Instruct\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Kimi K2 Instruct\" -k 5", "agentic_eval_config": { "available_tools": [ { diff --git a/data/models/multiple_multiple.json b/data/models/multiple_multiple.json index 0a0e35b1e041b4e94e4f88e456a39900b87f0e44..5fae5f56e342c454584c0143a055e41581a00848 100644 --- a/data/models/multiple_multiple.json +++ b/data/models/multiple_multiple.json @@ -34,7 +34,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-11", + "evaluation_timestamp": "2025-11-20", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -43,10 +43,10 @@ "max_score": 100.0 }, "score_details": { - "score": 50.1, + "score": 59.1, "uncertainty": { "standard_error": { - "value": 2.7 + "value": 2.8 }, "num_samples": 435 } @@ -84,7 +84,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/warp__multiple/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/junie-cli__multiple/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -108,7 +108,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-20", + "evaluation_timestamp": "2026-03-07", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -117,17 +117,17 @@ "max_score": 100.0 }, "score_details": { - "score": 59.1, + "score": 71.0, "uncertainty": { "standard_error": { - "value": 2.8 + "value": 2.9 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Warp\" -m \"Multiple\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Junie CLI\" -m \"Multiple\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -144,7 +144,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Warp\" -m \"Multiple\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Junie CLI\" -m \"Multiple\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -158,7 +158,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/warp__multiple/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/abacus-ai-desktop__multiple/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -182,7 +182,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-12-12", + "evaluation_timestamp": "2025-12-11", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -191,17 +191,17 @@ "max_score": 100.0 }, "score_details": { - "score": 61.2, + "score": 58.4, "uncertainty": { "standard_error": { - "value": 3.0 + "value": 2.8 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Warp\" -m \"Multiple\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Abacus AI Desktop\" -m \"Multiple\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -218,7 +218,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Warp\" -m \"Multiple\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Abacus AI Desktop\" -m \"Multiple\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -232,7 +232,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/abacus-ai-desktop__multiple/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/warp__multiple/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -256,7 +256,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-12-11", + "evaluation_timestamp": "2025-12-12", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -265,17 +265,17 @@ "max_score": 100.0 }, "score_details": { - "score": 58.4, + "score": 61.2, "uncertainty": { "standard_error": { - "value": 2.8 + "value": 3.0 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Abacus AI Desktop\" -m \"Multiple\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Warp\" -m \"Multiple\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -292,7 +292,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Abacus AI Desktop\" -m \"Multiple\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Warp\" -m \"Multiple\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -306,7 +306,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/ob-1__multiple/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/warp__multiple/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -330,7 +330,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2026-03-05", + "evaluation_timestamp": "2025-11-11", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -339,17 +339,17 @@ "max_score": 100.0 }, "score_details": { - "score": 72.4, + "score": 50.1, "uncertainty": { "standard_error": { - "value": 2.3 + "value": 2.7 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"OB-1\" -m \"Multiple\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Warp\" -m \"Multiple\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -366,7 +366,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"OB-1\" -m \"Multiple\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Warp\" -m \"Multiple\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -380,7 +380,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/junie-cli__multiple/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/ob-1__multiple/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -404,7 +404,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2026-03-07", + "evaluation_timestamp": "2026-03-05", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -413,17 +413,17 @@ "max_score": 100.0 }, "score_details": { - "score": 71.0, + "score": 72.4, "uncertainty": { "standard_error": { - "value": 2.9 + "value": 2.3 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Junie CLI\" -m \"Multiple\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"OB-1\" -m \"Multiple\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -440,7 +440,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Junie CLI\" -m \"Multiple\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"OB-1\" -m \"Multiple\" -k 5", "agentic_eval_config": { "available_tools": [ { diff --git a/data/models/nazimali_mistral-nemo-kurdish-instruct.json b/data/models/nazimali_mistral-nemo-kurdish-instruct.json index bf12d1ac4ce4431e1cc4657892f1de48dd5df10b..7bcf436e7a29205a95d8b228b8e52bcfb9264e7a 100644 --- a/data/models/nazimali_mistral-nemo-kurdish-instruct.json +++ b/data/models/nazimali_mistral-nemo-kurdish-instruct.json @@ -5,7 +5,7 @@ "developer": "nazimali", "inference_platform": "unknown", "additional_details": { - "precision": "float16", + "precision": "bfloat16", "architecture": "MistralForCausalLM", "params_billions": "12.248" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.486 + "score": 0.4964 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4721 + "score": 0.4699 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0846 + "score": 0.0045 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2844 + "score": 0.2827 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4006 + "score": 0.3979 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3087 + "score": 0.3063 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4964 + "score": 0.486 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4699 + "score": 0.4721 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0045 + "score": 0.0846 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2827 + "score": 0.2844 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3979 + "score": 0.4006 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3063 + "score": 0.3087 } } ], diff --git a/data/models/nicolinho_qrm-gemma-2-27b.json b/data/models/nicolinho_qrm-gemma-2-27b.json index 98185886d3c230ddcd90456c69a5aeed49795fc5..1dea90f885df9d34139a9ef21e55b3dcce1a25fd 100644 --- a/data/models/nicolinho_qrm-gemma-2-27b.json +++ b/data/models/nicolinho_qrm-gemma-2-27b.json @@ -9,10 +9,10 @@ }, "evaluations": [ { - "evaluation_id": "reward-bench-2/nicolinho_QRM-Gemma-2-27B/1766412838.146816", + "evaluation_id": "reward-bench/nicolinho_QRM-Gemma-2-27B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench 2", + "source_name": "RewardBench", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -31,104 +31,128 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7667 + "score": 0.9444 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Factuality", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7853 + "score": 0.9665 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Precise IF", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3719 + "score": 0.9013 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6995 + "score": 0.927 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Safety", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9578 + "score": 0.9826 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } - }, + } + ], + "detailed_evaluation_results": null, + "generation_config": null + }, + { + "evaluation_id": "reward-bench-2/nicolinho_QRM-Gemma-2-27B/1766412838.146816", + "retrieved_timestamp": "1766412838.146816", + "source_metadata": { + "source_name": "RewardBench 2", + "source_type": "documentation", + "source_organization_name": "Allen Institute for AI", + "source_organization_url": "https://allenai.org", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "rewardbench", + "version": "0.1.3", + "additional_details": { + "subsets": "Chat, Chat Hard, Safety, Reasoning", + "hf_space": "allenai/reward-bench" + } + }, + "benchmark": "reward-bench", + "evaluation_results": [ { - "evaluation_name": "Focus", + "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9535 + "score": 0.7667 }, "source_data": { "dataset_name": "RewardBench 2", @@ -137,135 +161,111 @@ } }, { - "evaluation_name": "Ties", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8321 + "score": 0.7853 }, "source_data": { "dataset_name": "RewardBench 2", "source_type": "hf_dataset", "hf_repo": "allenai/reward-bench-2-results" } - } - ], - "detailed_evaluation_results": null, - "generation_config": null - }, - { - "evaluation_id": "reward-bench/nicolinho_QRM-Gemma-2-27B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "eval_library": { - "name": "rewardbench", - "version": "0.1.3", - "additional_details": { - "subsets": "Chat, Chat Hard, Safety, Reasoning", - "hf_space": "allenai/reward-bench" - } - }, - "benchmark": "reward-bench", - "evaluation_results": [ + }, { - "evaluation_name": "Score", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9444 + "score": 0.3719 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Math", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Math score - measures mathematical reasoning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9665 + "score": 0.6995 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9013 + "score": 0.9578 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Safety", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.927 + "score": 0.9535 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9826 + "score": 0.8321 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } } ], diff --git a/data/models/nicolinho_qrm-llama3.1-8b-v2.json b/data/models/nicolinho_qrm-llama3.1-8b-v2.json index 0df8878cad15f33ea391f78f6a5406e147177591..71e586c5d191f366e8b76150e58f0f9807a69f6b 100644 --- a/data/models/nicolinho_qrm-llama3.1-8b-v2.json +++ b/data/models/nicolinho_qrm-llama3.1-8b-v2.json @@ -9,10 +9,10 @@ }, "evaluations": [ { - "evaluation_id": "reward-bench-2/nicolinho_QRM-Llama3.1-8B-v2/1766412838.146816", + "evaluation_id": "reward-bench/nicolinho_QRM-Llama3.1-8B-v2/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench 2", + "source_name": "RewardBench", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -31,104 +31,128 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7074 + "score": 0.9314 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Factuality", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6653 + "score": 0.9637 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Precise IF", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.4062 + "score": 0.8684 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.612 + "score": 0.9257 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Safety", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9467 + "score": 0.9677 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } - }, + } + ], + "detailed_evaluation_results": null, + "generation_config": null + }, + { + "evaluation_id": "reward-bench-2/nicolinho_QRM-Llama3.1-8B-v2/1766412838.146816", + "retrieved_timestamp": "1766412838.146816", + "source_metadata": { + "source_name": "RewardBench 2", + "source_type": "documentation", + "source_organization_name": "Allen Institute for AI", + "source_organization_url": "https://allenai.org", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "rewardbench", + "version": "0.1.3", + "additional_details": { + "subsets": "Chat, Chat Hard, Safety, Reasoning", + "hf_space": "allenai/reward-bench" + } + }, + "benchmark": "reward-bench", + "evaluation_results": [ { - "evaluation_name": "Focus", + "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8909 + "score": 0.7074 }, "source_data": { "dataset_name": "RewardBench 2", @@ -137,135 +161,111 @@ } }, { - "evaluation_name": "Ties", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7234 + "score": 0.6653 }, "source_data": { "dataset_name": "RewardBench 2", "source_type": "hf_dataset", "hf_repo": "allenai/reward-bench-2-results" } - } - ], - "detailed_evaluation_results": null, - "generation_config": null - }, - { - "evaluation_id": "reward-bench/nicolinho_QRM-Llama3.1-8B-v2/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "eval_library": { - "name": "rewardbench", - "version": "0.1.3", - "additional_details": { - "subsets": "Chat, Chat Hard, Safety, Reasoning", - "hf_space": "allenai/reward-bench" - } - }, - "benchmark": "reward-bench", - "evaluation_results": [ + }, { - "evaluation_name": "Score", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9314 + "score": 0.4062 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Math", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Math score - measures mathematical reasoning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9637 + "score": 0.612 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8684 + "score": 0.9467 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Safety", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9257 + "score": 0.8909 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9677 + "score": 0.7234 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } } ], diff --git a/data/models/nisten_franqwenstein-35b.json b/data/models/nisten_franqwenstein-35b.json index a3333a53a7553456405aceea78020ff40ff402f8..3a70722384deca25a07bb44eb7942735ee94aa9c 100644 --- a/data/models/nisten_franqwenstein-35b.json +++ b/data/models/nisten_franqwenstein-35b.json @@ -5,7 +5,7 @@ "developer": "nisten", "inference_platform": "unknown", "additional_details": { - "precision": "float16", + "precision": "bfloat16", "architecture": "Qwen2ForCausalLM", "params_billions": "34.714" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3799 + "score": 0.3914 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.6647 + "score": 0.6591 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3406 + "score": 0.3044 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4035 + "score": 0.3591 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.494 + "score": 0.4681 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5731 + "score": 0.5611 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3914 + "score": 0.3799 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.6591 + "score": 0.6647 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3044 + "score": 0.3406 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3591 + "score": 0.4035 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4681 + "score": 0.494 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5611 + "score": 0.5731 } } ], diff --git a/data/models/nousresearch_yarn-llama-2-7b-128k.json b/data/models/nousresearch_yarn-llama-2-7b-128k.json deleted file mode 100644 index 031faa4d0ff78244e61988ea0bfc24cefdd224de..0000000000000000000000000000000000000000 --- a/data/models/nousresearch_yarn-llama-2-7b-128k.json +++ /dev/null @@ -1,145 +0,0 @@ -{ - "model_info": { - "name": "Yarn-Llama-2-7b-128k", - "id": "NousResearch/Yarn-Llama-2-7b-128k", - "developer": "NousResearch", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": "7.0" - } - }, - "evaluations": [ - { - "evaluation_id": "hfopenllm_v2/NousResearch_Yarn-Llama-2-7b-128k/1773936498.240187", - "retrieved_timestamp": "1773936498.240187", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "eval_library": { - "name": "lm-evaluation-harness", - "version": "0.4.0", - "additional_details": { - "fork": "https://github.com/huggingface/lm-evaluation-harness/tree/adding_all_changess" - } - }, - "benchmark": "hfopenllm_v2", - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1485 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3248 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0151 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3967 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1791 - } - } - ], - "detailed_evaluation_results": null, - "generation_config": null - } - ] -} \ No newline at end of file diff --git a/data/models/omkar1102_code-yi.json b/data/models/omkar1102_code-yi.json index c43a8e6f44d50964e1b475e97cca0076acc3fcc2..420be452467271270dd617b6ac43e82dcaa608b1 100644 --- a/data/models/omkar1102_code-yi.json +++ b/data/models/omkar1102_code-yi.json @@ -5,7 +5,7 @@ "developer": "Omkar1102", "inference_platform": "unknown", "additional_details": { - "precision": "float16", + "precision": "bfloat16", "architecture": "LlamaForCausalLM", "params_billions": "2.084" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2148 + "score": 0.2254 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.276 + "score": 0.275 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2508 + "score": 0.2576 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3802 + "score": 0.3762 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1126 + "score": 0.1123 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2254 + "score": 0.2148 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.275 + "score": 0.276 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2576 + "score": 0.2508 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3762 + "score": 0.3802 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1123 + "score": 0.1126 } } ], diff --git a/data/models/openai_gpt-3.5-turbo-0613.json b/data/models/openai_gpt-3.5-turbo-0613.json index 9d2e47e97b66d9b4bf8d0c5ba01a8d6f70327065..c24676b03a30239bea93de7207f3f13985ab4587 100644 --- a/data/models/openai_gpt-3.5-turbo-0613.json +++ b/data/models/openai_gpt-3.5-turbo-0613.json @@ -6,233 +6,6 @@ "inference_platform": "unknown" }, "evaluations": [ - { - "evaluation_id": "helm_instruct/openai_gpt-3.5-turbo-0613/1774096309.537868", - "retrieved_timestamp": "1774096309.537868", - "source_metadata": { - "source_name": "helm_instruct", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "eval_library": { - "name": "helm", - "version": "unknown" - }, - "benchmark": "helm_instruct", - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_instruct", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.689, - "details": { - "description": "", - "tab": "Instruction Following" - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "Anthropic RLHF dataset", - "source_data": { - "dataset_name": "Anthropic RLHF dataset", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" - ] - }, - "metric_config": { - "evaluation_description": "Harmlessness on Anthropic RLHF dataset", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.964, - "details": { - "description": "min=4.915, mean=4.964, max=5, sum=39.715 (8)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "additional_details": { - "subset": "[\"hh\", \"hh\", \"hh\", \"hh\", \"red_team\", \"red_team\", \"red_team\", \"red_team\"]", - "evaluator": "[\"claude\", \"gpt4\", \"mturk\", \"scale\", \"claude\", \"gpt4\", \"mturk\", \"scale\"]" - } - } - }, - { - "evaluation_name": "Best ChatGPT Prompts", - "source_data": { - "dataset_name": "Best ChatGPT Prompts", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" - ] - }, - "metric_config": { - "evaluation_description": "Harmlessness on Best ChatGPT Prompts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.986, - "details": { - "description": "min=4.95, mean=4.986, max=5, sum=19.945 (4)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "additional_details": { - "path": "\"src_helm_benchmark_scenarios_best_chatgpt_prompts.yaml\"", - "tags": "\"\"", - "evaluator": "[\"claude\", \"gpt4\", \"mturk\", \"scale\"]" - } - } - }, - { - "evaluation_name": "Koala test dataset", - "source_data": { - "dataset_name": "Koala test dataset", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" - ] - }, - "metric_config": { - "evaluation_description": "Harmlessness on Koala test dataset", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.987, - "details": { - "description": "min=4.969, mean=4.987, max=5, sum=19.95 (4)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "additional_details": { - "evaluator": "[\"claude\", \"gpt4\", \"mturk\", \"scale\"]" - } - } - }, - { - "evaluation_name": "Open Assistant", - "source_data": { - "dataset_name": "Open Assistant", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" - ] - }, - "metric_config": { - "evaluation_description": "Harmlessness on Open Assistant", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.987, - "details": { - "description": "min=4.96, mean=4.987, max=5, sum=19.95 (4)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "additional_details": { - "language": "\"en\"", - "evaluator": "[\"claude\", \"gpt4\", \"mturk\", \"scale\"]" - } - } - }, - { - "evaluation_name": "Self Instruct", - "source_data": { - "dataset_name": "Self Instruct", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" - ] - }, - "metric_config": { - "evaluation_description": "Harmlessness on Self Instruct", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.99, - "details": { - "description": "min=4.97, mean=4.99, max=5, sum=19.96 (4)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "additional_details": { - "evaluator": "[\"claude\", \"gpt4\", \"mturk\", \"scale\"]" - } - } - }, - { - "evaluation_name": "Vicuna", - "source_data": { - "dataset_name": "Vicuna", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" - ] - }, - "metric_config": { - "evaluation_description": "Harmlessness on Vicuna", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.992, - "details": { - "description": "min=4.975, mean=4.992, max=5, sum=19.969 (4)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "additional_details": { - "category": "\"all\"", - "evaluator": "[\"claude\", \"gpt4\", \"mturk\", \"scale\"]" - } - } - } - ], - "detailed_evaluation_results": null, - "generation_config": { - "additional_details": {} - } - }, { "evaluation_id": "helm_classic/openai_gpt-3.5-turbo-0613/1774096308.339228", "retrieved_timestamp": "1774096308.339228", @@ -897,6 +670,233 @@ "additional_details": {} } }, + { + "evaluation_id": "helm_instruct/openai_gpt-3.5-turbo-0613/1774096309.537868", + "retrieved_timestamp": "1774096309.537868", + "source_metadata": { + "source_name": "helm_instruct", + "source_type": "documentation", + "source_organization_name": "crfm", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "helm", + "version": "unknown" + }, + "benchmark": "helm_instruct", + "evaluation_results": [ + { + "evaluation_name": "Mean win rate", + "source_data": { + "dataset_name": "helm_instruct", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, + "metric_config": { + "evaluation_description": "How many models this model outperform on average (over columns).", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.689, + "details": { + "description": "", + "tab": "Instruction Following" + } + }, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_name": "Anthropic RLHF dataset", + "source_data": { + "dataset_name": "Anthropic RLHF dataset", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, + "metric_config": { + "evaluation_description": "Harmlessness on Anthropic RLHF dataset", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 5.0 + }, + "score_details": { + "score": 4.964, + "details": { + "description": "min=4.915, mean=4.964, max=5, sum=39.715 (8)", + "tab": "Instruction Following" + } + }, + "generation_config": { + "additional_details": { + "subset": "[\"hh\", \"hh\", \"hh\", \"hh\", \"red_team\", \"red_team\", \"red_team\", \"red_team\"]", + "evaluator": "[\"claude\", \"gpt4\", \"mturk\", \"scale\", \"claude\", \"gpt4\", \"mturk\", \"scale\"]" + } + } + }, + { + "evaluation_name": "Best ChatGPT Prompts", + "source_data": { + "dataset_name": "Best ChatGPT Prompts", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, + "metric_config": { + "evaluation_description": "Harmlessness on Best ChatGPT Prompts", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 5.0 + }, + "score_details": { + "score": 4.986, + "details": { + "description": "min=4.95, mean=4.986, max=5, sum=19.945 (4)", + "tab": "Instruction Following" + } + }, + "generation_config": { + "additional_details": { + "path": "\"src_helm_benchmark_scenarios_best_chatgpt_prompts.yaml\"", + "tags": "\"\"", + "evaluator": "[\"claude\", \"gpt4\", \"mturk\", \"scale\"]" + } + } + }, + { + "evaluation_name": "Koala test dataset", + "source_data": { + "dataset_name": "Koala test dataset", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, + "metric_config": { + "evaluation_description": "Harmlessness on Koala test dataset", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 5.0 + }, + "score_details": { + "score": 4.987, + "details": { + "description": "min=4.969, mean=4.987, max=5, sum=19.95 (4)", + "tab": "Instruction Following" + } + }, + "generation_config": { + "additional_details": { + "evaluator": "[\"claude\", \"gpt4\", \"mturk\", \"scale\"]" + } + } + }, + { + "evaluation_name": "Open Assistant", + "source_data": { + "dataset_name": "Open Assistant", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, + "metric_config": { + "evaluation_description": "Harmlessness on Open Assistant", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 5.0 + }, + "score_details": { + "score": 4.987, + "details": { + "description": "min=4.96, mean=4.987, max=5, sum=19.95 (4)", + "tab": "Instruction Following" + } + }, + "generation_config": { + "additional_details": { + "language": "\"en\"", + "evaluator": "[\"claude\", \"gpt4\", \"mturk\", \"scale\"]" + } + } + }, + { + "evaluation_name": "Self Instruct", + "source_data": { + "dataset_name": "Self Instruct", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, + "metric_config": { + "evaluation_description": "Harmlessness on Self Instruct", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 5.0 + }, + "score_details": { + "score": 4.99, + "details": { + "description": "min=4.97, mean=4.99, max=5, sum=19.96 (4)", + "tab": "Instruction Following" + } + }, + "generation_config": { + "additional_details": { + "evaluator": "[\"claude\", \"gpt4\", \"mturk\", \"scale\"]" + } + } + }, + { + "evaluation_name": "Vicuna", + "source_data": { + "dataset_name": "Vicuna", + "source_type": "url", + "url": [ + "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" + ] + }, + "metric_config": { + "evaluation_description": "Harmlessness on Vicuna", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 5.0 + }, + "score_details": { + "score": 4.992, + "details": { + "description": "min=4.975, mean=4.992, max=5, sum=19.969 (4)", + "tab": "Instruction Following" + } + }, + "generation_config": { + "additional_details": { + "category": "\"all\"", + "evaluator": "[\"claude\", \"gpt4\", \"mturk\", \"scale\"]" + } + } + } + ], + "detailed_evaluation_results": null, + "generation_config": { + "additional_details": {} + } + }, { "evaluation_id": "helm_lite/openai_gpt-3.5-turbo-0613/1774096306.427425", "retrieved_timestamp": "1774096306.427425", diff --git a/data/models/openai_gpt-4-0613.json b/data/models/openai_gpt-4-0613.json index 968890d0737b80810e6c39ba99b0e4eb0e407fa8..f9edf8b400bcfd0d8167fb29ff1bda235867f788 100644 --- a/data/models/openai_gpt-4-0613.json +++ b/data/models/openai_gpt-4-0613.json @@ -7,10 +7,10 @@ }, "evaluations": [ { - "evaluation_id": "helm_mmlu/openai_gpt-4-0613/1774096312.00548", - "retrieved_timestamp": "1774096312.00548", + "evaluation_id": "helm_lite/openai_gpt-4-0613/1774096306.427425", + "retrieved_timestamp": "1774096306.427425", "source_metadata": { - "source_name": "helm_mmlu", + "source_name": "helm_lite", "source_type": "documentation", "source_organization_name": "crfm", "evaluator_relationship": "third_party" @@ -19,438 +19,380 @@ "name": "helm", "version": "unknown" }, - "benchmark": "helm_mmlu", + "benchmark": "helm_lite", "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects", + "evaluation_name": "Mean win rate", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "helm_lite", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", + "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.824, + "score": 0.867, "details": { - "description": "min=0.54, mean=0.824, max=0.99, sum=93.978 (114)", + "description": "", "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": "{\"description\": \"min=0.364, mean=0.447, max=0.579, sum=51.005 (114)\", \"tab\": \"Efficiency\", \"score\": \"0.4474144183932911\"}", - "MMLU All Subjects - # eval": "{\"description\": \"min=100, mean=246.351, max=1534, sum=28084 (114)\", \"tab\": \"General information\", \"score\": \"246.35087719298247\"}", - "MMLU All Subjects - # train": "{\"description\": \"min=5, mean=5, max=5, sum=570 (114)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "MMLU All Subjects - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (114)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "MMLU All Subjects - # prompt tokens": "{\"description\": \"min=268.561, mean=607.852, max=2791.073, sum=69295.086 (114)\", \"tab\": \"General information\", \"score\": \"607.851634217556\"}", - "MMLU All Subjects - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=114 (114)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Mean win rate - Efficiency": "{\"description\": \"\", \"tab\": \"Efficiency\", \"score\": \"0.5158801498127341\"}", + "Mean win rate - General information": "{\"description\": \"\", \"tab\": \"General information\", \"score\": \"\"}" } }, "generation_config": { - "additional_details": { - "subject": "[\"abstract_algebra\", \"anatomy\", \"astronomy\", \"business_ethics\", \"clinical_knowledge\", \"college_biology\", \"college_chemistry\", \"college_computer_science\", \"college_mathematics\", \"college_medicine\", \"college_physics\", \"computer_security\", \"conceptual_physics\", \"econometrics\", \"electrical_engineering\", \"elementary_mathematics\", \"formal_logic\", \"global_facts\", \"high_school_biology\", \"high_school_chemistry\", \"high_school_computer_science\", \"high_school_european_history\", \"high_school_geography\", \"high_school_government_and_politics\", \"high_school_macroeconomics\", \"high_school_mathematics\", \"high_school_microeconomics\", \"high_school_physics\", \"high_school_psychology\", \"high_school_statistics\", \"high_school_us_history\", \"high_school_world_history\", \"human_aging\", \"human_sexuality\", \"international_law\", \"jurisprudence\", \"logical_fallacies\", \"machine_learning\", \"management\", \"marketing\", \"medical_genetics\", \"miscellaneous\", \"moral_disputes\", \"moral_scenarios\", \"nutrition\", \"philosophy\", \"prehistory\", \"professional_accounting\", \"professional_law\", \"professional_medicine\", \"professional_psychology\", \"public_relations\", \"security_studies\", \"sociology\", \"us_foreign_policy\", \"virology\", \"world_religions\"]", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "[\"mmlu_abstract_algebra\", \"mmlu_anatomy\", \"mmlu_astronomy\", \"mmlu_business_ethics\", \"mmlu_clinical_knowledge\", \"mmlu_college_biology\", \"mmlu_college_chemistry\", \"mmlu_college_computer_science\", \"mmlu_college_mathematics\", \"mmlu_college_medicine\", \"mmlu_college_physics\", \"mmlu_computer_security\", \"mmlu_conceptual_physics\", \"mmlu_econometrics\", \"mmlu_electrical_engineering\", \"mmlu_elementary_mathematics\", \"mmlu_formal_logic\", \"mmlu_global_facts\", \"mmlu_high_school_biology\", \"mmlu_high_school_chemistry\", \"mmlu_high_school_computer_science\", \"mmlu_high_school_european_history\", \"mmlu_high_school_geography\", \"mmlu_high_school_government_and_politics\", \"mmlu_high_school_macroeconomics\", \"mmlu_high_school_mathematics\", \"mmlu_high_school_microeconomics\", \"mmlu_high_school_physics\", \"mmlu_high_school_psychology\", \"mmlu_high_school_statistics\", \"mmlu_high_school_us_history\", \"mmlu_high_school_world_history\", \"mmlu_human_aging\", \"mmlu_human_sexuality\", \"mmlu_international_law\", \"mmlu_jurisprudence\", \"mmlu_logical_fallacies\", \"mmlu_machine_learning\", \"mmlu_management\", \"mmlu_marketing\", \"mmlu_medical_genetics\", \"mmlu_miscellaneous\", \"mmlu_moral_disputes\", \"mmlu_moral_scenarios\", \"mmlu_nutrition\", \"mmlu_philosophy\", \"mmlu_prehistory\", \"mmlu_professional_accounting\", \"mmlu_professional_law\", \"mmlu_professional_medicine\", \"mmlu_professional_psychology\", \"mmlu_public_relations\", \"mmlu_security_studies\", \"mmlu_sociology\", \"mmlu_us_foreign_policy\", \"mmlu_virology\", \"mmlu_world_religions\"]" - } + "additional_details": {} } }, { - "evaluation_name": "Abstract Algebra", + "evaluation_name": "NarrativeQA", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "NarrativeQA", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on Abstract Algebra", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.63, + "score": 0.768, "details": { - "description": "min=0.63, mean=0.63, max=0.63, sum=1.26 (2)", + "description": "min=0.768, mean=0.768, max=0.768, sum=0.768 (1)", "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": "{\"description\": \"min=0.393, mean=0.393, max=0.393, sum=0.787 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.39332568168640136\"}", - "Abstract Algebra - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "Abstract Algebra - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Abstract Algebra - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Abstract Algebra - # prompt tokens": "{\"description\": \"min=366.44, mean=366.44, max=366.44, sum=732.88 (2)\", \"tab\": \"General information\", \"score\": \"366.44\"}", - "Abstract Algebra - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "NarrativeQA - Observed inference time (s)": "{\"description\": \"min=0.976, mean=0.976, max=0.976, sum=0.976 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.9758186582108619\"}", + "NarrativeQA - # eval": "{\"description\": \"min=355, mean=355, max=355, sum=355 (1)\", \"tab\": \"General information\", \"score\": \"355.0\"}", + "NarrativeQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "NarrativeQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "NarrativeQA - # prompt tokens": "{\"description\": \"min=3522.67, mean=3522.67, max=3522.67, sum=3522.67 (1)\", \"tab\": \"General information\", \"score\": \"3522.6704225352114\"}", + "NarrativeQA - # output tokens": "{\"description\": \"min=8.515, mean=8.515, max=8.515, sum=8.515 (1)\", \"tab\": \"General information\", \"score\": \"8.51549295774648\"}" } }, "generation_config": { - "additional_details": { - "subject": "\"abstract_algebra\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_abstract_algebra\"" - } + "additional_details": {} } }, { - "evaluation_name": "Anatomy", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on Anatomy", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8, + "score": 0.457, "details": { - "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)", + "description": "min=0.457, mean=0.457, max=0.457, sum=0.457 (1)", "tab": "Accuracy", - "Anatomy - Observed inference time (s)": "{\"description\": \"min=0.545, mean=0.545, max=0.545, sum=1.09 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5451150911825674\"}", - "Anatomy - # eval": "{\"description\": \"min=135, mean=135, max=135, sum=270 (2)\", \"tab\": \"General information\", \"score\": \"135.0\"}", - "Anatomy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Anatomy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Anatomy - # prompt tokens": "{\"description\": \"min=346.978, mean=346.978, max=346.978, sum=693.956 (2)\", \"tab\": \"General information\", \"score\": \"346.97777777777776\"}", - "Anatomy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "NaturalQuestions (open-book) - Observed inference time (s)": "{\"description\": \"min=0.908, mean=0.908, max=0.908, sum=0.908 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.9083020164966583\"}", + "NaturalQuestions (closed-book) - Observed inference time (s)": "{\"description\": \"min=0.512, mean=0.512, max=0.512, sum=0.512 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.5116857671737671\"}", + "NaturalQuestions (open-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", + "NaturalQuestions (open-book) - # train": "{\"description\": \"min=4.964, mean=4.964, max=4.964, sum=4.964 (1)\", \"tab\": \"General information\", \"score\": \"4.964\"}", + "NaturalQuestions (open-book) - truncated": "{\"description\": \"min=0.007, mean=0.007, max=0.007, sum=0.007 (1)\", \"tab\": \"General information\", \"score\": \"0.007\"}", + "NaturalQuestions (open-book) - # prompt tokens": "{\"description\": \"min=1717.847, mean=1717.847, max=1717.847, sum=1717.847 (1)\", \"tab\": \"General information\", \"score\": \"1717.847\"}", + "NaturalQuestions (open-book) - # output tokens": "{\"description\": \"min=8.055, mean=8.055, max=8.055, sum=8.055 (1)\", \"tab\": \"General information\", \"score\": \"8.055\"}", + "NaturalQuestions (closed-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", + "NaturalQuestions (closed-book) - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "NaturalQuestions (closed-book) - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "NaturalQuestions (closed-book) - # prompt tokens": "{\"description\": \"min=173.127, mean=173.127, max=173.127, sum=173.127 (1)\", \"tab\": \"General information\", \"score\": \"173.127\"}", + "NaturalQuestions (closed-book) - # output tokens": "{\"description\": \"min=3.832, mean=3.832, max=3.832, sum=3.832 (1)\", \"tab\": \"General information\", \"score\": \"3.832\"}" } }, "generation_config": { "additional_details": { - "subject": "\"anatomy\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_anatomy\"" + "mode": "\"closedbook\"" } } }, { - "evaluation_name": "College Physics", + "evaluation_name": "OpenbookQA", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "OpenbookQA", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on College Physics", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.627, + "score": 0.96, "details": { - "description": "min=0.627, mean=0.627, max=0.627, sum=1.255 (2)", + "description": "min=0.96, mean=0.96, max=0.96, sum=0.96 (1)", "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": "{\"description\": \"min=0.389, mean=0.389, max=0.389, sum=0.778 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3888898015022278\"}", - "College Biology - Observed inference time (s)": "{\"description\": \"min=0.433, mean=0.433, max=0.433, sum=0.866 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.43280420700709027\"}", - "College Computer Science - Observed inference time (s)": "{\"description\": \"min=0.492, mean=0.492, max=0.492, sum=0.984 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.49212974786758423\"}", - "College Mathematics - Observed inference time (s)": "{\"description\": \"min=0.435, mean=0.435, max=0.435, sum=0.871 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4354128074645996\"}", - "College Medicine - Observed inference time (s)": "{\"description\": \"min=0.431, mean=0.431, max=0.431, sum=0.861 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4306242893196944\"}", - "College Physics - Observed inference time (s)": "{\"description\": \"min=0.415, mean=0.415, max=0.415, sum=0.83 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.41519686287524654\"}", - "College Chemistry - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "College Chemistry - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "College Chemistry - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "College Chemistry - # prompt tokens": "{\"description\": \"min=542.4, mean=542.4, max=542.4, sum=1084.8 (2)\", \"tab\": \"General information\", \"score\": \"542.4\"}", - "College Chemistry - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "College Biology - # eval": "{\"description\": \"min=144, mean=144, max=144, sum=288 (2)\", \"tab\": \"General information\", \"score\": \"144.0\"}", - "College Biology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "College Biology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "College Biology - # prompt tokens": "{\"description\": \"min=466.917, mean=466.917, max=466.917, sum=933.833 (2)\", \"tab\": \"General information\", \"score\": \"466.9166666666667\"}", - "College Biology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "College Computer Science - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "College Computer Science - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "College Computer Science - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "College Computer Science - # prompt tokens": "{\"description\": \"min=821.39, mean=821.39, max=821.39, sum=1642.78 (2)\", \"tab\": \"General information\", \"score\": \"821.39\"}", - "College Computer Science - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "College Mathematics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "College Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "College Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "College Mathematics - # prompt tokens": "{\"description\": \"min=587.52, mean=587.52, max=587.52, sum=1175.04 (2)\", \"tab\": \"General information\", \"score\": \"587.52\"}", - "College Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "College Medicine - # eval": "{\"description\": \"min=173, mean=173, max=173, sum=346 (2)\", \"tab\": \"General information\", \"score\": \"173.0\"}", - "College Medicine - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "College Medicine - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "College Medicine - # prompt tokens": "{\"description\": \"min=495.728, mean=495.728, max=495.728, sum=991.457 (2)\", \"tab\": \"General information\", \"score\": \"495.728323699422\"}", - "College Medicine - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "College Physics - # eval": "{\"description\": \"min=102, mean=102, max=102, sum=204 (2)\", \"tab\": \"General information\", \"score\": \"102.0\"}", - "College Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "College Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "College Physics - # prompt tokens": "{\"description\": \"min=496.608, mean=496.608, max=496.608, sum=993.216 (2)\", \"tab\": \"General information\", \"score\": \"496.6078431372549\"}", - "College Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "OpenbookQA - Observed inference time (s)": "{\"description\": \"min=0.401, mean=0.401, max=0.401, sum=0.401 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.40061268854141235\"}", + "OpenbookQA - # eval": "{\"description\": \"min=500, mean=500, max=500, sum=500 (1)\", \"tab\": \"General information\", \"score\": \"500.0\"}", + "OpenbookQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "OpenbookQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "OpenbookQA - # prompt tokens": "{\"description\": \"min=242.782, mean=242.782, max=242.782, sum=242.782 (1)\", \"tab\": \"General information\", \"score\": \"242.782\"}", + "OpenbookQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"college_physics\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_college_physics\"" + "dataset": "\"openbookqa\"", + "method": "\"multiple_choice_joint\"" } } }, { - "evaluation_name": "Computer Security", + "evaluation_name": "MMLU", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "MMLU", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on Computer Security", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.86, + "score": 0.735, "details": { - "description": "min=0.86, mean=0.86, max=0.86, sum=1.72 (2)", + "description": "min=0.55, mean=0.735, max=0.95, sum=3.674 (5)", "tab": "Accuracy", - "Computer Security - Observed inference time (s)": "{\"description\": \"min=0.373, mean=0.373, max=0.373, sum=0.746 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3729291558265686\"}", - "Computer Security - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "Computer Security - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Computer Security - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Computer Security - # prompt tokens": "{\"description\": \"min=371.54, mean=371.54, max=371.54, sum=743.08 (2)\", \"tab\": \"General information\", \"score\": \"371.54\"}", - "Computer Security - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "MMLU - Observed inference time (s)": "{\"description\": \"min=0.364, mean=0.391, max=0.434, sum=1.954 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.39080846048656265\"}", + "MMLU - # eval": "{\"description\": \"min=100, mean=102.8, max=114, sum=514 (5)\", \"tab\": \"General information\", \"score\": \"102.8\"}", + "MMLU - # train": "{\"description\": \"min=5, mean=5, max=5, sum=25 (5)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "MMLU - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "MMLU - # prompt tokens": "{\"description\": \"min=366.44, mean=460.72, max=607.43, sum=2303.6 (5)\", \"tab\": \"General information\", \"score\": \"460.71996491228066\"}", + "MMLU - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"computer_security\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_computer_security\"" + "subject": "[\"abstract_algebra\", \"college_chemistry\", \"computer_security\", \"econometrics\", \"us_foreign_policy\"]", + "method": "\"multiple_choice_joint\"" } } }, { - "evaluation_name": "Econometrics", + "evaluation_name": "MATH", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "MATH", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on Econometrics", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.684, + "score": 0.802, "details": { - "description": "min=0.684, mean=0.684, max=0.684, sum=1.368 (2)", + "description": "min=0.673, mean=0.802, max=0.948, sum=5.617 (7)", "tab": "Accuracy", - "Econometrics - Observed inference time (s)": "{\"description\": \"min=0.364, mean=0.364, max=0.364, sum=0.729 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.36447873241023016\"}", - "Econometrics - # eval": "{\"description\": \"min=114, mean=114, max=114, sum=228 (2)\", \"tab\": \"General information\", \"score\": \"114.0\"}", - "Econometrics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Econometrics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Econometrics - # prompt tokens": "{\"description\": \"min=607.43, mean=607.43, max=607.43, sum=1214.86 (2)\", \"tab\": \"General information\", \"score\": \"607.4298245614035\"}", - "Econometrics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "MATH - Observed inference time (s)": "{\"description\": \"min=2.95, mean=3.472, max=4.247, sum=24.303 (7)\", \"tab\": \"Efficiency\", \"score\": \"3.4718795228507955\"}", + "MATH - # eval": "{\"description\": \"min=30, mean=62.429, max=135, sum=437 (7)\", \"tab\": \"General information\", \"score\": \"62.42857142857143\"}", + "MATH - # train": "{\"description\": \"min=8, mean=8, max=8, sum=56 (7)\", \"tab\": \"General information\", \"score\": \"8.0\"}", + "MATH - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (7)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "MATH - # prompt tokens": "{\"description\": \"min=942.363, mean=1323.911, max=2258.577, sum=9267.376 (7)\", \"tab\": \"General information\", \"score\": \"1323.910874184069\"}", + "MATH - # output tokens": "{\"description\": \"min=59.674, mean=73.257, max=81.1, sum=512.799 (7)\", \"tab\": \"General information\", \"score\": \"73.25695858608955\"}" } }, "generation_config": { "additional_details": { - "subject": "\"econometrics\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_econometrics\"" + "subject": "[\"algebra\", \"counting_and_probability\", \"geometry\", \"intermediate_algebra\", \"number_theory\", \"prealgebra\", \"precalculus\"]", + "level": "\"1\"", + "use_official_examples": "\"False\"", + "use_chain_of_thought": "\"True\"" } } }, { - "evaluation_name": "Global Facts", + "evaluation_name": "GSM8K", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "GSM8K", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on Global Facts", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.62, + "score": 0.932, "details": { - "description": "min=0.62, mean=0.62, max=0.62, sum=1.24 (2)", + "description": "min=0.932, mean=0.932, max=0.932, sum=0.932 (1)", "tab": "Accuracy", - "Global Facts - Observed inference time (s)": "{\"description\": \"min=0.476, mean=0.476, max=0.476, sum=0.952 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4758000469207764\"}", - "Global Facts - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "Global Facts - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Global Facts - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Global Facts - # prompt tokens": "{\"description\": \"min=392.71, mean=392.71, max=392.71, sum=785.42 (2)\", \"tab\": \"General information\", \"score\": \"392.71\"}", - "Global Facts - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "GSM8K - Observed inference time (s)": "{\"description\": \"min=4.948, mean=4.948, max=4.948, sum=4.948 (1)\", \"tab\": \"Efficiency\", \"score\": \"4.947624314308166\"}", + "GSM8K - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", + "GSM8K - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "GSM8K - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "GSM8K - # prompt tokens": "{\"description\": \"min=1020.035, mean=1020.035, max=1020.035, sum=1020.035 (1)\", \"tab\": \"General information\", \"score\": \"1020.035\"}", + "GSM8K - # output tokens": "{\"description\": \"min=111.209, mean=111.209, max=111.209, sum=111.209 (1)\", \"tab\": \"General information\", \"score\": \"111.209\"}" } }, "generation_config": { - "additional_details": { - "subject": "\"global_facts\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_global_facts\"" - } + "additional_details": {} } }, { - "evaluation_name": "Jurisprudence", + "evaluation_name": "LegalBench", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "LegalBench", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on Jurisprudence", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.889, + "score": 0.713, "details": { - "description": "min=0.889, mean=0.889, max=0.889, sum=1.778 (2)", + "description": "min=0.452, mean=0.713, max=0.905, sum=3.564 (5)", "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": "{\"description\": \"min=0.439, mean=0.439, max=0.439, sum=0.878 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.43886900389636\"}", - "Jurisprudence - # eval": "{\"description\": \"min=108, mean=108, max=108, sum=216 (2)\", \"tab\": \"General information\", \"score\": \"108.0\"}", - "Jurisprudence - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Jurisprudence - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Jurisprudence - # prompt tokens": "{\"description\": \"min=387.639, mean=387.639, max=387.639, sum=775.278 (2)\", \"tab\": \"General information\", \"score\": \"387.6388888888889\"}", - "Jurisprudence - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "LegalBench - Observed inference time (s)": "{\"description\": \"min=0.46, mean=0.558, max=0.886, sum=2.791 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.5582764348578453\"}", + "LegalBench - # eval": "{\"description\": \"min=95, mean=409.4, max=1000, sum=2047 (5)\", \"tab\": \"General information\", \"score\": \"409.4\"}", + "LegalBench - # train": "{\"description\": \"min=4, mean=4.798, max=5, sum=23.992 (5)\", \"tab\": \"General information\", \"score\": \"4.798367346938775\"}", + "LegalBench - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "LegalBench - # prompt tokens": "{\"description\": \"min=253.442, mean=1568.687, max=6350.008, sum=7843.435 (5)\", \"tab\": \"General information\", \"score\": \"1568.6870529886412\"}", + "LegalBench - # output tokens": "{\"description\": \"min=1, mean=1.34, max=2.063, sum=6.698 (5)\", \"tab\": \"General information\", \"score\": \"1.3396070557866055\"}" } }, "generation_config": { "additional_details": { - "subject": "\"jurisprudence\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_jurisprudence\"" + "subset": "[\"abercrombie\", \"corporate_lobbying\", \"function_of_decision_section\", \"international_citizenship_questions\", \"proa\"]" } } }, { - "evaluation_name": "Philosophy", + "evaluation_name": "MedQA", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "MedQA", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on Philosophy", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.859, + "score": 0.815, "details": { - "description": "min=0.859, mean=0.859, max=0.859, sum=1.717 (2)", + "description": "min=0.815, mean=0.815, max=0.815, sum=0.815 (1)", "tab": "Accuracy", - "Philosophy - Observed inference time (s)": "{\"description\": \"min=0.403, mean=0.403, max=0.403, sum=0.807 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.40341131480177117\"}", - "Philosophy - # eval": "{\"description\": \"min=311, mean=311, max=311, sum=622 (2)\", \"tab\": \"General information\", \"score\": \"311.0\"}", - "Philosophy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Philosophy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Philosophy - # prompt tokens": "{\"description\": \"min=322.084, mean=322.084, max=322.084, sum=644.167 (2)\", \"tab\": \"General information\", \"score\": \"322.08360128617363\"}", - "Philosophy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "MedQA - Observed inference time (s)": "{\"description\": \"min=0.414, mean=0.414, max=0.414, sum=0.414 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.4136932588239787\"}", + "MedQA - # eval": "{\"description\": \"min=503, mean=503, max=503, sum=503 (1)\", \"tab\": \"General information\", \"score\": \"503.0\"}", + "MedQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "MedQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "MedQA - # prompt tokens": "{\"description\": \"min=1020.414, mean=1020.414, max=1020.414, sum=1020.414 (1)\", \"tab\": \"General information\", \"score\": \"1020.4135188866799\"}", + "MedQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { - "additional_details": { - "subject": "\"philosophy\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_philosophy\"" - } + "additional_details": {} } }, { - "evaluation_name": "Professional Psychology", + "evaluation_name": "WMT 2014", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "WMT 2014", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on Professional Psychology", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.891, + "score": 0.211, "details": { - "description": "min=0.891, mean=0.891, max=0.891, sum=1.781 (2)", + "description": "min=0.149, mean=0.211, max=0.256, sum=1.053 (5)", "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": "{\"description\": \"min=0.483, mean=0.483, max=0.483, sum=0.966 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.48306868356816907\"}", - "Professional Accounting - Observed inference time (s)": "{\"description\": \"min=0.444, mean=0.444, max=0.444, sum=0.888 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.44407470006469296\"}", - "Professional Law - Observed inference time (s)": "{\"description\": \"min=0.578, mean=0.578, max=0.578, sum=1.157 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.578451920053017\"}", - "Professional Psychology - Observed inference time (s)": "{\"description\": \"min=0.469, mean=0.469, max=0.469, sum=0.938 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4690242421393301\"}", - "Professional Medicine - # eval": "{\"description\": \"min=272, mean=272, max=272, sum=544 (2)\", \"tab\": \"General information\", \"score\": \"272.0\"}", - "Professional Medicine - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Professional Medicine - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Professional Medicine - # prompt tokens": "{\"description\": \"min=1087.585, mean=1087.585, max=1087.585, sum=2175.169 (2)\", \"tab\": \"General information\", \"score\": \"1087.5845588235295\"}", - "Professional Medicine - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "Professional Accounting - # eval": "{\"description\": \"min=282, mean=282, max=282, sum=564 (2)\", \"tab\": \"General information\", \"score\": \"282.0\"}", - "Professional Accounting - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Professional Accounting - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Professional Accounting - # prompt tokens": "{\"description\": \"min=651.592, mean=651.592, max=651.592, sum=1303.184 (2)\", \"tab\": \"General information\", \"score\": \"651.5921985815603\"}", - "Professional Accounting - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "Professional Law - # eval": "{\"description\": \"min=1534, mean=1534, max=1534, sum=3068 (2)\", \"tab\": \"General information\", \"score\": \"1534.0\"}", - "Professional Law - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Professional Law - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Professional Law - # prompt tokens": "{\"description\": \"min=1630.787, mean=1630.787, max=1630.787, sum=3261.574 (2)\", \"tab\": \"General information\", \"score\": \"1630.7868318122555\"}", - "Professional Law - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "Professional Psychology - # eval": "{\"description\": \"min=612, mean=612, max=612, sum=1224 (2)\", \"tab\": \"General information\", \"score\": \"612.0\"}", - "Professional Psychology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Professional Psychology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Professional Psychology - # prompt tokens": "{\"description\": \"min=568.114, mean=568.114, max=568.114, sum=1136.229 (2)\", \"tab\": \"General information\", \"score\": \"568.1143790849674\"}", - "Professional Psychology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "WMT 2014 - Observed inference time (s)": "{\"description\": \"min=1.448, mean=1.58, max=1.724, sum=7.899 (5)\", \"tab\": \"Efficiency\", \"score\": \"1.5797039644192494\"}", + "WMT 2014 - # eval": "{\"description\": \"min=503, mean=568.8, max=832, sum=2844 (5)\", \"tab\": \"General information\", \"score\": \"568.8\"}", + "WMT 2014 - # train": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "WMT 2014 - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "WMT 2014 - # prompt tokens": "{\"description\": \"min=169.901, mean=193.043, max=213.185, sum=965.213 (5)\", \"tab\": \"General information\", \"score\": \"193.04258583116683\"}", + "WMT 2014 - # output tokens": "{\"description\": \"min=23.767, mean=25.424, max=26.121, sum=127.122 (5)\", \"tab\": \"General information\", \"score\": \"25.424382072946933\"}" } }, "generation_config": { "additional_details": { - "subject": "\"professional_psychology\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_professional_psychology\"" + "language_pair": "[\"cs-en\", \"de-en\", \"fr-en\", \"hi-en\", \"ru-en\"]" } } - }, + } + ], + "detailed_evaluation_results": null, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_id": "helm_mmlu/openai_gpt-4-0613/1774096312.00548", + "retrieved_timestamp": "1774096312.00548", + "source_metadata": { + "source_name": "helm_mmlu", + "source_type": "documentation", + "source_organization_name": "crfm", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "helm", + "version": "unknown" + }, + "benchmark": "helm_mmlu", + "evaluation_results": [ { - "evaluation_name": "Us Foreign Policy", + "evaluation_name": "MMLU All Subjects", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -459,36 +401,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.95, + "score": 0.824, "details": { - "description": "min=0.95, mean=0.95, max=0.95, sum=1.9 (2)", + "description": "min=0.54, mean=0.824, max=0.99, sum=93.978 (114)", "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": "{\"description\": \"min=0.434, mean=0.434, max=0.434, sum=0.869 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.43441893100738527\"}", - "Us Foreign Policy - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "Us Foreign Policy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Us Foreign Policy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Us Foreign Policy - # prompt tokens": "{\"description\": \"min=415.79, mean=415.79, max=415.79, sum=831.58 (2)\", \"tab\": \"General information\", \"score\": \"415.79\"}", - "Us Foreign Policy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "MMLU All Subjects - Observed inference time (s)": "{\"description\": \"min=0.364, mean=0.447, max=0.579, sum=51.005 (114)\", \"tab\": \"Efficiency\", \"score\": \"0.4474144183932911\"}", + "MMLU All Subjects - # eval": "{\"description\": \"min=100, mean=246.351, max=1534, sum=28084 (114)\", \"tab\": \"General information\", \"score\": \"246.35087719298247\"}", + "MMLU All Subjects - # train": "{\"description\": \"min=5, mean=5, max=5, sum=570 (114)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "MMLU All Subjects - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (114)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "MMLU All Subjects - # prompt tokens": "{\"description\": \"min=268.561, mean=607.852, max=2791.073, sum=69295.086 (114)\", \"tab\": \"General information\", \"score\": \"607.851634217556\"}", + "MMLU All Subjects - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=114 (114)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"us_foreign_policy\"", + "subject": "[\"abstract_algebra\", \"anatomy\", \"astronomy\", \"business_ethics\", \"clinical_knowledge\", \"college_biology\", \"college_chemistry\", \"college_computer_science\", \"college_mathematics\", \"college_medicine\", \"college_physics\", \"computer_security\", \"conceptual_physics\", \"econometrics\", \"electrical_engineering\", \"elementary_mathematics\", \"formal_logic\", \"global_facts\", \"high_school_biology\", \"high_school_chemistry\", \"high_school_computer_science\", \"high_school_european_history\", \"high_school_geography\", \"high_school_government_and_politics\", \"high_school_macroeconomics\", \"high_school_mathematics\", \"high_school_microeconomics\", \"high_school_physics\", \"high_school_psychology\", \"high_school_statistics\", \"high_school_us_history\", \"high_school_world_history\", \"human_aging\", \"human_sexuality\", \"international_law\", \"jurisprudence\", \"logical_fallacies\", \"machine_learning\", \"management\", \"marketing\", \"medical_genetics\", \"miscellaneous\", \"moral_disputes\", \"moral_scenarios\", \"nutrition\", \"philosophy\", \"prehistory\", \"professional_accounting\", \"professional_law\", \"professional_medicine\", \"professional_psychology\", \"public_relations\", \"security_studies\", \"sociology\", \"us_foreign_policy\", \"virology\", \"world_religions\"]", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_us_foreign_policy\"" + "groups": "[\"mmlu_abstract_algebra\", \"mmlu_anatomy\", \"mmlu_astronomy\", \"mmlu_business_ethics\", \"mmlu_clinical_knowledge\", \"mmlu_college_biology\", \"mmlu_college_chemistry\", \"mmlu_college_computer_science\", \"mmlu_college_mathematics\", \"mmlu_college_medicine\", \"mmlu_college_physics\", \"mmlu_computer_security\", \"mmlu_conceptual_physics\", \"mmlu_econometrics\", \"mmlu_electrical_engineering\", \"mmlu_elementary_mathematics\", \"mmlu_formal_logic\", \"mmlu_global_facts\", \"mmlu_high_school_biology\", \"mmlu_high_school_chemistry\", \"mmlu_high_school_computer_science\", \"mmlu_high_school_european_history\", \"mmlu_high_school_geography\", \"mmlu_high_school_government_and_politics\", \"mmlu_high_school_macroeconomics\", \"mmlu_high_school_mathematics\", \"mmlu_high_school_microeconomics\", \"mmlu_high_school_physics\", \"mmlu_high_school_psychology\", \"mmlu_high_school_statistics\", \"mmlu_high_school_us_history\", \"mmlu_high_school_world_history\", \"mmlu_human_aging\", \"mmlu_human_sexuality\", \"mmlu_international_law\", \"mmlu_jurisprudence\", \"mmlu_logical_fallacies\", \"mmlu_machine_learning\", \"mmlu_management\", \"mmlu_marketing\", \"mmlu_medical_genetics\", \"mmlu_miscellaneous\", \"mmlu_moral_disputes\", \"mmlu_moral_scenarios\", \"mmlu_nutrition\", \"mmlu_philosophy\", \"mmlu_prehistory\", \"mmlu_professional_accounting\", \"mmlu_professional_law\", \"mmlu_professional_medicine\", \"mmlu_professional_psychology\", \"mmlu_public_relations\", \"mmlu_security_studies\", \"mmlu_sociology\", \"mmlu_us_foreign_policy\", \"mmlu_virology\", \"mmlu_world_religions\"]" } } }, { - "evaluation_name": "Astronomy", + "evaluation_name": "Abstract Algebra", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -497,36 +439,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Astronomy", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.934, + "score": 0.63, "details": { - "description": "min=0.934, mean=0.934, max=0.934, sum=1.868 (2)", + "description": "min=0.63, mean=0.63, max=0.63, sum=1.26 (2)", "tab": "Accuracy", - "Astronomy - Observed inference time (s)": "{\"description\": \"min=0.472, mean=0.472, max=0.472, sum=0.944 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4718977307018481\"}", - "Astronomy - # eval": "{\"description\": \"min=152, mean=152, max=152, sum=304 (2)\", \"tab\": \"General information\", \"score\": \"152.0\"}", - "Astronomy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Astronomy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Astronomy - # prompt tokens": "{\"description\": \"min=572.691, mean=572.691, max=572.691, sum=1145.382 (2)\", \"tab\": \"General information\", \"score\": \"572.6907894736842\"}", - "Astronomy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Abstract Algebra - Observed inference time (s)": "{\"description\": \"min=0.393, mean=0.393, max=0.393, sum=0.787 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.39332568168640136\"}", + "Abstract Algebra - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "Abstract Algebra - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Abstract Algebra - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Abstract Algebra - # prompt tokens": "{\"description\": \"min=366.44, mean=366.44, max=366.44, sum=732.88 (2)\", \"tab\": \"General information\", \"score\": \"366.44\"}", + "Abstract Algebra - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"astronomy\"", + "subject": "\"abstract_algebra\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_astronomy\"" + "groups": "\"mmlu_abstract_algebra\"" } } }, { - "evaluation_name": "Business Ethics", + "evaluation_name": "Anatomy", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -535,36 +477,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Business Ethics", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.79, + "score": 0.8, "details": { - "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)", + "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)", "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": "{\"description\": \"min=0.477, mean=0.477, max=0.477, sum=0.953 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4765148901939392\"}", - "Business Ethics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "Business Ethics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Business Ethics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Business Ethics - # prompt tokens": "{\"description\": \"min=562.52, mean=562.52, max=562.52, sum=1125.04 (2)\", \"tab\": \"General information\", \"score\": \"562.52\"}", - "Business Ethics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Anatomy - Observed inference time (s)": "{\"description\": \"min=0.545, mean=0.545, max=0.545, sum=1.09 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5451150911825674\"}", + "Anatomy - # eval": "{\"description\": \"min=135, mean=135, max=135, sum=270 (2)\", \"tab\": \"General information\", \"score\": \"135.0\"}", + "Anatomy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Anatomy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Anatomy - # prompt tokens": "{\"description\": \"min=346.978, mean=346.978, max=346.978, sum=693.956 (2)\", \"tab\": \"General information\", \"score\": \"346.97777777777776\"}", + "Anatomy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"business_ethics\"", + "subject": "\"anatomy\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_business_ethics\"" + "groups": "\"mmlu_anatomy\"" } } }, { - "evaluation_name": "Clinical Knowledge", + "evaluation_name": "College Physics", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -573,36 +515,66 @@ ] }, "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.845, + "score": 0.627, "details": { - "description": "min=0.845, mean=0.845, max=0.845, sum=1.691 (2)", + "description": "min=0.627, mean=0.627, max=0.627, sum=1.255 (2)", "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": "{\"description\": \"min=0.415, mean=0.415, max=0.415, sum=0.829 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.414557883424579\"}", - "Clinical Knowledge - # eval": "{\"description\": \"min=265, mean=265, max=265, sum=530 (2)\", \"tab\": \"General information\", \"score\": \"265.0\"}", - "Clinical Knowledge - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Clinical Knowledge - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Clinical Knowledge - # prompt tokens": "{\"description\": \"min=390.947, mean=390.947, max=390.947, sum=781.894 (2)\", \"tab\": \"General information\", \"score\": \"390.94716981132075\"}", - "Clinical Knowledge - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "College Chemistry - Observed inference time (s)": "{\"description\": \"min=0.389, mean=0.389, max=0.389, sum=0.778 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3888898015022278\"}", + "College Biology - Observed inference time (s)": "{\"description\": \"min=0.433, mean=0.433, max=0.433, sum=0.866 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.43280420700709027\"}", + "College Computer Science - Observed inference time (s)": "{\"description\": \"min=0.492, mean=0.492, max=0.492, sum=0.984 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.49212974786758423\"}", + "College Mathematics - Observed inference time (s)": "{\"description\": \"min=0.435, mean=0.435, max=0.435, sum=0.871 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4354128074645996\"}", + "College Medicine - Observed inference time (s)": "{\"description\": \"min=0.431, mean=0.431, max=0.431, sum=0.861 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4306242893196944\"}", + "College Physics - Observed inference time (s)": "{\"description\": \"min=0.415, mean=0.415, max=0.415, sum=0.83 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.41519686287524654\"}", + "College Chemistry - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "College Chemistry - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "College Chemistry - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "College Chemistry - # prompt tokens": "{\"description\": \"min=542.4, mean=542.4, max=542.4, sum=1084.8 (2)\", \"tab\": \"General information\", \"score\": \"542.4\"}", + "College Chemistry - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "College Biology - # eval": "{\"description\": \"min=144, mean=144, max=144, sum=288 (2)\", \"tab\": \"General information\", \"score\": \"144.0\"}", + "College Biology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "College Biology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "College Biology - # prompt tokens": "{\"description\": \"min=466.917, mean=466.917, max=466.917, sum=933.833 (2)\", \"tab\": \"General information\", \"score\": \"466.9166666666667\"}", + "College Biology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "College Computer Science - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "College Computer Science - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "College Computer Science - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "College Computer Science - # prompt tokens": "{\"description\": \"min=821.39, mean=821.39, max=821.39, sum=1642.78 (2)\", \"tab\": \"General information\", \"score\": \"821.39\"}", + "College Computer Science - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "College Mathematics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "College Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "College Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "College Mathematics - # prompt tokens": "{\"description\": \"min=587.52, mean=587.52, max=587.52, sum=1175.04 (2)\", \"tab\": \"General information\", \"score\": \"587.52\"}", + "College Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "College Medicine - # eval": "{\"description\": \"min=173, mean=173, max=173, sum=346 (2)\", \"tab\": \"General information\", \"score\": \"173.0\"}", + "College Medicine - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "College Medicine - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "College Medicine - # prompt tokens": "{\"description\": \"min=495.728, mean=495.728, max=495.728, sum=991.457 (2)\", \"tab\": \"General information\", \"score\": \"495.728323699422\"}", + "College Medicine - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "College Physics - # eval": "{\"description\": \"min=102, mean=102, max=102, sum=204 (2)\", \"tab\": \"General information\", \"score\": \"102.0\"}", + "College Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "College Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "College Physics - # prompt tokens": "{\"description\": \"min=496.608, mean=496.608, max=496.608, sum=993.216 (2)\", \"tab\": \"General information\", \"score\": \"496.6078431372549\"}", + "College Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"clinical_knowledge\"", + "subject": "\"college_physics\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_clinical_knowledge\"" + "groups": "\"mmlu_college_physics\"" } } }, { - "evaluation_name": "Conceptual Physics", + "evaluation_name": "Computer Security", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -611,36 +583,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Conceptual Physics", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.868, + "score": 0.86, "details": { - "description": "min=0.868, mean=0.868, max=0.868, sum=1.736 (2)", + "description": "min=0.86, mean=0.86, max=0.86, sum=1.72 (2)", "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": "{\"description\": \"min=0.384, mean=0.384, max=0.384, sum=0.767 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3836827186827964\"}", - "Conceptual Physics - # eval": "{\"description\": \"min=235, mean=235, max=235, sum=470 (2)\", \"tab\": \"General information\", \"score\": \"235.0\"}", - "Conceptual Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Conceptual Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Conceptual Physics - # prompt tokens": "{\"description\": \"min=297.838, mean=297.838, max=297.838, sum=595.677 (2)\", \"tab\": \"General information\", \"score\": \"297.83829787234043\"}", - "Conceptual Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Computer Security - Observed inference time (s)": "{\"description\": \"min=0.373, mean=0.373, max=0.373, sum=0.746 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3729291558265686\"}", + "Computer Security - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "Computer Security - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Computer Security - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Computer Security - # prompt tokens": "{\"description\": \"min=371.54, mean=371.54, max=371.54, sum=743.08 (2)\", \"tab\": \"General information\", \"score\": \"371.54\"}", + "Computer Security - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"conceptual_physics\"", + "subject": "\"computer_security\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_conceptual_physics\"" + "groups": "\"mmlu_computer_security\"" } } }, { - "evaluation_name": "Electrical Engineering", + "evaluation_name": "Econometrics", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -649,36 +621,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Electrical Engineering", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.786, + "score": 0.684, "details": { - "description": "min=0.786, mean=0.786, max=0.786, sum=1.572 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": "{\"description\": \"min=0.399, mean=0.399, max=0.399, sum=0.798 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.39915286919166304\"}", - "Electrical Engineering - # eval": "{\"description\": \"min=145, mean=145, max=145, sum=290 (2)\", \"tab\": \"General information\", \"score\": \"145.0\"}", - "Electrical Engineering - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Electrical Engineering - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Electrical Engineering - # prompt tokens": "{\"description\": \"min=433.641, mean=433.641, max=433.641, sum=867.283 (2)\", \"tab\": \"General information\", \"score\": \"433.6413793103448\"}", - "Electrical Engineering - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "description": "min=0.684, mean=0.684, max=0.684, sum=1.368 (2)", + "tab": "Accuracy", + "Econometrics - Observed inference time (s)": "{\"description\": \"min=0.364, mean=0.364, max=0.364, sum=0.729 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.36447873241023016\"}", + "Econometrics - # eval": "{\"description\": \"min=114, mean=114, max=114, sum=228 (2)\", \"tab\": \"General information\", \"score\": \"114.0\"}", + "Econometrics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Econometrics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Econometrics - # prompt tokens": "{\"description\": \"min=607.43, mean=607.43, max=607.43, sum=1214.86 (2)\", \"tab\": \"General information\", \"score\": \"607.4298245614035\"}", + "Econometrics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"electrical_engineering\"", + "subject": "\"econometrics\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_electrical_engineering\"" + "groups": "\"mmlu_econometrics\"" } } }, { - "evaluation_name": "Elementary Mathematics", + "evaluation_name": "Global Facts", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -687,36 +659,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.807, + "score": 0.62, "details": { - "description": "min=0.807, mean=0.807, max=0.807, sum=1.614 (2)", + "description": "min=0.62, mean=0.62, max=0.62, sum=1.24 (2)", "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": "{\"description\": \"min=0.423, mean=0.423, max=0.423, sum=0.845 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4225258120784053\"}", - "Elementary Mathematics - # eval": "{\"description\": \"min=378, mean=378, max=378, sum=756 (2)\", \"tab\": \"General information\", \"score\": \"378.0\"}", - "Elementary Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Elementary Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Elementary Mathematics - # prompt tokens": "{\"description\": \"min=524.862, mean=524.862, max=524.862, sum=1049.725 (2)\", \"tab\": \"General information\", \"score\": \"524.8624338624338\"}", - "Elementary Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Global Facts - Observed inference time (s)": "{\"description\": \"min=0.476, mean=0.476, max=0.476, sum=0.952 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4758000469207764\"}", + "Global Facts - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "Global Facts - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Global Facts - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Global Facts - # prompt tokens": "{\"description\": \"min=392.71, mean=392.71, max=392.71, sum=785.42 (2)\", \"tab\": \"General information\", \"score\": \"392.71\"}", + "Global Facts - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"elementary_mathematics\"", + "subject": "\"global_facts\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_elementary_mathematics\"" + "groups": "\"mmlu_global_facts\"" } } }, { - "evaluation_name": "Formal Logic", + "evaluation_name": "Jurisprudence", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -725,36 +697,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Formal Logic", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.643, + "score": 0.889, "details": { - "description": "min=0.643, mean=0.643, max=0.643, sum=1.286 (2)", + "description": "min=0.889, mean=0.889, max=0.889, sum=1.778 (2)", "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": "{\"description\": \"min=0.486, mean=0.486, max=0.486, sum=0.973 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.48647683007376535\"}", - "Formal Logic - # eval": "{\"description\": \"min=126, mean=126, max=126, sum=252 (2)\", \"tab\": \"General information\", \"score\": \"126.0\"}", - "Formal Logic - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Formal Logic - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Formal Logic - # prompt tokens": "{\"description\": \"min=599.762, mean=599.762, max=599.762, sum=1199.524 (2)\", \"tab\": \"General information\", \"score\": \"599.7619047619048\"}", - "Formal Logic - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Jurisprudence - Observed inference time (s)": "{\"description\": \"min=0.439, mean=0.439, max=0.439, sum=0.878 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.43886900389636\"}", + "Jurisprudence - # eval": "{\"description\": \"min=108, mean=108, max=108, sum=216 (2)\", \"tab\": \"General information\", \"score\": \"108.0\"}", + "Jurisprudence - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Jurisprudence - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Jurisprudence - # prompt tokens": "{\"description\": \"min=387.639, mean=387.639, max=387.639, sum=775.278 (2)\", \"tab\": \"General information\", \"score\": \"387.6388888888889\"}", + "Jurisprudence - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"formal_logic\"", + "subject": "\"jurisprudence\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_formal_logic\"" + "groups": "\"mmlu_jurisprudence\"" } } }, { - "evaluation_name": "High School World History", + "evaluation_name": "Philosophy", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -763,114 +735,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on High School World History", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.945, + "score": 0.859, "details": { - "description": "min=0.945, mean=0.945, max=0.945, sum=1.89 (2)", + "description": "min=0.859, mean=0.859, max=0.859, sum=1.717 (2)", "tab": "Accuracy", - "High School Biology - Observed inference time (s)": "{\"description\": \"min=0.436, mean=0.436, max=0.436, sum=0.872 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4360047817230225\"}", - "High School Chemistry - Observed inference time (s)": "{\"description\": \"min=0.413, mean=0.413, max=0.413, sum=0.827 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.41338158710836775\"}", - "High School Computer Science - Observed inference time (s)": "{\"description\": \"min=0.5, mean=0.5, max=0.5, sum=1.001 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5002665758132935\"}", - "High School European History - Observed inference time (s)": "{\"description\": \"min=0.579, mean=0.579, max=0.579, sum=1.158 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.578774525902488\"}", - "High School Geography - Observed inference time (s)": "{\"description\": \"min=0.414, mean=0.414, max=0.414, sum=0.829 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4142996747084338\"}", - "High School Government And Politics - Observed inference time (s)": "{\"description\": \"min=0.43, mean=0.43, max=0.43, sum=0.86 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.43005221001224814\"}", - "High School Macroeconomics - Observed inference time (s)": "{\"description\": \"min=0.416, mean=0.416, max=0.416, sum=0.832 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4160928750649477\"}", - "High School Mathematics - Observed inference time (s)": "{\"description\": \"min=0.423, mean=0.423, max=0.423, sum=0.846 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4231933620240953\"}", - "High School Microeconomics - Observed inference time (s)": "{\"description\": \"min=0.474, mean=0.474, max=0.474, sum=0.948 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4740273321376127\"}", - "High School Physics - Observed inference time (s)": "{\"description\": \"min=0.462, mean=0.462, max=0.462, sum=0.924 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4620048778736039\"}", - "High School Psychology - Observed inference time (s)": "{\"description\": \"min=0.407, mean=0.407, max=0.407, sum=0.813 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.40661886022725235\"}", - "High School Statistics - Observed inference time (s)": "{\"description\": \"min=0.463, mean=0.463, max=0.463, sum=0.926 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.46296725780875597\"}", - "High School US History - Observed inference time (s)": "{\"description\": \"min=0.546, mean=0.546, max=0.546, sum=1.091 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5456923538563299\"}", - "High School World History - Observed inference time (s)": "{\"description\": \"min=0.517, mean=0.517, max=0.517, sum=1.033 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5166646488608188\"}", - "High School Biology - # eval": "{\"description\": \"min=310, mean=310, max=310, sum=620 (2)\", \"tab\": \"General information\", \"score\": \"310.0\"}", - "High School Biology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Biology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Biology - # prompt tokens": "{\"description\": \"min=506.677, mean=506.677, max=506.677, sum=1013.355 (2)\", \"tab\": \"General information\", \"score\": \"506.6774193548387\"}", - "High School Biology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Chemistry - # eval": "{\"description\": \"min=203, mean=203, max=203, sum=406 (2)\", \"tab\": \"General information\", \"score\": \"203.0\"}", - "High School Chemistry - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Chemistry - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Chemistry - # prompt tokens": "{\"description\": \"min=489.714, mean=489.714, max=489.714, sum=979.429 (2)\", \"tab\": \"General information\", \"score\": \"489.7142857142857\"}", - "High School Chemistry - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Computer Science - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "High School Computer Science - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Computer Science - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Computer Science - # prompt tokens": "{\"description\": \"min=860.78, mean=860.78, max=860.78, sum=1721.56 (2)\", \"tab\": \"General information\", \"score\": \"860.78\"}", - "High School Computer Science - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School European History - # eval": "{\"description\": \"min=165, mean=165, max=165, sum=330 (2)\", \"tab\": \"General information\", \"score\": \"165.0\"}", - "High School European History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School European History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School European History - # prompt tokens": "{\"description\": \"min=2791.073, mean=2791.073, max=2791.073, sum=5582.145 (2)\", \"tab\": \"General information\", \"score\": \"2791.072727272727\"}", - "High School European History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Geography - # eval": "{\"description\": \"min=198, mean=198, max=198, sum=396 (2)\", \"tab\": \"General information\", \"score\": \"198.0\"}", - "High School Geography - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Geography - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Geography - # prompt tokens": "{\"description\": \"min=365.045, mean=365.045, max=365.045, sum=730.091 (2)\", \"tab\": \"General information\", \"score\": \"365.04545454545456\"}", - "High School Geography - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Government And Politics - # eval": "{\"description\": \"min=193, mean=193, max=193, sum=386 (2)\", \"tab\": \"General information\", \"score\": \"193.0\"}", - "High School Government And Politics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Government And Politics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Government And Politics - # prompt tokens": "{\"description\": \"min=458.824, mean=458.824, max=458.824, sum=917.648 (2)\", \"tab\": \"General information\", \"score\": \"458.8238341968912\"}", - "High School Government And Politics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Macroeconomics - # eval": "{\"description\": \"min=390, mean=390, max=390, sum=780 (2)\", \"tab\": \"General information\", \"score\": \"390.0\"}", - "High School Macroeconomics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Macroeconomics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Macroeconomics - # prompt tokens": "{\"description\": \"min=364.562, mean=364.562, max=364.562, sum=729.123 (2)\", \"tab\": \"General information\", \"score\": \"364.5615384615385\"}", - "High School Macroeconomics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Mathematics - # eval": "{\"description\": \"min=270, mean=270, max=270, sum=540 (2)\", \"tab\": \"General information\", \"score\": \"270.0\"}", - "High School Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Mathematics - # prompt tokens": "{\"description\": \"min=525.374, mean=525.374, max=525.374, sum=1050.748 (2)\", \"tab\": \"General information\", \"score\": \"525.3740740740741\"}", - "High School Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Microeconomics - # eval": "{\"description\": \"min=238, mean=238, max=238, sum=476 (2)\", \"tab\": \"General information\", \"score\": \"238.0\"}", - "High School Microeconomics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Microeconomics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Microeconomics - # prompt tokens": "{\"description\": \"min=392.025, mean=392.025, max=392.025, sum=784.05 (2)\", \"tab\": \"General information\", \"score\": \"392.02521008403363\"}", - "High School Microeconomics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Physics - # eval": "{\"description\": \"min=151, mean=151, max=151, sum=302 (2)\", \"tab\": \"General information\", \"score\": \"151.0\"}", - "High School Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Physics - # prompt tokens": "{\"description\": \"min=553.464, mean=553.464, max=553.464, sum=1106.927 (2)\", \"tab\": \"General information\", \"score\": \"553.4635761589404\"}", - "High School Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Psychology - # eval": "{\"description\": \"min=545, mean=545, max=545, sum=1090 (2)\", \"tab\": \"General information\", \"score\": \"545.0\"}", - "High School Psychology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Psychology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Psychology - # prompt tokens": "{\"description\": \"min=488.246, mean=488.246, max=488.246, sum=976.492 (2)\", \"tab\": \"General information\", \"score\": \"488.24587155963303\"}", - "High School Psychology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Statistics - # eval": "{\"description\": \"min=216, mean=216, max=216, sum=432 (2)\", \"tab\": \"General information\", \"score\": \"216.0\"}", - "High School Statistics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Statistics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Statistics - # prompt tokens": "{\"description\": \"min=788.699, mean=788.699, max=788.699, sum=1577.398 (2)\", \"tab\": \"General information\", \"score\": \"788.699074074074\"}", - "High School Statistics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School US History - # eval": "{\"description\": \"min=204, mean=204, max=204, sum=408 (2)\", \"tab\": \"General information\", \"score\": \"204.0\"}", - "High School US History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School US History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School US History - # prompt tokens": "{\"description\": \"min=2210.809, mean=2210.809, max=2210.809, sum=4421.618 (2)\", \"tab\": \"General information\", \"score\": \"2210.8088235294117\"}", - "High School US History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School World History - # eval": "{\"description\": \"min=237, mean=237, max=237, sum=474 (2)\", \"tab\": \"General information\", \"score\": \"237.0\"}", - "High School World History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School World History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School World History - # prompt tokens": "{\"description\": \"min=1421.27, mean=1421.27, max=1421.27, sum=2842.54 (2)\", \"tab\": \"General information\", \"score\": \"1421.2700421940929\"}", - "High School World History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Philosophy - Observed inference time (s)": "{\"description\": \"min=0.403, mean=0.403, max=0.403, sum=0.807 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.40341131480177117\"}", + "Philosophy - # eval": "{\"description\": \"min=311, mean=311, max=311, sum=622 (2)\", \"tab\": \"General information\", \"score\": \"311.0\"}", + "Philosophy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Philosophy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Philosophy - # prompt tokens": "{\"description\": \"min=322.084, mean=322.084, max=322.084, sum=644.167 (2)\", \"tab\": \"General information\", \"score\": \"322.08360128617363\"}", + "Philosophy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"high_school_world_history\"", + "subject": "\"philosophy\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_high_school_world_history\"" + "groups": "\"mmlu_philosophy\"" } } }, { - "evaluation_name": "Human Sexuality", + "evaluation_name": "Professional Psychology", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -879,42 +773,54 @@ ] }, "metric_config": { - "evaluation_description": "EM on Human Sexuality", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.908, + "score": 0.891, "details": { - "description": "min=0.908, mean=0.908, max=0.908, sum=1.817 (2)", + "description": "min=0.891, mean=0.891, max=0.891, sum=1.781 (2)", "tab": "Accuracy", - "Human Aging - Observed inference time (s)": "{\"description\": \"min=0.406, mean=0.406, max=0.406, sum=0.812 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4058152218036053\"}", - "Human Sexuality - Observed inference time (s)": "{\"description\": \"min=0.466, mean=0.466, max=0.466, sum=0.932 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.46620041541470825\"}", - "Human Aging - # eval": "{\"description\": \"min=223, mean=223, max=223, sum=446 (2)\", \"tab\": \"General information\", \"score\": \"223.0\"}", - "Human Aging - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Human Aging - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Human Aging - # prompt tokens": "{\"description\": \"min=312.906, mean=312.906, max=312.906, sum=625.812 (2)\", \"tab\": \"General information\", \"score\": \"312.90582959641256\"}", - "Human Aging - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "Human Sexuality - # eval": "{\"description\": \"min=131, mean=131, max=131, sum=262 (2)\", \"tab\": \"General information\", \"score\": \"131.0\"}", - "Human Sexuality - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Human Sexuality - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Human Sexuality - # prompt tokens": "{\"description\": \"min=334.183, mean=334.183, max=334.183, sum=668.366 (2)\", \"tab\": \"General information\", \"score\": \"334.1832061068702\"}", - "Human Sexuality - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Professional Medicine - Observed inference time (s)": "{\"description\": \"min=0.483, mean=0.483, max=0.483, sum=0.966 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.48306868356816907\"}", + "Professional Accounting - Observed inference time (s)": "{\"description\": \"min=0.444, mean=0.444, max=0.444, sum=0.888 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.44407470006469296\"}", + "Professional Law - Observed inference time (s)": "{\"description\": \"min=0.578, mean=0.578, max=0.578, sum=1.157 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.578451920053017\"}", + "Professional Psychology - Observed inference time (s)": "{\"description\": \"min=0.469, mean=0.469, max=0.469, sum=0.938 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4690242421393301\"}", + "Professional Medicine - # eval": "{\"description\": \"min=272, mean=272, max=272, sum=544 (2)\", \"tab\": \"General information\", \"score\": \"272.0\"}", + "Professional Medicine - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Professional Medicine - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Professional Medicine - # prompt tokens": "{\"description\": \"min=1087.585, mean=1087.585, max=1087.585, sum=2175.169 (2)\", \"tab\": \"General information\", \"score\": \"1087.5845588235295\"}", + "Professional Medicine - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "Professional Accounting - # eval": "{\"description\": \"min=282, mean=282, max=282, sum=564 (2)\", \"tab\": \"General information\", \"score\": \"282.0\"}", + "Professional Accounting - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Professional Accounting - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Professional Accounting - # prompt tokens": "{\"description\": \"min=651.592, mean=651.592, max=651.592, sum=1303.184 (2)\", \"tab\": \"General information\", \"score\": \"651.5921985815603\"}", + "Professional Accounting - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "Professional Law - # eval": "{\"description\": \"min=1534, mean=1534, max=1534, sum=3068 (2)\", \"tab\": \"General information\", \"score\": \"1534.0\"}", + "Professional Law - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Professional Law - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Professional Law - # prompt tokens": "{\"description\": \"min=1630.787, mean=1630.787, max=1630.787, sum=3261.574 (2)\", \"tab\": \"General information\", \"score\": \"1630.7868318122555\"}", + "Professional Law - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "Professional Psychology - # eval": "{\"description\": \"min=612, mean=612, max=612, sum=1224 (2)\", \"tab\": \"General information\", \"score\": \"612.0\"}", + "Professional Psychology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Professional Psychology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Professional Psychology - # prompt tokens": "{\"description\": \"min=568.114, mean=568.114, max=568.114, sum=1136.229 (2)\", \"tab\": \"General information\", \"score\": \"568.1143790849674\"}", + "Professional Psychology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"human_sexuality\"", + "subject": "\"professional_psychology\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_human_sexuality\"" + "groups": "\"mmlu_professional_psychology\"" } } }, { - "evaluation_name": "International Law", + "evaluation_name": "Us Foreign Policy", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -923,36 +829,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on International Law", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.917, + "score": 0.95, "details": { - "description": "min=0.917, mean=0.917, max=0.917, sum=1.835 (2)", + "description": "min=0.95, mean=0.95, max=0.95, sum=1.9 (2)", "tab": "Accuracy", - "International Law - Observed inference time (s)": "{\"description\": \"min=0.461, mean=0.461, max=0.461, sum=0.922 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4608367139642889\"}", - "International Law - # eval": "{\"description\": \"min=121, mean=121, max=121, sum=242 (2)\", \"tab\": \"General information\", \"score\": \"121.0\"}", - "International Law - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "International Law - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "International Law - # prompt tokens": "{\"description\": \"min=632.851, mean=632.851, max=632.851, sum=1265.702 (2)\", \"tab\": \"General information\", \"score\": \"632.8512396694215\"}", - "International Law - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Us Foreign Policy - Observed inference time (s)": "{\"description\": \"min=0.434, mean=0.434, max=0.434, sum=0.869 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.43441893100738527\"}", + "Us Foreign Policy - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "Us Foreign Policy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Us Foreign Policy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Us Foreign Policy - # prompt tokens": "{\"description\": \"min=415.79, mean=415.79, max=415.79, sum=831.58 (2)\", \"tab\": \"General information\", \"score\": \"415.79\"}", + "Us Foreign Policy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"international_law\"", + "subject": "\"us_foreign_policy\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_international_law\"" + "groups": "\"mmlu_us_foreign_policy\"" } } }, { - "evaluation_name": "Logical Fallacies", + "evaluation_name": "Astronomy", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -961,36 +867,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Logical Fallacies", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.871, + "score": 0.934, "details": { - "description": "min=0.871, mean=0.871, max=0.871, sum=1.742 (2)", + "description": "min=0.934, mean=0.934, max=0.934, sum=1.868 (2)", "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": "{\"description\": \"min=0.432, mean=0.432, max=0.432, sum=0.864 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4321035870745138\"}", - "Logical Fallacies - # eval": "{\"description\": \"min=163, mean=163, max=163, sum=326 (2)\", \"tab\": \"General information\", \"score\": \"163.0\"}", - "Logical Fallacies - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Logical Fallacies - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Logical Fallacies - # prompt tokens": "{\"description\": \"min=442.595, mean=442.595, max=442.595, sum=885.19 (2)\", \"tab\": \"General information\", \"score\": \"442.5950920245399\"}", - "Logical Fallacies - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Astronomy - Observed inference time (s)": "{\"description\": \"min=0.472, mean=0.472, max=0.472, sum=0.944 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4718977307018481\"}", + "Astronomy - # eval": "{\"description\": \"min=152, mean=152, max=152, sum=304 (2)\", \"tab\": \"General information\", \"score\": \"152.0\"}", + "Astronomy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Astronomy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Astronomy - # prompt tokens": "{\"description\": \"min=572.691, mean=572.691, max=572.691, sum=1145.382 (2)\", \"tab\": \"General information\", \"score\": \"572.6907894736842\"}", + "Astronomy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"logical_fallacies\"", + "subject": "\"astronomy\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_logical_fallacies\"" + "groups": "\"mmlu_astronomy\"" } } }, { - "evaluation_name": "Machine Learning", + "evaluation_name": "Business Ethics", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -999,36 +905,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Machine Learning", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.759, + "score": 0.79, "details": { - "description": "min=0.759, mean=0.759, max=0.759, sum=1.518 (2)", + "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)", "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": "{\"description\": \"min=0.463, mean=0.463, max=0.463, sum=0.926 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.46302694933755056\"}", - "Machine Learning - # eval": "{\"description\": \"min=112, mean=112, max=112, sum=224 (2)\", \"tab\": \"General information\", \"score\": \"112.0\"}", - "Machine Learning - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Machine Learning - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Machine Learning - # prompt tokens": "{\"description\": \"min=661.054, mean=661.054, max=661.054, sum=1322.107 (2)\", \"tab\": \"General information\", \"score\": \"661.0535714285714\"}", - "Machine Learning - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Business Ethics - Observed inference time (s)": "{\"description\": \"min=0.477, mean=0.477, max=0.477, sum=0.953 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4765148901939392\"}", + "Business Ethics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "Business Ethics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Business Ethics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Business Ethics - # prompt tokens": "{\"description\": \"min=562.52, mean=562.52, max=562.52, sum=1125.04 (2)\", \"tab\": \"General information\", \"score\": \"562.52\"}", + "Business Ethics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"machine_learning\"", + "subject": "\"business_ethics\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_machine_learning\"" + "groups": "\"mmlu_business_ethics\"" } } }, { - "evaluation_name": "Management", + "evaluation_name": "Clinical Knowledge", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1037,36 +943,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Management", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.932, + "score": 0.845, "details": { - "description": "min=0.932, mean=0.932, max=0.932, sum=1.864 (2)", + "description": "min=0.845, mean=0.845, max=0.845, sum=1.691 (2)", "tab": "Accuracy", - "Management - Observed inference time (s)": "{\"description\": \"min=0.446, mean=0.446, max=0.446, sum=0.891 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4455798760201167\"}", - "Management - # eval": "{\"description\": \"min=103, mean=103, max=103, sum=206 (2)\", \"tab\": \"General information\", \"score\": \"103.0\"}", - "Management - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Management - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Management - # prompt tokens": "{\"description\": \"min=276.796, mean=276.796, max=276.796, sum=553.592 (2)\", \"tab\": \"General information\", \"score\": \"276.79611650485435\"}", - "Management - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Clinical Knowledge - Observed inference time (s)": "{\"description\": \"min=0.415, mean=0.415, max=0.415, sum=0.829 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.414557883424579\"}", + "Clinical Knowledge - # eval": "{\"description\": \"min=265, mean=265, max=265, sum=530 (2)\", \"tab\": \"General information\", \"score\": \"265.0\"}", + "Clinical Knowledge - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Clinical Knowledge - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Clinical Knowledge - # prompt tokens": "{\"description\": \"min=390.947, mean=390.947, max=390.947, sum=781.894 (2)\", \"tab\": \"General information\", \"score\": \"390.94716981132075\"}", + "Clinical Knowledge - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"management\"", + "subject": "\"clinical_knowledge\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_management\"" + "groups": "\"mmlu_clinical_knowledge\"" } } }, { - "evaluation_name": "Marketing", + "evaluation_name": "Conceptual Physics", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1075,36 +981,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Marketing", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.962, + "score": 0.868, "details": { - "description": "min=0.962, mean=0.962, max=0.962, sum=1.923 (2)", + "description": "min=0.868, mean=0.868, max=0.868, sum=1.736 (2)", "tab": "Accuracy", - "Marketing - Observed inference time (s)": "{\"description\": \"min=0.421, mean=0.421, max=0.421, sum=0.843 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4213859372668796\"}", - "Marketing - # eval": "{\"description\": \"min=234, mean=234, max=234, sum=468 (2)\", \"tab\": \"General information\", \"score\": \"234.0\"}", - "Marketing - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Marketing - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Marketing - # prompt tokens": "{\"description\": \"min=397.218, mean=397.218, max=397.218, sum=794.436 (2)\", \"tab\": \"General information\", \"score\": \"397.21794871794873\"}", - "Marketing - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Conceptual Physics - Observed inference time (s)": "{\"description\": \"min=0.384, mean=0.384, max=0.384, sum=0.767 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3836827186827964\"}", + "Conceptual Physics - # eval": "{\"description\": \"min=235, mean=235, max=235, sum=470 (2)\", \"tab\": \"General information\", \"score\": \"235.0\"}", + "Conceptual Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Conceptual Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Conceptual Physics - # prompt tokens": "{\"description\": \"min=297.838, mean=297.838, max=297.838, sum=595.677 (2)\", \"tab\": \"General information\", \"score\": \"297.83829787234043\"}", + "Conceptual Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"marketing\"", + "subject": "\"conceptual_physics\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_marketing\"" + "groups": "\"mmlu_conceptual_physics\"" } } }, { - "evaluation_name": "Medical Genetics", + "evaluation_name": "Electrical Engineering", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1113,36 +1019,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Medical Genetics", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.94, + "score": 0.786, "details": { - "description": "min=0.94, mean=0.94, max=0.94, sum=1.88 (2)", + "description": "min=0.786, mean=0.786, max=0.786, sum=1.572 (2)", "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": "{\"description\": \"min=0.411, mean=0.411, max=0.411, sum=0.823 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.41135803937911986\"}", - "Medical Genetics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "Medical Genetics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Medical Genetics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Medical Genetics - # prompt tokens": "{\"description\": \"min=334, mean=334, max=334, sum=668 (2)\", \"tab\": \"General information\", \"score\": \"334.0\"}", - "Medical Genetics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Electrical Engineering - Observed inference time (s)": "{\"description\": \"min=0.399, mean=0.399, max=0.399, sum=0.798 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.39915286919166304\"}", + "Electrical Engineering - # eval": "{\"description\": \"min=145, mean=145, max=145, sum=290 (2)\", \"tab\": \"General information\", \"score\": \"145.0\"}", + "Electrical Engineering - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Electrical Engineering - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Electrical Engineering - # prompt tokens": "{\"description\": \"min=433.641, mean=433.641, max=433.641, sum=867.283 (2)\", \"tab\": \"General information\", \"score\": \"433.6413793103448\"}", + "Electrical Engineering - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"medical_genetics\"", + "subject": "\"electrical_engineering\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_medical_genetics\"" + "groups": "\"mmlu_electrical_engineering\"" } } }, { - "evaluation_name": "Miscellaneous", + "evaluation_name": "Elementary Mathematics", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1151,36 +1057,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Miscellaneous", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.949, + "score": 0.807, "details": { - "description": "min=0.949, mean=0.949, max=0.949, sum=1.898 (2)", + "description": "min=0.807, mean=0.807, max=0.807, sum=1.614 (2)", "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": "{\"description\": \"min=0.451, mean=0.451, max=0.451, sum=0.901 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4505587230088001\"}", - "Miscellaneous - # eval": "{\"description\": \"min=783, mean=783, max=783, sum=1566 (2)\", \"tab\": \"General information\", \"score\": \"783.0\"}", - "Miscellaneous - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Miscellaneous - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Miscellaneous - # prompt tokens": "{\"description\": \"min=292.925, mean=292.925, max=292.925, sum=585.849 (2)\", \"tab\": \"General information\", \"score\": \"292.92464878671774\"}", - "Miscellaneous - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Elementary Mathematics - Observed inference time (s)": "{\"description\": \"min=0.423, mean=0.423, max=0.423, sum=0.845 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4225258120784053\"}", + "Elementary Mathematics - # eval": "{\"description\": \"min=378, mean=378, max=378, sum=756 (2)\", \"tab\": \"General information\", \"score\": \"378.0\"}", + "Elementary Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Elementary Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Elementary Mathematics - # prompt tokens": "{\"description\": \"min=524.862, mean=524.862, max=524.862, sum=1049.725 (2)\", \"tab\": \"General information\", \"score\": \"524.8624338624338\"}", + "Elementary Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"miscellaneous\"", + "subject": "\"elementary_mathematics\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_miscellaneous\"" + "groups": "\"mmlu_elementary_mathematics\"" } } }, { - "evaluation_name": "Moral Scenarios", + "evaluation_name": "Formal Logic", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1189,42 +1095,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Moral Scenarios", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.902, + "score": 0.643, "details": { - "description": "min=0.902, mean=0.902, max=0.902, sum=1.803 (2)", + "description": "min=0.643, mean=0.643, max=0.643, sum=1.286 (2)", "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": "{\"description\": \"min=0.428, mean=0.428, max=0.428, sum=0.856 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4281756044123214\"}", - "Moral Scenarios - Observed inference time (s)": "{\"description\": \"min=0.445, mean=0.445, max=0.445, sum=0.89 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.44513606945229645\"}", - "Moral Disputes - # eval": "{\"description\": \"min=346, mean=346, max=346, sum=692 (2)\", \"tab\": \"General information\", \"score\": \"346.0\"}", - "Moral Disputes - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Moral Disputes - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Moral Disputes - # prompt tokens": "{\"description\": \"min=469.145, mean=469.145, max=469.145, sum=938.289 (2)\", \"tab\": \"General information\", \"score\": \"469.1445086705202\"}", - "Moral Disputes - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "Moral Scenarios - # eval": "{\"description\": \"min=895, mean=895, max=895, sum=1790 (2)\", \"tab\": \"General information\", \"score\": \"895.0\"}", - "Moral Scenarios - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Moral Scenarios - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Moral Scenarios - # prompt tokens": "{\"description\": \"min=649.455, mean=649.455, max=649.455, sum=1298.909 (2)\", \"tab\": \"General information\", \"score\": \"649.454748603352\"}", - "Moral Scenarios - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Formal Logic - Observed inference time (s)": "{\"description\": \"min=0.486, mean=0.486, max=0.486, sum=0.973 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.48647683007376535\"}", + "Formal Logic - # eval": "{\"description\": \"min=126, mean=126, max=126, sum=252 (2)\", \"tab\": \"General information\", \"score\": \"126.0\"}", + "Formal Logic - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Formal Logic - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Formal Logic - # prompt tokens": "{\"description\": \"min=599.762, mean=599.762, max=599.762, sum=1199.524 (2)\", \"tab\": \"General information\", \"score\": \"599.7619047619048\"}", + "Formal Logic - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"moral_scenarios\"", + "subject": "\"formal_logic\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_moral_scenarios\"" + "groups": "\"mmlu_formal_logic\"" } } }, { - "evaluation_name": "Nutrition", + "evaluation_name": "High School World History", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1233,36 +1133,114 @@ ] }, "metric_config": { - "evaluation_description": "EM on Nutrition", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.892, + "score": 0.945, "details": { - "description": "min=0.892, mean=0.892, max=0.892, sum=1.784 (2)", + "description": "min=0.945, mean=0.945, max=0.945, sum=1.89 (2)", "tab": "Accuracy", - "Nutrition - Observed inference time (s)": "{\"description\": \"min=0.446, mean=0.446, max=0.446, sum=0.892 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4460979816960354\"}", - "Nutrition - # eval": "{\"description\": \"min=306, mean=306, max=306, sum=612 (2)\", \"tab\": \"General information\", \"score\": \"306.0\"}", - "Nutrition - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Nutrition - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Nutrition - # prompt tokens": "{\"description\": \"min=579.817, mean=579.817, max=579.817, sum=1159.634 (2)\", \"tab\": \"General information\", \"score\": \"579.8169934640523\"}", - "Nutrition - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "High School Biology - Observed inference time (s)": "{\"description\": \"min=0.436, mean=0.436, max=0.436, sum=0.872 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4360047817230225\"}", + "High School Chemistry - Observed inference time (s)": "{\"description\": \"min=0.413, mean=0.413, max=0.413, sum=0.827 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.41338158710836775\"}", + "High School Computer Science - Observed inference time (s)": "{\"description\": \"min=0.5, mean=0.5, max=0.5, sum=1.001 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5002665758132935\"}", + "High School European History - Observed inference time (s)": "{\"description\": \"min=0.579, mean=0.579, max=0.579, sum=1.158 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.578774525902488\"}", + "High School Geography - Observed inference time (s)": "{\"description\": \"min=0.414, mean=0.414, max=0.414, sum=0.829 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4142996747084338\"}", + "High School Government And Politics - Observed inference time (s)": "{\"description\": \"min=0.43, mean=0.43, max=0.43, sum=0.86 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.43005221001224814\"}", + "High School Macroeconomics - Observed inference time (s)": "{\"description\": \"min=0.416, mean=0.416, max=0.416, sum=0.832 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4160928750649477\"}", + "High School Mathematics - Observed inference time (s)": "{\"description\": \"min=0.423, mean=0.423, max=0.423, sum=0.846 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4231933620240953\"}", + "High School Microeconomics - Observed inference time (s)": "{\"description\": \"min=0.474, mean=0.474, max=0.474, sum=0.948 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4740273321376127\"}", + "High School Physics - Observed inference time (s)": "{\"description\": \"min=0.462, mean=0.462, max=0.462, sum=0.924 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4620048778736039\"}", + "High School Psychology - Observed inference time (s)": "{\"description\": \"min=0.407, mean=0.407, max=0.407, sum=0.813 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.40661886022725235\"}", + "High School Statistics - Observed inference time (s)": "{\"description\": \"min=0.463, mean=0.463, max=0.463, sum=0.926 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.46296725780875597\"}", + "High School US History - Observed inference time (s)": "{\"description\": \"min=0.546, mean=0.546, max=0.546, sum=1.091 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5456923538563299\"}", + "High School World History - Observed inference time (s)": "{\"description\": \"min=0.517, mean=0.517, max=0.517, sum=1.033 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5166646488608188\"}", + "High School Biology - # eval": "{\"description\": \"min=310, mean=310, max=310, sum=620 (2)\", \"tab\": \"General information\", \"score\": \"310.0\"}", + "High School Biology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Biology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Biology - # prompt tokens": "{\"description\": \"min=506.677, mean=506.677, max=506.677, sum=1013.355 (2)\", \"tab\": \"General information\", \"score\": \"506.6774193548387\"}", + "High School Biology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Chemistry - # eval": "{\"description\": \"min=203, mean=203, max=203, sum=406 (2)\", \"tab\": \"General information\", \"score\": \"203.0\"}", + "High School Chemistry - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Chemistry - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Chemistry - # prompt tokens": "{\"description\": \"min=489.714, mean=489.714, max=489.714, sum=979.429 (2)\", \"tab\": \"General information\", \"score\": \"489.7142857142857\"}", + "High School Chemistry - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Computer Science - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "High School Computer Science - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Computer Science - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Computer Science - # prompt tokens": "{\"description\": \"min=860.78, mean=860.78, max=860.78, sum=1721.56 (2)\", \"tab\": \"General information\", \"score\": \"860.78\"}", + "High School Computer Science - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School European History - # eval": "{\"description\": \"min=165, mean=165, max=165, sum=330 (2)\", \"tab\": \"General information\", \"score\": \"165.0\"}", + "High School European History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School European History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School European History - # prompt tokens": "{\"description\": \"min=2791.073, mean=2791.073, max=2791.073, sum=5582.145 (2)\", \"tab\": \"General information\", \"score\": \"2791.072727272727\"}", + "High School European History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Geography - # eval": "{\"description\": \"min=198, mean=198, max=198, sum=396 (2)\", \"tab\": \"General information\", \"score\": \"198.0\"}", + "High School Geography - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Geography - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Geography - # prompt tokens": "{\"description\": \"min=365.045, mean=365.045, max=365.045, sum=730.091 (2)\", \"tab\": \"General information\", \"score\": \"365.04545454545456\"}", + "High School Geography - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Government And Politics - # eval": "{\"description\": \"min=193, mean=193, max=193, sum=386 (2)\", \"tab\": \"General information\", \"score\": \"193.0\"}", + "High School Government And Politics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Government And Politics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Government And Politics - # prompt tokens": "{\"description\": \"min=458.824, mean=458.824, max=458.824, sum=917.648 (2)\", \"tab\": \"General information\", \"score\": \"458.8238341968912\"}", + "High School Government And Politics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Macroeconomics - # eval": "{\"description\": \"min=390, mean=390, max=390, sum=780 (2)\", \"tab\": \"General information\", \"score\": \"390.0\"}", + "High School Macroeconomics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Macroeconomics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Macroeconomics - # prompt tokens": "{\"description\": \"min=364.562, mean=364.562, max=364.562, sum=729.123 (2)\", \"tab\": \"General information\", \"score\": \"364.5615384615385\"}", + "High School Macroeconomics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Mathematics - # eval": "{\"description\": \"min=270, mean=270, max=270, sum=540 (2)\", \"tab\": \"General information\", \"score\": \"270.0\"}", + "High School Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Mathematics - # prompt tokens": "{\"description\": \"min=525.374, mean=525.374, max=525.374, sum=1050.748 (2)\", \"tab\": \"General information\", \"score\": \"525.3740740740741\"}", + "High School Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Microeconomics - # eval": "{\"description\": \"min=238, mean=238, max=238, sum=476 (2)\", \"tab\": \"General information\", \"score\": \"238.0\"}", + "High School Microeconomics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Microeconomics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Microeconomics - # prompt tokens": "{\"description\": \"min=392.025, mean=392.025, max=392.025, sum=784.05 (2)\", \"tab\": \"General information\", \"score\": \"392.02521008403363\"}", + "High School Microeconomics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Physics - # eval": "{\"description\": \"min=151, mean=151, max=151, sum=302 (2)\", \"tab\": \"General information\", \"score\": \"151.0\"}", + "High School Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Physics - # prompt tokens": "{\"description\": \"min=553.464, mean=553.464, max=553.464, sum=1106.927 (2)\", \"tab\": \"General information\", \"score\": \"553.4635761589404\"}", + "High School Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Psychology - # eval": "{\"description\": \"min=545, mean=545, max=545, sum=1090 (2)\", \"tab\": \"General information\", \"score\": \"545.0\"}", + "High School Psychology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Psychology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Psychology - # prompt tokens": "{\"description\": \"min=488.246, mean=488.246, max=488.246, sum=976.492 (2)\", \"tab\": \"General information\", \"score\": \"488.24587155963303\"}", + "High School Psychology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Statistics - # eval": "{\"description\": \"min=216, mean=216, max=216, sum=432 (2)\", \"tab\": \"General information\", \"score\": \"216.0\"}", + "High School Statistics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Statistics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Statistics - # prompt tokens": "{\"description\": \"min=788.699, mean=788.699, max=788.699, sum=1577.398 (2)\", \"tab\": \"General information\", \"score\": \"788.699074074074\"}", + "High School Statistics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School US History - # eval": "{\"description\": \"min=204, mean=204, max=204, sum=408 (2)\", \"tab\": \"General information\", \"score\": \"204.0\"}", + "High School US History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School US History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School US History - # prompt tokens": "{\"description\": \"min=2210.809, mean=2210.809, max=2210.809, sum=4421.618 (2)\", \"tab\": \"General information\", \"score\": \"2210.8088235294117\"}", + "High School US History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School World History - # eval": "{\"description\": \"min=237, mean=237, max=237, sum=474 (2)\", \"tab\": \"General information\", \"score\": \"237.0\"}", + "High School World History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School World History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School World History - # prompt tokens": "{\"description\": \"min=1421.27, mean=1421.27, max=1421.27, sum=2842.54 (2)\", \"tab\": \"General information\", \"score\": \"1421.2700421940929\"}", + "High School World History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"nutrition\"", + "subject": "\"high_school_world_history\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_nutrition\"" + "groups": "\"mmlu_high_school_world_history\"" } } }, { - "evaluation_name": "Prehistory", + "evaluation_name": "Human Sexuality", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1271,36 +1249,42 @@ ] }, "metric_config": { - "evaluation_description": "EM on Prehistory", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.926, + "score": 0.908, "details": { - "description": "min=0.926, mean=0.926, max=0.926, sum=1.852 (2)", + "description": "min=0.908, mean=0.908, max=0.908, sum=1.817 (2)", "tab": "Accuracy", - "Prehistory - Observed inference time (s)": "{\"description\": \"min=0.426, mean=0.426, max=0.426, sum=0.852 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42610209665180726\"}", - "Prehistory - # eval": "{\"description\": \"min=324, mean=324, max=324, sum=648 (2)\", \"tab\": \"General information\", \"score\": \"324.0\"}", - "Prehistory - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Prehistory - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Prehistory - # prompt tokens": "{\"description\": \"min=507.559, mean=507.559, max=507.559, sum=1015.117 (2)\", \"tab\": \"General information\", \"score\": \"507.55864197530866\"}", - "Prehistory - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Human Aging - Observed inference time (s)": "{\"description\": \"min=0.406, mean=0.406, max=0.406, sum=0.812 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4058152218036053\"}", + "Human Sexuality - Observed inference time (s)": "{\"description\": \"min=0.466, mean=0.466, max=0.466, sum=0.932 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.46620041541470825\"}", + "Human Aging - # eval": "{\"description\": \"min=223, mean=223, max=223, sum=446 (2)\", \"tab\": \"General information\", \"score\": \"223.0\"}", + "Human Aging - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Human Aging - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Human Aging - # prompt tokens": "{\"description\": \"min=312.906, mean=312.906, max=312.906, sum=625.812 (2)\", \"tab\": \"General information\", \"score\": \"312.90582959641256\"}", + "Human Aging - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "Human Sexuality - # eval": "{\"description\": \"min=131, mean=131, max=131, sum=262 (2)\", \"tab\": \"General information\", \"score\": \"131.0\"}", + "Human Sexuality - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Human Sexuality - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Human Sexuality - # prompt tokens": "{\"description\": \"min=334.183, mean=334.183, max=334.183, sum=668.366 (2)\", \"tab\": \"General information\", \"score\": \"334.1832061068702\"}", + "Human Sexuality - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"prehistory\"", + "subject": "\"human_sexuality\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_prehistory\"" + "groups": "\"mmlu_human_sexuality\"" } } }, { - "evaluation_name": "Public Relations", + "evaluation_name": "International Law", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1309,36 +1293,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Public Relations", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.745, + "score": 0.917, "details": { - "description": "min=0.745, mean=0.745, max=0.745, sum=1.491 (2)", + "description": "min=0.917, mean=0.917, max=0.917, sum=1.835 (2)", "tab": "Accuracy", - "Public Relations - Observed inference time (s)": "{\"description\": \"min=0.496, mean=0.496, max=0.496, sum=0.992 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.49601870450106533\"}", - "Public Relations - # eval": "{\"description\": \"min=110, mean=110, max=110, sum=220 (2)\", \"tab\": \"General information\", \"score\": \"110.0\"}", - "Public Relations - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Public Relations - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Public Relations - # prompt tokens": "{\"description\": \"min=398.318, mean=398.318, max=398.318, sum=796.636 (2)\", \"tab\": \"General information\", \"score\": \"398.3181818181818\"}", - "Public Relations - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "International Law - Observed inference time (s)": "{\"description\": \"min=0.461, mean=0.461, max=0.461, sum=0.922 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4608367139642889\"}", + "International Law - # eval": "{\"description\": \"min=121, mean=121, max=121, sum=242 (2)\", \"tab\": \"General information\", \"score\": \"121.0\"}", + "International Law - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "International Law - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "International Law - # prompt tokens": "{\"description\": \"min=632.851, mean=632.851, max=632.851, sum=1265.702 (2)\", \"tab\": \"General information\", \"score\": \"632.8512396694215\"}", + "International Law - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"public_relations\"", + "subject": "\"international_law\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_public_relations\"" + "groups": "\"mmlu_international_law\"" } } }, { - "evaluation_name": "Security Studies", + "evaluation_name": "Logical Fallacies", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1347,36 +1331,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Security Studies", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.861, + "score": 0.871, "details": { - "description": "min=0.861, mean=0.861, max=0.861, sum=1.722 (2)", + "description": "min=0.871, mean=0.871, max=0.871, sum=1.742 (2)", "tab": "Accuracy", - "Security Studies - Observed inference time (s)": "{\"description\": \"min=0.471, mean=0.471, max=0.471, sum=0.941 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.47064581306613223\"}", - "Security Studies - # eval": "{\"description\": \"min=245, mean=245, max=245, sum=490 (2)\", \"tab\": \"General information\", \"score\": \"245.0\"}", - "Security Studies - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Security Studies - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Security Studies - # prompt tokens": "{\"description\": \"min=1157.473, mean=1157.473, max=1157.473, sum=2314.947 (2)\", \"tab\": \"General information\", \"score\": \"1157.4734693877551\"}", - "Security Studies - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Logical Fallacies - Observed inference time (s)": "{\"description\": \"min=0.432, mean=0.432, max=0.432, sum=0.864 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4321035870745138\"}", + "Logical Fallacies - # eval": "{\"description\": \"min=163, mean=163, max=163, sum=326 (2)\", \"tab\": \"General information\", \"score\": \"163.0\"}", + "Logical Fallacies - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Logical Fallacies - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Logical Fallacies - # prompt tokens": "{\"description\": \"min=442.595, mean=442.595, max=442.595, sum=885.19 (2)\", \"tab\": \"General information\", \"score\": \"442.5950920245399\"}", + "Logical Fallacies - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"security_studies\"", + "subject": "\"logical_fallacies\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_security_studies\"" + "groups": "\"mmlu_logical_fallacies\"" } } }, { - "evaluation_name": "Sociology", + "evaluation_name": "Machine Learning", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1385,36 +1369,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Sociology", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.93, + "score": 0.759, "details": { - "description": "min=0.93, mean=0.93, max=0.93, sum=1.861 (2)", + "description": "min=0.759, mean=0.759, max=0.759, sum=1.518 (2)", "tab": "Accuracy", - "Sociology - Observed inference time (s)": "{\"description\": \"min=0.43, mean=0.43, max=0.43, sum=0.86 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42976075143956427\"}", - "Sociology - # eval": "{\"description\": \"min=201, mean=201, max=201, sum=402 (2)\", \"tab\": \"General information\", \"score\": \"201.0\"}", - "Sociology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Sociology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Sociology - # prompt tokens": "{\"description\": \"min=438.522, mean=438.522, max=438.522, sum=877.045 (2)\", \"tab\": \"General information\", \"score\": \"438.5223880597015\"}", - "Sociology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Machine Learning - Observed inference time (s)": "{\"description\": \"min=0.463, mean=0.463, max=0.463, sum=0.926 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.46302694933755056\"}", + "Machine Learning - # eval": "{\"description\": \"min=112, mean=112, max=112, sum=224 (2)\", \"tab\": \"General information\", \"score\": \"112.0\"}", + "Machine Learning - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Machine Learning - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Machine Learning - # prompt tokens": "{\"description\": \"min=661.054, mean=661.054, max=661.054, sum=1322.107 (2)\", \"tab\": \"General information\", \"score\": \"661.0535714285714\"}", + "Machine Learning - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"sociology\"", + "subject": "\"machine_learning\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_sociology\"" + "groups": "\"mmlu_machine_learning\"" } } }, { - "evaluation_name": "Virology", + "evaluation_name": "Management", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1423,36 +1407,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Virology", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.596, + "score": 0.932, "details": { - "description": "min=0.596, mean=0.596, max=0.596, sum=1.193 (2)", + "description": "min=0.932, mean=0.932, max=0.932, sum=1.864 (2)", "tab": "Accuracy", - "Virology - Observed inference time (s)": "{\"description\": \"min=0.42, mean=0.42, max=0.42, sum=0.84 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42023470890091125\"}", - "Virology - # eval": "{\"description\": \"min=166, mean=166, max=166, sum=332 (2)\", \"tab\": \"General information\", \"score\": \"166.0\"}", - "Virology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Virology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Virology - # prompt tokens": "{\"description\": \"min=336.09, mean=336.09, max=336.09, sum=672.181 (2)\", \"tab\": \"General information\", \"score\": \"336.0903614457831\"}", - "Virology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Management - Observed inference time (s)": "{\"description\": \"min=0.446, mean=0.446, max=0.446, sum=0.891 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4455798760201167\"}", + "Management - # eval": "{\"description\": \"min=103, mean=103, max=103, sum=206 (2)\", \"tab\": \"General information\", \"score\": \"103.0\"}", + "Management - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Management - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Management - # prompt tokens": "{\"description\": \"min=276.796, mean=276.796, max=276.796, sum=553.592 (2)\", \"tab\": \"General information\", \"score\": \"276.79611650485435\"}", + "Management - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"virology\"", + "subject": "\"management\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_virology\"" + "groups": "\"mmlu_management\"" } } }, { - "evaluation_name": "World Religions", + "evaluation_name": "Marketing", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1461,36 +1445,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on World Religions", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.877, + "score": 0.962, "details": { - "description": "min=0.877, mean=0.877, max=0.877, sum=1.754 (2)", + "description": "min=0.962, mean=0.962, max=0.962, sum=1.923 (2)", "tab": "Accuracy", - "World Religions - Observed inference time (s)": "{\"description\": \"min=0.451, mean=0.451, max=0.451, sum=0.901 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4507097779658803\"}", - "World Religions - # eval": "{\"description\": \"min=171, mean=171, max=171, sum=342 (2)\", \"tab\": \"General information\", \"score\": \"171.0\"}", - "World Religions - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "World Religions - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "World Religions - # prompt tokens": "{\"description\": \"min=268.561, mean=268.561, max=268.561, sum=537.123 (2)\", \"tab\": \"General information\", \"score\": \"268.56140350877195\"}", - "World Religions - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Marketing - Observed inference time (s)": "{\"description\": \"min=0.421, mean=0.421, max=0.421, sum=0.843 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4213859372668796\"}", + "Marketing - # eval": "{\"description\": \"min=234, mean=234, max=234, sum=468 (2)\", \"tab\": \"General information\", \"score\": \"234.0\"}", + "Marketing - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Marketing - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Marketing - # prompt tokens": "{\"description\": \"min=397.218, mean=397.218, max=397.218, sum=794.436 (2)\", \"tab\": \"General information\", \"score\": \"397.21794871794873\"}", + "Marketing - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"world_religions\"", + "subject": "\"marketing\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_world_religions\"" + "groups": "\"mmlu_marketing\"" } } }, { - "evaluation_name": "Mean win rate", + "evaluation_name": "Medical Genetics", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1499,402 +1483,418 @@ ] }, "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.517, + "score": 0.94, "details": { - "description": "", - "tab": "Efficiency" + "description": "min=0.94, mean=0.94, max=0.94, sum=1.88 (2)", + "tab": "Accuracy", + "Medical Genetics - Observed inference time (s)": "{\"description\": \"min=0.411, mean=0.411, max=0.411, sum=0.823 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.41135803937911986\"}", + "Medical Genetics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "Medical Genetics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Medical Genetics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Medical Genetics - # prompt tokens": "{\"description\": \"min=334, mean=334, max=334, sum=668 (2)\", \"tab\": \"General information\", \"score\": \"334.0\"}", + "Medical Genetics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { - "additional_details": {} + "additional_details": { + "subject": "\"medical_genetics\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_medical_genetics\"" + } } - } - ], - "detailed_evaluation_results": null, - "generation_config": { - "additional_details": { - "subject": "[\"abstract_algebra\", \"anatomy\", \"astronomy\", \"business_ethics\", \"clinical_knowledge\", \"college_biology\", \"college_chemistry\", \"college_computer_science\", \"college_mathematics\", \"college_medicine\", \"college_physics\", \"computer_security\", \"conceptual_physics\", \"econometrics\", \"electrical_engineering\", \"elementary_mathematics\", \"formal_logic\", \"global_facts\", \"high_school_biology\", \"high_school_chemistry\", \"high_school_computer_science\", \"high_school_european_history\", \"high_school_geography\", \"high_school_government_and_politics\", \"high_school_macroeconomics\", \"high_school_mathematics\", \"high_school_microeconomics\", \"high_school_physics\", \"high_school_psychology\", \"high_school_statistics\", \"high_school_us_history\", \"high_school_world_history\", \"human_aging\", \"human_sexuality\", \"international_law\", \"jurisprudence\", \"logical_fallacies\", \"machine_learning\", \"management\", \"marketing\", \"medical_genetics\", \"miscellaneous\", \"moral_disputes\", \"moral_scenarios\", \"nutrition\", \"philosophy\", \"prehistory\", \"professional_accounting\", \"professional_law\", \"professional_medicine\", \"professional_psychology\", \"public_relations\", \"security_studies\", \"sociology\", \"us_foreign_policy\", \"virology\", \"world_religions\"]", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "[\"mmlu_abstract_algebra\", \"mmlu_anatomy\", \"mmlu_astronomy\", \"mmlu_business_ethics\", \"mmlu_clinical_knowledge\", \"mmlu_college_biology\", \"mmlu_college_chemistry\", \"mmlu_college_computer_science\", \"mmlu_college_mathematics\", \"mmlu_college_medicine\", \"mmlu_college_physics\", \"mmlu_computer_security\", \"mmlu_conceptual_physics\", \"mmlu_econometrics\", \"mmlu_electrical_engineering\", \"mmlu_elementary_mathematics\", \"mmlu_formal_logic\", \"mmlu_global_facts\", \"mmlu_high_school_biology\", \"mmlu_high_school_chemistry\", \"mmlu_high_school_computer_science\", \"mmlu_high_school_european_history\", \"mmlu_high_school_geography\", \"mmlu_high_school_government_and_politics\", \"mmlu_high_school_macroeconomics\", \"mmlu_high_school_mathematics\", \"mmlu_high_school_microeconomics\", \"mmlu_high_school_physics\", \"mmlu_high_school_psychology\", \"mmlu_high_school_statistics\", \"mmlu_high_school_us_history\", \"mmlu_high_school_world_history\", \"mmlu_human_aging\", \"mmlu_human_sexuality\", \"mmlu_international_law\", \"mmlu_jurisprudence\", \"mmlu_logical_fallacies\", \"mmlu_machine_learning\", \"mmlu_management\", \"mmlu_marketing\", \"mmlu_medical_genetics\", \"mmlu_miscellaneous\", \"mmlu_moral_disputes\", \"mmlu_moral_scenarios\", \"mmlu_nutrition\", \"mmlu_philosophy\", \"mmlu_prehistory\", \"mmlu_professional_accounting\", \"mmlu_professional_law\", \"mmlu_professional_medicine\", \"mmlu_professional_psychology\", \"mmlu_public_relations\", \"mmlu_security_studies\", \"mmlu_sociology\", \"mmlu_us_foreign_policy\", \"mmlu_virology\", \"mmlu_world_religions\"]" - } - } - }, - { - "evaluation_id": "helm_lite/openai_gpt-4-0613/1774096306.427425", - "retrieved_timestamp": "1774096306.427425", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "eval_library": { - "name": "helm", - "version": "unknown" - }, - "benchmark": "helm_lite", - "evaluation_results": [ + }, { - "evaluation_name": "Mean win rate", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "helm_lite", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.867, + "score": 0.949, "details": { - "description": "", + "description": "min=0.949, mean=0.949, max=0.949, sum=1.898 (2)", "tab": "Accuracy", - "Mean win rate - Efficiency": "{\"description\": \"\", \"tab\": \"Efficiency\", \"score\": \"0.5158801498127341\"}", - "Mean win rate - General information": "{\"description\": \"\", \"tab\": \"General information\", \"score\": \"\"}" + "Miscellaneous - Observed inference time (s)": "{\"description\": \"min=0.451, mean=0.451, max=0.451, sum=0.901 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4505587230088001\"}", + "Miscellaneous - # eval": "{\"description\": \"min=783, mean=783, max=783, sum=1566 (2)\", \"tab\": \"General information\", \"score\": \"783.0\"}", + "Miscellaneous - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Miscellaneous - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Miscellaneous - # prompt tokens": "{\"description\": \"min=292.925, mean=292.925, max=292.925, sum=585.849 (2)\", \"tab\": \"General information\", \"score\": \"292.92464878671774\"}", + "Miscellaneous - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { - "additional_details": {} + "additional_details": { + "subject": "\"miscellaneous\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_miscellaneous\"" + } } }, { - "evaluation_name": "NarrativeQA", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "NarrativeQA", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "F1 on NarrativeQA", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.768, + "score": 0.902, "details": { - "description": "min=0.768, mean=0.768, max=0.768, sum=0.768 (1)", + "description": "min=0.902, mean=0.902, max=0.902, sum=1.803 (2)", "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": "{\"description\": \"min=0.976, mean=0.976, max=0.976, sum=0.976 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.9758186582108619\"}", - "NarrativeQA - # eval": "{\"description\": \"min=355, mean=355, max=355, sum=355 (1)\", \"tab\": \"General information\", \"score\": \"355.0\"}", - "NarrativeQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "NarrativeQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "NarrativeQA - # prompt tokens": "{\"description\": \"min=3522.67, mean=3522.67, max=3522.67, sum=3522.67 (1)\", \"tab\": \"General information\", \"score\": \"3522.6704225352114\"}", - "NarrativeQA - # output tokens": "{\"description\": \"min=8.515, mean=8.515, max=8.515, sum=8.515 (1)\", \"tab\": \"General information\", \"score\": \"8.51549295774648\"}" + "Moral Disputes - Observed inference time (s)": "{\"description\": \"min=0.428, mean=0.428, max=0.428, sum=0.856 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4281756044123214\"}", + "Moral Scenarios - Observed inference time (s)": "{\"description\": \"min=0.445, mean=0.445, max=0.445, sum=0.89 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.44513606945229645\"}", + "Moral Disputes - # eval": "{\"description\": \"min=346, mean=346, max=346, sum=692 (2)\", \"tab\": \"General information\", \"score\": \"346.0\"}", + "Moral Disputes - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Moral Disputes - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Moral Disputes - # prompt tokens": "{\"description\": \"min=469.145, mean=469.145, max=469.145, sum=938.289 (2)\", \"tab\": \"General information\", \"score\": \"469.1445086705202\"}", + "Moral Disputes - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "Moral Scenarios - # eval": "{\"description\": \"min=895, mean=895, max=895, sum=1790 (2)\", \"tab\": \"General information\", \"score\": \"895.0\"}", + "Moral Scenarios - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Moral Scenarios - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Moral Scenarios - # prompt tokens": "{\"description\": \"min=649.455, mean=649.455, max=649.455, sum=1298.909 (2)\", \"tab\": \"General information\", \"score\": \"649.454748603352\"}", + "Moral Scenarios - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { - "additional_details": {} + "additional_details": { + "subject": "\"moral_scenarios\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_moral_scenarios\"" + } } }, { - "evaluation_name": "NaturalQuestions (closed-book)", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.457, + "score": 0.892, "details": { - "description": "min=0.457, mean=0.457, max=0.457, sum=0.457 (1)", + "description": "min=0.892, mean=0.892, max=0.892, sum=1.784 (2)", "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": "{\"description\": \"min=0.908, mean=0.908, max=0.908, sum=0.908 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.9083020164966583\"}", - "NaturalQuestions (closed-book) - Observed inference time (s)": "{\"description\": \"min=0.512, mean=0.512, max=0.512, sum=0.512 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.5116857671737671\"}", - "NaturalQuestions (open-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", - "NaturalQuestions (open-book) - # train": "{\"description\": \"min=4.964, mean=4.964, max=4.964, sum=4.964 (1)\", \"tab\": \"General information\", \"score\": \"4.964\"}", - "NaturalQuestions (open-book) - truncated": "{\"description\": \"min=0.007, mean=0.007, max=0.007, sum=0.007 (1)\", \"tab\": \"General information\", \"score\": \"0.007\"}", - "NaturalQuestions (open-book) - # prompt tokens": "{\"description\": \"min=1717.847, mean=1717.847, max=1717.847, sum=1717.847 (1)\", \"tab\": \"General information\", \"score\": \"1717.847\"}", - "NaturalQuestions (open-book) - # output tokens": "{\"description\": \"min=8.055, mean=8.055, max=8.055, sum=8.055 (1)\", \"tab\": \"General information\", \"score\": \"8.055\"}", - "NaturalQuestions (closed-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", - "NaturalQuestions (closed-book) - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "NaturalQuestions (closed-book) - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "NaturalQuestions (closed-book) - # prompt tokens": "{\"description\": \"min=173.127, mean=173.127, max=173.127, sum=173.127 (1)\", \"tab\": \"General information\", \"score\": \"173.127\"}", - "NaturalQuestions (closed-book) - # output tokens": "{\"description\": \"min=3.832, mean=3.832, max=3.832, sum=3.832 (1)\", \"tab\": \"General information\", \"score\": \"3.832\"}" + "Nutrition - Observed inference time (s)": "{\"description\": \"min=0.446, mean=0.446, max=0.446, sum=0.892 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4460979816960354\"}", + "Nutrition - # eval": "{\"description\": \"min=306, mean=306, max=306, sum=612 (2)\", \"tab\": \"General information\", \"score\": \"306.0\"}", + "Nutrition - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Nutrition - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Nutrition - # prompt tokens": "{\"description\": \"min=579.817, mean=579.817, max=579.817, sum=1159.634 (2)\", \"tab\": \"General information\", \"score\": \"579.8169934640523\"}", + "Nutrition - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "mode": "\"closedbook\"" + "subject": "\"nutrition\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_nutrition\"" } } }, { - "evaluation_name": "OpenbookQA", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "OpenbookQA", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "EM on OpenbookQA", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.96, + "score": 0.926, "details": { - "description": "min=0.96, mean=0.96, max=0.96, sum=0.96 (1)", + "description": "min=0.926, mean=0.926, max=0.926, sum=1.852 (2)", "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": "{\"description\": \"min=0.401, mean=0.401, max=0.401, sum=0.401 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.40061268854141235\"}", - "OpenbookQA - # eval": "{\"description\": \"min=500, mean=500, max=500, sum=500 (1)\", \"tab\": \"General information\", \"score\": \"500.0\"}", - "OpenbookQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "OpenbookQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "OpenbookQA - # prompt tokens": "{\"description\": \"min=242.782, mean=242.782, max=242.782, sum=242.782 (1)\", \"tab\": \"General information\", \"score\": \"242.782\"}", - "OpenbookQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Prehistory - Observed inference time (s)": "{\"description\": \"min=0.426, mean=0.426, max=0.426, sum=0.852 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42610209665180726\"}", + "Prehistory - # eval": "{\"description\": \"min=324, mean=324, max=324, sum=648 (2)\", \"tab\": \"General information\", \"score\": \"324.0\"}", + "Prehistory - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Prehistory - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Prehistory - # prompt tokens": "{\"description\": \"min=507.559, mean=507.559, max=507.559, sum=1015.117 (2)\", \"tab\": \"General information\", \"score\": \"507.55864197530866\"}", + "Prehistory - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "dataset": "\"openbookqa\"", - "method": "\"multiple_choice_joint\"" + "subject": "\"prehistory\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_prehistory\"" } } }, { - "evaluation_name": "MMLU", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "MMLU", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "EM on MMLU", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.735, + "score": 0.745, "details": { - "description": "min=0.55, mean=0.735, max=0.95, sum=3.674 (5)", + "description": "min=0.745, mean=0.745, max=0.745, sum=1.491 (2)", "tab": "Accuracy", - "MMLU - Observed inference time (s)": "{\"description\": \"min=0.364, mean=0.391, max=0.434, sum=1.954 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.39080846048656265\"}", - "MMLU - # eval": "{\"description\": \"min=100, mean=102.8, max=114, sum=514 (5)\", \"tab\": \"General information\", \"score\": \"102.8\"}", - "MMLU - # train": "{\"description\": \"min=5, mean=5, max=5, sum=25 (5)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "MMLU - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "MMLU - # prompt tokens": "{\"description\": \"min=366.44, mean=460.72, max=607.43, sum=2303.6 (5)\", \"tab\": \"General information\", \"score\": \"460.71996491228066\"}", - "MMLU - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Public Relations - Observed inference time (s)": "{\"description\": \"min=0.496, mean=0.496, max=0.496, sum=0.992 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.49601870450106533\"}", + "Public Relations - # eval": "{\"description\": \"min=110, mean=110, max=110, sum=220 (2)\", \"tab\": \"General information\", \"score\": \"110.0\"}", + "Public Relations - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Public Relations - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Public Relations - # prompt tokens": "{\"description\": \"min=398.318, mean=398.318, max=398.318, sum=796.636 (2)\", \"tab\": \"General information\", \"score\": \"398.3181818181818\"}", + "Public Relations - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "[\"abstract_algebra\", \"college_chemistry\", \"computer_security\", \"econometrics\", \"us_foreign_policy\"]", - "method": "\"multiple_choice_joint\"" + "subject": "\"public_relations\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_public_relations\"" } } }, { - "evaluation_name": "MATH", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "MATH", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.802, + "score": 0.861, "details": { - "description": "min=0.673, mean=0.802, max=0.948, sum=5.617 (7)", + "description": "min=0.861, mean=0.861, max=0.861, sum=1.722 (2)", "tab": "Accuracy", - "MATH - Observed inference time (s)": "{\"description\": \"min=2.95, mean=3.472, max=4.247, sum=24.303 (7)\", \"tab\": \"Efficiency\", \"score\": \"3.4718795228507955\"}", - "MATH - # eval": "{\"description\": \"min=30, mean=62.429, max=135, sum=437 (7)\", \"tab\": \"General information\", \"score\": \"62.42857142857143\"}", - "MATH - # train": "{\"description\": \"min=8, mean=8, max=8, sum=56 (7)\", \"tab\": \"General information\", \"score\": \"8.0\"}", - "MATH - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (7)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "MATH - # prompt tokens": "{\"description\": \"min=942.363, mean=1323.911, max=2258.577, sum=9267.376 (7)\", \"tab\": \"General information\", \"score\": \"1323.910874184069\"}", - "MATH - # output tokens": "{\"description\": \"min=59.674, mean=73.257, max=81.1, sum=512.799 (7)\", \"tab\": \"General information\", \"score\": \"73.25695858608955\"}" + "Security Studies - Observed inference time (s)": "{\"description\": \"min=0.471, mean=0.471, max=0.471, sum=0.941 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.47064581306613223\"}", + "Security Studies - # eval": "{\"description\": \"min=245, mean=245, max=245, sum=490 (2)\", \"tab\": \"General information\", \"score\": \"245.0\"}", + "Security Studies - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Security Studies - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Security Studies - # prompt tokens": "{\"description\": \"min=1157.473, mean=1157.473, max=1157.473, sum=2314.947 (2)\", \"tab\": \"General information\", \"score\": \"1157.4734693877551\"}", + "Security Studies - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "[\"algebra\", \"counting_and_probability\", \"geometry\", \"intermediate_algebra\", \"number_theory\", \"prealgebra\", \"precalculus\"]", - "level": "\"1\"", - "use_official_examples": "\"False\"", - "use_chain_of_thought": "\"True\"" + "subject": "\"security_studies\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_security_studies\"" } } }, { - "evaluation_name": "GSM8K", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "GSM8K", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "EM on GSM8K", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.932, + "score": 0.93, "details": { - "description": "min=0.932, mean=0.932, max=0.932, sum=0.932 (1)", + "description": "min=0.93, mean=0.93, max=0.93, sum=1.861 (2)", "tab": "Accuracy", - "GSM8K - Observed inference time (s)": "{\"description\": \"min=4.948, mean=4.948, max=4.948, sum=4.948 (1)\", \"tab\": \"Efficiency\", \"score\": \"4.947624314308166\"}", - "GSM8K - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", - "GSM8K - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "GSM8K - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "GSM8K - # prompt tokens": "{\"description\": \"min=1020.035, mean=1020.035, max=1020.035, sum=1020.035 (1)\", \"tab\": \"General information\", \"score\": \"1020.035\"}", - "GSM8K - # output tokens": "{\"description\": \"min=111.209, mean=111.209, max=111.209, sum=111.209 (1)\", \"tab\": \"General information\", \"score\": \"111.209\"}" + "Sociology - Observed inference time (s)": "{\"description\": \"min=0.43, mean=0.43, max=0.43, sum=0.86 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42976075143956427\"}", + "Sociology - # eval": "{\"description\": \"min=201, mean=201, max=201, sum=402 (2)\", \"tab\": \"General information\", \"score\": \"201.0\"}", + "Sociology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Sociology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Sociology - # prompt tokens": "{\"description\": \"min=438.522, mean=438.522, max=438.522, sum=877.045 (2)\", \"tab\": \"General information\", \"score\": \"438.5223880597015\"}", + "Sociology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { - "additional_details": {} + "additional_details": { + "subject": "\"sociology\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_sociology\"" + } } }, { - "evaluation_name": "LegalBench", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "LegalBench", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "EM on LegalBench", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.713, + "score": 0.596, "details": { - "description": "min=0.452, mean=0.713, max=0.905, sum=3.564 (5)", + "description": "min=0.596, mean=0.596, max=0.596, sum=1.193 (2)", "tab": "Accuracy", - "LegalBench - Observed inference time (s)": "{\"description\": \"min=0.46, mean=0.558, max=0.886, sum=2.791 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.5582764348578453\"}", - "LegalBench - # eval": "{\"description\": \"min=95, mean=409.4, max=1000, sum=2047 (5)\", \"tab\": \"General information\", \"score\": \"409.4\"}", - "LegalBench - # train": "{\"description\": \"min=4, mean=4.798, max=5, sum=23.992 (5)\", \"tab\": \"General information\", \"score\": \"4.798367346938775\"}", - "LegalBench - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "LegalBench - # prompt tokens": "{\"description\": \"min=253.442, mean=1568.687, max=6350.008, sum=7843.435 (5)\", \"tab\": \"General information\", \"score\": \"1568.6870529886412\"}", - "LegalBench - # output tokens": "{\"description\": \"min=1, mean=1.34, max=2.063, sum=6.698 (5)\", \"tab\": \"General information\", \"score\": \"1.3396070557866055\"}" + "Virology - Observed inference time (s)": "{\"description\": \"min=0.42, mean=0.42, max=0.42, sum=0.84 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42023470890091125\"}", + "Virology - # eval": "{\"description\": \"min=166, mean=166, max=166, sum=332 (2)\", \"tab\": \"General information\", \"score\": \"166.0\"}", + "Virology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Virology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Virology - # prompt tokens": "{\"description\": \"min=336.09, mean=336.09, max=336.09, sum=672.181 (2)\", \"tab\": \"General information\", \"score\": \"336.0903614457831\"}", + "Virology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subset": "[\"abercrombie\", \"corporate_lobbying\", \"function_of_decision_section\", \"international_citizenship_questions\", \"proa\"]" + "subject": "\"virology\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_virology\"" } } }, { - "evaluation_name": "MedQA", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "MedQA", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "EM on MedQA", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.815, + "score": 0.877, "details": { - "description": "min=0.815, mean=0.815, max=0.815, sum=0.815 (1)", + "description": "min=0.877, mean=0.877, max=0.877, sum=1.754 (2)", "tab": "Accuracy", - "MedQA - Observed inference time (s)": "{\"description\": \"min=0.414, mean=0.414, max=0.414, sum=0.414 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.4136932588239787\"}", - "MedQA - # eval": "{\"description\": \"min=503, mean=503, max=503, sum=503 (1)\", \"tab\": \"General information\", \"score\": \"503.0\"}", - "MedQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "MedQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "MedQA - # prompt tokens": "{\"description\": \"min=1020.414, mean=1020.414, max=1020.414, sum=1020.414 (1)\", \"tab\": \"General information\", \"score\": \"1020.4135188866799\"}", - "MedQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "World Religions - Observed inference time (s)": "{\"description\": \"min=0.451, mean=0.451, max=0.451, sum=0.901 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4507097779658803\"}", + "World Religions - # eval": "{\"description\": \"min=171, mean=171, max=171, sum=342 (2)\", \"tab\": \"General information\", \"score\": \"171.0\"}", + "World Religions - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "World Religions - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "World Religions - # prompt tokens": "{\"description\": \"min=268.561, mean=268.561, max=268.561, sum=537.123 (2)\", \"tab\": \"General information\", \"score\": \"268.56140350877195\"}", + "World Religions - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { - "additional_details": {} + "additional_details": { + "subject": "\"world_religions\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_world_religions\"" + } } }, { - "evaluation_name": "WMT 2014", + "evaluation_name": "Mean win rate", "source_data": { - "dataset_name": "WMT 2014", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", + "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.211, + "score": 0.517, "details": { - "description": "min=0.149, mean=0.211, max=0.256, sum=1.053 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": "{\"description\": \"min=1.448, mean=1.58, max=1.724, sum=7.899 (5)\", \"tab\": \"Efficiency\", \"score\": \"1.5797039644192494\"}", - "WMT 2014 - # eval": "{\"description\": \"min=503, mean=568.8, max=832, sum=2844 (5)\", \"tab\": \"General information\", \"score\": \"568.8\"}", - "WMT 2014 - # train": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "WMT 2014 - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "WMT 2014 - # prompt tokens": "{\"description\": \"min=169.901, mean=193.043, max=213.185, sum=965.213 (5)\", \"tab\": \"General information\", \"score\": \"193.04258583116683\"}", - "WMT 2014 - # output tokens": "{\"description\": \"min=23.767, mean=25.424, max=26.121, sum=127.122 (5)\", \"tab\": \"General information\", \"score\": \"25.424382072946933\"}" + "description": "", + "tab": "Efficiency" } }, "generation_config": { - "additional_details": { - "language_pair": "[\"cs-en\", \"de-en\", \"fr-en\", \"hi-en\", \"ru-en\"]" - } + "additional_details": {} } } ], "detailed_evaluation_results": null, "generation_config": { - "additional_details": {} + "additional_details": { + "subject": "[\"abstract_algebra\", \"anatomy\", \"astronomy\", \"business_ethics\", \"clinical_knowledge\", \"college_biology\", \"college_chemistry\", \"college_computer_science\", \"college_mathematics\", \"college_medicine\", \"college_physics\", \"computer_security\", \"conceptual_physics\", \"econometrics\", \"electrical_engineering\", \"elementary_mathematics\", \"formal_logic\", \"global_facts\", \"high_school_biology\", \"high_school_chemistry\", \"high_school_computer_science\", \"high_school_european_history\", \"high_school_geography\", \"high_school_government_and_politics\", \"high_school_macroeconomics\", \"high_school_mathematics\", \"high_school_microeconomics\", \"high_school_physics\", \"high_school_psychology\", \"high_school_statistics\", \"high_school_us_history\", \"high_school_world_history\", \"human_aging\", \"human_sexuality\", \"international_law\", \"jurisprudence\", \"logical_fallacies\", \"machine_learning\", \"management\", \"marketing\", \"medical_genetics\", \"miscellaneous\", \"moral_disputes\", \"moral_scenarios\", \"nutrition\", \"philosophy\", \"prehistory\", \"professional_accounting\", \"professional_law\", \"professional_medicine\", \"professional_psychology\", \"public_relations\", \"security_studies\", \"sociology\", \"us_foreign_policy\", \"virology\", \"world_religions\"]", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "[\"mmlu_abstract_algebra\", \"mmlu_anatomy\", \"mmlu_astronomy\", \"mmlu_business_ethics\", \"mmlu_clinical_knowledge\", \"mmlu_college_biology\", \"mmlu_college_chemistry\", \"mmlu_college_computer_science\", \"mmlu_college_mathematics\", \"mmlu_college_medicine\", \"mmlu_college_physics\", \"mmlu_computer_security\", \"mmlu_conceptual_physics\", \"mmlu_econometrics\", \"mmlu_electrical_engineering\", \"mmlu_elementary_mathematics\", \"mmlu_formal_logic\", \"mmlu_global_facts\", \"mmlu_high_school_biology\", \"mmlu_high_school_chemistry\", \"mmlu_high_school_computer_science\", \"mmlu_high_school_european_history\", \"mmlu_high_school_geography\", \"mmlu_high_school_government_and_politics\", \"mmlu_high_school_macroeconomics\", \"mmlu_high_school_mathematics\", \"mmlu_high_school_microeconomics\", \"mmlu_high_school_physics\", \"mmlu_high_school_psychology\", \"mmlu_high_school_statistics\", \"mmlu_high_school_us_history\", \"mmlu_high_school_world_history\", \"mmlu_human_aging\", \"mmlu_human_sexuality\", \"mmlu_international_law\", \"mmlu_jurisprudence\", \"mmlu_logical_fallacies\", \"mmlu_machine_learning\", \"mmlu_management\", \"mmlu_marketing\", \"mmlu_medical_genetics\", \"mmlu_miscellaneous\", \"mmlu_moral_disputes\", \"mmlu_moral_scenarios\", \"mmlu_nutrition\", \"mmlu_philosophy\", \"mmlu_prehistory\", \"mmlu_professional_accounting\", \"mmlu_professional_law\", \"mmlu_professional_medicine\", \"mmlu_professional_psychology\", \"mmlu_public_relations\", \"mmlu_security_studies\", \"mmlu_sociology\", \"mmlu_us_foreign_policy\", \"mmlu_virology\", \"mmlu_world_religions\"]" + } } } ] diff --git a/data/models/openai_gpt-4-turbo-2024-04-09.json b/data/models/openai_gpt-4-turbo-2024-04-09.json index 1fe1d15bc389a507225c8274f004fa48f42d4b5a..e16c126fc0c5b9aa15f8e7eae24f7699fd6de829 100644 --- a/data/models/openai_gpt-4-turbo-2024-04-09.json +++ b/data/models/openai_gpt-4-turbo-2024-04-09.json @@ -7,10 +7,10 @@ }, "evaluations": [ { - "evaluation_id": "helm_mmlu/openai_gpt-4-turbo-2024-04-09/1774096312.00548", - "retrieved_timestamp": "1774096312.00548", + "evaluation_id": "helm_lite/openai_gpt-4-turbo-2024-04-09/1774096306.427425", + "retrieved_timestamp": "1774096306.427425", "source_metadata": { - "source_name": "helm_mmlu", + "source_name": "helm_lite", "source_type": "documentation", "source_organization_name": "crfm", "evaluator_relationship": "third_party" @@ -19,438 +19,382 @@ "name": "helm", "version": "unknown" }, - "benchmark": "helm_mmlu", + "benchmark": "helm_lite", "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects", + "evaluation_name": "Mean win rate", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "helm_lite", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", + "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.813, + "score": 0.864, "details": { - "description": "min=0.515, mean=0.813, max=0.974, sum=92.65 (114)", + "description": "", "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": "{\"description\": \"min=0.479, mean=0.617, max=0.934, sum=70.3 (114)\", \"tab\": \"Efficiency\", \"score\": \"0.6166649052297876\"}", - "MMLU All Subjects - # eval": "{\"description\": \"min=100, mean=246.351, max=1534, sum=28084 (114)\", \"tab\": \"General information\", \"score\": \"246.35087719298247\"}", - "MMLU All Subjects - # train": "{\"description\": \"min=5, mean=5, max=5, sum=570 (114)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "MMLU All Subjects - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (114)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "MMLU All Subjects - # prompt tokens": "{\"description\": \"min=275.561, mean=614.852, max=2798.073, sum=70093.086 (114)\", \"tab\": \"General information\", \"score\": \"614.851634217556\"}", - "MMLU All Subjects - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=114 (114)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Mean win rate - Efficiency": "{\"description\": \"\", \"tab\": \"Efficiency\", \"score\": \"0.4568414481897628\"}", + "Mean win rate - General information": "{\"description\": \"\", \"tab\": \"General information\", \"score\": \"\"}" } }, "generation_config": { - "additional_details": { - "subject": "[\"abstract_algebra\", \"anatomy\", \"astronomy\", \"business_ethics\", \"clinical_knowledge\", \"college_biology\", \"college_chemistry\", \"college_computer_science\", \"college_mathematics\", \"college_medicine\", \"college_physics\", \"computer_security\", \"conceptual_physics\", \"econometrics\", \"electrical_engineering\", \"elementary_mathematics\", \"formal_logic\", \"global_facts\", \"high_school_biology\", \"high_school_chemistry\", \"high_school_computer_science\", \"high_school_european_history\", \"high_school_geography\", \"high_school_government_and_politics\", \"high_school_macroeconomics\", \"high_school_mathematics\", \"high_school_microeconomics\", \"high_school_physics\", \"high_school_psychology\", \"high_school_statistics\", \"high_school_us_history\", \"high_school_world_history\", \"human_aging\", \"human_sexuality\", \"international_law\", \"jurisprudence\", \"logical_fallacies\", \"machine_learning\", \"management\", \"marketing\", \"medical_genetics\", \"miscellaneous\", \"moral_disputes\", \"moral_scenarios\", \"nutrition\", \"philosophy\", \"prehistory\", \"professional_accounting\", \"professional_law\", \"professional_medicine\", \"professional_psychology\", \"public_relations\", \"security_studies\", \"sociology\", \"us_foreign_policy\", \"virology\", \"world_religions\"]", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "[\"mmlu_abstract_algebra\", \"mmlu_anatomy\", \"mmlu_astronomy\", \"mmlu_business_ethics\", \"mmlu_clinical_knowledge\", \"mmlu_college_biology\", \"mmlu_college_chemistry\", \"mmlu_college_computer_science\", \"mmlu_college_mathematics\", \"mmlu_college_medicine\", \"mmlu_college_physics\", \"mmlu_computer_security\", \"mmlu_conceptual_physics\", \"mmlu_econometrics\", \"mmlu_electrical_engineering\", \"mmlu_elementary_mathematics\", \"mmlu_formal_logic\", \"mmlu_global_facts\", \"mmlu_high_school_biology\", \"mmlu_high_school_chemistry\", \"mmlu_high_school_computer_science\", \"mmlu_high_school_european_history\", \"mmlu_high_school_geography\", \"mmlu_high_school_government_and_politics\", \"mmlu_high_school_macroeconomics\", \"mmlu_high_school_mathematics\", \"mmlu_high_school_microeconomics\", \"mmlu_high_school_physics\", \"mmlu_high_school_psychology\", \"mmlu_high_school_statistics\", \"mmlu_high_school_us_history\", \"mmlu_high_school_world_history\", \"mmlu_human_aging\", \"mmlu_human_sexuality\", \"mmlu_international_law\", \"mmlu_jurisprudence\", \"mmlu_logical_fallacies\", \"mmlu_machine_learning\", \"mmlu_management\", \"mmlu_marketing\", \"mmlu_medical_genetics\", \"mmlu_miscellaneous\", \"mmlu_moral_disputes\", \"mmlu_moral_scenarios\", \"mmlu_nutrition\", \"mmlu_philosophy\", \"mmlu_prehistory\", \"mmlu_professional_accounting\", \"mmlu_professional_law\", \"mmlu_professional_medicine\", \"mmlu_professional_psychology\", \"mmlu_public_relations\", \"mmlu_security_studies\", \"mmlu_sociology\", \"mmlu_us_foreign_policy\", \"mmlu_virology\", \"mmlu_world_religions\"]" - } + "additional_details": {} } }, { - "evaluation_name": "Abstract Algebra", + "evaluation_name": "NarrativeQA", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "NarrativeQA", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on Abstract Algebra", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.56, + "score": 0.761, "details": { - "description": "min=0.56, mean=0.56, max=0.56, sum=1.12 (2)", + "description": "min=0.761, mean=0.761, max=0.761, sum=0.761 (1)", "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": "{\"description\": \"min=0.54, mean=0.54, max=0.54, sum=1.08 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.539907853603363\"}", - "Abstract Algebra - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "Abstract Algebra - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Abstract Algebra - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Abstract Algebra - # prompt tokens": "{\"description\": \"min=373.44, mean=373.44, max=373.44, sum=746.88 (2)\", \"tab\": \"General information\", \"score\": \"373.44\"}", - "Abstract Algebra - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "NarrativeQA - Observed inference time (s)": "{\"description\": \"min=0.804, mean=0.804, max=0.804, sum=0.804 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.8043310716118611\"}", + "NarrativeQA - # eval": "{\"description\": \"min=355, mean=355, max=355, sum=355 (1)\", \"tab\": \"General information\", \"score\": \"355.0\"}", + "NarrativeQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "NarrativeQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "NarrativeQA - # prompt tokens": "{\"description\": \"min=3495.67, mean=3495.67, max=3495.67, sum=3495.67 (1)\", \"tab\": \"General information\", \"score\": \"3495.6704225352114\"}", + "NarrativeQA - # output tokens": "{\"description\": \"min=6.037, mean=6.037, max=6.037, sum=6.037 (1)\", \"tab\": \"General information\", \"score\": \"6.0366197183098596\"}" } }, "generation_config": { - "additional_details": { - "subject": "\"abstract_algebra\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_abstract_algebra\"" - } + "additional_details": {} } }, { - "evaluation_name": "Anatomy", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on Anatomy", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.822, + "score": 0.482, "details": { - "description": "min=0.822, mean=0.822, max=0.822, sum=1.644 (2)", + "description": "min=0.482, mean=0.482, max=0.482, sum=0.482 (1)", "tab": "Accuracy", - "Anatomy - Observed inference time (s)": "{\"description\": \"min=0.53, mean=0.53, max=0.53, sum=1.06 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5299274744810881\"}", - "Anatomy - # eval": "{\"description\": \"min=135, mean=135, max=135, sum=270 (2)\", \"tab\": \"General information\", \"score\": \"135.0\"}", - "Anatomy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Anatomy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Anatomy - # prompt tokens": "{\"description\": \"min=353.978, mean=353.978, max=353.978, sum=707.956 (2)\", \"tab\": \"General information\", \"score\": \"353.97777777777776\"}", - "Anatomy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "NaturalQuestions (open-book) - Observed inference time (s)": "{\"description\": \"min=0.712, mean=0.712, max=0.712, sum=0.712 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.7120162718296051\"}", + "NaturalQuestions (closed-book) - Observed inference time (s)": "{\"description\": \"min=0.605, mean=0.605, max=0.605, sum=0.605 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.6052222681045533\"}", + "NaturalQuestions (open-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", + "NaturalQuestions (open-book) - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "NaturalQuestions (open-book) - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "NaturalQuestions (open-book) - # prompt tokens": "{\"description\": \"min=1728.593, mean=1728.593, max=1728.593, sum=1728.593 (1)\", \"tab\": \"General information\", \"score\": \"1728.593\"}", + "NaturalQuestions (open-book) - # output tokens": "{\"description\": \"min=5.902, mean=5.902, max=5.902, sum=5.902 (1)\", \"tab\": \"General information\", \"score\": \"5.902\"}", + "NaturalQuestions (closed-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", + "NaturalQuestions (closed-book) - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "NaturalQuestions (closed-book) - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "NaturalQuestions (closed-book) - # prompt tokens": "{\"description\": \"min=139.127, mean=139.127, max=139.127, sum=139.127 (1)\", \"tab\": \"General information\", \"score\": \"139.127\"}", + "NaturalQuestions (closed-book) - # output tokens": "{\"description\": \"min=5.263, mean=5.263, max=5.263, sum=5.263 (1)\", \"tab\": \"General information\", \"score\": \"5.263\"}" } }, "generation_config": { "additional_details": { - "subject": "\"anatomy\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_anatomy\"" + "mode": "\"closedbook\"" } } }, { - "evaluation_name": "College Physics", + "evaluation_name": "OpenbookQA", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "OpenbookQA", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on College Physics", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.539, + "score": 0.97, "details": { - "description": "min=0.539, mean=0.539, max=0.539, sum=1.078 (2)", + "description": "min=0.97, mean=0.97, max=0.97, sum=0.97 (1)", "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": "{\"description\": \"min=0.549, mean=0.549, max=0.549, sum=1.099 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5493535542488098\"}", - "College Biology - Observed inference time (s)": "{\"description\": \"min=0.6, mean=0.6, max=0.6, sum=1.199 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5995734184980392\"}", - "College Computer Science - Observed inference time (s)": "{\"description\": \"min=0.691, mean=0.691, max=0.691, sum=1.382 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6911867094039917\"}", - "College Mathematics - Observed inference time (s)": "{\"description\": \"min=0.609, mean=0.609, max=0.609, sum=1.219 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6092576813697815\"}", - "College Medicine - Observed inference time (s)": "{\"description\": \"min=0.67, mean=0.67, max=0.67, sum=1.34 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6697626251705809\"}", - "College Physics - Observed inference time (s)": "{\"description\": \"min=0.706, mean=0.706, max=0.706, sum=1.412 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.7058592660754335\"}", - "College Chemistry - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "College Chemistry - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "College Chemistry - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "College Chemistry - # prompt tokens": "{\"description\": \"min=549.4, mean=549.4, max=549.4, sum=1098.8 (2)\", \"tab\": \"General information\", \"score\": \"549.4\"}", - "College Chemistry - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "College Biology - # eval": "{\"description\": \"min=144, mean=144, max=144, sum=288 (2)\", \"tab\": \"General information\", \"score\": \"144.0\"}", - "College Biology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "College Biology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "College Biology - # prompt tokens": "{\"description\": \"min=473.917, mean=473.917, max=473.917, sum=947.833 (2)\", \"tab\": \"General information\", \"score\": \"473.9166666666667\"}", - "College Biology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "College Computer Science - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "College Computer Science - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "College Computer Science - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "College Computer Science - # prompt tokens": "{\"description\": \"min=828.39, mean=828.39, max=828.39, sum=1656.78 (2)\", \"tab\": \"General information\", \"score\": \"828.39\"}", - "College Computer Science - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "College Mathematics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "College Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "College Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "College Mathematics - # prompt tokens": "{\"description\": \"min=594.52, mean=594.52, max=594.52, sum=1189.04 (2)\", \"tab\": \"General information\", \"score\": \"594.52\"}", - "College Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "College Medicine - # eval": "{\"description\": \"min=173, mean=173, max=173, sum=346 (2)\", \"tab\": \"General information\", \"score\": \"173.0\"}", - "College Medicine - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "College Medicine - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "College Medicine - # prompt tokens": "{\"description\": \"min=502.728, mean=502.728, max=502.728, sum=1005.457 (2)\", \"tab\": \"General information\", \"score\": \"502.728323699422\"}", - "College Medicine - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "College Physics - # eval": "{\"description\": \"min=102, mean=102, max=102, sum=204 (2)\", \"tab\": \"General information\", \"score\": \"102.0\"}", - "College Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "College Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "College Physics - # prompt tokens": "{\"description\": \"min=503.608, mean=503.608, max=503.608, sum=1007.216 (2)\", \"tab\": \"General information\", \"score\": \"503.6078431372549\"}", - "College Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "OpenbookQA - Observed inference time (s)": "{\"description\": \"min=0.438, mean=0.438, max=0.438, sum=0.438 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.4376141686439514\"}", + "OpenbookQA - # eval": "{\"description\": \"min=500, mean=500, max=500, sum=500 (1)\", \"tab\": \"General information\", \"score\": \"500.0\"}", + "OpenbookQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "OpenbookQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "OpenbookQA - # prompt tokens": "{\"description\": \"min=249.782, mean=249.782, max=249.782, sum=249.782 (1)\", \"tab\": \"General information\", \"score\": \"249.782\"}", + "OpenbookQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"college_physics\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_college_physics\"" + "dataset": "\"openbookqa\"", + "method": "\"multiple_choice_joint\"" } } }, { - "evaluation_name": "Computer Security", + "evaluation_name": "MMLU", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "MMLU", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on Computer Security", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.83, + "score": 0.711, "details": { - "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)", + "description": "min=0.53, mean=0.711, max=0.96, sum=3.555 (5)", "tab": "Accuracy", - "Computer Security - Observed inference time (s)": "{\"description\": \"min=0.53, mean=0.53, max=0.53, sum=1.061 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5303381824493408\"}", - "Computer Security - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "Computer Security - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Computer Security - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Computer Security - # prompt tokens": "{\"description\": \"min=378.54, mean=378.54, max=378.54, sum=757.08 (2)\", \"tab\": \"General information\", \"score\": \"378.54\"}", - "Computer Security - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "MMLU - Observed inference time (s)": "{\"description\": \"min=0.53, mean=0.55, max=0.572, sum=2.749 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.5498773384847139\"}", + "MMLU - # eval": "{\"description\": \"min=100, mean=102.8, max=114, sum=514 (5)\", \"tab\": \"General information\", \"score\": \"102.8\"}", + "MMLU - # train": "{\"description\": \"min=5, mean=5, max=5, sum=25 (5)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "MMLU - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "MMLU - # prompt tokens": "{\"description\": \"min=373.44, mean=467.72, max=614.43, sum=2338.6 (5)\", \"tab\": \"General information\", \"score\": \"467.71996491228066\"}", + "MMLU - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"computer_security\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_computer_security\"" + "subject": "[\"abstract_algebra\", \"college_chemistry\", \"computer_security\", \"econometrics\", \"us_foreign_policy\"]", + "method": "\"multiple_choice_joint\"" } } }, { - "evaluation_name": "Econometrics", + "evaluation_name": "MATH", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "MATH", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on Econometrics", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.675, + "score": 0.833, "details": { - "description": "min=0.675, mean=0.675, max=0.675, sum=1.351 (2)", + "description": "min=0.684, mean=0.833, max=0.97, sum=5.83 (7)", "tab": "Accuracy", - "Econometrics - Observed inference time (s)": "{\"description\": \"min=0.572, mean=0.572, max=0.572, sum=1.144 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5721135453173989\"}", - "Econometrics - # eval": "{\"description\": \"min=114, mean=114, max=114, sum=228 (2)\", \"tab\": \"General information\", \"score\": \"114.0\"}", - "Econometrics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Econometrics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Econometrics - # prompt tokens": "{\"description\": \"min=614.43, mean=614.43, max=614.43, sum=1228.86 (2)\", \"tab\": \"General information\", \"score\": \"614.4298245614035\"}", - "Econometrics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "MATH - Observed inference time (s)": "{\"description\": \"min=4.92, mean=6.678, max=8.338, sum=46.748 (7)\", \"tab\": \"Efficiency\", \"score\": \"6.678270916932833\"}", + "MATH - # eval": "{\"description\": \"min=30, mean=62.429, max=135, sum=437 (7)\", \"tab\": \"General information\", \"score\": \"62.42857142857143\"}", + "MATH - # train": "{\"description\": \"min=8, mean=8, max=8, sum=56 (7)\", \"tab\": \"General information\", \"score\": \"8.0\"}", + "MATH - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (7)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "MATH - # prompt tokens": "{\"description\": \"min=881.363, mean=1262.911, max=2197.577, sum=8840.376 (7)\", \"tab\": \"General information\", \"score\": \"1262.9108741840687\"}", + "MATH - # output tokens": "{\"description\": \"min=135.163, mean=189.561, max=219.316, sum=1326.926 (7)\", \"tab\": \"General information\", \"score\": \"189.56082409362702\"}" } }, "generation_config": { "additional_details": { - "subject": "\"econometrics\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_econometrics\"" + "subject": "[\"algebra\", \"counting_and_probability\", \"geometry\", \"intermediate_algebra\", \"number_theory\", \"prealgebra\", \"precalculus\"]", + "level": "\"1\"", + "use_official_examples": "\"False\"", + "use_chain_of_thought": "\"True\"" } } }, { - "evaluation_name": "Global Facts", + "evaluation_name": "GSM8K", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "GSM8K", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on Global Facts", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.58, + "score": 0.824, "details": { - "description": "min=0.58, mean=0.58, max=0.58, sum=1.16 (2)", + "description": "min=0.824, mean=0.824, max=0.824, sum=0.824 (1)", "tab": "Accuracy", - "Global Facts - Observed inference time (s)": "{\"description\": \"min=0.479, mean=0.479, max=0.479, sum=0.958 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.47900029182434084\"}", - "Global Facts - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "Global Facts - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Global Facts - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Global Facts - # prompt tokens": "{\"description\": \"min=399.71, mean=399.71, max=399.71, sum=799.42 (2)\", \"tab\": \"General information\", \"score\": \"399.71\"}", - "Global Facts - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "GSM8K - Observed inference time (s)": "{\"description\": \"min=6.915, mean=6.915, max=6.915, sum=6.915 (1)\", \"tab\": \"Efficiency\", \"score\": \"6.91472976398468\"}", + "GSM8K - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", + "GSM8K - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "GSM8K - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "GSM8K - # prompt tokens": "{\"description\": \"min=959.035, mean=959.035, max=959.035, sum=959.035 (1)\", \"tab\": \"General information\", \"score\": \"959.035\"}", + "GSM8K - # output tokens": "{\"description\": \"min=141.712, mean=141.712, max=141.712, sum=141.712 (1)\", \"tab\": \"General information\", \"score\": \"141.712\"}" } }, "generation_config": { "additional_details": { - "subject": "\"global_facts\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_global_facts\"" + "stop": "\"none\"" } } }, { - "evaluation_name": "Jurisprudence", + "evaluation_name": "LegalBench", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "LegalBench", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on Jurisprudence", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.88, + "score": 0.727, "details": { - "description": "min=0.88, mean=0.88, max=0.88, sum=1.759 (2)", + "description": "min=0.417, mean=0.727, max=0.947, sum=3.637 (5)", "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": "{\"description\": \"min=0.539, mean=0.539, max=0.539, sum=1.079 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5393155504156042\"}", - "Jurisprudence - # eval": "{\"description\": \"min=108, mean=108, max=108, sum=216 (2)\", \"tab\": \"General information\", \"score\": \"108.0\"}", - "Jurisprudence - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Jurisprudence - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Jurisprudence - # prompt tokens": "{\"description\": \"min=394.639, mean=394.639, max=394.639, sum=789.278 (2)\", \"tab\": \"General information\", \"score\": \"394.6388888888889\"}", - "Jurisprudence - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "LegalBench - Observed inference time (s)": "{\"description\": \"min=0.514, mean=0.608, max=0.803, sum=3.041 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.6081070231398068\"}", + "LegalBench - # eval": "{\"description\": \"min=95, mean=409.4, max=1000, sum=2047 (5)\", \"tab\": \"General information\", \"score\": \"409.4\"}", + "LegalBench - # train": "{\"description\": \"min=4, mean=4.8, max=5, sum=24 (5)\", \"tab\": \"General information\", \"score\": \"4.8\"}", + "LegalBench - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "LegalBench - # prompt tokens": "{\"description\": \"min=207.442, mean=1524.163, max=6311.388, sum=7620.815 (5)\", \"tab\": \"General information\", \"score\": \"1524.162971355988\"}", + "LegalBench - # output tokens": "{\"description\": \"min=1, mean=1.325, max=2.032, sum=6.626 (5)\", \"tab\": \"General information\", \"score\": \"1.3251168793919403\"}" } }, "generation_config": { "additional_details": { - "subject": "\"jurisprudence\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_jurisprudence\"" + "subset": "[\"abercrombie\", \"corporate_lobbying\", \"function_of_decision_section\", \"international_citizenship_questions\", \"proa\"]" } } }, { - "evaluation_name": "Philosophy", + "evaluation_name": "MedQA", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "MedQA", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on Philosophy", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.868, + "score": 0.783, "details": { - "description": "min=0.868, mean=0.868, max=0.868, sum=1.736 (2)", + "description": "min=0.783, mean=0.783, max=0.783, sum=0.783 (1)", "tab": "Accuracy", - "Philosophy - Observed inference time (s)": "{\"description\": \"min=0.543, mean=0.543, max=0.543, sum=1.087 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5434573969273705\"}", - "Philosophy - # eval": "{\"description\": \"min=311, mean=311, max=311, sum=622 (2)\", \"tab\": \"General information\", \"score\": \"311.0\"}", - "Philosophy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Philosophy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Philosophy - # prompt tokens": "{\"description\": \"min=329.084, mean=329.084, max=329.084, sum=658.167 (2)\", \"tab\": \"General information\", \"score\": \"329.08360128617363\"}", - "Philosophy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "MedQA - Observed inference time (s)": "{\"description\": \"min=0.455, mean=0.455, max=0.455, sum=0.455 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.4549296101329341\"}", + "MedQA - # eval": "{\"description\": \"min=503, mean=503, max=503, sum=503 (1)\", \"tab\": \"General information\", \"score\": \"503.0\"}", + "MedQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "MedQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "MedQA - # prompt tokens": "{\"description\": \"min=1027.414, mean=1027.414, max=1027.414, sum=1027.414 (1)\", \"tab\": \"General information\", \"score\": \"1027.4135188866799\"}", + "MedQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { - "additional_details": { - "subject": "\"philosophy\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_philosophy\"" - } + "additional_details": {} } }, { - "evaluation_name": "Professional Psychology", + "evaluation_name": "WMT 2014", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "WMT 2014", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on Professional Psychology", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.873, + "score": 0.218, "details": { - "description": "min=0.873, mean=0.873, max=0.873, sum=1.745 (2)", + "description": "min=0.169, mean=0.218, max=0.264, sum=1.088 (5)", "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": "{\"description\": \"min=0.579, mean=0.579, max=0.579, sum=1.159 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5794552100055358\"}", - "Professional Accounting - Observed inference time (s)": "{\"description\": \"min=0.59, mean=0.59, max=0.59, sum=1.18 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5898241354218612\"}", - "Professional Law - Observed inference time (s)": "{\"description\": \"min=0.639, mean=0.639, max=0.639, sum=1.278 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6388053317424371\"}", - "Professional Psychology - Observed inference time (s)": "{\"description\": \"min=0.671, mean=0.671, max=0.671, sum=1.342 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6712259284031936\"}", - "Professional Medicine - # eval": "{\"description\": \"min=272, mean=272, max=272, sum=544 (2)\", \"tab\": \"General information\", \"score\": \"272.0\"}", - "Professional Medicine - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Professional Medicine - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Professional Medicine - # prompt tokens": "{\"description\": \"min=1094.585, mean=1094.585, max=1094.585, sum=2189.169 (2)\", \"tab\": \"General information\", \"score\": \"1094.5845588235295\"}", - "Professional Medicine - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "Professional Accounting - # eval": "{\"description\": \"min=282, mean=282, max=282, sum=564 (2)\", \"tab\": \"General information\", \"score\": \"282.0\"}", - "Professional Accounting - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Professional Accounting - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Professional Accounting - # prompt tokens": "{\"description\": \"min=658.592, mean=658.592, max=658.592, sum=1317.184 (2)\", \"tab\": \"General information\", \"score\": \"658.5921985815603\"}", - "Professional Accounting - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "Professional Law - # eval": "{\"description\": \"min=1534, mean=1534, max=1534, sum=3068 (2)\", \"tab\": \"General information\", \"score\": \"1534.0\"}", - "Professional Law - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Professional Law - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Professional Law - # prompt tokens": "{\"description\": \"min=1637.787, mean=1637.787, max=1637.787, sum=3275.574 (2)\", \"tab\": \"General information\", \"score\": \"1637.7868318122555\"}", - "Professional Law - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "Professional Psychology - # eval": "{\"description\": \"min=612, mean=612, max=612, sum=1224 (2)\", \"tab\": \"General information\", \"score\": \"612.0\"}", - "Professional Psychology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Professional Psychology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Professional Psychology - # prompt tokens": "{\"description\": \"min=575.114, mean=575.114, max=575.114, sum=1150.229 (2)\", \"tab\": \"General information\", \"score\": \"575.1143790849674\"}", - "Professional Psychology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "WMT 2014 - Observed inference time (s)": "{\"description\": \"min=1.131, mean=1.185, max=1.222, sum=5.925 (5)\", \"tab\": \"Efficiency\", \"score\": \"1.1850423664020953\"}", + "WMT 2014 - # eval": "{\"description\": \"min=503, mean=568.8, max=832, sum=2844 (5)\", \"tab\": \"General information\", \"score\": \"568.8\"}", + "WMT 2014 - # train": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "WMT 2014 - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "WMT 2014 - # prompt tokens": "{\"description\": \"min=124.901, mean=148.043, max=168.185, sum=740.213 (5)\", \"tab\": \"General information\", \"score\": \"148.04258583116683\"}", + "WMT 2014 - # output tokens": "{\"description\": \"min=23.744, mean=25.264, max=25.938, sum=126.322 (5)\", \"tab\": \"General information\", \"score\": \"25.26444840571953\"}" } }, "generation_config": { "additional_details": { - "subject": "\"professional_psychology\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_professional_psychology\"" + "language_pair": "[\"cs-en\", \"de-en\", \"fr-en\", \"hi-en\", \"ru-en\"]" } } - }, + } + ], + "detailed_evaluation_results": null, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_id": "helm_mmlu/openai_gpt-4-turbo-2024-04-09/1774096312.00548", + "retrieved_timestamp": "1774096312.00548", + "source_metadata": { + "source_name": "helm_mmlu", + "source_type": "documentation", + "source_organization_name": "crfm", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "helm", + "version": "unknown" + }, + "benchmark": "helm_mmlu", + "evaluation_results": [ { - "evaluation_name": "Us Foreign Policy", + "evaluation_name": "MMLU All Subjects", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -459,36 +403,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.96, + "score": 0.813, "details": { - "description": "min=0.96, mean=0.96, max=0.96, sum=1.92 (2)", + "description": "min=0.515, mean=0.813, max=0.974, sum=92.65 (114)", "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": "{\"description\": \"min=0.558, mean=0.558, max=0.558, sum=1.115 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.557673556804657\"}", - "Us Foreign Policy - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "Us Foreign Policy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Us Foreign Policy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Us Foreign Policy - # prompt tokens": "{\"description\": \"min=422.79, mean=422.79, max=422.79, sum=845.58 (2)\", \"tab\": \"General information\", \"score\": \"422.79\"}", - "Us Foreign Policy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "MMLU All Subjects - Observed inference time (s)": "{\"description\": \"min=0.479, mean=0.617, max=0.934, sum=70.3 (114)\", \"tab\": \"Efficiency\", \"score\": \"0.6166649052297876\"}", + "MMLU All Subjects - # eval": "{\"description\": \"min=100, mean=246.351, max=1534, sum=28084 (114)\", \"tab\": \"General information\", \"score\": \"246.35087719298247\"}", + "MMLU All Subjects - # train": "{\"description\": \"min=5, mean=5, max=5, sum=570 (114)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "MMLU All Subjects - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (114)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "MMLU All Subjects - # prompt tokens": "{\"description\": \"min=275.561, mean=614.852, max=2798.073, sum=70093.086 (114)\", \"tab\": \"General information\", \"score\": \"614.851634217556\"}", + "MMLU All Subjects - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=114 (114)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"us_foreign_policy\"", + "subject": "[\"abstract_algebra\", \"anatomy\", \"astronomy\", \"business_ethics\", \"clinical_knowledge\", \"college_biology\", \"college_chemistry\", \"college_computer_science\", \"college_mathematics\", \"college_medicine\", \"college_physics\", \"computer_security\", \"conceptual_physics\", \"econometrics\", \"electrical_engineering\", \"elementary_mathematics\", \"formal_logic\", \"global_facts\", \"high_school_biology\", \"high_school_chemistry\", \"high_school_computer_science\", \"high_school_european_history\", \"high_school_geography\", \"high_school_government_and_politics\", \"high_school_macroeconomics\", \"high_school_mathematics\", \"high_school_microeconomics\", \"high_school_physics\", \"high_school_psychology\", \"high_school_statistics\", \"high_school_us_history\", \"high_school_world_history\", \"human_aging\", \"human_sexuality\", \"international_law\", \"jurisprudence\", \"logical_fallacies\", \"machine_learning\", \"management\", \"marketing\", \"medical_genetics\", \"miscellaneous\", \"moral_disputes\", \"moral_scenarios\", \"nutrition\", \"philosophy\", \"prehistory\", \"professional_accounting\", \"professional_law\", \"professional_medicine\", \"professional_psychology\", \"public_relations\", \"security_studies\", \"sociology\", \"us_foreign_policy\", \"virology\", \"world_religions\"]", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_us_foreign_policy\"" + "groups": "[\"mmlu_abstract_algebra\", \"mmlu_anatomy\", \"mmlu_astronomy\", \"mmlu_business_ethics\", \"mmlu_clinical_knowledge\", \"mmlu_college_biology\", \"mmlu_college_chemistry\", \"mmlu_college_computer_science\", \"mmlu_college_mathematics\", \"mmlu_college_medicine\", \"mmlu_college_physics\", \"mmlu_computer_security\", \"mmlu_conceptual_physics\", \"mmlu_econometrics\", \"mmlu_electrical_engineering\", \"mmlu_elementary_mathematics\", \"mmlu_formal_logic\", \"mmlu_global_facts\", \"mmlu_high_school_biology\", \"mmlu_high_school_chemistry\", \"mmlu_high_school_computer_science\", \"mmlu_high_school_european_history\", \"mmlu_high_school_geography\", \"mmlu_high_school_government_and_politics\", \"mmlu_high_school_macroeconomics\", \"mmlu_high_school_mathematics\", \"mmlu_high_school_microeconomics\", \"mmlu_high_school_physics\", \"mmlu_high_school_psychology\", \"mmlu_high_school_statistics\", \"mmlu_high_school_us_history\", \"mmlu_high_school_world_history\", \"mmlu_human_aging\", \"mmlu_human_sexuality\", \"mmlu_international_law\", \"mmlu_jurisprudence\", \"mmlu_logical_fallacies\", \"mmlu_machine_learning\", \"mmlu_management\", \"mmlu_marketing\", \"mmlu_medical_genetics\", \"mmlu_miscellaneous\", \"mmlu_moral_disputes\", \"mmlu_moral_scenarios\", \"mmlu_nutrition\", \"mmlu_philosophy\", \"mmlu_prehistory\", \"mmlu_professional_accounting\", \"mmlu_professional_law\", \"mmlu_professional_medicine\", \"mmlu_professional_psychology\", \"mmlu_public_relations\", \"mmlu_security_studies\", \"mmlu_sociology\", \"mmlu_us_foreign_policy\", \"mmlu_virology\", \"mmlu_world_religions\"]" } } }, { - "evaluation_name": "Astronomy", + "evaluation_name": "Abstract Algebra", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -497,36 +441,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Astronomy", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.941, + "score": 0.56, "details": { - "description": "min=0.941, mean=0.941, max=0.941, sum=1.882 (2)", + "description": "min=0.56, mean=0.56, max=0.56, sum=1.12 (2)", "tab": "Accuracy", - "Astronomy - Observed inference time (s)": "{\"description\": \"min=0.666, mean=0.666, max=0.666, sum=1.332 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6662032525790366\"}", - "Astronomy - # eval": "{\"description\": \"min=152, mean=152, max=152, sum=304 (2)\", \"tab\": \"General information\", \"score\": \"152.0\"}", - "Astronomy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Astronomy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Astronomy - # prompt tokens": "{\"description\": \"min=579.691, mean=579.691, max=579.691, sum=1159.382 (2)\", \"tab\": \"General information\", \"score\": \"579.6907894736842\"}", - "Astronomy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Abstract Algebra - Observed inference time (s)": "{\"description\": \"min=0.54, mean=0.54, max=0.54, sum=1.08 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.539907853603363\"}", + "Abstract Algebra - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "Abstract Algebra - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Abstract Algebra - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Abstract Algebra - # prompt tokens": "{\"description\": \"min=373.44, mean=373.44, max=373.44, sum=746.88 (2)\", \"tab\": \"General information\", \"score\": \"373.44\"}", + "Abstract Algebra - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"astronomy\"", + "subject": "\"abstract_algebra\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_astronomy\"" + "groups": "\"mmlu_abstract_algebra\"" } } }, { - "evaluation_name": "Business Ethics", + "evaluation_name": "Anatomy", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -535,36 +479,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Business Ethics", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.82, + "score": 0.822, "details": { - "description": "min=0.82, mean=0.82, max=0.82, sum=1.64 (2)", + "description": "min=0.822, mean=0.822, max=0.822, sum=1.644 (2)", "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": "{\"description\": \"min=0.598, mean=0.598, max=0.598, sum=1.196 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5981367039680481\"}", - "Business Ethics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "Business Ethics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Business Ethics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Business Ethics - # prompt tokens": "{\"description\": \"min=569.52, mean=569.52, max=569.52, sum=1139.04 (2)\", \"tab\": \"General information\", \"score\": \"569.52\"}", - "Business Ethics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Anatomy - Observed inference time (s)": "{\"description\": \"min=0.53, mean=0.53, max=0.53, sum=1.06 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5299274744810881\"}", + "Anatomy - # eval": "{\"description\": \"min=135, mean=135, max=135, sum=270 (2)\", \"tab\": \"General information\", \"score\": \"135.0\"}", + "Anatomy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Anatomy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Anatomy - # prompt tokens": "{\"description\": \"min=353.978, mean=353.978, max=353.978, sum=707.956 (2)\", \"tab\": \"General information\", \"score\": \"353.97777777777776\"}", + "Anatomy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"business_ethics\"", + "subject": "\"anatomy\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_business_ethics\"" + "groups": "\"mmlu_anatomy\"" } } }, { - "evaluation_name": "Clinical Knowledge", + "evaluation_name": "College Physics", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -573,36 +517,66 @@ ] }, "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.83, + "score": 0.539, "details": { - "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)", + "description": "min=0.539, mean=0.539, max=0.539, sum=1.078 (2)", "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": "{\"description\": \"min=0.591, mean=0.591, max=0.591, sum=1.183 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5912713131814633\"}", - "Clinical Knowledge - # eval": "{\"description\": \"min=265, mean=265, max=265, sum=530 (2)\", \"tab\": \"General information\", \"score\": \"265.0\"}", - "Clinical Knowledge - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Clinical Knowledge - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Clinical Knowledge - # prompt tokens": "{\"description\": \"min=397.947, mean=397.947, max=397.947, sum=795.894 (2)\", \"tab\": \"General information\", \"score\": \"397.94716981132075\"}", - "Clinical Knowledge - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "College Chemistry - Observed inference time (s)": "{\"description\": \"min=0.549, mean=0.549, max=0.549, sum=1.099 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5493535542488098\"}", + "College Biology - Observed inference time (s)": "{\"description\": \"min=0.6, mean=0.6, max=0.6, sum=1.199 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5995734184980392\"}", + "College Computer Science - Observed inference time (s)": "{\"description\": \"min=0.691, mean=0.691, max=0.691, sum=1.382 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6911867094039917\"}", + "College Mathematics - Observed inference time (s)": "{\"description\": \"min=0.609, mean=0.609, max=0.609, sum=1.219 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6092576813697815\"}", + "College Medicine - Observed inference time (s)": "{\"description\": \"min=0.67, mean=0.67, max=0.67, sum=1.34 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6697626251705809\"}", + "College Physics - Observed inference time (s)": "{\"description\": \"min=0.706, mean=0.706, max=0.706, sum=1.412 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.7058592660754335\"}", + "College Chemistry - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "College Chemistry - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "College Chemistry - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "College Chemistry - # prompt tokens": "{\"description\": \"min=549.4, mean=549.4, max=549.4, sum=1098.8 (2)\", \"tab\": \"General information\", \"score\": \"549.4\"}", + "College Chemistry - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "College Biology - # eval": "{\"description\": \"min=144, mean=144, max=144, sum=288 (2)\", \"tab\": \"General information\", \"score\": \"144.0\"}", + "College Biology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "College Biology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "College Biology - # prompt tokens": "{\"description\": \"min=473.917, mean=473.917, max=473.917, sum=947.833 (2)\", \"tab\": \"General information\", \"score\": \"473.9166666666667\"}", + "College Biology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "College Computer Science - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "College Computer Science - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "College Computer Science - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "College Computer Science - # prompt tokens": "{\"description\": \"min=828.39, mean=828.39, max=828.39, sum=1656.78 (2)\", \"tab\": \"General information\", \"score\": \"828.39\"}", + "College Computer Science - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "College Mathematics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "College Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "College Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "College Mathematics - # prompt tokens": "{\"description\": \"min=594.52, mean=594.52, max=594.52, sum=1189.04 (2)\", \"tab\": \"General information\", \"score\": \"594.52\"}", + "College Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "College Medicine - # eval": "{\"description\": \"min=173, mean=173, max=173, sum=346 (2)\", \"tab\": \"General information\", \"score\": \"173.0\"}", + "College Medicine - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "College Medicine - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "College Medicine - # prompt tokens": "{\"description\": \"min=502.728, mean=502.728, max=502.728, sum=1005.457 (2)\", \"tab\": \"General information\", \"score\": \"502.728323699422\"}", + "College Medicine - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "College Physics - # eval": "{\"description\": \"min=102, mean=102, max=102, sum=204 (2)\", \"tab\": \"General information\", \"score\": \"102.0\"}", + "College Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "College Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "College Physics - # prompt tokens": "{\"description\": \"min=503.608, mean=503.608, max=503.608, sum=1007.216 (2)\", \"tab\": \"General information\", \"score\": \"503.6078431372549\"}", + "College Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"clinical_knowledge\"", + "subject": "\"college_physics\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_clinical_knowledge\"" + "groups": "\"mmlu_college_physics\"" } } }, { - "evaluation_name": "Conceptual Physics", + "evaluation_name": "Computer Security", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -611,36 +585,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Conceptual Physics", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.894, + "score": 0.83, "details": { - "description": "min=0.894, mean=0.894, max=0.894, sum=1.787 (2)", + "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)", "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": "{\"description\": \"min=0.685, mean=0.685, max=0.685, sum=1.369 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.684603402969685\"}", - "Conceptual Physics - # eval": "{\"description\": \"min=235, mean=235, max=235, sum=470 (2)\", \"tab\": \"General information\", \"score\": \"235.0\"}", - "Conceptual Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Conceptual Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Conceptual Physics - # prompt tokens": "{\"description\": \"min=304.838, mean=304.838, max=304.838, sum=609.677 (2)\", \"tab\": \"General information\", \"score\": \"304.83829787234043\"}", - "Conceptual Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Computer Security - Observed inference time (s)": "{\"description\": \"min=0.53, mean=0.53, max=0.53, sum=1.061 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5303381824493408\"}", + "Computer Security - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "Computer Security - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Computer Security - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Computer Security - # prompt tokens": "{\"description\": \"min=378.54, mean=378.54, max=378.54, sum=757.08 (2)\", \"tab\": \"General information\", \"score\": \"378.54\"}", + "Computer Security - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"conceptual_physics\"", + "subject": "\"computer_security\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_conceptual_physics\"" + "groups": "\"mmlu_computer_security\"" } } }, { - "evaluation_name": "Electrical Engineering", + "evaluation_name": "Econometrics", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -649,36 +623,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Electrical Engineering", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.752, + "score": 0.675, "details": { - "description": "min=0.752, mean=0.752, max=0.752, sum=1.503 (2)", + "description": "min=0.675, mean=0.675, max=0.675, sum=1.351 (2)", "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": "{\"description\": \"min=0.649, mean=0.649, max=0.649, sum=1.297 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6487039006989578\"}", - "Electrical Engineering - # eval": "{\"description\": \"min=145, mean=145, max=145, sum=290 (2)\", \"tab\": \"General information\", \"score\": \"145.0\"}", - "Electrical Engineering - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Electrical Engineering - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Electrical Engineering - # prompt tokens": "{\"description\": \"min=440.641, mean=440.641, max=440.641, sum=881.283 (2)\", \"tab\": \"General information\", \"score\": \"440.6413793103448\"}", - "Electrical Engineering - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Econometrics - Observed inference time (s)": "{\"description\": \"min=0.572, mean=0.572, max=0.572, sum=1.144 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5721135453173989\"}", + "Econometrics - # eval": "{\"description\": \"min=114, mean=114, max=114, sum=228 (2)\", \"tab\": \"General information\", \"score\": \"114.0\"}", + "Econometrics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Econometrics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Econometrics - # prompt tokens": "{\"description\": \"min=614.43, mean=614.43, max=614.43, sum=1228.86 (2)\", \"tab\": \"General information\", \"score\": \"614.4298245614035\"}", + "Econometrics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"electrical_engineering\"", + "subject": "\"econometrics\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_electrical_engineering\"" + "groups": "\"mmlu_econometrics\"" } } }, { - "evaluation_name": "Elementary Mathematics", + "evaluation_name": "Global Facts", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -687,36 +661,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.72, + "score": 0.58, "details": { - "description": "min=0.72, mean=0.72, max=0.72, sum=1.439 (2)", + "description": "min=0.58, mean=0.58, max=0.58, sum=1.16 (2)", "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": "{\"description\": \"min=0.708, mean=0.708, max=0.708, sum=1.417 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.708430844009238\"}", - "Elementary Mathematics - # eval": "{\"description\": \"min=378, mean=378, max=378, sum=756 (2)\", \"tab\": \"General information\", \"score\": \"378.0\"}", - "Elementary Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Elementary Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Elementary Mathematics - # prompt tokens": "{\"description\": \"min=531.862, mean=531.862, max=531.862, sum=1063.725 (2)\", \"tab\": \"General information\", \"score\": \"531.8624338624338\"}", - "Elementary Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Global Facts - Observed inference time (s)": "{\"description\": \"min=0.479, mean=0.479, max=0.479, sum=0.958 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.47900029182434084\"}", + "Global Facts - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "Global Facts - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Global Facts - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Global Facts - # prompt tokens": "{\"description\": \"min=399.71, mean=399.71, max=399.71, sum=799.42 (2)\", \"tab\": \"General information\", \"score\": \"399.71\"}", + "Global Facts - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"elementary_mathematics\"", + "subject": "\"global_facts\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_elementary_mathematics\"" + "groups": "\"mmlu_global_facts\"" } } }, { - "evaluation_name": "Formal Logic", + "evaluation_name": "Jurisprudence", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -725,36 +699,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Formal Logic", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.706, + "score": 0.88, "details": { - "description": "min=0.706, mean=0.706, max=0.706, sum=1.413 (2)", + "description": "min=0.88, mean=0.88, max=0.88, sum=1.759 (2)", "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": "{\"description\": \"min=0.635, mean=0.635, max=0.635, sum=1.27 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6347800322941372\"}", - "Formal Logic - # eval": "{\"description\": \"min=126, mean=126, max=126, sum=252 (2)\", \"tab\": \"General information\", \"score\": \"126.0\"}", - "Formal Logic - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Formal Logic - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Formal Logic - # prompt tokens": "{\"description\": \"min=606.762, mean=606.762, max=606.762, sum=1213.524 (2)\", \"tab\": \"General information\", \"score\": \"606.7619047619048\"}", - "Formal Logic - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Jurisprudence - Observed inference time (s)": "{\"description\": \"min=0.539, mean=0.539, max=0.539, sum=1.079 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5393155504156042\"}", + "Jurisprudence - # eval": "{\"description\": \"min=108, mean=108, max=108, sum=216 (2)\", \"tab\": \"General information\", \"score\": \"108.0\"}", + "Jurisprudence - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Jurisprudence - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Jurisprudence - # prompt tokens": "{\"description\": \"min=394.639, mean=394.639, max=394.639, sum=789.278 (2)\", \"tab\": \"General information\", \"score\": \"394.6388888888889\"}", + "Jurisprudence - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"formal_logic\"", + "subject": "\"jurisprudence\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_formal_logic\"" + "groups": "\"mmlu_jurisprudence\"" } } }, { - "evaluation_name": "High School World History", + "evaluation_name": "Philosophy", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -763,114 +737,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on High School World History", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.941, + "score": 0.868, "details": { - "description": "min=0.941, mean=0.941, max=0.941, sum=1.882 (2)", + "description": "min=0.868, mean=0.868, max=0.868, sum=1.736 (2)", "tab": "Accuracy", - "High School Biology - Observed inference time (s)": "{\"description\": \"min=0.674, mean=0.674, max=0.674, sum=1.348 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6741217144073979\"}", - "High School Chemistry - Observed inference time (s)": "{\"description\": \"min=0.673, mean=0.673, max=0.673, sum=1.346 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6728476491467706\"}", - "High School Computer Science - Observed inference time (s)": "{\"description\": \"min=0.626, mean=0.626, max=0.626, sum=1.252 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6261640882492066\"}", - "High School European History - Observed inference time (s)": "{\"description\": \"min=0.747, mean=0.747, max=0.747, sum=1.495 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.7474224538514108\"}", - "High School Geography - Observed inference time (s)": "{\"description\": \"min=0.667, mean=0.667, max=0.667, sum=1.335 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6672574221485793\"}", - "High School Government And Politics - Observed inference time (s)": "{\"description\": \"min=0.683, mean=0.683, max=0.683, sum=1.366 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6831059715290762\"}", - "High School Macroeconomics - Observed inference time (s)": "{\"description\": \"min=0.613, mean=0.613, max=0.613, sum=1.226 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6132381714307344\"}", - "High School Mathematics - Observed inference time (s)": "{\"description\": \"min=0.594, mean=0.594, max=0.594, sum=1.188 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5939316025486698\"}", - "High School Microeconomics - Observed inference time (s)": "{\"description\": \"min=0.585, mean=0.585, max=0.585, sum=1.169 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5845635728675778\"}", - "High School Physics - Observed inference time (s)": "{\"description\": \"min=0.934, mean=0.934, max=0.934, sum=1.868 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.9341671135251886\"}", - "High School Psychology - Observed inference time (s)": "{\"description\": \"min=0.741, mean=0.741, max=0.741, sum=1.482 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.7410666920723171\"}", - "High School Statistics - Observed inference time (s)": "{\"description\": \"min=0.72, mean=0.72, max=0.72, sum=1.439 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.7196061655327126\"}", - "High School US History - Observed inference time (s)": "{\"description\": \"min=0.745, mean=0.745, max=0.745, sum=1.491 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.7454434785188413\"}", - "High School World History - Observed inference time (s)": "{\"description\": \"min=0.667, mean=0.667, max=0.667, sum=1.333 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6665283818788166\"}", - "High School Biology - # eval": "{\"description\": \"min=310, mean=310, max=310, sum=620 (2)\", \"tab\": \"General information\", \"score\": \"310.0\"}", - "High School Biology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Biology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Biology - # prompt tokens": "{\"description\": \"min=513.677, mean=513.677, max=513.677, sum=1027.355 (2)\", \"tab\": \"General information\", \"score\": \"513.6774193548387\"}", - "High School Biology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Chemistry - # eval": "{\"description\": \"min=203, mean=203, max=203, sum=406 (2)\", \"tab\": \"General information\", \"score\": \"203.0\"}", - "High School Chemistry - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Chemistry - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Chemistry - # prompt tokens": "{\"description\": \"min=496.714, mean=496.714, max=496.714, sum=993.429 (2)\", \"tab\": \"General information\", \"score\": \"496.7142857142857\"}", - "High School Chemistry - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Computer Science - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "High School Computer Science - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Computer Science - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Computer Science - # prompt tokens": "{\"description\": \"min=867.78, mean=867.78, max=867.78, sum=1735.56 (2)\", \"tab\": \"General information\", \"score\": \"867.78\"}", - "High School Computer Science - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School European History - # eval": "{\"description\": \"min=165, mean=165, max=165, sum=330 (2)\", \"tab\": \"General information\", \"score\": \"165.0\"}", - "High School European History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School European History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School European History - # prompt tokens": "{\"description\": \"min=2798.073, mean=2798.073, max=2798.073, sum=5596.145 (2)\", \"tab\": \"General information\", \"score\": \"2798.072727272727\"}", - "High School European History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Geography - # eval": "{\"description\": \"min=198, mean=198, max=198, sum=396 (2)\", \"tab\": \"General information\", \"score\": \"198.0\"}", - "High School Geography - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Geography - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Geography - # prompt tokens": "{\"description\": \"min=372.045, mean=372.045, max=372.045, sum=744.091 (2)\", \"tab\": \"General information\", \"score\": \"372.04545454545456\"}", - "High School Geography - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Government And Politics - # eval": "{\"description\": \"min=193, mean=193, max=193, sum=386 (2)\", \"tab\": \"General information\", \"score\": \"193.0\"}", - "High School Government And Politics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Government And Politics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Government And Politics - # prompt tokens": "{\"description\": \"min=465.824, mean=465.824, max=465.824, sum=931.648 (2)\", \"tab\": \"General information\", \"score\": \"465.8238341968912\"}", - "High School Government And Politics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Macroeconomics - # eval": "{\"description\": \"min=390, mean=390, max=390, sum=780 (2)\", \"tab\": \"General information\", \"score\": \"390.0\"}", - "High School Macroeconomics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Macroeconomics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Macroeconomics - # prompt tokens": "{\"description\": \"min=371.562, mean=371.562, max=371.562, sum=743.123 (2)\", \"tab\": \"General information\", \"score\": \"371.5615384615385\"}", - "High School Macroeconomics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Mathematics - # eval": "{\"description\": \"min=270, mean=270, max=270, sum=540 (2)\", \"tab\": \"General information\", \"score\": \"270.0\"}", - "High School Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Mathematics - # prompt tokens": "{\"description\": \"min=532.374, mean=532.374, max=532.374, sum=1064.748 (2)\", \"tab\": \"General information\", \"score\": \"532.3740740740741\"}", - "High School Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Microeconomics - # eval": "{\"description\": \"min=238, mean=238, max=238, sum=476 (2)\", \"tab\": \"General information\", \"score\": \"238.0\"}", - "High School Microeconomics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Microeconomics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Microeconomics - # prompt tokens": "{\"description\": \"min=399.025, mean=399.025, max=399.025, sum=798.05 (2)\", \"tab\": \"General information\", \"score\": \"399.02521008403363\"}", - "High School Microeconomics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Physics - # eval": "{\"description\": \"min=151, mean=151, max=151, sum=302 (2)\", \"tab\": \"General information\", \"score\": \"151.0\"}", - "High School Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Physics - # prompt tokens": "{\"description\": \"min=560.464, mean=560.464, max=560.464, sum=1120.927 (2)\", \"tab\": \"General information\", \"score\": \"560.4635761589404\"}", - "High School Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Psychology - # eval": "{\"description\": \"min=545, mean=545, max=545, sum=1090 (2)\", \"tab\": \"General information\", \"score\": \"545.0\"}", - "High School Psychology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Psychology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Psychology - # prompt tokens": "{\"description\": \"min=495.246, mean=495.246, max=495.246, sum=990.492 (2)\", \"tab\": \"General information\", \"score\": \"495.24587155963303\"}", - "High School Psychology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Statistics - # eval": "{\"description\": \"min=216, mean=216, max=216, sum=432 (2)\", \"tab\": \"General information\", \"score\": \"216.0\"}", - "High School Statistics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Statistics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Statistics - # prompt tokens": "{\"description\": \"min=795.699, mean=795.699, max=795.699, sum=1591.398 (2)\", \"tab\": \"General information\", \"score\": \"795.699074074074\"}", - "High School Statistics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School US History - # eval": "{\"description\": \"min=204, mean=204, max=204, sum=408 (2)\", \"tab\": \"General information\", \"score\": \"204.0\"}", - "High School US History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School US History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School US History - # prompt tokens": "{\"description\": \"min=2217.809, mean=2217.809, max=2217.809, sum=4435.618 (2)\", \"tab\": \"General information\", \"score\": \"2217.8088235294117\"}", - "High School US History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School World History - # eval": "{\"description\": \"min=237, mean=237, max=237, sum=474 (2)\", \"tab\": \"General information\", \"score\": \"237.0\"}", - "High School World History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School World History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School World History - # prompt tokens": "{\"description\": \"min=1428.27, mean=1428.27, max=1428.27, sum=2856.54 (2)\", \"tab\": \"General information\", \"score\": \"1428.2700421940929\"}", - "High School World History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Philosophy - Observed inference time (s)": "{\"description\": \"min=0.543, mean=0.543, max=0.543, sum=1.087 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5434573969273705\"}", + "Philosophy - # eval": "{\"description\": \"min=311, mean=311, max=311, sum=622 (2)\", \"tab\": \"General information\", \"score\": \"311.0\"}", + "Philosophy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Philosophy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Philosophy - # prompt tokens": "{\"description\": \"min=329.084, mean=329.084, max=329.084, sum=658.167 (2)\", \"tab\": \"General information\", \"score\": \"329.08360128617363\"}", + "Philosophy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"high_school_world_history\"", + "subject": "\"philosophy\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_high_school_world_history\"" + "groups": "\"mmlu_philosophy\"" } } }, { - "evaluation_name": "Human Sexuality", + "evaluation_name": "Professional Psychology", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -879,42 +775,54 @@ ] }, "metric_config": { - "evaluation_description": "EM on Human Sexuality", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.901, + "score": 0.873, "details": { - "description": "min=0.901, mean=0.901, max=0.901, sum=1.802 (2)", + "description": "min=0.873, mean=0.873, max=0.873, sum=1.745 (2)", "tab": "Accuracy", - "Human Aging - Observed inference time (s)": "{\"description\": \"min=0.656, mean=0.656, max=0.656, sum=1.313 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6564141239286003\"}", - "Human Sexuality - Observed inference time (s)": "{\"description\": \"min=0.613, mean=0.613, max=0.613, sum=1.226 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6131143715545422\"}", - "Human Aging - # eval": "{\"description\": \"min=223, mean=223, max=223, sum=446 (2)\", \"tab\": \"General information\", \"score\": \"223.0\"}", - "Human Aging - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Human Aging - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Human Aging - # prompt tokens": "{\"description\": \"min=319.906, mean=319.906, max=319.906, sum=639.812 (2)\", \"tab\": \"General information\", \"score\": \"319.90582959641256\"}", - "Human Aging - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "Human Sexuality - # eval": "{\"description\": \"min=131, mean=131, max=131, sum=262 (2)\", \"tab\": \"General information\", \"score\": \"131.0\"}", - "Human Sexuality - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Human Sexuality - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Human Sexuality - # prompt tokens": "{\"description\": \"min=341.183, mean=341.183, max=341.183, sum=682.366 (2)\", \"tab\": \"General information\", \"score\": \"341.1832061068702\"}", - "Human Sexuality - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Professional Medicine - Observed inference time (s)": "{\"description\": \"min=0.579, mean=0.579, max=0.579, sum=1.159 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5794552100055358\"}", + "Professional Accounting - Observed inference time (s)": "{\"description\": \"min=0.59, mean=0.59, max=0.59, sum=1.18 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5898241354218612\"}", + "Professional Law - Observed inference time (s)": "{\"description\": \"min=0.639, mean=0.639, max=0.639, sum=1.278 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6388053317424371\"}", + "Professional Psychology - Observed inference time (s)": "{\"description\": \"min=0.671, mean=0.671, max=0.671, sum=1.342 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6712259284031936\"}", + "Professional Medicine - # eval": "{\"description\": \"min=272, mean=272, max=272, sum=544 (2)\", \"tab\": \"General information\", \"score\": \"272.0\"}", + "Professional Medicine - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Professional Medicine - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Professional Medicine - # prompt tokens": "{\"description\": \"min=1094.585, mean=1094.585, max=1094.585, sum=2189.169 (2)\", \"tab\": \"General information\", \"score\": \"1094.5845588235295\"}", + "Professional Medicine - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "Professional Accounting - # eval": "{\"description\": \"min=282, mean=282, max=282, sum=564 (2)\", \"tab\": \"General information\", \"score\": \"282.0\"}", + "Professional Accounting - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Professional Accounting - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Professional Accounting - # prompt tokens": "{\"description\": \"min=658.592, mean=658.592, max=658.592, sum=1317.184 (2)\", \"tab\": \"General information\", \"score\": \"658.5921985815603\"}", + "Professional Accounting - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "Professional Law - # eval": "{\"description\": \"min=1534, mean=1534, max=1534, sum=3068 (2)\", \"tab\": \"General information\", \"score\": \"1534.0\"}", + "Professional Law - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Professional Law - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Professional Law - # prompt tokens": "{\"description\": \"min=1637.787, mean=1637.787, max=1637.787, sum=3275.574 (2)\", \"tab\": \"General information\", \"score\": \"1637.7868318122555\"}", + "Professional Law - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "Professional Psychology - # eval": "{\"description\": \"min=612, mean=612, max=612, sum=1224 (2)\", \"tab\": \"General information\", \"score\": \"612.0\"}", + "Professional Psychology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Professional Psychology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Professional Psychology - # prompt tokens": "{\"description\": \"min=575.114, mean=575.114, max=575.114, sum=1150.229 (2)\", \"tab\": \"General information\", \"score\": \"575.1143790849674\"}", + "Professional Psychology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"human_sexuality\"", + "subject": "\"professional_psychology\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_human_sexuality\"" + "groups": "\"mmlu_professional_psychology\"" } } }, { - "evaluation_name": "International Law", + "evaluation_name": "Us Foreign Policy", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -923,36 +831,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on International Law", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.942, + "score": 0.96, "details": { - "description": "min=0.942, mean=0.942, max=0.942, sum=1.884 (2)", + "description": "min=0.96, mean=0.96, max=0.96, sum=1.92 (2)", "tab": "Accuracy", - "International Law - Observed inference time (s)": "{\"description\": \"min=0.63, mean=0.63, max=0.63, sum=1.26 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6297830116650289\"}", - "International Law - # eval": "{\"description\": \"min=121, mean=121, max=121, sum=242 (2)\", \"tab\": \"General information\", \"score\": \"121.0\"}", - "International Law - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "International Law - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "International Law - # prompt tokens": "{\"description\": \"min=639.851, mean=639.851, max=639.851, sum=1279.702 (2)\", \"tab\": \"General information\", \"score\": \"639.8512396694215\"}", - "International Law - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Us Foreign Policy - Observed inference time (s)": "{\"description\": \"min=0.558, mean=0.558, max=0.558, sum=1.115 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.557673556804657\"}", + "Us Foreign Policy - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "Us Foreign Policy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Us Foreign Policy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Us Foreign Policy - # prompt tokens": "{\"description\": \"min=422.79, mean=422.79, max=422.79, sum=845.58 (2)\", \"tab\": \"General information\", \"score\": \"422.79\"}", + "Us Foreign Policy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"international_law\"", + "subject": "\"us_foreign_policy\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_international_law\"" + "groups": "\"mmlu_us_foreign_policy\"" } } }, { - "evaluation_name": "Logical Fallacies", + "evaluation_name": "Astronomy", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -961,36 +869,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Logical Fallacies", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.871, + "score": 0.941, "details": { - "description": "min=0.871, mean=0.871, max=0.871, sum=1.742 (2)", + "description": "min=0.941, mean=0.941, max=0.941, sum=1.882 (2)", "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": "{\"description\": \"min=0.585, mean=0.585, max=0.585, sum=1.171 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.585445927695994\"}", - "Logical Fallacies - # eval": "{\"description\": \"min=163, mean=163, max=163, sum=326 (2)\", \"tab\": \"General information\", \"score\": \"163.0\"}", - "Logical Fallacies - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Logical Fallacies - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Logical Fallacies - # prompt tokens": "{\"description\": \"min=449.595, mean=449.595, max=449.595, sum=899.19 (2)\", \"tab\": \"General information\", \"score\": \"449.5950920245399\"}", - "Logical Fallacies - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Astronomy - Observed inference time (s)": "{\"description\": \"min=0.666, mean=0.666, max=0.666, sum=1.332 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6662032525790366\"}", + "Astronomy - # eval": "{\"description\": \"min=152, mean=152, max=152, sum=304 (2)\", \"tab\": \"General information\", \"score\": \"152.0\"}", + "Astronomy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Astronomy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Astronomy - # prompt tokens": "{\"description\": \"min=579.691, mean=579.691, max=579.691, sum=1159.382 (2)\", \"tab\": \"General information\", \"score\": \"579.6907894736842\"}", + "Astronomy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"logical_fallacies\"", + "subject": "\"astronomy\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_logical_fallacies\"" + "groups": "\"mmlu_astronomy\"" } } }, { - "evaluation_name": "Machine Learning", + "evaluation_name": "Business Ethics", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -999,36 +907,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Machine Learning", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.741, + "score": 0.82, "details": { - "description": "min=0.741, mean=0.741, max=0.741, sum=1.482 (2)", + "description": "min=0.82, mean=0.82, max=0.82, sum=1.64 (2)", "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": "{\"description\": \"min=0.718, mean=0.718, max=0.718, sum=1.436 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.718035706451961\"}", - "Machine Learning - # eval": "{\"description\": \"min=112, mean=112, max=112, sum=224 (2)\", \"tab\": \"General information\", \"score\": \"112.0\"}", - "Machine Learning - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Machine Learning - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Machine Learning - # prompt tokens": "{\"description\": \"min=668.054, mean=668.054, max=668.054, sum=1336.107 (2)\", \"tab\": \"General information\", \"score\": \"668.0535714285714\"}", - "Machine Learning - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Business Ethics - Observed inference time (s)": "{\"description\": \"min=0.598, mean=0.598, max=0.598, sum=1.196 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5981367039680481\"}", + "Business Ethics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "Business Ethics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Business Ethics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Business Ethics - # prompt tokens": "{\"description\": \"min=569.52, mean=569.52, max=569.52, sum=1139.04 (2)\", \"tab\": \"General information\", \"score\": \"569.52\"}", + "Business Ethics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"machine_learning\"", + "subject": "\"business_ethics\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_machine_learning\"" + "groups": "\"mmlu_business_ethics\"" } } }, { - "evaluation_name": "Management", + "evaluation_name": "Clinical Knowledge", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1037,36 +945,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Management", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.883, + "score": 0.83, "details": { - "description": "min=0.883, mean=0.883, max=0.883, sum=1.767 (2)", + "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)", "tab": "Accuracy", - "Management - Observed inference time (s)": "{\"description\": \"min=0.592, mean=0.592, max=0.592, sum=1.184 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5921963488013999\"}", - "Management - # eval": "{\"description\": \"min=103, mean=103, max=103, sum=206 (2)\", \"tab\": \"General information\", \"score\": \"103.0\"}", - "Management - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Management - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Management - # prompt tokens": "{\"description\": \"min=283.796, mean=283.796, max=283.796, sum=567.592 (2)\", \"tab\": \"General information\", \"score\": \"283.79611650485435\"}", - "Management - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Clinical Knowledge - Observed inference time (s)": "{\"description\": \"min=0.591, mean=0.591, max=0.591, sum=1.183 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5912713131814633\"}", + "Clinical Knowledge - # eval": "{\"description\": \"min=265, mean=265, max=265, sum=530 (2)\", \"tab\": \"General information\", \"score\": \"265.0\"}", + "Clinical Knowledge - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Clinical Knowledge - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Clinical Knowledge - # prompt tokens": "{\"description\": \"min=397.947, mean=397.947, max=397.947, sum=795.894 (2)\", \"tab\": \"General information\", \"score\": \"397.94716981132075\"}", + "Clinical Knowledge - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"management\"", + "subject": "\"clinical_knowledge\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_management\"" + "groups": "\"mmlu_clinical_knowledge\"" } } }, { - "evaluation_name": "Marketing", + "evaluation_name": "Conceptual Physics", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1075,36 +983,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Marketing", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.949, + "score": 0.894, "details": { - "description": "min=0.949, mean=0.949, max=0.949, sum=1.897 (2)", + "description": "min=0.894, mean=0.894, max=0.894, sum=1.787 (2)", "tab": "Accuracy", - "Marketing - Observed inference time (s)": "{\"description\": \"min=0.588, mean=0.588, max=0.588, sum=1.176 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5880082672477788\"}", - "Marketing - # eval": "{\"description\": \"min=234, mean=234, max=234, sum=468 (2)\", \"tab\": \"General information\", \"score\": \"234.0\"}", - "Marketing - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Marketing - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Marketing - # prompt tokens": "{\"description\": \"min=404.218, mean=404.218, max=404.218, sum=808.436 (2)\", \"tab\": \"General information\", \"score\": \"404.21794871794873\"}", - "Marketing - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Conceptual Physics - Observed inference time (s)": "{\"description\": \"min=0.685, mean=0.685, max=0.685, sum=1.369 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.684603402969685\"}", + "Conceptual Physics - # eval": "{\"description\": \"min=235, mean=235, max=235, sum=470 (2)\", \"tab\": \"General information\", \"score\": \"235.0\"}", + "Conceptual Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Conceptual Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Conceptual Physics - # prompt tokens": "{\"description\": \"min=304.838, mean=304.838, max=304.838, sum=609.677 (2)\", \"tab\": \"General information\", \"score\": \"304.83829787234043\"}", + "Conceptual Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"marketing\"", + "subject": "\"conceptual_physics\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_marketing\"" + "groups": "\"mmlu_conceptual_physics\"" } } }, { - "evaluation_name": "Medical Genetics", + "evaluation_name": "Electrical Engineering", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1113,36 +1021,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Medical Genetics", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.92, + "score": 0.752, "details": { - "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)", + "description": "min=0.752, mean=0.752, max=0.752, sum=1.503 (2)", "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": "{\"description\": \"min=0.52, mean=0.52, max=0.52, sum=1.04 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5201336288452149\"}", - "Medical Genetics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "Medical Genetics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Medical Genetics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Medical Genetics - # prompt tokens": "{\"description\": \"min=341, mean=341, max=341, sum=682 (2)\", \"tab\": \"General information\", \"score\": \"341.0\"}", - "Medical Genetics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Electrical Engineering - Observed inference time (s)": "{\"description\": \"min=0.649, mean=0.649, max=0.649, sum=1.297 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6487039006989578\"}", + "Electrical Engineering - # eval": "{\"description\": \"min=145, mean=145, max=145, sum=290 (2)\", \"tab\": \"General information\", \"score\": \"145.0\"}", + "Electrical Engineering - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Electrical Engineering - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Electrical Engineering - # prompt tokens": "{\"description\": \"min=440.641, mean=440.641, max=440.641, sum=881.283 (2)\", \"tab\": \"General information\", \"score\": \"440.6413793103448\"}", + "Electrical Engineering - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"medical_genetics\"", + "subject": "\"electrical_engineering\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_medical_genetics\"" + "groups": "\"mmlu_electrical_engineering\"" } } }, { - "evaluation_name": "Miscellaneous", + "evaluation_name": "Elementary Mathematics", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1151,36 +1059,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Miscellaneous", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.945, + "score": 0.72, "details": { - "description": "min=0.945, mean=0.945, max=0.945, sum=1.89 (2)", + "description": "min=0.72, mean=0.72, max=0.72, sum=1.439 (2)", "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": "{\"description\": \"min=0.565, mean=0.565, max=0.565, sum=1.13 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5650817577561809\"}", - "Miscellaneous - # eval": "{\"description\": \"min=783, mean=783, max=783, sum=1566 (2)\", \"tab\": \"General information\", \"score\": \"783.0\"}", - "Miscellaneous - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Miscellaneous - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Miscellaneous - # prompt tokens": "{\"description\": \"min=299.925, mean=299.925, max=299.925, sum=599.849 (2)\", \"tab\": \"General information\", \"score\": \"299.92464878671774\"}", - "Miscellaneous - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Elementary Mathematics - Observed inference time (s)": "{\"description\": \"min=0.708, mean=0.708, max=0.708, sum=1.417 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.708430844009238\"}", + "Elementary Mathematics - # eval": "{\"description\": \"min=378, mean=378, max=378, sum=756 (2)\", \"tab\": \"General information\", \"score\": \"378.0\"}", + "Elementary Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Elementary Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Elementary Mathematics - # prompt tokens": "{\"description\": \"min=531.862, mean=531.862, max=531.862, sum=1063.725 (2)\", \"tab\": \"General information\", \"score\": \"531.8624338624338\"}", + "Elementary Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"miscellaneous\"", + "subject": "\"elementary_mathematics\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_miscellaneous\"" + "groups": "\"mmlu_elementary_mathematics\"" } } }, { - "evaluation_name": "Moral Scenarios", + "evaluation_name": "Formal Logic", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1189,42 +1097,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Moral Scenarios", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.803, + "score": 0.706, "details": { - "description": "min=0.803, mean=0.803, max=0.803, sum=1.607 (2)", + "description": "min=0.706, mean=0.706, max=0.706, sum=1.413 (2)", "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": "{\"description\": \"min=0.564, mean=0.564, max=0.564, sum=1.129 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5643301023913256\"}", - "Moral Scenarios - Observed inference time (s)": "{\"description\": \"min=0.599, mean=0.599, max=0.599, sum=1.197 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5985688052363902\"}", - "Moral Disputes - # eval": "{\"description\": \"min=346, mean=346, max=346, sum=692 (2)\", \"tab\": \"General information\", \"score\": \"346.0\"}", - "Moral Disputes - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Moral Disputes - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Moral Disputes - # prompt tokens": "{\"description\": \"min=476.145, mean=476.145, max=476.145, sum=952.289 (2)\", \"tab\": \"General information\", \"score\": \"476.1445086705202\"}", - "Moral Disputes - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "Moral Scenarios - # eval": "{\"description\": \"min=895, mean=895, max=895, sum=1790 (2)\", \"tab\": \"General information\", \"score\": \"895.0\"}", - "Moral Scenarios - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Moral Scenarios - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Moral Scenarios - # prompt tokens": "{\"description\": \"min=656.455, mean=656.455, max=656.455, sum=1312.909 (2)\", \"tab\": \"General information\", \"score\": \"656.454748603352\"}", - "Moral Scenarios - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Formal Logic - Observed inference time (s)": "{\"description\": \"min=0.635, mean=0.635, max=0.635, sum=1.27 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6347800322941372\"}", + "Formal Logic - # eval": "{\"description\": \"min=126, mean=126, max=126, sum=252 (2)\", \"tab\": \"General information\", \"score\": \"126.0\"}", + "Formal Logic - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Formal Logic - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Formal Logic - # prompt tokens": "{\"description\": \"min=606.762, mean=606.762, max=606.762, sum=1213.524 (2)\", \"tab\": \"General information\", \"score\": \"606.7619047619048\"}", + "Formal Logic - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"moral_scenarios\"", + "subject": "\"formal_logic\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_moral_scenarios\"" + "groups": "\"mmlu_formal_logic\"" } } }, { - "evaluation_name": "Nutrition", + "evaluation_name": "High School World History", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1233,36 +1135,114 @@ ] }, "metric_config": { - "evaluation_description": "EM on Nutrition", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.892, + "score": 0.941, "details": { - "description": "min=0.892, mean=0.892, max=0.892, sum=1.784 (2)", + "description": "min=0.941, mean=0.941, max=0.941, sum=1.882 (2)", "tab": "Accuracy", - "Nutrition - Observed inference time (s)": "{\"description\": \"min=0.532, mean=0.532, max=0.532, sum=1.063 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5316595968857311\"}", - "Nutrition - # eval": "{\"description\": \"min=306, mean=306, max=306, sum=612 (2)\", \"tab\": \"General information\", \"score\": \"306.0\"}", - "Nutrition - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Nutrition - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Nutrition - # prompt tokens": "{\"description\": \"min=586.817, mean=586.817, max=586.817, sum=1173.634 (2)\", \"tab\": \"General information\", \"score\": \"586.8169934640523\"}", - "Nutrition - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "High School Biology - Observed inference time (s)": "{\"description\": \"min=0.674, mean=0.674, max=0.674, sum=1.348 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6741217144073979\"}", + "High School Chemistry - Observed inference time (s)": "{\"description\": \"min=0.673, mean=0.673, max=0.673, sum=1.346 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6728476491467706\"}", + "High School Computer Science - Observed inference time (s)": "{\"description\": \"min=0.626, mean=0.626, max=0.626, sum=1.252 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6261640882492066\"}", + "High School European History - Observed inference time (s)": "{\"description\": \"min=0.747, mean=0.747, max=0.747, sum=1.495 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.7474224538514108\"}", + "High School Geography - Observed inference time (s)": "{\"description\": \"min=0.667, mean=0.667, max=0.667, sum=1.335 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6672574221485793\"}", + "High School Government And Politics - Observed inference time (s)": "{\"description\": \"min=0.683, mean=0.683, max=0.683, sum=1.366 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6831059715290762\"}", + "High School Macroeconomics - Observed inference time (s)": "{\"description\": \"min=0.613, mean=0.613, max=0.613, sum=1.226 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6132381714307344\"}", + "High School Mathematics - Observed inference time (s)": "{\"description\": \"min=0.594, mean=0.594, max=0.594, sum=1.188 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5939316025486698\"}", + "High School Microeconomics - Observed inference time (s)": "{\"description\": \"min=0.585, mean=0.585, max=0.585, sum=1.169 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5845635728675778\"}", + "High School Physics - Observed inference time (s)": "{\"description\": \"min=0.934, mean=0.934, max=0.934, sum=1.868 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.9341671135251886\"}", + "High School Psychology - Observed inference time (s)": "{\"description\": \"min=0.741, mean=0.741, max=0.741, sum=1.482 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.7410666920723171\"}", + "High School Statistics - Observed inference time (s)": "{\"description\": \"min=0.72, mean=0.72, max=0.72, sum=1.439 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.7196061655327126\"}", + "High School US History - Observed inference time (s)": "{\"description\": \"min=0.745, mean=0.745, max=0.745, sum=1.491 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.7454434785188413\"}", + "High School World History - Observed inference time (s)": "{\"description\": \"min=0.667, mean=0.667, max=0.667, sum=1.333 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6665283818788166\"}", + "High School Biology - # eval": "{\"description\": \"min=310, mean=310, max=310, sum=620 (2)\", \"tab\": \"General information\", \"score\": \"310.0\"}", + "High School Biology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Biology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Biology - # prompt tokens": "{\"description\": \"min=513.677, mean=513.677, max=513.677, sum=1027.355 (2)\", \"tab\": \"General information\", \"score\": \"513.6774193548387\"}", + "High School Biology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Chemistry - # eval": "{\"description\": \"min=203, mean=203, max=203, sum=406 (2)\", \"tab\": \"General information\", \"score\": \"203.0\"}", + "High School Chemistry - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Chemistry - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Chemistry - # prompt tokens": "{\"description\": \"min=496.714, mean=496.714, max=496.714, sum=993.429 (2)\", \"tab\": \"General information\", \"score\": \"496.7142857142857\"}", + "High School Chemistry - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Computer Science - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "High School Computer Science - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Computer Science - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Computer Science - # prompt tokens": "{\"description\": \"min=867.78, mean=867.78, max=867.78, sum=1735.56 (2)\", \"tab\": \"General information\", \"score\": \"867.78\"}", + "High School Computer Science - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School European History - # eval": "{\"description\": \"min=165, mean=165, max=165, sum=330 (2)\", \"tab\": \"General information\", \"score\": \"165.0\"}", + "High School European History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School European History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School European History - # prompt tokens": "{\"description\": \"min=2798.073, mean=2798.073, max=2798.073, sum=5596.145 (2)\", \"tab\": \"General information\", \"score\": \"2798.072727272727\"}", + "High School European History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Geography - # eval": "{\"description\": \"min=198, mean=198, max=198, sum=396 (2)\", \"tab\": \"General information\", \"score\": \"198.0\"}", + "High School Geography - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Geography - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Geography - # prompt tokens": "{\"description\": \"min=372.045, mean=372.045, max=372.045, sum=744.091 (2)\", \"tab\": \"General information\", \"score\": \"372.04545454545456\"}", + "High School Geography - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Government And Politics - # eval": "{\"description\": \"min=193, mean=193, max=193, sum=386 (2)\", \"tab\": \"General information\", \"score\": \"193.0\"}", + "High School Government And Politics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Government And Politics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Government And Politics - # prompt tokens": "{\"description\": \"min=465.824, mean=465.824, max=465.824, sum=931.648 (2)\", \"tab\": \"General information\", \"score\": \"465.8238341968912\"}", + "High School Government And Politics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Macroeconomics - # eval": "{\"description\": \"min=390, mean=390, max=390, sum=780 (2)\", \"tab\": \"General information\", \"score\": \"390.0\"}", + "High School Macroeconomics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Macroeconomics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Macroeconomics - # prompt tokens": "{\"description\": \"min=371.562, mean=371.562, max=371.562, sum=743.123 (2)\", \"tab\": \"General information\", \"score\": \"371.5615384615385\"}", + "High School Macroeconomics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Mathematics - # eval": "{\"description\": \"min=270, mean=270, max=270, sum=540 (2)\", \"tab\": \"General information\", \"score\": \"270.0\"}", + "High School Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Mathematics - # prompt tokens": "{\"description\": \"min=532.374, mean=532.374, max=532.374, sum=1064.748 (2)\", \"tab\": \"General information\", \"score\": \"532.3740740740741\"}", + "High School Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Microeconomics - # eval": "{\"description\": \"min=238, mean=238, max=238, sum=476 (2)\", \"tab\": \"General information\", \"score\": \"238.0\"}", + "High School Microeconomics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Microeconomics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Microeconomics - # prompt tokens": "{\"description\": \"min=399.025, mean=399.025, max=399.025, sum=798.05 (2)\", \"tab\": \"General information\", \"score\": \"399.02521008403363\"}", + "High School Microeconomics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Physics - # eval": "{\"description\": \"min=151, mean=151, max=151, sum=302 (2)\", \"tab\": \"General information\", \"score\": \"151.0\"}", + "High School Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Physics - # prompt tokens": "{\"description\": \"min=560.464, mean=560.464, max=560.464, sum=1120.927 (2)\", \"tab\": \"General information\", \"score\": \"560.4635761589404\"}", + "High School Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Psychology - # eval": "{\"description\": \"min=545, mean=545, max=545, sum=1090 (2)\", \"tab\": \"General information\", \"score\": \"545.0\"}", + "High School Psychology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Psychology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Psychology - # prompt tokens": "{\"description\": \"min=495.246, mean=495.246, max=495.246, sum=990.492 (2)\", \"tab\": \"General information\", \"score\": \"495.24587155963303\"}", + "High School Psychology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Statistics - # eval": "{\"description\": \"min=216, mean=216, max=216, sum=432 (2)\", \"tab\": \"General information\", \"score\": \"216.0\"}", + "High School Statistics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Statistics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Statistics - # prompt tokens": "{\"description\": \"min=795.699, mean=795.699, max=795.699, sum=1591.398 (2)\", \"tab\": \"General information\", \"score\": \"795.699074074074\"}", + "High School Statistics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School US History - # eval": "{\"description\": \"min=204, mean=204, max=204, sum=408 (2)\", \"tab\": \"General information\", \"score\": \"204.0\"}", + "High School US History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School US History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School US History - # prompt tokens": "{\"description\": \"min=2217.809, mean=2217.809, max=2217.809, sum=4435.618 (2)\", \"tab\": \"General information\", \"score\": \"2217.8088235294117\"}", + "High School US History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School World History - # eval": "{\"description\": \"min=237, mean=237, max=237, sum=474 (2)\", \"tab\": \"General information\", \"score\": \"237.0\"}", + "High School World History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School World History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School World History - # prompt tokens": "{\"description\": \"min=1428.27, mean=1428.27, max=1428.27, sum=2856.54 (2)\", \"tab\": \"General information\", \"score\": \"1428.2700421940929\"}", + "High School World History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"nutrition\"", + "subject": "\"high_school_world_history\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_nutrition\"" + "groups": "\"mmlu_high_school_world_history\"" } } }, { - "evaluation_name": "Prehistory", + "evaluation_name": "Human Sexuality", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1271,36 +1251,42 @@ ] }, "metric_config": { - "evaluation_description": "EM on Prehistory", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.92, + "score": 0.901, "details": { - "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)", + "description": "min=0.901, mean=0.901, max=0.901, sum=1.802 (2)", "tab": "Accuracy", - "Prehistory - Observed inference time (s)": "{\"description\": \"min=0.54, mean=0.54, max=0.54, sum=1.079 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5397091279795141\"}", - "Prehistory - # eval": "{\"description\": \"min=324, mean=324, max=324, sum=648 (2)\", \"tab\": \"General information\", \"score\": \"324.0\"}", - "Prehistory - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Prehistory - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Prehistory - # prompt tokens": "{\"description\": \"min=514.559, mean=514.559, max=514.559, sum=1029.117 (2)\", \"tab\": \"General information\", \"score\": \"514.5586419753087\"}", - "Prehistory - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Human Aging - Observed inference time (s)": "{\"description\": \"min=0.656, mean=0.656, max=0.656, sum=1.313 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6564141239286003\"}", + "Human Sexuality - Observed inference time (s)": "{\"description\": \"min=0.613, mean=0.613, max=0.613, sum=1.226 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6131143715545422\"}", + "Human Aging - # eval": "{\"description\": \"min=223, mean=223, max=223, sum=446 (2)\", \"tab\": \"General information\", \"score\": \"223.0\"}", + "Human Aging - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Human Aging - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Human Aging - # prompt tokens": "{\"description\": \"min=319.906, mean=319.906, max=319.906, sum=639.812 (2)\", \"tab\": \"General information\", \"score\": \"319.90582959641256\"}", + "Human Aging - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "Human Sexuality - # eval": "{\"description\": \"min=131, mean=131, max=131, sum=262 (2)\", \"tab\": \"General information\", \"score\": \"131.0\"}", + "Human Sexuality - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Human Sexuality - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Human Sexuality - # prompt tokens": "{\"description\": \"min=341.183, mean=341.183, max=341.183, sum=682.366 (2)\", \"tab\": \"General information\", \"score\": \"341.1832061068702\"}", + "Human Sexuality - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"prehistory\"", + "subject": "\"human_sexuality\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_prehistory\"" + "groups": "\"mmlu_human_sexuality\"" } } }, { - "evaluation_name": "Public Relations", + "evaluation_name": "International Law", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1309,36 +1295,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Public Relations", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.755, + "score": 0.942, "details": { - "description": "min=0.755, mean=0.755, max=0.755, sum=1.509 (2)", + "description": "min=0.942, mean=0.942, max=0.942, sum=1.884 (2)", "tab": "Accuracy", - "Public Relations - Observed inference time (s)": "{\"description\": \"min=0.584, mean=0.584, max=0.584, sum=1.168 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5840315688740123\"}", - "Public Relations - # eval": "{\"description\": \"min=110, mean=110, max=110, sum=220 (2)\", \"tab\": \"General information\", \"score\": \"110.0\"}", - "Public Relations - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Public Relations - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Public Relations - # prompt tokens": "{\"description\": \"min=405.318, mean=405.318, max=405.318, sum=810.636 (2)\", \"tab\": \"General information\", \"score\": \"405.3181818181818\"}", - "Public Relations - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "International Law - Observed inference time (s)": "{\"description\": \"min=0.63, mean=0.63, max=0.63, sum=1.26 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.6297830116650289\"}", + "International Law - # eval": "{\"description\": \"min=121, mean=121, max=121, sum=242 (2)\", \"tab\": \"General information\", \"score\": \"121.0\"}", + "International Law - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "International Law - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "International Law - # prompt tokens": "{\"description\": \"min=639.851, mean=639.851, max=639.851, sum=1279.702 (2)\", \"tab\": \"General information\", \"score\": \"639.8512396694215\"}", + "International Law - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"public_relations\"", + "subject": "\"international_law\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_public_relations\"" + "groups": "\"mmlu_international_law\"" } } }, { - "evaluation_name": "Security Studies", + "evaluation_name": "Logical Fallacies", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1347,36 +1333,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Security Studies", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8, + "score": 0.871, "details": { - "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)", + "description": "min=0.871, mean=0.871, max=0.871, sum=1.742 (2)", "tab": "Accuracy", - "Security Studies - Observed inference time (s)": "{\"description\": \"min=0.529, mean=0.529, max=0.529, sum=1.058 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.529095221538933\"}", - "Security Studies - # eval": "{\"description\": \"min=245, mean=245, max=245, sum=490 (2)\", \"tab\": \"General information\", \"score\": \"245.0\"}", - "Security Studies - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Security Studies - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Security Studies - # prompt tokens": "{\"description\": \"min=1164.473, mean=1164.473, max=1164.473, sum=2328.947 (2)\", \"tab\": \"General information\", \"score\": \"1164.4734693877551\"}", - "Security Studies - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Logical Fallacies - Observed inference time (s)": "{\"description\": \"min=0.585, mean=0.585, max=0.585, sum=1.171 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.585445927695994\"}", + "Logical Fallacies - # eval": "{\"description\": \"min=163, mean=163, max=163, sum=326 (2)\", \"tab\": \"General information\", \"score\": \"163.0\"}", + "Logical Fallacies - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Logical Fallacies - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Logical Fallacies - # prompt tokens": "{\"description\": \"min=449.595, mean=449.595, max=449.595, sum=899.19 (2)\", \"tab\": \"General information\", \"score\": \"449.5950920245399\"}", + "Logical Fallacies - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"security_studies\"", + "subject": "\"logical_fallacies\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_security_studies\"" + "groups": "\"mmlu_logical_fallacies\"" } } }, { - "evaluation_name": "Sociology", + "evaluation_name": "Machine Learning", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1385,36 +1371,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Sociology", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.915, + "score": 0.741, "details": { - "description": "min=0.915, mean=0.915, max=0.915, sum=1.831 (2)", + "description": "min=0.741, mean=0.741, max=0.741, sum=1.482 (2)", "tab": "Accuracy", - "Sociology - Observed inference time (s)": "{\"description\": \"min=0.52, mean=0.52, max=0.52, sum=1.04 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5199050891458692\"}", - "Sociology - # eval": "{\"description\": \"min=201, mean=201, max=201, sum=402 (2)\", \"tab\": \"General information\", \"score\": \"201.0\"}", - "Sociology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Sociology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Sociology - # prompt tokens": "{\"description\": \"min=445.522, mean=445.522, max=445.522, sum=891.045 (2)\", \"tab\": \"General information\", \"score\": \"445.5223880597015\"}", - "Sociology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Machine Learning - Observed inference time (s)": "{\"description\": \"min=0.718, mean=0.718, max=0.718, sum=1.436 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.718035706451961\"}", + "Machine Learning - # eval": "{\"description\": \"min=112, mean=112, max=112, sum=224 (2)\", \"tab\": \"General information\", \"score\": \"112.0\"}", + "Machine Learning - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Machine Learning - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Machine Learning - # prompt tokens": "{\"description\": \"min=668.054, mean=668.054, max=668.054, sum=1336.107 (2)\", \"tab\": \"General information\", \"score\": \"668.0535714285714\"}", + "Machine Learning - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"sociology\"", + "subject": "\"machine_learning\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_sociology\"" + "groups": "\"mmlu_machine_learning\"" } } }, { - "evaluation_name": "Virology", + "evaluation_name": "Management", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1423,36 +1409,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Virology", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.602, + "score": 0.883, "details": { - "description": "min=0.602, mean=0.602, max=0.602, sum=1.205 (2)", + "description": "min=0.883, mean=0.883, max=0.883, sum=1.767 (2)", "tab": "Accuracy", - "Virology - Observed inference time (s)": "{\"description\": \"min=0.523, mean=0.523, max=0.523, sum=1.045 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5226844951330897\"}", - "Virology - # eval": "{\"description\": \"min=166, mean=166, max=166, sum=332 (2)\", \"tab\": \"General information\", \"score\": \"166.0\"}", - "Virology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Virology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Virology - # prompt tokens": "{\"description\": \"min=343.09, mean=343.09, max=343.09, sum=686.181 (2)\", \"tab\": \"General information\", \"score\": \"343.0903614457831\"}", - "Virology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Management - Observed inference time (s)": "{\"description\": \"min=0.592, mean=0.592, max=0.592, sum=1.184 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5921963488013999\"}", + "Management - # eval": "{\"description\": \"min=103, mean=103, max=103, sum=206 (2)\", \"tab\": \"General information\", \"score\": \"103.0\"}", + "Management - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Management - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Management - # prompt tokens": "{\"description\": \"min=283.796, mean=283.796, max=283.796, sum=567.592 (2)\", \"tab\": \"General information\", \"score\": \"283.79611650485435\"}", + "Management - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"virology\"", + "subject": "\"management\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_virology\"" + "groups": "\"mmlu_management\"" } } }, { - "evaluation_name": "World Religions", + "evaluation_name": "Marketing", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1461,36 +1447,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on World Religions", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.848, + "score": 0.949, "details": { - "description": "min=0.848, mean=0.848, max=0.848, sum=1.696 (2)", + "description": "min=0.949, mean=0.949, max=0.949, sum=1.897 (2)", "tab": "Accuracy", - "World Religions - Observed inference time (s)": "{\"description\": \"min=0.494, mean=0.494, max=0.494, sum=0.988 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.49407080739562276\"}", - "World Religions - # eval": "{\"description\": \"min=171, mean=171, max=171, sum=342 (2)\", \"tab\": \"General information\", \"score\": \"171.0\"}", - "World Religions - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "World Religions - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "World Religions - # prompt tokens": "{\"description\": \"min=275.561, mean=275.561, max=275.561, sum=551.123 (2)\", \"tab\": \"General information\", \"score\": \"275.56140350877195\"}", - "World Religions - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Marketing - Observed inference time (s)": "{\"description\": \"min=0.588, mean=0.588, max=0.588, sum=1.176 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5880082672477788\"}", + "Marketing - # eval": "{\"description\": \"min=234, mean=234, max=234, sum=468 (2)\", \"tab\": \"General information\", \"score\": \"234.0\"}", + "Marketing - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Marketing - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Marketing - # prompt tokens": "{\"description\": \"min=404.218, mean=404.218, max=404.218, sum=808.436 (2)\", \"tab\": \"General information\", \"score\": \"404.21794871794873\"}", + "Marketing - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"world_religions\"", + "subject": "\"marketing\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_world_religions\"" + "groups": "\"mmlu_marketing\"" } } }, { - "evaluation_name": "Mean win rate", + "evaluation_name": "Medical Genetics", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1499,404 +1485,418 @@ ] }, "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.351, + "score": 0.92, "details": { - "description": "", - "tab": "Efficiency" + "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)", + "tab": "Accuracy", + "Medical Genetics - Observed inference time (s)": "{\"description\": \"min=0.52, mean=0.52, max=0.52, sum=1.04 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5201336288452149\"}", + "Medical Genetics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "Medical Genetics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Medical Genetics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Medical Genetics - # prompt tokens": "{\"description\": \"min=341, mean=341, max=341, sum=682 (2)\", \"tab\": \"General information\", \"score\": \"341.0\"}", + "Medical Genetics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { - "additional_details": {} + "additional_details": { + "subject": "\"medical_genetics\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_medical_genetics\"" + } } - } - ], - "detailed_evaluation_results": null, - "generation_config": { - "additional_details": { - "subject": "[\"abstract_algebra\", \"anatomy\", \"astronomy\", \"business_ethics\", \"clinical_knowledge\", \"college_biology\", \"college_chemistry\", \"college_computer_science\", \"college_mathematics\", \"college_medicine\", \"college_physics\", \"computer_security\", \"conceptual_physics\", \"econometrics\", \"electrical_engineering\", \"elementary_mathematics\", \"formal_logic\", \"global_facts\", \"high_school_biology\", \"high_school_chemistry\", \"high_school_computer_science\", \"high_school_european_history\", \"high_school_geography\", \"high_school_government_and_politics\", \"high_school_macroeconomics\", \"high_school_mathematics\", \"high_school_microeconomics\", \"high_school_physics\", \"high_school_psychology\", \"high_school_statistics\", \"high_school_us_history\", \"high_school_world_history\", \"human_aging\", \"human_sexuality\", \"international_law\", \"jurisprudence\", \"logical_fallacies\", \"machine_learning\", \"management\", \"marketing\", \"medical_genetics\", \"miscellaneous\", \"moral_disputes\", \"moral_scenarios\", \"nutrition\", \"philosophy\", \"prehistory\", \"professional_accounting\", \"professional_law\", \"professional_medicine\", \"professional_psychology\", \"public_relations\", \"security_studies\", \"sociology\", \"us_foreign_policy\", \"virology\", \"world_religions\"]", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "[\"mmlu_abstract_algebra\", \"mmlu_anatomy\", \"mmlu_astronomy\", \"mmlu_business_ethics\", \"mmlu_clinical_knowledge\", \"mmlu_college_biology\", \"mmlu_college_chemistry\", \"mmlu_college_computer_science\", \"mmlu_college_mathematics\", \"mmlu_college_medicine\", \"mmlu_college_physics\", \"mmlu_computer_security\", \"mmlu_conceptual_physics\", \"mmlu_econometrics\", \"mmlu_electrical_engineering\", \"mmlu_elementary_mathematics\", \"mmlu_formal_logic\", \"mmlu_global_facts\", \"mmlu_high_school_biology\", \"mmlu_high_school_chemistry\", \"mmlu_high_school_computer_science\", \"mmlu_high_school_european_history\", \"mmlu_high_school_geography\", \"mmlu_high_school_government_and_politics\", \"mmlu_high_school_macroeconomics\", \"mmlu_high_school_mathematics\", \"mmlu_high_school_microeconomics\", \"mmlu_high_school_physics\", \"mmlu_high_school_psychology\", \"mmlu_high_school_statistics\", \"mmlu_high_school_us_history\", \"mmlu_high_school_world_history\", \"mmlu_human_aging\", \"mmlu_human_sexuality\", \"mmlu_international_law\", \"mmlu_jurisprudence\", \"mmlu_logical_fallacies\", \"mmlu_machine_learning\", \"mmlu_management\", \"mmlu_marketing\", \"mmlu_medical_genetics\", \"mmlu_miscellaneous\", \"mmlu_moral_disputes\", \"mmlu_moral_scenarios\", \"mmlu_nutrition\", \"mmlu_philosophy\", \"mmlu_prehistory\", \"mmlu_professional_accounting\", \"mmlu_professional_law\", \"mmlu_professional_medicine\", \"mmlu_professional_psychology\", \"mmlu_public_relations\", \"mmlu_security_studies\", \"mmlu_sociology\", \"mmlu_us_foreign_policy\", \"mmlu_virology\", \"mmlu_world_religions\"]" - } - } - }, - { - "evaluation_id": "helm_lite/openai_gpt-4-turbo-2024-04-09/1774096306.427425", - "retrieved_timestamp": "1774096306.427425", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "eval_library": { - "name": "helm", - "version": "unknown" - }, - "benchmark": "helm_lite", - "evaluation_results": [ + }, { - "evaluation_name": "Mean win rate", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "helm_lite", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.864, + "score": 0.945, "details": { - "description": "", + "description": "min=0.945, mean=0.945, max=0.945, sum=1.89 (2)", "tab": "Accuracy", - "Mean win rate - Efficiency": "{\"description\": \"\", \"tab\": \"Efficiency\", \"score\": \"0.4568414481897628\"}", - "Mean win rate - General information": "{\"description\": \"\", \"tab\": \"General information\", \"score\": \"\"}" + "Miscellaneous - Observed inference time (s)": "{\"description\": \"min=0.565, mean=0.565, max=0.565, sum=1.13 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5650817577561809\"}", + "Miscellaneous - # eval": "{\"description\": \"min=783, mean=783, max=783, sum=1566 (2)\", \"tab\": \"General information\", \"score\": \"783.0\"}", + "Miscellaneous - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Miscellaneous - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Miscellaneous - # prompt tokens": "{\"description\": \"min=299.925, mean=299.925, max=299.925, sum=599.849 (2)\", \"tab\": \"General information\", \"score\": \"299.92464878671774\"}", + "Miscellaneous - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { - "additional_details": {} + "additional_details": { + "subject": "\"miscellaneous\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_miscellaneous\"" + } } }, { - "evaluation_name": "NarrativeQA", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "NarrativeQA", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "F1 on NarrativeQA", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.761, + "score": 0.803, "details": { - "description": "min=0.761, mean=0.761, max=0.761, sum=0.761 (1)", + "description": "min=0.803, mean=0.803, max=0.803, sum=1.607 (2)", "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": "{\"description\": \"min=0.804, mean=0.804, max=0.804, sum=0.804 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.8043310716118611\"}", - "NarrativeQA - # eval": "{\"description\": \"min=355, mean=355, max=355, sum=355 (1)\", \"tab\": \"General information\", \"score\": \"355.0\"}", - "NarrativeQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "NarrativeQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "NarrativeQA - # prompt tokens": "{\"description\": \"min=3495.67, mean=3495.67, max=3495.67, sum=3495.67 (1)\", \"tab\": \"General information\", \"score\": \"3495.6704225352114\"}", - "NarrativeQA - # output tokens": "{\"description\": \"min=6.037, mean=6.037, max=6.037, sum=6.037 (1)\", \"tab\": \"General information\", \"score\": \"6.0366197183098596\"}" + "Moral Disputes - Observed inference time (s)": "{\"description\": \"min=0.564, mean=0.564, max=0.564, sum=1.129 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5643301023913256\"}", + "Moral Scenarios - Observed inference time (s)": "{\"description\": \"min=0.599, mean=0.599, max=0.599, sum=1.197 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5985688052363902\"}", + "Moral Disputes - # eval": "{\"description\": \"min=346, mean=346, max=346, sum=692 (2)\", \"tab\": \"General information\", \"score\": \"346.0\"}", + "Moral Disputes - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Moral Disputes - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Moral Disputes - # prompt tokens": "{\"description\": \"min=476.145, mean=476.145, max=476.145, sum=952.289 (2)\", \"tab\": \"General information\", \"score\": \"476.1445086705202\"}", + "Moral Disputes - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "Moral Scenarios - # eval": "{\"description\": \"min=895, mean=895, max=895, sum=1790 (2)\", \"tab\": \"General information\", \"score\": \"895.0\"}", + "Moral Scenarios - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Moral Scenarios - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Moral Scenarios - # prompt tokens": "{\"description\": \"min=656.455, mean=656.455, max=656.455, sum=1312.909 (2)\", \"tab\": \"General information\", \"score\": \"656.454748603352\"}", + "Moral Scenarios - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { - "additional_details": {} + "additional_details": { + "subject": "\"moral_scenarios\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_moral_scenarios\"" + } } }, { - "evaluation_name": "NaturalQuestions (closed-book)", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.482, + "score": 0.892, "details": { - "description": "min=0.482, mean=0.482, max=0.482, sum=0.482 (1)", + "description": "min=0.892, mean=0.892, max=0.892, sum=1.784 (2)", "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": "{\"description\": \"min=0.712, mean=0.712, max=0.712, sum=0.712 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.7120162718296051\"}", - "NaturalQuestions (closed-book) - Observed inference time (s)": "{\"description\": \"min=0.605, mean=0.605, max=0.605, sum=0.605 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.6052222681045533\"}", - "NaturalQuestions (open-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", - "NaturalQuestions (open-book) - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "NaturalQuestions (open-book) - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "NaturalQuestions (open-book) - # prompt tokens": "{\"description\": \"min=1728.593, mean=1728.593, max=1728.593, sum=1728.593 (1)\", \"tab\": \"General information\", \"score\": \"1728.593\"}", - "NaturalQuestions (open-book) - # output tokens": "{\"description\": \"min=5.902, mean=5.902, max=5.902, sum=5.902 (1)\", \"tab\": \"General information\", \"score\": \"5.902\"}", - "NaturalQuestions (closed-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", - "NaturalQuestions (closed-book) - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "NaturalQuestions (closed-book) - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "NaturalQuestions (closed-book) - # prompt tokens": "{\"description\": \"min=139.127, mean=139.127, max=139.127, sum=139.127 (1)\", \"tab\": \"General information\", \"score\": \"139.127\"}", - "NaturalQuestions (closed-book) - # output tokens": "{\"description\": \"min=5.263, mean=5.263, max=5.263, sum=5.263 (1)\", \"tab\": \"General information\", \"score\": \"5.263\"}" + "Nutrition - Observed inference time (s)": "{\"description\": \"min=0.532, mean=0.532, max=0.532, sum=1.063 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5316595968857311\"}", + "Nutrition - # eval": "{\"description\": \"min=306, mean=306, max=306, sum=612 (2)\", \"tab\": \"General information\", \"score\": \"306.0\"}", + "Nutrition - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Nutrition - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Nutrition - # prompt tokens": "{\"description\": \"min=586.817, mean=586.817, max=586.817, sum=1173.634 (2)\", \"tab\": \"General information\", \"score\": \"586.8169934640523\"}", + "Nutrition - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "mode": "\"closedbook\"" + "subject": "\"nutrition\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_nutrition\"" } } }, { - "evaluation_name": "OpenbookQA", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "OpenbookQA", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "EM on OpenbookQA", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.97, + "score": 0.92, "details": { - "description": "min=0.97, mean=0.97, max=0.97, sum=0.97 (1)", + "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)", "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": "{\"description\": \"min=0.438, mean=0.438, max=0.438, sum=0.438 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.4376141686439514\"}", - "OpenbookQA - # eval": "{\"description\": \"min=500, mean=500, max=500, sum=500 (1)\", \"tab\": \"General information\", \"score\": \"500.0\"}", - "OpenbookQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "OpenbookQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "OpenbookQA - # prompt tokens": "{\"description\": \"min=249.782, mean=249.782, max=249.782, sum=249.782 (1)\", \"tab\": \"General information\", \"score\": \"249.782\"}", - "OpenbookQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Prehistory - Observed inference time (s)": "{\"description\": \"min=0.54, mean=0.54, max=0.54, sum=1.079 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5397091279795141\"}", + "Prehistory - # eval": "{\"description\": \"min=324, mean=324, max=324, sum=648 (2)\", \"tab\": \"General information\", \"score\": \"324.0\"}", + "Prehistory - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Prehistory - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Prehistory - # prompt tokens": "{\"description\": \"min=514.559, mean=514.559, max=514.559, sum=1029.117 (2)\", \"tab\": \"General information\", \"score\": \"514.5586419753087\"}", + "Prehistory - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "dataset": "\"openbookqa\"", - "method": "\"multiple_choice_joint\"" + "subject": "\"prehistory\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_prehistory\"" } } }, { - "evaluation_name": "MMLU", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "MMLU", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "EM on MMLU", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.711, + "score": 0.755, "details": { - "description": "min=0.53, mean=0.711, max=0.96, sum=3.555 (5)", + "description": "min=0.755, mean=0.755, max=0.755, sum=1.509 (2)", "tab": "Accuracy", - "MMLU - Observed inference time (s)": "{\"description\": \"min=0.53, mean=0.55, max=0.572, sum=2.749 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.5498773384847139\"}", - "MMLU - # eval": "{\"description\": \"min=100, mean=102.8, max=114, sum=514 (5)\", \"tab\": \"General information\", \"score\": \"102.8\"}", - "MMLU - # train": "{\"description\": \"min=5, mean=5, max=5, sum=25 (5)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "MMLU - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "MMLU - # prompt tokens": "{\"description\": \"min=373.44, mean=467.72, max=614.43, sum=2338.6 (5)\", \"tab\": \"General information\", \"score\": \"467.71996491228066\"}", - "MMLU - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Public Relations - Observed inference time (s)": "{\"description\": \"min=0.584, mean=0.584, max=0.584, sum=1.168 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5840315688740123\"}", + "Public Relations - # eval": "{\"description\": \"min=110, mean=110, max=110, sum=220 (2)\", \"tab\": \"General information\", \"score\": \"110.0\"}", + "Public Relations - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Public Relations - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Public Relations - # prompt tokens": "{\"description\": \"min=405.318, mean=405.318, max=405.318, sum=810.636 (2)\", \"tab\": \"General information\", \"score\": \"405.3181818181818\"}", + "Public Relations - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "[\"abstract_algebra\", \"college_chemistry\", \"computer_security\", \"econometrics\", \"us_foreign_policy\"]", - "method": "\"multiple_choice_joint\"" + "subject": "\"public_relations\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_public_relations\"" } } }, { - "evaluation_name": "MATH", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "MATH", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.833, + "score": 0.8, "details": { - "description": "min=0.684, mean=0.833, max=0.97, sum=5.83 (7)", + "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)", "tab": "Accuracy", - "MATH - Observed inference time (s)": "{\"description\": \"min=4.92, mean=6.678, max=8.338, sum=46.748 (7)\", \"tab\": \"Efficiency\", \"score\": \"6.678270916932833\"}", - "MATH - # eval": "{\"description\": \"min=30, mean=62.429, max=135, sum=437 (7)\", \"tab\": \"General information\", \"score\": \"62.42857142857143\"}", - "MATH - # train": "{\"description\": \"min=8, mean=8, max=8, sum=56 (7)\", \"tab\": \"General information\", \"score\": \"8.0\"}", - "MATH - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (7)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "MATH - # prompt tokens": "{\"description\": \"min=881.363, mean=1262.911, max=2197.577, sum=8840.376 (7)\", \"tab\": \"General information\", \"score\": \"1262.9108741840687\"}", - "MATH - # output tokens": "{\"description\": \"min=135.163, mean=189.561, max=219.316, sum=1326.926 (7)\", \"tab\": \"General information\", \"score\": \"189.56082409362702\"}" + "Security Studies - Observed inference time (s)": "{\"description\": \"min=0.529, mean=0.529, max=0.529, sum=1.058 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.529095221538933\"}", + "Security Studies - # eval": "{\"description\": \"min=245, mean=245, max=245, sum=490 (2)\", \"tab\": \"General information\", \"score\": \"245.0\"}", + "Security Studies - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Security Studies - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Security Studies - # prompt tokens": "{\"description\": \"min=1164.473, mean=1164.473, max=1164.473, sum=2328.947 (2)\", \"tab\": \"General information\", \"score\": \"1164.4734693877551\"}", + "Security Studies - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "[\"algebra\", \"counting_and_probability\", \"geometry\", \"intermediate_algebra\", \"number_theory\", \"prealgebra\", \"precalculus\"]", - "level": "\"1\"", - "use_official_examples": "\"False\"", - "use_chain_of_thought": "\"True\"" + "subject": "\"security_studies\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_security_studies\"" } } }, { - "evaluation_name": "GSM8K", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "GSM8K", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "EM on GSM8K", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.824, + "score": 0.915, "details": { - "description": "min=0.824, mean=0.824, max=0.824, sum=0.824 (1)", + "description": "min=0.915, mean=0.915, max=0.915, sum=1.831 (2)", "tab": "Accuracy", - "GSM8K - Observed inference time (s)": "{\"description\": \"min=6.915, mean=6.915, max=6.915, sum=6.915 (1)\", \"tab\": \"Efficiency\", \"score\": \"6.91472976398468\"}", - "GSM8K - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", - "GSM8K - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "GSM8K - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "GSM8K - # prompt tokens": "{\"description\": \"min=959.035, mean=959.035, max=959.035, sum=959.035 (1)\", \"tab\": \"General information\", \"score\": \"959.035\"}", - "GSM8K - # output tokens": "{\"description\": \"min=141.712, mean=141.712, max=141.712, sum=141.712 (1)\", \"tab\": \"General information\", \"score\": \"141.712\"}" + "Sociology - Observed inference time (s)": "{\"description\": \"min=0.52, mean=0.52, max=0.52, sum=1.04 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5199050891458692\"}", + "Sociology - # eval": "{\"description\": \"min=201, mean=201, max=201, sum=402 (2)\", \"tab\": \"General information\", \"score\": \"201.0\"}", + "Sociology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Sociology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Sociology - # prompt tokens": "{\"description\": \"min=445.522, mean=445.522, max=445.522, sum=891.045 (2)\", \"tab\": \"General information\", \"score\": \"445.5223880597015\"}", + "Sociology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "stop": "\"none\"" + "subject": "\"sociology\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_sociology\"" } } }, { - "evaluation_name": "LegalBench", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "LegalBench", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "EM on LegalBench", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.727, + "score": 0.602, "details": { - "description": "min=0.417, mean=0.727, max=0.947, sum=3.637 (5)", + "description": "min=0.602, mean=0.602, max=0.602, sum=1.205 (2)", "tab": "Accuracy", - "LegalBench - Observed inference time (s)": "{\"description\": \"min=0.514, mean=0.608, max=0.803, sum=3.041 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.6081070231398068\"}", - "LegalBench - # eval": "{\"description\": \"min=95, mean=409.4, max=1000, sum=2047 (5)\", \"tab\": \"General information\", \"score\": \"409.4\"}", - "LegalBench - # train": "{\"description\": \"min=4, mean=4.8, max=5, sum=24 (5)\", \"tab\": \"General information\", \"score\": \"4.8\"}", - "LegalBench - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "LegalBench - # prompt tokens": "{\"description\": \"min=207.442, mean=1524.163, max=6311.388, sum=7620.815 (5)\", \"tab\": \"General information\", \"score\": \"1524.162971355988\"}", - "LegalBench - # output tokens": "{\"description\": \"min=1, mean=1.325, max=2.032, sum=6.626 (5)\", \"tab\": \"General information\", \"score\": \"1.3251168793919403\"}" + "Virology - Observed inference time (s)": "{\"description\": \"min=0.523, mean=0.523, max=0.523, sum=1.045 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5226844951330897\"}", + "Virology - # eval": "{\"description\": \"min=166, mean=166, max=166, sum=332 (2)\", \"tab\": \"General information\", \"score\": \"166.0\"}", + "Virology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Virology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Virology - # prompt tokens": "{\"description\": \"min=343.09, mean=343.09, max=343.09, sum=686.181 (2)\", \"tab\": \"General information\", \"score\": \"343.0903614457831\"}", + "Virology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subset": "[\"abercrombie\", \"corporate_lobbying\", \"function_of_decision_section\", \"international_citizenship_questions\", \"proa\"]" + "subject": "\"virology\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_virology\"" } } }, { - "evaluation_name": "MedQA", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "MedQA", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "EM on MedQA", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.783, + "score": 0.848, "details": { - "description": "min=0.783, mean=0.783, max=0.783, sum=0.783 (1)", + "description": "min=0.848, mean=0.848, max=0.848, sum=1.696 (2)", "tab": "Accuracy", - "MedQA - Observed inference time (s)": "{\"description\": \"min=0.455, mean=0.455, max=0.455, sum=0.455 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.4549296101329341\"}", - "MedQA - # eval": "{\"description\": \"min=503, mean=503, max=503, sum=503 (1)\", \"tab\": \"General information\", \"score\": \"503.0\"}", - "MedQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "MedQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "MedQA - # prompt tokens": "{\"description\": \"min=1027.414, mean=1027.414, max=1027.414, sum=1027.414 (1)\", \"tab\": \"General information\", \"score\": \"1027.4135188866799\"}", - "MedQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "World Religions - Observed inference time (s)": "{\"description\": \"min=0.494, mean=0.494, max=0.494, sum=0.988 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.49407080739562276\"}", + "World Religions - # eval": "{\"description\": \"min=171, mean=171, max=171, sum=342 (2)\", \"tab\": \"General information\", \"score\": \"171.0\"}", + "World Religions - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "World Religions - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "World Religions - # prompt tokens": "{\"description\": \"min=275.561, mean=275.561, max=275.561, sum=551.123 (2)\", \"tab\": \"General information\", \"score\": \"275.56140350877195\"}", + "World Religions - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { - "additional_details": {} + "additional_details": { + "subject": "\"world_religions\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_world_religions\"" + } } }, { - "evaluation_name": "WMT 2014", + "evaluation_name": "Mean win rate", "source_data": { - "dataset_name": "WMT 2014", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", + "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.218, + "score": 0.351, "details": { - "description": "min=0.169, mean=0.218, max=0.264, sum=1.088 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": "{\"description\": \"min=1.131, mean=1.185, max=1.222, sum=5.925 (5)\", \"tab\": \"Efficiency\", \"score\": \"1.1850423664020953\"}", - "WMT 2014 - # eval": "{\"description\": \"min=503, mean=568.8, max=832, sum=2844 (5)\", \"tab\": \"General information\", \"score\": \"568.8\"}", - "WMT 2014 - # train": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "WMT 2014 - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "WMT 2014 - # prompt tokens": "{\"description\": \"min=124.901, mean=148.043, max=168.185, sum=740.213 (5)\", \"tab\": \"General information\", \"score\": \"148.04258583116683\"}", - "WMT 2014 - # output tokens": "{\"description\": \"min=23.744, mean=25.264, max=25.938, sum=126.322 (5)\", \"tab\": \"General information\", \"score\": \"25.26444840571953\"}" + "description": "", + "tab": "Efficiency" } }, "generation_config": { - "additional_details": { - "language_pair": "[\"cs-en\", \"de-en\", \"fr-en\", \"hi-en\", \"ru-en\"]" - } + "additional_details": {} } } ], "detailed_evaluation_results": null, "generation_config": { - "additional_details": {} + "additional_details": { + "subject": "[\"abstract_algebra\", \"anatomy\", \"astronomy\", \"business_ethics\", \"clinical_knowledge\", \"college_biology\", \"college_chemistry\", \"college_computer_science\", \"college_mathematics\", \"college_medicine\", \"college_physics\", \"computer_security\", \"conceptual_physics\", \"econometrics\", \"electrical_engineering\", \"elementary_mathematics\", \"formal_logic\", \"global_facts\", \"high_school_biology\", \"high_school_chemistry\", \"high_school_computer_science\", \"high_school_european_history\", \"high_school_geography\", \"high_school_government_and_politics\", \"high_school_macroeconomics\", \"high_school_mathematics\", \"high_school_microeconomics\", \"high_school_physics\", \"high_school_psychology\", \"high_school_statistics\", \"high_school_us_history\", \"high_school_world_history\", \"human_aging\", \"human_sexuality\", \"international_law\", \"jurisprudence\", \"logical_fallacies\", \"machine_learning\", \"management\", \"marketing\", \"medical_genetics\", \"miscellaneous\", \"moral_disputes\", \"moral_scenarios\", \"nutrition\", \"philosophy\", \"prehistory\", \"professional_accounting\", \"professional_law\", \"professional_medicine\", \"professional_psychology\", \"public_relations\", \"security_studies\", \"sociology\", \"us_foreign_policy\", \"virology\", \"world_religions\"]", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "[\"mmlu_abstract_algebra\", \"mmlu_anatomy\", \"mmlu_astronomy\", \"mmlu_business_ethics\", \"mmlu_clinical_knowledge\", \"mmlu_college_biology\", \"mmlu_college_chemistry\", \"mmlu_college_computer_science\", \"mmlu_college_mathematics\", \"mmlu_college_medicine\", \"mmlu_college_physics\", \"mmlu_computer_security\", \"mmlu_conceptual_physics\", \"mmlu_econometrics\", \"mmlu_electrical_engineering\", \"mmlu_elementary_mathematics\", \"mmlu_formal_logic\", \"mmlu_global_facts\", \"mmlu_high_school_biology\", \"mmlu_high_school_chemistry\", \"mmlu_high_school_computer_science\", \"mmlu_high_school_european_history\", \"mmlu_high_school_geography\", \"mmlu_high_school_government_and_politics\", \"mmlu_high_school_macroeconomics\", \"mmlu_high_school_mathematics\", \"mmlu_high_school_microeconomics\", \"mmlu_high_school_physics\", \"mmlu_high_school_psychology\", \"mmlu_high_school_statistics\", \"mmlu_high_school_us_history\", \"mmlu_high_school_world_history\", \"mmlu_human_aging\", \"mmlu_human_sexuality\", \"mmlu_international_law\", \"mmlu_jurisprudence\", \"mmlu_logical_fallacies\", \"mmlu_machine_learning\", \"mmlu_management\", \"mmlu_marketing\", \"mmlu_medical_genetics\", \"mmlu_miscellaneous\", \"mmlu_moral_disputes\", \"mmlu_moral_scenarios\", \"mmlu_nutrition\", \"mmlu_philosophy\", \"mmlu_prehistory\", \"mmlu_professional_accounting\", \"mmlu_professional_law\", \"mmlu_professional_medicine\", \"mmlu_professional_psychology\", \"mmlu_public_relations\", \"mmlu_security_studies\", \"mmlu_sociology\", \"mmlu_us_foreign_policy\", \"mmlu_virology\", \"mmlu_world_religions\"]" + } } }, { diff --git a/data/models/openai_gpt-4o-2024-08-06.json b/data/models/openai_gpt-4o-2024-08-06.json index 4523783fb76c33e56985cf8706dfcde93c76d009..ca15abfce433f9cf3b132cc5fefadfcef719867a 100644 --- a/data/models/openai_gpt-4o-2024-08-06.json +++ b/data/models/openai_gpt-4o-2024-08-06.json @@ -1900,10 +1900,10 @@ } }, { - "evaluation_id": "reward-bench/openai_gpt-4o-2024-08-06/1766412838.146816", + "evaluation_id": "reward-bench-2/openai_gpt-4o-2024-08-06/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench", + "source_name": "RewardBench 2", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -1922,128 +1922,104 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8673 + "score": 0.6493 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9609 + "score": 0.5684 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.761 + "score": 0.3312 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Safety", + "evaluation_name": "Math", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Math score - measures mathematical reasoning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8811 + "score": 0.623 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8661 + "score": 0.8619 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } - } - ], - "detailed_evaluation_results": null, - "generation_config": null - }, - { - "evaluation_id": "reward-bench-2/openai_gpt-4o-2024-08-06/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "eval_library": { - "name": "rewardbench", - "version": "0.1.3", - "additional_details": { - "subsets": "Chat, Chat Hard, Safety, Reasoning", - "hf_space": "allenai/reward-bench" - } - }, - "benchmark": "reward-bench", - "evaluation_results": [ + }, { - "evaluation_name": "Score", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6493 + "score": 0.7293 }, "source_data": { "dataset_name": "RewardBench 2", @@ -2052,111 +2028,135 @@ } }, { - "evaluation_name": "Factuality", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5684 + "score": 0.7819 }, "source_data": { "dataset_name": "RewardBench 2", "source_type": "hf_dataset", "hf_repo": "allenai/reward-bench-2-results" } - }, + } + ], + "detailed_evaluation_results": null, + "generation_config": null + }, + { + "evaluation_id": "reward-bench/openai_gpt-4o-2024-08-06/1766412838.146816", + "retrieved_timestamp": "1766412838.146816", + "source_metadata": { + "source_name": "RewardBench", + "source_type": "documentation", + "source_organization_name": "Allen Institute for AI", + "source_organization_url": "https://allenai.org", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "rewardbench", + "version": "0.1.3", + "additional_details": { + "subsets": "Chat, Chat Hard, Safety, Reasoning", + "hf_space": "allenai/reward-bench" + } + }, + "benchmark": "reward-bench", + "evaluation_results": [ { - "evaluation_name": "Precise IF", + "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3312 + "score": 0.8673 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.623 + "score": 0.9609 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Safety", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8619 + "score": 0.761 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Focus", + "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7293 + "score": 0.8811 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Ties", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7819 + "score": 0.8661 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } } ], diff --git a/data/models/openai_gpt-5-2025-08-07.json b/data/models/openai_gpt-5-2025-08-07.json index 0853492fcc4bbd45f07185718454686350fe1be2..532bfb57f278c67f06480977b0f49dd924ee82c1 100644 --- a/data/models/openai_gpt-5-2025-08-07.json +++ b/data/models/openai_gpt-5-2025-08-07.json @@ -1264,13 +1264,13 @@ } }, { - "evaluation_id": "livecodebenchpro/gpt-5-2025-08-07/1760492095.8105888", - "retrieved_timestamp": "1760492095.8105888", + "evaluation_id": "livecodebenchpro/gpt-5-2025-08-07/1770683238.099205", + "retrieved_timestamp": "1770683238.099205", "source_metadata": { - "source_organization_name": "New York University, Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy", - "evaluator_relationship": "third_party", "source_name": "Live Code Bench Pro", - "source_type": "documentation" + "source_type": "documentation", + "source_organization_name": "New York University, Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy", + "evaluator_relationship": "third_party" }, "eval_library": { "name": "unknown", @@ -1280,62 +1280,62 @@ "evaluation_results": [ { "evaluation_name": "Hard Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Hard Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.04225352112676056 - }, "source_data": { "dataset_name": "Hard Problems", "source_type": "url", "url": [ "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live" ] - } - }, - { - "evaluation_name": "Medium Problems", + }, "metric_config": { - "evaluation_description": "Pass@1 on Medium Problems", + "evaluation_description": "Pass@1 on Hard Problems", "lower_is_better": false, "score_type": "continuous", - "min_score": 0, - "max_score": 1 + "min_score": 0.0, + "max_score": 1.0 }, "score_details": { - "score": 0.4084507042253521 - }, + "score": 0.0423 + } + }, + { + "evaluation_name": "Medium Problems", "source_data": { "dataset_name": "Medium Problems", "source_type": "url", "url": [ "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live" ] - } - }, - { - "evaluation_name": "Easy Problems", + }, "metric_config": { - "evaluation_description": "Pass@1 on Easy Problems", + "evaluation_description": "Pass@1 on Medium Problems", "lower_is_better": false, "score_type": "continuous", - "min_score": 0, - "max_score": 1 + "min_score": 0.0, + "max_score": 1.0 }, "score_details": { - "score": 0.8873239436619719 - }, + "score": 0.4085 + } + }, + { + "evaluation_name": "Easy Problems", "source_data": { "dataset_name": "Easy Problems", "source_type": "url", "url": [ "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live" ] + }, + "metric_config": { + "evaluation_description": "Pass@1 on Easy Problems", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.9014 } } ], @@ -1343,13 +1343,13 @@ "generation_config": null }, { - "evaluation_id": "livecodebenchpro/gpt-5-2025-08-07/1770683238.099205", - "retrieved_timestamp": "1770683238.099205", + "evaluation_id": "livecodebenchpro/gpt-5-2025-08-07/1760492095.8105888", + "retrieved_timestamp": "1760492095.8105888", "source_metadata": { + "source_organization_name": "New York University, Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy", + "evaluator_relationship": "third_party", "source_name": "Live Code Bench Pro", - "source_type": "documentation", - "source_organization_name": "New York University, Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy", - "evaluator_relationship": "third_party" + "source_type": "documentation" }, "eval_library": { "name": "unknown", @@ -1359,62 +1359,62 @@ "evaluation_results": [ { "evaluation_name": "Hard Problems", + "metric_config": { + "evaluation_description": "Pass@1 on Hard Problems", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0, + "max_score": 1 + }, + "score_details": { + "score": 0.04225352112676056 + }, "source_data": { "dataset_name": "Hard Problems", "source_type": "url", "url": [ "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live" ] - }, + } + }, + { + "evaluation_name": "Medium Problems", "metric_config": { - "evaluation_description": "Pass@1 on Hard Problems", + "evaluation_description": "Pass@1 on Medium Problems", "lower_is_better": false, "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 + "min_score": 0, + "max_score": 1 }, "score_details": { - "score": 0.0423 - } - }, - { - "evaluation_name": "Medium Problems", + "score": 0.4084507042253521 + }, "source_data": { "dataset_name": "Medium Problems", "source_type": "url", "url": [ "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live" ] - }, + } + }, + { + "evaluation_name": "Easy Problems", "metric_config": { - "evaluation_description": "Pass@1 on Medium Problems", + "evaluation_description": "Pass@1 on Easy Problems", "lower_is_better": false, "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 + "min_score": 0, + "max_score": 1 }, "score_details": { - "score": 0.4085 - } - }, - { - "evaluation_name": "Easy Problems", + "score": 0.8873239436619719 + }, "source_data": { "dataset_name": "Easy Problems", "source_type": "url", "url": [ "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live" ] - }, - "metric_config": { - "evaluation_description": "Pass@1 on Easy Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9014 } } ], diff --git a/data/models/openai_gpt-5-codex.json b/data/models/openai_gpt-5-codex.json index c82c5f95f908380f688e9ef7118281bf4bf2ade4..a875eb11356a27d3b1b0f42875c13e7560d24fec 100644 --- a/data/models/openai_gpt-5-codex.json +++ b/data/models/openai_gpt-5-codex.json @@ -4,13 +4,13 @@ "id": "openai/gpt-5-codex", "developer": "OpenAI", "additional_details": { - "agent_name": "Mini-SWE-Agent", - "agent_organization": "Princeton" + "agent_name": "Codex CLI", + "agent_organization": "OpenAI" } }, "evaluations": [ { - "evaluation_id": "terminal-bench-2.0/mini-swe-agent__gpt-5-codex/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/codex-cli__gpt-5-codex/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -34,7 +34,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-03", + "evaluation_timestamp": "2025-11-04", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -43,17 +43,17 @@ "max_score": 100.0 }, "score_details": { - "score": 41.3, + "score": 44.3, "uncertainty": { "standard_error": { - "value": 2.8 + "value": 2.7 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"GPT-5-Codex\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Codex CLI\" -m \"GPT-5-Codex\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -70,7 +70,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"GPT-5-Codex\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Codex CLI\" -m \"GPT-5-Codex\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -84,7 +84,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/terminus-2__gpt-5-codex/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/mini-swe-agent__gpt-5-codex/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -108,7 +108,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-10-31", + "evaluation_timestamp": "2025-11-03", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -117,17 +117,17 @@ "max_score": 100.0 }, "score_details": { - "score": 43.4, + "score": 41.3, "uncertainty": { "standard_error": { - "value": 2.9 + "value": 2.8 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5-Codex\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"GPT-5-Codex\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -144,7 +144,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5-Codex\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"GPT-5-Codex\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -158,7 +158,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/codex-cli__gpt-5-codex/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/terminus-2__gpt-5-codex/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -182,7 +182,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-04", + "evaluation_timestamp": "2025-10-31", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -191,17 +191,17 @@ "max_score": 100.0 }, "score_details": { - "score": 44.3, + "score": 43.4, "uncertainty": { "standard_error": { - "value": 2.7 + "value": 2.9 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Codex CLI\" -m \"GPT-5-Codex\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5-Codex\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -218,7 +218,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Codex CLI\" -m \"GPT-5-Codex\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5-Codex\" -k 5", "agentic_eval_config": { "available_tools": [ { diff --git a/data/models/openai_gpt-5-mini.json b/data/models/openai_gpt-5-mini.json index d47eef41739a278ed131ade530579316195a6b05..8229a45bea599ef0cf4ec037999fa4d4fdccb65f 100644 --- a/data/models/openai_gpt-5-mini.json +++ b/data/models/openai_gpt-5-mini.json @@ -84,7 +84,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/mini-swe-agent__gpt-5-mini/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/openhands__gpt-5-mini/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -108,7 +108,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-03", + "evaluation_timestamp": "2025-11-02", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -117,17 +117,17 @@ "max_score": 100.0 }, "score_details": { - "score": 22.2, + "score": 29.2, "uncertainty": { "standard_error": { - "value": 2.6 + "value": 2.8 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"GPT-5-Mini\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"GPT-5-Mini\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -144,7 +144,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"GPT-5-Mini\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"GPT-5-Mini\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -158,7 +158,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/openhands__gpt-5-mini/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/mini-swe-agent__gpt-5-mini/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -182,7 +182,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-02", + "evaluation_timestamp": "2025-11-03", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -191,17 +191,17 @@ "max_score": 100.0 }, "score_details": { - "score": 29.2, + "score": 22.2, "uncertainty": { "standard_error": { - "value": 2.8 + "value": 2.6 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"GPT-5-Mini\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"GPT-5-Mini\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -218,7 +218,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"GPT-5-Mini\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"GPT-5-Mini\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -232,7 +232,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/terminus-2__gpt-5-mini/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/spoox-m__gpt-5-mini/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -256,7 +256,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-10-31", + "evaluation_timestamp": "2025-12-24", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -265,17 +265,17 @@ "max_score": 100.0 }, "score_details": { - "score": 24.0, + "score": 34.8, "uncertainty": { "standard_error": { - "value": 2.5 + "value": 2.7 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5-Mini\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"spoox-m\" -m \"GPT-5-Mini\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -292,7 +292,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5-Mini\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"spoox-m\" -m \"GPT-5-Mini\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -306,7 +306,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/spoox-m__gpt-5-mini/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/terminus-2__gpt-5-mini/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -330,7 +330,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-12-24", + "evaluation_timestamp": "2025-10-31", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -339,17 +339,17 @@ "max_score": 100.0 }, "score_details": { - "score": 34.8, + "score": 24.0, "uncertainty": { "standard_error": { - "value": 2.7 + "value": 2.5 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"spoox-m\" -m \"GPT-5-Mini\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5-Mini\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -366,7 +366,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"spoox-m\" -m \"GPT-5-Mini\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5-Mini\" -k 5", "agentic_eval_config": { "available_tools": [ { diff --git a/data/models/openai_gpt-5-nano.json b/data/models/openai_gpt-5-nano.json index ce03029c7ef9b6ec65527da76a59725ce6e990db..9816cdcd5d573c1531a9f756bbbe26744d018229 100644 --- a/data/models/openai_gpt-5-nano.json +++ b/data/models/openai_gpt-5-nano.json @@ -4,13 +4,13 @@ "id": "openai/gpt-5-nano", "developer": "OpenAI", "additional_details": { - "agent_name": "Codex CLI", - "agent_organization": "OpenAI" + "agent_name": "OpenHands", + "agent_organization": "OpenHands" } }, "evaluations": [ { - "evaluation_id": "terminal-bench-2.0/codex-cli__gpt-5-nano/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/openhands__gpt-5-nano/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -34,7 +34,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-04", + "evaluation_timestamp": "2025-11-02", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -43,17 +43,17 @@ "max_score": 100.0 }, "score_details": { - "score": 11.5, + "score": 9.9, "uncertainty": { "standard_error": { - "value": 2.3 + "value": 2.1 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Codex CLI\" -m \"GPT-5-Nano\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"GPT-5-Nano\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -70,7 +70,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Codex CLI\" -m \"GPT-5-Nano\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"GPT-5-Nano\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -232,7 +232,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/openhands__gpt-5-nano/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/codex-cli__gpt-5-nano/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -256,7 +256,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-02", + "evaluation_timestamp": "2025-11-04", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -265,17 +265,17 @@ "max_score": 100.0 }, "score_details": { - "score": 9.9, + "score": 11.5, "uncertainty": { "standard_error": { - "value": 2.1 + "value": 2.3 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"GPT-5-Nano\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Codex CLI\" -m \"GPT-5-Nano\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -292,7 +292,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"OpenHands\" -m \"GPT-5-Nano\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Codex CLI\" -m \"GPT-5-Nano\" -k 5", "agentic_eval_config": { "available_tools": [ { diff --git a/data/models/openai_gpt-5.1-codex.json b/data/models/openai_gpt-5.1-codex.json index c0cb4dd134e66b0f125db24b4289a9c34bf024c0..a2aac5e629083b81e1729673348d7d31d575cd9b 100644 --- a/data/models/openai_gpt-5.1-codex.json +++ b/data/models/openai_gpt-5.1-codex.json @@ -4,13 +4,13 @@ "id": "openai/gpt-5.1-codex", "developer": "OpenAI", "additional_details": { - "agent_name": "Crux", - "agent_organization": "Roam" + "agent_name": "Terminus 2", + "agent_organization": "Terminal Bench" } }, "evaluations": [ { - "evaluation_id": "terminal-bench-2.0/crux__gpt-5.1-codex/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/terminus-2__gpt-5.1-codex/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -34,7 +34,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-16", + "evaluation_timestamp": "2025-11-17", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -43,17 +43,17 @@ "max_score": 100.0 }, "score_details": { - "score": 57.8, + "score": 36.9, "uncertainty": { "standard_error": { - "value": 2.9 + "value": 3.2 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Crux\" -m \"GPT-5.1-Codex\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5.1-Codex\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -70,7 +70,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Crux\" -m \"GPT-5.1-Codex\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5.1-Codex\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -84,7 +84,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/terminus-2__gpt-5.1-codex/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/letta-code__gpt-5.1-codex/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -108,7 +108,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-17", + "evaluation_timestamp": "2025-12-17", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -117,17 +117,17 @@ "max_score": 100.0 }, "score_details": { - "score": 36.9, + "score": 53.5, "uncertainty": { "standard_error": { - "value": 3.2 + "value": 2.8 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5.1-Codex\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Letta Code\" -m \"GPT-5.1-Codex\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -144,7 +144,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5.1-Codex\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Letta Code\" -m \"GPT-5.1-Codex\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -158,7 +158,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/letta-code__gpt-5.1-codex/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/crux__gpt-5.1-codex/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -182,7 +182,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-12-17", + "evaluation_timestamp": "2025-11-16", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -191,17 +191,17 @@ "max_score": 100.0 }, "score_details": { - "score": 53.5, + "score": 57.8, "uncertainty": { "standard_error": { - "value": 2.8 + "value": 2.9 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Letta Code\" -m \"GPT-5.1-Codex\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Crux\" -m \"GPT-5.1-Codex\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -218,7 +218,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Letta Code\" -m \"GPT-5.1-Codex\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Crux\" -m \"GPT-5.1-Codex\" -k 5", "agentic_eval_config": { "available_tools": [ { diff --git a/data/models/openai_gpt-5.2-2025-12-11.json b/data/models/openai_gpt-5.2-2025-12-11.json index 6e12cfe05d65d67e592592610eaa134b705b88c4..b70c28d6ee613e835fe13ebe0ae4b3e2cc5d0ae8 100644 --- a/data/models/openai_gpt-5.2-2025-12-11.json +++ b/data/models/openai_gpt-5.2-2025-12-11.json @@ -78,7 +78,7 @@ } }, { - "evaluation_id": "appworld/test_normal/claude-code-cli__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "appworld/test_normal/openai-solo__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -125,8 +125,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } @@ -138,15 +138,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } } }, { - "evaluation_id": "appworld/test_normal/openai-solo__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "appworld/test_normal/litellm-tool-calling__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -193,8 +193,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } @@ -206,8 +206,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } @@ -282,7 +282,7 @@ } }, { - "evaluation_id": "appworld/test_normal/litellm-tool-calling__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "appworld/test_normal/claude-code-cli__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -329,8 +329,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } @@ -342,15 +342,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } } }, { - "evaluation_id": "browsecompplus/litellm-tool-calling__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "browsecompplus/openai-solo__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -382,23 +382,23 @@ "max_score": 1.0 }, "score_details": { - "score": 0.46, + "score": 0.48, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.3", - "total_run_cost": "29.78", - "average_steps": "8.14", - "percent_finished": "0.99" + "average_agent_cost": "0.38", + "total_run_cost": "38.21", + "average_steps": "14.27", + "percent_finished": "1.0" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } @@ -410,15 +410,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } } }, { - "evaluation_id": "browsecompplus/smolagents-code__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "browsecompplus/litellm-tool-calling-with-shortlisting__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -450,14 +450,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.26, + "score": 0.46, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.17", - "total_run_cost": "17.31", - "average_steps": "6.57", + "average_agent_cost": "0.3", + "total_run_cost": "29.78", + "average_steps": "8.14", "percent_finished": "0.99" } }, @@ -465,8 +465,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } @@ -478,15 +478,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } } }, { - "evaluation_id": "browsecompplus/claude-code-cli__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "browsecompplus/litellm-tool-calling__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -518,23 +518,23 @@ "max_score": 1.0 }, "score_details": { - "score": 0.43, + "score": 0.46, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.43", - "total_run_cost": "43.11", - "average_steps": "8.97", - "percent_finished": "1.0" + "average_agent_cost": "0.3", + "total_run_cost": "29.78", + "average_steps": "8.14", + "percent_finished": "0.99" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } @@ -546,15 +546,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } } }, { - "evaluation_id": "browsecompplus/litellm-tool-calling-with-shortlisting__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "browsecompplus/smolagents-code__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -586,14 +586,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.46, + "score": 0.26, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.3", - "total_run_cost": "29.78", - "average_steps": "8.14", + "average_agent_cost": "0.17", + "total_run_cost": "17.31", + "average_steps": "6.57", "percent_finished": "0.99" } }, @@ -601,8 +601,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } @@ -614,15 +614,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } } }, { - "evaluation_id": "browsecompplus/openai-solo__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "browsecompplus/claude-code-cli__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -654,14 +654,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.48, + "score": 0.43, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.38", - "total_run_cost": "38.21", - "average_steps": "14.27", + "average_agent_cost": "0.43", + "total_run_cost": "43.11", + "average_steps": "8.97", "percent_finished": "1.0" } }, @@ -669,8 +669,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } @@ -682,8 +682,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } @@ -769,7 +769,7 @@ "generation_config": null }, { - "evaluation_id": "swe-bench/smolagents-code__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "swe-bench/claude-code-cli__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -801,14 +801,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5253, + "score": 0.58, "uncertainty": { - "num_samples": 99 + "num_samples": 100 }, "details": { - "average_agent_cost": "0.45", - "total_run_cost": "44.58", - "average_steps": "19.98", + "average_agent_cost": "0.94", + "total_run_cost": "93.98", + "average_steps": "23.99", "percent_finished": "1.0" } }, @@ -816,8 +816,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } @@ -829,8 +829,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } @@ -905,7 +905,7 @@ } }, { - "evaluation_id": "swe-bench/claude-code-cli__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "swe-bench/litellm-tool-calling__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -937,14 +937,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.58, + "score": 0.57, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.94", - "total_run_cost": "93.98", - "average_steps": "23.99", + "average_agent_cost": "0.25", + "total_run_cost": "24.76", + "average_steps": "20.47", "percent_finished": "1.0" } }, @@ -952,8 +952,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } @@ -965,15 +965,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } } }, { - "evaluation_id": "swe-bench/litellm-tool-calling__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "swe-bench/smolagents-code__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1005,14 +1005,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.57, + "score": 0.5253, "uncertainty": { - "num_samples": 100 + "num_samples": 99 }, "details": { - "average_agent_cost": "0.25", - "total_run_cost": "24.76", - "average_steps": "20.47", + "average_agent_cost": "0.45", + "total_run_cost": "44.58", + "average_steps": "19.98", "percent_finished": "1.0" } }, @@ -1020,8 +1020,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } @@ -1033,8 +1033,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } @@ -1109,7 +1109,7 @@ } }, { - "evaluation_id": "tau-bench-2/airline/litellm-tool-calling-with-shortlisting__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "tau-bench-2/airline/claude-code-cli__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1141,14 +1141,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.54, + "score": 0.48, "uncertainty": { "num_samples": 50 }, "details": { - "average_agent_cost": "0.13", - "total_run_cost": "6.96", - "average_steps": "11.22", + "average_agent_cost": "0.21", + "total_run_cost": "11.23", + "average_steps": "10.18", "percent_finished": "1.0" } }, @@ -1156,8 +1156,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } @@ -1169,8 +1169,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } @@ -1245,7 +1245,7 @@ } }, { - "evaluation_id": "tau-bench-2/airline/claude-code-cli__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "tau-bench-2/airline/litellm-tool-calling__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1277,14 +1277,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.48, + "score": 0.54, "uncertainty": { "num_samples": 50 }, "details": { - "average_agent_cost": "0.21", - "total_run_cost": "11.23", - "average_steps": "10.18", + "average_agent_cost": "0.13", + "total_run_cost": "6.96", + "average_steps": "11.22", "percent_finished": "1.0" } }, @@ -1292,8 +1292,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } @@ -1305,15 +1305,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } } }, { - "evaluation_id": "tau-bench-2/airline/litellm-tool-calling__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "tau-bench-2/airline/litellm-tool-calling-with-shortlisting__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1360,8 +1360,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } @@ -1373,15 +1373,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } } }, { - "evaluation_id": "tau-bench-2/airline/smolagents-code__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "tau-bench-2/retail/smolagents-code__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1394,33 +1394,33 @@ "name": "exgentic", "version": "0.1.0" }, - "benchmark": "tau-bench-2_airline", + "benchmark": "tau-bench-2_retail", "evaluation_results": [ { - "evaluation_name": "tau-bench-2/airline", + "evaluation_name": "tau-bench-2/retail", "source_data": { - "dataset_name": "tau-bench-2/airline", + "dataset_name": "tau-bench-2/retail", "source_type": "url", "url": [ "https://github.com/Exgentic/exgentic" ] }, "metric_config": { - "evaluation_description": "Tau Bench 2 benchmark evaluation (airline subset)", + "evaluation_description": "Tau Bench 2 benchmark evaluation (retail subset)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6, + "score": 0.68, "uncertainty": { - "num_samples": 50 + "num_samples": 100 }, "details": { - "average_agent_cost": "0.29", - "total_run_cost": "15.28", - "average_steps": "10.68", + "average_agent_cost": "0.25", + "total_run_cost": "26.27", + "average_steps": "11.08", "percent_finished": "1.0" } }, @@ -1449,7 +1449,7 @@ } }, { - "evaluation_id": "tau-bench-2/retail/openai-solo__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "tau-bench-2/retail/claude-code-cli__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1481,23 +1481,23 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5354, + "score": 0.51, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.11", - "total_run_cost": "11.54", - "average_steps": "9.55", - "percent_finished": "0.99" + "average_agent_cost": "0.12", + "total_run_cost": "12.63", + "average_steps": "9.92", + "percent_finished": "0.98" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } @@ -1509,15 +1509,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "Claude Code CLI", + "agent_framework": "claude_code" } } } } }, { - "evaluation_id": "tau-bench-2/retail/litellm-tool-calling-with-shortlisting__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "tau-bench-2/airline/smolagents-code__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1530,33 +1530,33 @@ "name": "exgentic", "version": "0.1.0" }, - "benchmark": "tau-bench-2_retail", + "benchmark": "tau-bench-2_airline", "evaluation_results": [ { - "evaluation_name": "tau-bench-2/retail", + "evaluation_name": "tau-bench-2/airline", "source_data": { - "dataset_name": "tau-bench-2/retail", + "dataset_name": "tau-bench-2/airline", "source_type": "url", "url": [ "https://github.com/Exgentic/exgentic" ] }, "metric_config": { - "evaluation_description": "Tau Bench 2 benchmark evaluation (retail subset)", + "evaluation_description": "Tau Bench 2 benchmark evaluation (airline subset)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.73, + "score": 0.6, "uncertainty": { - "num_samples": 100 + "num_samples": 50 }, "details": { - "average_agent_cost": "0.11", - "total_run_cost": "12.27", - "average_steps": "10.33", + "average_agent_cost": "0.29", + "total_run_cost": "15.28", + "average_steps": "10.68", "percent_finished": "1.0" } }, @@ -1564,8 +1564,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } @@ -1577,15 +1577,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling with Shortlisting", - "agent_framework": "tool_calling_with_shortlisting" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } } }, { - "evaluation_id": "tau-bench-2/retail/litellm-tool-calling__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "tau-bench-2/retail/openai-solo__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1617,23 +1617,23 @@ "max_score": 1.0 }, "score_details": { - "score": 0.73, + "score": 0.5354, "uncertainty": { "num_samples": 100 }, "details": { "average_agent_cost": "0.11", - "total_run_cost": "12.27", - "average_steps": "10.33", - "percent_finished": "1.0" + "total_run_cost": "11.54", + "average_steps": "9.55", + "percent_finished": "0.99" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } @@ -1645,15 +1645,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } } }, { - "evaluation_id": "tau-bench-2/retail/claude-code-cli__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "tau-bench-2/retail/litellm-tool-calling__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1685,23 +1685,23 @@ "max_score": 1.0 }, "score_details": { - "score": 0.51, + "score": 0.73, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.12", - "total_run_cost": "12.63", - "average_steps": "9.92", - "percent_finished": "0.98" + "average_agent_cost": "0.11", + "total_run_cost": "12.27", + "average_steps": "10.33", + "percent_finished": "1.0" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } @@ -1713,15 +1713,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "Claude Code CLI", - "agent_framework": "claude_code" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } } }, { - "evaluation_id": "tau-bench-2/retail/smolagents-code__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "tau-bench-2/retail/litellm-tool-calling-with-shortlisting__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1753,14 +1753,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.68, + "score": 0.73, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.25", - "total_run_cost": "26.27", - "average_steps": "11.08", + "average_agent_cost": "0.11", + "total_run_cost": "12.27", + "average_steps": "10.33", "percent_finished": "1.0" } }, @@ -1768,8 +1768,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } @@ -1781,15 +1781,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "LiteLLM Tool Calling with Shortlisting", + "agent_framework": "tool_calling_with_shortlisting" } } } } }, { - "evaluation_id": "tau-bench-2/telecom/smolagents-code__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "tau-bench-2/telecom/openai-solo__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -1821,14 +1821,14 @@ "max_score": 1.0 }, "score_details": { - "score": 0.71, + "score": 0.53, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.3", - "total_run_cost": "35.31", - "average_steps": "10.11", + "average_agent_cost": "0.15", + "total_run_cost": "18.88", + "average_steps": "9.92", "percent_finished": "1.0" } }, @@ -1836,8 +1836,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } @@ -1849,8 +1849,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "SmolAgents Code", - "agent_framework": "smolagents_code" + "agent_name": "OpenAI Solo", + "agent_framework": "openai_solo" } } } @@ -1993,7 +1993,7 @@ } }, { - "evaluation_id": "tau-bench-2/telecom/openai-solo__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "tau-bench-2/telecom/litellm-tool-calling__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -2025,23 +2025,23 @@ "max_score": 1.0 }, "score_details": { - "score": 0.53, + "score": 0.5354, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.15", - "total_run_cost": "18.88", - "average_steps": "9.92", - "percent_finished": "1.0" + "average_agent_cost": "0.14", + "total_run_cost": "19.92", + "average_steps": "10.18", + "percent_finished": "0.99" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } @@ -2053,15 +2053,15 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "OpenAI Solo", - "agent_framework": "openai_solo" + "agent_name": "LiteLLM Tool Calling", + "agent_framework": "tool_calling" } } } } }, { - "evaluation_id": "tau-bench-2/telecom/litellm-tool-calling__openai_gpt-5.2-2025-12-11/1774263615.0201504", + "evaluation_id": "tau-bench-2/telecom/smolagents-code__openai_gpt-5.2-2025-12-11/1774263615.0201504", "retrieved_timestamp": "1774263615.0201504", "source_metadata": { "source_name": "Exgentic Open Agent Leaderboard", @@ -2093,23 +2093,23 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5354, + "score": 0.71, "uncertainty": { "num_samples": 100 }, "details": { - "average_agent_cost": "0.14", - "total_run_cost": "19.92", - "average_steps": "10.18", - "percent_finished": "0.99" + "average_agent_cost": "0.3", + "total_run_cost": "35.31", + "average_steps": "10.11", + "percent_finished": "1.0" } }, "generation_config": { "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } @@ -2121,8 +2121,8 @@ "generation_args": { "agentic_eval_config": { "additional_details": { - "agent_name": "LiteLLM Tool Calling", - "agent_framework": "tool_calling" + "agent_name": "SmolAgents Code", + "agent_framework": "smolagents_code" } } } diff --git a/data/models/openai_gpt-5.2.json b/data/models/openai_gpt-5.2.json index 77930b98006acfd2c6efa9a1e831cbd44874ee48..26fe4dd6a4fc6668b751e0d5655642084b59ba75 100644 --- a/data/models/openai_gpt-5.2.json +++ b/data/models/openai_gpt-5.2.json @@ -4,13 +4,13 @@ "id": "openai/gpt-5.2", "developer": "OpenAI", "additional_details": { - "agent_name": "Codex CLI", - "agent_organization": "OpenAI" + "agent_name": "Droid", + "agent_organization": "Factory" } }, "evaluations": [ { - "evaluation_id": "terminal-bench-2.0/codex-cli__gpt-5.2/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/droid__gpt-5.2/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -34,7 +34,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-12-18", + "evaluation_timestamp": "2025-12-24", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -43,17 +43,17 @@ "max_score": 100.0 }, "score_details": { - "score": 62.9, + "score": 64.9, "uncertainty": { "standard_error": { - "value": 3.0 + "value": 2.8 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Codex CLI\" -m \"GPT-5.2\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Droid\" -m \"GPT-5.2\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -70,7 +70,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Codex CLI\" -m \"GPT-5.2\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Droid\" -m \"GPT-5.2\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -84,7 +84,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/droid__gpt-5.2/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/terminus-2__gpt-5.2/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -108,7 +108,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-12-24", + "evaluation_timestamp": "2025-12-12", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -117,17 +117,17 @@ "max_score": 100.0 }, "score_details": { - "score": 64.9, + "score": 54.0, "uncertainty": { "standard_error": { - "value": 2.8 + "value": 2.9 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Droid\" -m \"GPT-5.2\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5.2\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -144,7 +144,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Droid\" -m \"GPT-5.2\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5.2\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -158,7 +158,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/terminus-2__gpt-5.2/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/mux__gpt-5.2/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -182,7 +182,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-12-12", + "evaluation_timestamp": "2026-01-17", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -191,17 +191,11 @@ "max_score": 100.0 }, "score_details": { - "score": 54.0, - "uncertainty": { - "standard_error": { - "value": 2.9 - }, - "num_samples": 435 - } + "score": 60.7 }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5.2\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mux\" -m \"GPT-5.2\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -218,7 +212,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5.2\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mux\" -m \"GPT-5.2\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -232,7 +226,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/mux__gpt-5.2/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/codex-cli__gpt-5.2/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -256,7 +250,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2026-01-17", + "evaluation_timestamp": "2025-12-18", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -265,11 +259,17 @@ "max_score": 100.0 }, "score_details": { - "score": 60.7 + "score": 62.9, + "uncertainty": { + "standard_error": { + "value": 3.0 + }, + "num_samples": 435 + } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mux\" -m \"GPT-5.2\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Codex CLI\" -m \"GPT-5.2\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -286,7 +286,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mux\" -m \"GPT-5.2\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Codex CLI\" -m \"GPT-5.2\" -k 5", "agentic_eval_config": { "available_tools": [ { diff --git a/data/models/openai_gpt-5.3-codex.json b/data/models/openai_gpt-5.3-codex.json index bc1283e272bffaad3e91b4e9076eec602a9b6d32..d48b8e986526c7f815b5efdb141e51c853537ca1 100644 --- a/data/models/openai_gpt-5.3-codex.json +++ b/data/models/openai_gpt-5.3-codex.json @@ -4,13 +4,13 @@ "id": "openai/gpt-5.3-codex", "developer": "OpenAI", "additional_details": { - "agent_name": "Simple Codex", - "agent_organization": "OpenAI" + "agent_name": "Terminus 2", + "agent_organization": "Terminal Bench" } }, "evaluations": [ { - "evaluation_id": "terminal-bench-2.0/simple-codex__gpt-5.3-codex/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/terminus-2__gpt-5.3-codex/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -34,7 +34,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2026-02-06", + "evaluation_timestamp": "2026-02-05", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -43,17 +43,17 @@ "max_score": 100.0 }, "score_details": { - "score": 75.1, + "score": 64.7, "uncertainty": { "standard_error": { - "value": 2.4 + "value": 2.7 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Simple Codex\" -m \"GPT-5.3-Codex\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5.3-Codex\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -70,7 +70,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Simple Codex\" -m \"GPT-5.3-Codex\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5.3-Codex\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -158,7 +158,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/codebrain-1__gpt-5.3-codex/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/simple-codex__gpt-5.3-codex/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -182,7 +182,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2026-02-10", + "evaluation_timestamp": "2026-02-06", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -191,17 +191,17 @@ "max_score": 100.0 }, "score_details": { - "score": 70.3, + "score": 75.1, "uncertainty": { "standard_error": { - "value": 2.6 + "value": 2.4 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"CodeBrain-1\" -m \"GPT-5.3-Codex\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Simple Codex\" -m \"GPT-5.3-Codex\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -218,7 +218,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"CodeBrain-1\" -m \"GPT-5.3-Codex\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Simple Codex\" -m \"GPT-5.3-Codex\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -232,7 +232,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/droid__gpt-5.3-codex/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/codebrain-1__gpt-5.3-codex/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -256,7 +256,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2026-02-24", + "evaluation_timestamp": "2026-02-10", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -265,17 +265,17 @@ "max_score": 100.0 }, "score_details": { - "score": 77.3, + "score": 70.3, "uncertainty": { "standard_error": { - "value": 2.2 + "value": 2.6 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Droid\" -m \"GPT-5.3-Codex\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"CodeBrain-1\" -m \"GPT-5.3-Codex\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -292,7 +292,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Droid\" -m \"GPT-5.3-Codex\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"CodeBrain-1\" -m \"GPT-5.3-Codex\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -306,7 +306,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/terminus-2__gpt-5.3-codex/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/droid__gpt-5.3-codex/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -330,7 +330,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2026-02-05", + "evaluation_timestamp": "2026-02-24", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -339,17 +339,17 @@ "max_score": 100.0 }, "score_details": { - "score": 64.7, + "score": 77.3, "uncertainty": { "standard_error": { - "value": 2.7 + "value": 2.2 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5.3-Codex\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Droid\" -m \"GPT-5.3-Codex\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -366,7 +366,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5.3-Codex\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Droid\" -m \"GPT-5.3-Codex\" -k 5", "agentic_eval_config": { "available_tools": [ { diff --git a/data/models/openai_gpt-5.json b/data/models/openai_gpt-5.json index 27068eec1e4a30557b2050b38502f142637aeb61..539597ca005b6b7d3304665ed895ff7bb0860332 100644 --- a/data/models/openai_gpt-5.json +++ b/data/models/openai_gpt-5.json @@ -4,13 +4,13 @@ "id": "openai/gpt-5", "developer": "OpenAI", "additional_details": { - "agent_name": "Codex CLI", - "agent_organization": "OpenAI" + "agent_name": "Terminus 2", + "agent_organization": "Terminal Bench" } }, "evaluations": [ { - "evaluation_id": "terminal-bench-2.0/codex-cli__gpt-5/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/terminus-2__gpt-5/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -34,7 +34,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-04", + "evaluation_timestamp": "2025-10-31", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -43,17 +43,17 @@ "max_score": 100.0 }, "score_details": { - "score": 49.6, + "score": 35.2, "uncertainty": { "standard_error": { - "value": 2.9 + "value": 3.1 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Codex CLI\" -m \"GPT-5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -70,7 +70,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Codex CLI\" -m \"GPT-5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -232,7 +232,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/terminus-2__gpt-5/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/codex-cli__gpt-5/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -256,7 +256,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-10-31", + "evaluation_timestamp": "2025-11-04", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -265,17 +265,17 @@ "max_score": 100.0 }, "score_details": { - "score": 35.2, + "score": 49.6, "uncertainty": { "standard_error": { - "value": 3.1 + "value": 2.9 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Codex CLI\" -m \"GPT-5\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -292,7 +292,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-5\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Codex CLI\" -m \"GPT-5\" -k 5", "agentic_eval_config": { "available_tools": [ { diff --git a/data/models/openai_gpt-oss-120b.json b/data/models/openai_gpt-oss-120b.json index 396f861a54b2b1a7762f9a8f0810f5cd018d5e52..cc688cac8bf3c3ea698efc0de68329e08bff64c1 100644 --- a/data/models/openai_gpt-oss-120b.json +++ b/data/models/openai_gpt-oss-120b.json @@ -310,7 +310,7 @@ "generation_config": null }, { - "evaluation_id": "terminal-bench-2.0/terminus-2__gpt-oss-120b/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/mini-swe-agent__gpt-oss-120b/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -334,7 +334,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-01", + "evaluation_timestamp": "2025-11-03", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -343,17 +343,17 @@ "max_score": 100.0 }, "score_details": { - "score": 18.7, + "score": 14.2, "uncertainty": { "standard_error": { - "value": 2.7 + "value": 2.3 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-OSS-120B\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"GPT-OSS-120B\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -370,7 +370,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-OSS-120B\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"GPT-OSS-120B\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -384,7 +384,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/mini-swe-agent__gpt-oss-120b/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/terminus-2__gpt-oss-120b/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -408,7 +408,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-03", + "evaluation_timestamp": "2025-11-01", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -417,17 +417,17 @@ "max_score": 100.0 }, "score_details": { - "score": 14.2, + "score": 18.7, "uncertainty": { "standard_error": { - "value": 2.3 + "value": 2.7 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"GPT-OSS-120B\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-OSS-120B\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -444,7 +444,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"GPT-OSS-120B\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-OSS-120B\" -k 5", "agentic_eval_config": { "available_tools": [ { diff --git a/data/models/openai_gpt-oss-20b.json b/data/models/openai_gpt-oss-20b.json index d658cd5464b4783e6e0372fda0cb20e4e0a6a422..584e35a6df7897b2cd4e9e1cdb5c6db8d87d90ab 100644 --- a/data/models/openai_gpt-oss-20b.json +++ b/data/models/openai_gpt-oss-20b.json @@ -310,7 +310,7 @@ "generation_config": null }, { - "evaluation_id": "terminal-bench-2.0/mini-swe-agent__gpt-oss-20b/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/terminus-2__gpt-oss-20b/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -334,7 +334,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-03", + "evaluation_timestamp": "2025-11-01", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -343,17 +343,17 @@ "max_score": 100.0 }, "score_details": { - "score": 3.4, + "score": 3.1, "uncertainty": { "standard_error": { - "value": 1.4 + "value": 1.5 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"GPT-OSS-20B\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-OSS-20B\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -370,7 +370,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"GPT-OSS-20B\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-OSS-20B\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -384,7 +384,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/terminus-2__gpt-oss-20b/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/mini-swe-agent__gpt-oss-20b/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -408,7 +408,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-01", + "evaluation_timestamp": "2025-11-03", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -417,17 +417,17 @@ "max_score": 100.0 }, "score_details": { - "score": 3.1, + "score": 3.4, "uncertainty": { "standard_error": { - "value": 1.5 + "value": 1.4 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-OSS-20B\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"GPT-OSS-20B\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -444,7 +444,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"GPT-OSS-20B\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"GPT-OSS-20B\" -k 5", "agentic_eval_config": { "available_tools": [ { diff --git a/data/models/openai_gpt_5.2.json b/data/models/openai_gpt_5.2.json index e5de01feca478b0842db2b2533d006446492021b..fec8e3d9659540e746cd368872cad0ec3c496416 100644 --- a/data/models/openai_gpt_5.2.json +++ b/data/models/openai_gpt_5.2.json @@ -7,10 +7,10 @@ }, "evaluations": [ { - "evaluation_id": "apex-agents/openai_gpt-5.2/1773260200", + "evaluation_id": "ace/openai_gpt-5.2/1773260200", "retrieved_timestamp": "1773260200", "source_metadata": { - "source_name": "Mercor APEX-Agents Leaderboard", + "source_name": "Mercor ACE Leaderboard", "source_type": "evaluation_run", "source_organization_name": "Mercor", "source_organization_url": "https://www.mercor.com", @@ -20,24 +20,24 @@ "name": "archipelago", "version": "1.0.0" }, - "benchmark": "apex-agents", + "benchmark": "ace", "evaluation_results": [ { - "evaluation_name": "Overall Pass@1", + "evaluation_name": "Overall Score", "source_data": { - "dataset_name": "apex-agents", + "dataset_name": "ace", "source_type": "hf_dataset", - "hf_repo": "mercor/apex-agents" + "hf_repo": "Mercor/ACE" }, "metric_config": { - "evaluation_description": "Overall Pass@1 (dataset card / paper snapshot).", + "evaluation_description": "Overall ACE score across all consumer-task domains.", "lower_is_better": false, "score_type": "continuous", "min_score": 0, "max_score": 1 }, "score_details": { - "score": 0.23, + "score": 0.515, "uncertainty": { "confidence_interval": { "lower": -0.032, @@ -53,28 +53,21 @@ } }, { - "evaluation_name": "Overall Pass@8", + "evaluation_name": "Food Score", "source_data": { - "dataset_name": "apex-agents", + "dataset_name": "ace", "source_type": "hf_dataset", - "hf_repo": "mercor/apex-agents" + "hf_repo": "Mercor/ACE" }, "metric_config": { - "evaluation_description": "Overall Pass@8 (dataset card / paper snapshot).", + "evaluation_description": "Food domain score.", "lower_is_better": false, "score_type": "continuous", "min_score": 0, "max_score": 1 }, "score_details": { - "score": 0.4, - "uncertainty": { - "confidence_interval": { - "lower": -0.044, - "upper": 0.044, - "method": "bootstrap" - } - } + "score": 0.65 }, "generation_config": { "additional_details": { @@ -83,44 +76,75 @@ } }, { - "evaluation_name": "Overall Mean Score", + "evaluation_name": "Gaming Score", "source_data": { - "dataset_name": "apex-agents", + "dataset_name": "ace", "source_type": "hf_dataset", - "hf_repo": "mercor/apex-agents" + "hf_repo": "Mercor/ACE" }, "metric_config": { - "evaluation_description": "Overall mean rubric score.", + "evaluation_description": "Gaming domain score.", "lower_is_better": false, "score_type": "continuous", "min_score": 0, "max_score": 1 }, "score_details": { - "score": 0.387 + "score": 0.578 }, "generation_config": { "additional_details": { "run_setting": "High" } } - }, + } + ], + "detailed_evaluation_results": null, + "generation_config": { + "additional_details": { + "run_setting": "High" + } + } + }, + { + "evaluation_id": "apex-agents/openai_gpt-5.2/1773260200", + "retrieved_timestamp": "1773260200", + "source_metadata": { + "source_name": "Mercor APEX-Agents Leaderboard", + "source_type": "evaluation_run", + "source_organization_name": "Mercor", + "source_organization_url": "https://www.mercor.com", + "evaluator_relationship": "first_party" + }, + "eval_library": { + "name": "archipelago", + "version": "1.0.0" + }, + "benchmark": "apex-agents", + "evaluation_results": [ { - "evaluation_name": "Investment Banking Pass@1", + "evaluation_name": "Overall Pass@1", "source_data": { "dataset_name": "apex-agents", "source_type": "hf_dataset", "hf_repo": "mercor/apex-agents" }, "metric_config": { - "evaluation_description": "Investment banking world Pass@1.", + "evaluation_description": "Overall Pass@1 (dataset card / paper snapshot).", "lower_is_better": false, "score_type": "continuous", "min_score": 0, "max_score": 1 }, "score_details": { - "score": 0.273 + "score": 0.23, + "uncertainty": { + "confidence_interval": { + "lower": -0.032, + "upper": 0.032, + "method": "bootstrap" + } + } }, "generation_config": { "additional_details": { @@ -129,21 +153,28 @@ } }, { - "evaluation_name": "Management Consulting Pass@1", + "evaluation_name": "Overall Pass@8", "source_data": { "dataset_name": "apex-agents", "source_type": "hf_dataset", "hf_repo": "mercor/apex-agents" }, "metric_config": { - "evaluation_description": "Management consulting world Pass@1.", + "evaluation_description": "Overall Pass@8 (dataset card / paper snapshot).", "lower_is_better": false, "score_type": "continuous", "min_score": 0, "max_score": 1 }, "score_details": { - "score": 0.227 + "score": 0.4, + "uncertainty": { + "confidence_interval": { + "lower": -0.044, + "upper": 0.044, + "method": "bootstrap" + } + } }, "generation_config": { "additional_details": { @@ -152,21 +183,21 @@ } }, { - "evaluation_name": "Corporate Law Pass@1", + "evaluation_name": "Overall Mean Score", "source_data": { "dataset_name": "apex-agents", "source_type": "hf_dataset", "hf_repo": "mercor/apex-agents" }, "metric_config": { - "evaluation_description": "Corporate law world Pass@1.", + "evaluation_description": "Overall mean rubric score.", "lower_is_better": false, "score_type": "continuous", "min_score": 0, "max_score": 1 }, "score_details": { - "score": 0.189 + "score": 0.387 }, "generation_config": { "additional_details": { @@ -175,75 +206,44 @@ } }, { - "evaluation_name": "Corporate Lawyer Mean Score", + "evaluation_name": "Investment Banking Pass@1", "source_data": { "dataset_name": "apex-agents", "source_type": "hf_dataset", "hf_repo": "mercor/apex-agents" }, "metric_config": { - "evaluation_description": "Corporate lawyer world mean score.", + "evaluation_description": "Investment banking world Pass@1.", "lower_is_better": false, "score_type": "continuous", "min_score": 0, "max_score": 1 }, "score_details": { - "score": 0.443 + "score": 0.273 }, "generation_config": { "additional_details": { "run_setting": "High" } } - } - ], - "detailed_evaluation_results": null, - "generation_config": { - "additional_details": { - "run_setting": "High" - } - } - }, - { - "evaluation_id": "ace/openai_gpt-5.2/1773260200", - "retrieved_timestamp": "1773260200", - "source_metadata": { - "source_name": "Mercor ACE Leaderboard", - "source_type": "evaluation_run", - "source_organization_name": "Mercor", - "source_organization_url": "https://www.mercor.com", - "evaluator_relationship": "first_party" - }, - "eval_library": { - "name": "archipelago", - "version": "1.0.0" - }, - "benchmark": "ace", - "evaluation_results": [ + }, { - "evaluation_name": "Overall Score", + "evaluation_name": "Management Consulting Pass@1", "source_data": { - "dataset_name": "ace", + "dataset_name": "apex-agents", "source_type": "hf_dataset", - "hf_repo": "Mercor/ACE" + "hf_repo": "mercor/apex-agents" }, "metric_config": { - "evaluation_description": "Overall ACE score across all consumer-task domains.", + "evaluation_description": "Management consulting world Pass@1.", "lower_is_better": false, "score_type": "continuous", "min_score": 0, "max_score": 1 }, "score_details": { - "score": 0.515, - "uncertainty": { - "confidence_interval": { - "lower": -0.032, - "upper": 0.032, - "method": "bootstrap" - } - } + "score": 0.227 }, "generation_config": { "additional_details": { @@ -252,21 +252,21 @@ } }, { - "evaluation_name": "Food Score", + "evaluation_name": "Corporate Law Pass@1", "source_data": { - "dataset_name": "ace", + "dataset_name": "apex-agents", "source_type": "hf_dataset", - "hf_repo": "Mercor/ACE" + "hf_repo": "mercor/apex-agents" }, "metric_config": { - "evaluation_description": "Food domain score.", + "evaluation_description": "Corporate law world Pass@1.", "lower_is_better": false, "score_type": "continuous", "min_score": 0, "max_score": 1 }, "score_details": { - "score": 0.65 + "score": 0.189 }, "generation_config": { "additional_details": { @@ -275,21 +275,21 @@ } }, { - "evaluation_name": "Gaming Score", + "evaluation_name": "Corporate Lawyer Mean Score", "source_data": { - "dataset_name": "ace", + "dataset_name": "apex-agents", "source_type": "hf_dataset", - "hf_repo": "Mercor/ACE" + "hf_repo": "mercor/apex-agents" }, "metric_config": { - "evaluation_description": "Gaming domain score.", + "evaluation_description": "Corporate lawyer world mean score.", "lower_is_better": false, "score_type": "continuous", "min_score": 0, "max_score": 1 }, "score_details": { - "score": 0.578 + "score": 0.443 }, "generation_config": { "additional_details": { diff --git a/data/models/openassistant_oasst-rm-2-pythia-6.9b-epoch-1.json b/data/models/openassistant_oasst-rm-2-pythia-6.9b-epoch-1.json index cdfb6729d8ee2c00438e79bd148b5b9825fbce54..34afb9663c74112d98337da6fd83b3594c465f50 100644 --- a/data/models/openassistant_oasst-rm-2-pythia-6.9b-epoch-1.json +++ b/data/models/openassistant_oasst-rm-2-pythia-6.9b-epoch-1.json @@ -9,10 +9,10 @@ }, "evaluations": [ { - "evaluation_id": "reward-bench-2/OpenAssistant_oasst-rm-2-pythia-6.9b-epoch-1/1766412838.146816", + "evaluation_id": "reward-bench/OpenAssistant_oasst-rm-2-pythia-6.9b-epoch-1/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench 2", + "source_name": "RewardBench", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -31,127 +31,109 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2653 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3979 + "score": 0.615 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Precise IF", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.2875 + "score": 0.9246 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.377 + "score": 0.3728 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3289 + "score": 0.5446 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Focus", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.1535 + "score": 0.5855 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Ties", + "evaluation_name": "Prior Sets (0.5 weight)", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.047 + "score": 0.6801 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } } ], @@ -159,10 +141,10 @@ "generation_config": null }, { - "evaluation_id": "reward-bench/OpenAssistant_oasst-rm-2-pythia-6.9b-epoch-1/1766412838.146816", + "evaluation_id": "reward-bench-2/OpenAssistant_oasst-rm-2-pythia-6.9b-epoch-1/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench", + "source_name": "RewardBench 2", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -181,109 +163,127 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.615 + "score": 0.2653 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9246 + "score": 0.3979 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3728 + "score": 0.2875 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" + } + }, + { + "evaluation_name": "Math", + "metric_config": { + "evaluation_description": "Math score - measures mathematical reasoning", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.377 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5446 + "score": 0.3289 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5855 + "score": 0.1535 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Prior Sets (0.5 weight)", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6801 + "score": 0.047 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } } ], diff --git a/data/models/openassistant_oasst-rm-2.1-pythia-1.4b-epoch-2.5.json b/data/models/openassistant_oasst-rm-2.1-pythia-1.4b-epoch-2.5.json index d028cad4784e7392a6a614670c5c27a9b8900f31..4f200b5bed952e0790f7951e7076e297510a1c46 100644 --- a/data/models/openassistant_oasst-rm-2.1-pythia-1.4b-epoch-2.5.json +++ b/data/models/openassistant_oasst-rm-2.1-pythia-1.4b-epoch-2.5.json @@ -9,10 +9,10 @@ }, "evaluations": [ { - "evaluation_id": "reward-bench/OpenAssistant_oasst-rm-2.1-pythia-1.4b-epoch-2.5/1766412838.146816", + "evaluation_id": "reward-bench-2/OpenAssistant_oasst-rm-2.1-pythia-1.4b-epoch-2.5/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench", + "source_name": "RewardBench 2", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -31,109 +31,127 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6901 + "score": 0.2648 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8855 + "score": 0.3179 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.4868 + "score": 0.2625 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" + } + }, + { + "evaluation_name": "Math", + "metric_config": { + "evaluation_description": "Math score - measures mathematical reasoning", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.3934 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6311 + "score": 0.3244 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7752 + "score": 0.2707 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Prior Sets (0.5 weight)", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6533 + "score": 0.0198 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } } ], @@ -141,10 +159,10 @@ "generation_config": null }, { - "evaluation_id": "reward-bench-2/OpenAssistant_oasst-rm-2.1-pythia-1.4b-epoch-2.5/1766412838.146816", + "evaluation_id": "reward-bench/OpenAssistant_oasst-rm-2.1-pythia-1.4b-epoch-2.5/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench 2", + "source_name": "RewardBench", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -163,127 +181,109 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2648 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3179 + "score": 0.6901 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Precise IF", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.2625 + "score": 0.8855 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3934 + "score": 0.4868 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3244 + "score": 0.6311 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Focus", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.2707 + "score": 0.7752 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Ties", + "evaluation_name": "Prior Sets (0.5 weight)", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.0198 + "score": 0.6533 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } } ], diff --git a/data/models/openbmb_eurus-rm-7b.json b/data/models/openbmb_eurus-rm-7b.json index 44637ca17276f082725f61866dcfa3229cc54274..e1154a660c89f219cec7ec844602d9d8fdfa08ad 100644 --- a/data/models/openbmb_eurus-rm-7b.json +++ b/data/models/openbmb_eurus-rm-7b.json @@ -9,10 +9,10 @@ }, "evaluations": [ { - "evaluation_id": "reward-bench/openbmb_Eurus-RM-7b/1766412838.146816", + "evaluation_id": "reward-bench-2/openbmb_Eurus-RM-7b/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench", + "source_name": "RewardBench 2", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -31,109 +31,127 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8159 + "score": 0.5806 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9804 + "score": 0.6 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6557 + "score": 0.3438 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" + } + }, + { + "evaluation_name": "Math", + "metric_config": { + "evaluation_description": "Math score - measures mathematical reasoning", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.5683 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8135 + "score": 0.6267 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8633 + "score": 0.7475 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Prior Sets (0.5 weight)", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7172 + "score": 0.5972 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } } ], @@ -141,10 +159,10 @@ "generation_config": null }, { - "evaluation_id": "reward-bench-2/openbmb_Eurus-RM-7b/1766412838.146816", + "evaluation_id": "reward-bench/openbmb_Eurus-RM-7b/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench 2", + "source_name": "RewardBench", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -163,127 +181,109 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5806 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6 + "score": 0.8159 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Precise IF", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3438 + "score": 0.9804 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5683 + "score": 0.6557 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6267 + "score": 0.8135 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Focus", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7475 + "score": 0.8633 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Ties", + "evaluation_name": "Prior Sets (0.5 weight)", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5972 + "score": 0.7172 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } } ], diff --git a/data/models/pku-alignment_beaver-7b-v1.0-cost.json b/data/models/pku-alignment_beaver-7b-v1.0-cost.json index 8e786484059e3101c1249c1cce8c4b2be81faa5a..3777eba3edfdc470c669a503ac85994bf8139135 100644 --- a/data/models/pku-alignment_beaver-7b-v1.0-cost.json +++ b/data/models/pku-alignment_beaver-7b-v1.0-cost.json @@ -9,10 +9,10 @@ }, "evaluations": [ { - "evaluation_id": "reward-bench/PKU-Alignment_beaver-7b-v1.0-cost/1766412838.146816", + "evaluation_id": "reward-bench-2/PKU-Alignment_beaver-7b-v1.0-cost/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench", + "source_name": "RewardBench 2", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -31,109 +31,127 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5798 + "score": 0.3332 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6173 + "score": 0.3263 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.4232 + "score": 0.2313 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" + } + }, + { + "evaluation_name": "Math", + "metric_config": { + "evaluation_description": "Math score - measures mathematical reasoning", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.3989 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7351 + "score": 0.7589 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5482 + "score": 0.2939 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Prior Sets (0.5 weight)", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.57 + "score": -0.01 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } } ], @@ -141,10 +159,10 @@ "generation_config": null }, { - "evaluation_id": "reward-bench-2/PKU-Alignment_beaver-7b-v1.0-cost/1766412838.146816", + "evaluation_id": "reward-bench/PKU-Alignment_beaver-7b-v1.0-cost/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench 2", + "source_name": "RewardBench", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -163,127 +181,109 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3332 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3263 + "score": 0.5798 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Precise IF", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.2313 + "score": 0.6173 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3989 + "score": 0.4232 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7589 + "score": 0.7351 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Focus", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.2939 + "score": 0.5482 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Ties", + "evaluation_name": "Prior Sets (0.5 weight)", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": -0.01 + "score": 0.57 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } } ], diff --git a/data/models/pku-alignment_beaver-7b-v1.0-reward.json b/data/models/pku-alignment_beaver-7b-v1.0-reward.json index adf890d6a20deeb5311651870c4e5638866b9733..ee66fd95461643d78856676dc7b45eb953fc109f 100644 --- a/data/models/pku-alignment_beaver-7b-v1.0-reward.json +++ b/data/models/pku-alignment_beaver-7b-v1.0-reward.json @@ -9,10 +9,10 @@ }, "evaluations": [ { - "evaluation_id": "reward-bench-2/PKU-Alignment_beaver-7b-v1.0-reward/1766412838.146816", + "evaluation_id": "reward-bench/PKU-Alignment_beaver-7b-v1.0-reward/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench 2", + "source_name": "RewardBench", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -31,127 +31,109 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1606 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.2105 + "score": 0.4727 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Precise IF", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.2938 + "score": 0.8184 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.2623 + "score": 0.2873 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.1422 + "score": 0.3757 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Focus", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.0646 + "score": 0.346 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Ties", + "evaluation_name": "Prior Sets (0.5 weight)", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": -0.01 + "score": 0.5993 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } } ], @@ -159,10 +141,10 @@ "generation_config": null }, { - "evaluation_id": "reward-bench/PKU-Alignment_beaver-7b-v1.0-reward/1766412838.146816", + "evaluation_id": "reward-bench-2/PKU-Alignment_beaver-7b-v1.0-reward/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench", + "source_name": "RewardBench 2", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -181,109 +163,127 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.4727 + "score": 0.1606 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8184 + "score": 0.2105 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.2873 + "score": 0.2938 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" + } + }, + { + "evaluation_name": "Math", + "metric_config": { + "evaluation_description": "Math score - measures mathematical reasoning", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.2623 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3757 + "score": 0.1422 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.346 + "score": 0.0646 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Prior Sets (0.5 weight)", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5993 + "score": -0.01 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } } ], diff --git a/data/models/pku-alignment_beaver-7b-v2.0-reward.json b/data/models/pku-alignment_beaver-7b-v2.0-reward.json index e6954fa8026e0ae406f243f530096227275cb53f..4c7c399de00819c04f2b08eddfc902f45548a34f 100644 --- a/data/models/pku-alignment_beaver-7b-v2.0-reward.json +++ b/data/models/pku-alignment_beaver-7b-v2.0-reward.json @@ -9,10 +9,10 @@ }, "evaluations": [ { - "evaluation_id": "reward-bench-2/PKU-Alignment_beaver-7b-v2.0-reward/1766412838.146816", + "evaluation_id": "reward-bench/PKU-Alignment_beaver-7b-v2.0-reward/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench 2", + "source_name": "RewardBench", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -31,127 +31,109 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2544 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.2168 + "score": 0.6366 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Precise IF", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.2562 + "score": 0.8994 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3825 + "score": 0.364 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3156 + "score": 0.6041 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Focus", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.2606 + "score": 0.6887 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Ties", + "evaluation_name": "Prior Sets (0.5 weight)", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.0944 + "score": 0.6171 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } } ], @@ -159,10 +141,10 @@ "generation_config": null }, { - "evaluation_id": "reward-bench/PKU-Alignment_beaver-7b-v2.0-reward/1766412838.146816", + "evaluation_id": "reward-bench-2/PKU-Alignment_beaver-7b-v2.0-reward/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench", + "source_name": "RewardBench 2", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -181,109 +163,127 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6366 + "score": 0.2544 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8994 + "score": 0.2168 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.364 + "score": 0.2562 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" + } + }, + { + "evaluation_name": "Math", + "metric_config": { + "evaluation_description": "Math score - measures mathematical reasoning", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.3825 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6041 + "score": 0.3156 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6887 + "score": 0.2606 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Prior Sets (0.5 weight)", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6171 + "score": 0.0944 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } } ], diff --git a/data/models/primeintellect_intellect-1.json b/data/models/primeintellect_intellect-1.json index 7d9ec915394b34f60309abc9ed073a5aa8bce5ab..b8fa0710a43fde559062289de677b131f6527599 100644 --- a/data/models/primeintellect_intellect-1.json +++ b/data/models/primeintellect_intellect-1.json @@ -5,7 +5,7 @@ "developer": "PrimeIntellect", "inference_platform": "unknown", "additional_details": { - "precision": "float16", + "precision": "bfloat16", "architecture": "LlamaForCausalLM", "params_billions": "10.211" } @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.274 + "score": 0.276 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.25 + "score": 0.2534 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3753 + "score": 0.3339 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.112 + "score": 0.1123 } } ], @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.276 + "score": 0.274 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2534 + "score": 0.25 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3339 + "score": 0.3753 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1123 + "score": 0.112 } } ], diff --git a/data/models/princeton-nlp_llama-3-8b-prolong-512k-instruct.json b/data/models/princeton-nlp_llama-3-8b-prolong-512k-instruct.json index a2239eea28a466521ab226132351f6bc4c958619..b88fd125d78454888c7b70c3de2f72cbe2fd7732 100644 --- a/data/models/princeton-nlp_llama-3-8b-prolong-512k-instruct.json +++ b/data/models/princeton-nlp_llama-3-8b-prolong-512k-instruct.json @@ -5,7 +5,7 @@ "developer": "princeton-nlp", "inference_platform": "unknown", "additional_details": { - "precision": "float16", + "precision": "bfloat16", "architecture": "LlamaForCausalLM", "params_billions": "8.03" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3978 + "score": 0.5508 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4983 + "score": 0.5028 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0582 + "score": 0.0529 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.281 + "score": 0.2861 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.425 + "score": 0.4266 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3246 + "score": 0.3231 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5508 + "score": 0.3978 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5028 + "score": 0.4983 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0529 + "score": 0.0582 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2861 + "score": 0.281 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4266 + "score": 0.425 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3231 + "score": 0.3246 } } ], diff --git a/data/models/quazim0t0_odb-14b-sce.json b/data/models/quazim0t0_odb-14b-sce.json index 0894dd13c3bc882b56c1b90b77e354b0b8fcf1b3..b2854fae2e7ea9ad5a32eb4ec453ef9c1365f1b7 100644 --- a/data/models/quazim0t0_odb-14b-sce.json +++ b/data/models/quazim0t0_odb-14b-sce.json @@ -6,8 +6,8 @@ "inference_platform": "unknown", "additional_details": { "precision": "bfloat16", - "architecture": "Unknown", - "params_billions": "0.0", + "architecture": "LlamaForCausalLM", + "params_billions": "14.66", "model_id_aliases": [ "Quazim0t0/ODB-14b-sce" ] @@ -15,7 +15,7 @@ }, "evaluations": [ { - "evaluation_id": "hfopenllm_v2/Quazim0t0_ODB-14B-sce/1773936498.240187", + "evaluation_id": "hfopenllm_v2/Quazim0t0_ODB-14b-sce/1773936498.240187", "retrieved_timestamp": "1773936498.240187", "source_metadata": { "source_name": "HF Open LLM v2", @@ -47,7 +47,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2922 + "score": 0.7016 } }, { @@ -65,7 +65,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.6559 + "score": 0.6942 } }, { @@ -83,7 +83,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2545 + "score": 0.4116 } }, { @@ -101,7 +101,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2659 + "score": 0.3624 } }, { @@ -119,7 +119,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3929 + "score": 0.4571 } }, { @@ -137,7 +137,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5207 + "score": 0.5411 } } ], @@ -145,7 +145,7 @@ "generation_config": null }, { - "evaluation_id": "hfopenllm_v2/Quazim0t0_ODB-14b-sce/1773936498.240187", + "evaluation_id": "hfopenllm_v2/Quazim0t0_ODB-14B-sce/1773936498.240187", "retrieved_timestamp": "1773936498.240187", "source_metadata": { "source_name": "HF Open LLM v2", @@ -177,7 +177,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.7016 + "score": 0.2922 } }, { @@ -195,7 +195,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.6942 + "score": 0.6559 } }, { @@ -213,7 +213,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4116 + "score": 0.2545 } }, { @@ -231,7 +231,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3624 + "score": 0.2659 } }, { @@ -249,7 +249,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4571 + "score": 0.3929 } }, { @@ -267,7 +267,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5411 + "score": 0.5207 } } ], diff --git a/data/models/qwen_qwen2.5-0.5b-instruct.json b/data/models/qwen_qwen2.5-0.5b-instruct.json index c0e268fa987678ac81eb89b230a531c20916f2db..09847e1fd5f51b0203501d1e930a33e69907de86 100644 --- a/data/models/qwen_qwen2.5-0.5b-instruct.json +++ b/data/models/qwen_qwen2.5-0.5b-instruct.json @@ -5,9 +5,9 @@ "developer": "Qwen", "inference_platform": "unknown", "additional_details": { - "precision": "bfloat16", + "precision": "float16", "architecture": "Qwen2ForCausalLM", - "params_billions": "0.5" + "params_billions": "0.494" } }, "evaluations": [ @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3071 + "score": 0.3153 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3341 + "score": 0.3322 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0 + "score": 0.1035 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2576 + "score": 0.2592 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3329 + "score": 0.3342 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1697 + "score": 0.172 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3153 + "score": 0.3071 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3322 + "score": 0.3341 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1035 + "score": 0.0 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2592 + "score": 0.2576 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3342 + "score": 0.3329 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.172 + "score": 0.1697 } } ], diff --git a/data/models/qwen_qwen2.5-3b-instruct.json b/data/models/qwen_qwen2.5-3b-instruct.json index f61e692a8920e6654870904e7032a5b9b962252b..a7b912165d684c557d1da01d4fe87e78c2addb23 100644 --- a/data/models/qwen_qwen2.5-3b-instruct.json +++ b/data/models/qwen_qwen2.5-3b-instruct.json @@ -140,6 +140,195 @@ ], "detailed_evaluation_results": null, "generation_config": null + }, + { + "evaluation_id": "theory_of_mind/hf_Qwen_Qwen2.5-3B-Instruct/1772541652.0", + "retrieved_timestamp": "1774793718.284365", + "source_metadata": { + "source_name": "inspect_ai", + "source_type": "evaluation_run", + "source_organization_name": "unknown", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "inspect", + "version": "inspect_ai:0.3.185" + }, + "benchmark": "theory_of_mind", + "evaluation_results": [ + { + "evaluation_name": "accuracy on theory_of_mind for scorer model_graded_fact", + "source_data": { + "dataset_name": "theory_of_mind", + "source_type": "hf_dataset", + "hf_repo": "example://theory_of_mind", + "samples_number": 100, + "sample_ids": [ + "1", + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "9", + "10", + "11", + "12", + "13", + "14", + "15", + "16", + "17", + "18", + "19", + "20", + "21", + "22", + "23", + "24", + "25", + "26", + "27", + "28", + "29", + "30", + "31", + "32", + "33", + "34", + "35", + "36", + "37", + "38", + "39", + "40", + "41", + "42", + "43", + "44", + "45", + "46", + "47", + "48", + "49", + "50", + "51", + "52", + "53", + "54", + "55", + "56", + "57", + "58", + "59", + "60", + "61", + "62", + "63", + "64", + "65", + "66", + "67", + "68", + "69", + "70", + "71", + "72", + "73", + "74", + "75", + "76", + "77", + "78", + "79", + "80", + "81", + "82", + "83", + "84", + "85", + "86", + "87", + "88", + "89", + "90", + "91", + "92", + "93", + "94", + "95", + "96", + "97", + "98", + "99", + "100" + ], + "additional_details": { + "shuffled": "False" + } + }, + "evaluation_timestamp": "1772541652.0", + "metric_config": { + "evaluation_description": "accuracy", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.78, + "uncertainty": { + "standard_error": { + "value": 0.04163331998932266 + }, + "num_samples": 100 + } + }, + "generation_config": { + "generation_args": { + "reasoning": false, + "agentic_eval_config": { + "available_tools": [] + }, + "eval_plan": { + "name": "plan", + "steps": [ + "{\"solver\": \"generate\", \"params\": {\"tool_calls\": \"loop\", \"kwargs\": {}}, \"params_passed\": {}}" + ], + "config": {} + }, + "eval_limits": {}, + "sandbox": {} + } + } + } + ], + "detailed_evaluation_results": { + "format": "jsonl", + "file_path": "data/theory_of_mind/Qwen/Qwen2.5-3B-Instruct/30ed1a75-5bfd-4405-abce-b0fd5e0165ba_samples.jsonl", + "hash_algorithm": "sha256", + "checksum": "22c5bd6a8da6c54dfb409425283b1e136f76a225daa48c28b963f3be1f13d697", + "total_rows": 100 + }, + "generation_config": { + "generation_args": { + "reasoning": false, + "agentic_eval_config": { + "available_tools": [] + }, + "eval_plan": { + "name": "plan", + "steps": [ + "{\"solver\": \"generate\", \"params\": {\"tool_calls\": \"loop\", \"kwargs\": {}}, \"params_passed\": {}}" + ], + "config": {} + }, + "eval_limits": {}, + "sandbox": {} + } + } } ] } \ No newline at end of file diff --git a/data/models/qwen_qwen2.5-coder-7b-instruct.json b/data/models/qwen_qwen2.5-coder-7b-instruct.json index 96b0a8af52fb442affa2ad7ba1ed017d80ec2231..0cdcd4044d4f077bb1870a78d07a83c3bbc7d0d0 100644 --- a/data/models/qwen_qwen2.5-coder-7b-instruct.json +++ b/data/models/qwen_qwen2.5-coder-7b-instruct.json @@ -5,7 +5,7 @@ "developer": "Qwen", "inference_platform": "unknown", "additional_details": { - "precision": "bfloat16", + "precision": "float16", "architecture": "Qwen2ForCausalLM", "params_billions": "7.616" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.6101 + "score": 0.6147 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5008 + "score": 0.4999 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3716 + "score": 0.031 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2919 + "score": 0.2936 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4073 + "score": 0.4099 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3352 + "score": 0.3354 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.6147 + "score": 0.6101 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4999 + "score": 0.5008 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.031 + "score": 0.3716 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2936 + "score": 0.2919 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4099 + "score": 0.4073 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3354 + "score": 0.3352 } } ], diff --git a/data/models/ray2333_grm-gemma2-2b-rewardmodel-ft.json b/data/models/ray2333_grm-gemma2-2b-rewardmodel-ft.json index 24dd55ac376e51965493e81885df295559db7184..cd8835eaf989dbd622c51262fef9b72b10534d67 100644 --- a/data/models/ray2333_grm-gemma2-2b-rewardmodel-ft.json +++ b/data/models/ray2333_grm-gemma2-2b-rewardmodel-ft.json @@ -9,10 +9,10 @@ }, "evaluations": [ { - "evaluation_id": "reward-bench-2/Ray2333_GRM-gemma2-2B-rewardmodel-ft/1766412838.146816", + "evaluation_id": "reward-bench/Ray2333_GRM-gemma2-2B-rewardmodel-ft/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench 2", + "source_name": "RewardBench", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -31,104 +31,128 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5966 + "score": 0.8839 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Factuality", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5305 + "score": 0.9302 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Precise IF", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3125 + "score": 0.7719 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5902 + "score": 0.9216 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Safety", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9222 + "score": 0.912 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } - }, + } + ], + "detailed_evaluation_results": null, + "generation_config": null + }, + { + "evaluation_id": "reward-bench-2/Ray2333_GRM-gemma2-2B-rewardmodel-ft/1766412838.146816", + "retrieved_timestamp": "1766412838.146816", + "source_metadata": { + "source_name": "RewardBench 2", + "source_type": "documentation", + "source_organization_name": "Allen Institute for AI", + "source_organization_url": "https://allenai.org", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "rewardbench", + "version": "0.1.3", + "additional_details": { + "subsets": "Chat, Chat Hard, Safety, Reasoning", + "hf_space": "allenai/reward-bench" + } + }, + "benchmark": "reward-bench", + "evaluation_results": [ { - "evaluation_name": "Focus", + "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7455 + "score": 0.5966 }, "source_data": { "dataset_name": "RewardBench 2", @@ -137,135 +161,111 @@ } }, { - "evaluation_name": "Ties", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.4788 + "score": 0.5305 }, "source_data": { "dataset_name": "RewardBench 2", "source_type": "hf_dataset", "hf_repo": "allenai/reward-bench-2-results" } - } - ], - "detailed_evaluation_results": null, - "generation_config": null - }, - { - "evaluation_id": "reward-bench/Ray2333_GRM-gemma2-2B-rewardmodel-ft/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "eval_library": { - "name": "rewardbench", - "version": "0.1.3", - "additional_details": { - "subsets": "Chat, Chat Hard, Safety, Reasoning", - "hf_space": "allenai/reward-bench" - } - }, - "benchmark": "reward-bench", - "evaluation_results": [ + }, { - "evaluation_name": "Score", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8839 + "score": 0.3125 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Math", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Math score - measures mathematical reasoning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9302 + "score": 0.5902 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7719 + "score": 0.9222 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Safety", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9216 + "score": 0.7455 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.912 + "score": 0.4788 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } } ], diff --git a/data/models/ray2333_grm-llama3-8b-sftreg.json b/data/models/ray2333_grm-llama3-8b-sftreg.json index bd70639489991e3e8880cad041284119bd35ecc9..15f35c842dfd5169bfd85bae53f76a90d5850421 100644 --- a/data/models/ray2333_grm-llama3-8b-sftreg.json +++ b/data/models/ray2333_grm-llama3-8b-sftreg.json @@ -9,10 +9,10 @@ }, "evaluations": [ { - "evaluation_id": "reward-bench/Ray2333_GRM-llama3-8B-sftreg/1766412838.146816", + "evaluation_id": "reward-bench-2/Ray2333_GRM-llama3-8B-sftreg/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench", + "source_name": "RewardBench 2", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -31,109 +31,127 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8542 + "score": 0.6089 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.986 + "score": 0.6189 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6776 + "score": 0.3875 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" + } + }, + { + "evaluation_name": "Math", + "metric_config": { + "evaluation_description": "Math score - measures mathematical reasoning", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.5792 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8919 + "score": 0.7867 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9229 + "score": 0.6828 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Prior Sets (0.5 weight)", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7309 + "score": 0.5981 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } } ], @@ -141,10 +159,10 @@ "generation_config": null }, { - "evaluation_id": "reward-bench-2/Ray2333_GRM-llama3-8B-sftreg/1766412838.146816", + "evaluation_id": "reward-bench/Ray2333_GRM-llama3-8B-sftreg/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench 2", + "source_name": "RewardBench", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -163,127 +181,109 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6089 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6189 + "score": 0.8542 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Precise IF", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3875 + "score": 0.986 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5792 + "score": 0.6776 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7867 + "score": 0.8919 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Focus", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6828 + "score": 0.9229 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Ties", + "evaluation_name": "Prior Sets (0.5 weight)", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5981 + "score": 0.7309 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } } ], diff --git a/data/models/recoilme_gemma-2-ataraxy-gemmasutra-9b-slerp.json b/data/models/recoilme_gemma-2-ataraxy-gemmasutra-9b-slerp.json index 07191bcaa576a8b8b67650d88a341b91a6316705..f26fe1b9038e8edc381d5411187a5c6b2c7cbf6e 100644 --- a/data/models/recoilme_gemma-2-ataraxy-gemmasutra-9b-slerp.json +++ b/data/models/recoilme_gemma-2-ataraxy-gemmasutra-9b-slerp.json @@ -5,7 +5,7 @@ "developer": "recoilme", "inference_platform": "unknown", "additional_details": { - "precision": "bfloat16", + "precision": "float16", "architecture": "Gemma2ForCausalLM", "params_billions": "10.159" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2854 + "score": 0.7649 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5984 + "score": 0.5974 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1005 + "score": 0.0174 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3297 + "score": 0.3305 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4607 + "score": 0.4245 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4162 + "score": 0.4207 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.7649 + "score": 0.2854 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5974 + "score": 0.5984 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0174 + "score": 0.1005 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3305 + "score": 0.3297 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4245 + "score": 0.4607 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4207 + "score": 0.4162 } } ], diff --git a/data/models/recoilme_recoilme-gemma-2-9b-v0.2.json b/data/models/recoilme_recoilme-gemma-2-9b-v0.2.json index 83bfdb05cda442c3bac351d424fa26cf0ae864fe..34121094195afd5811c50d769ba5e5e07669b30d 100644 --- a/data/models/recoilme_recoilme-gemma-2-9b-v0.2.json +++ b/data/models/recoilme_recoilme-gemma-2-9b-v0.2.json @@ -5,7 +5,7 @@ "developer": "recoilme", "inference_platform": "unknown", "additional_details": { - "precision": "float16", + "precision": "bfloat16", "architecture": "Gemma2ForCausalLM", "params_billions": "10.159" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.7592 + "score": 0.2747 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.6026 + "score": 0.6031 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0529 + "score": 0.0831 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3289 + "score": 0.3305 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4099 + "score": 0.4686 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4163 + "score": 0.4122 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2747 + "score": 0.7592 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.6031 + "score": 0.6026 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0831 + "score": 0.0529 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3305 + "score": 0.3289 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4686 + "score": 0.4099 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4122 + "score": 0.4163 } } ], diff --git a/data/models/recoilme_recoilme-gemma-2-9b-v0.3.json b/data/models/recoilme_recoilme-gemma-2-9b-v0.3.json index d5cadb462419994ffdabe8b6cdbd73be57e99fa8..812871dcfb75c54e3765d2c8d03eeadc60d898aa 100644 --- a/data/models/recoilme_recoilme-gemma-2-9b-v0.3.json +++ b/data/models/recoilme_recoilme-gemma-2-9b-v0.3.json @@ -5,7 +5,7 @@ "developer": "recoilme", "inference_platform": "unknown", "additional_details": { - "precision": "bfloat16", + "precision": "float16", "architecture": "Gemma2ForCausalLM", "params_billions": "10.159" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5761 + "score": 0.7439 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.602 + "score": 0.5993 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1888 + "score": 0.0876 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3372 + "score": 0.3238 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4632 + "score": 0.4204 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4039 + "score": 0.4072 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.7439 + "score": 0.5761 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5993 + "score": 0.602 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0876 + "score": 0.1888 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3238 + "score": 0.3372 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4204 + "score": 0.4632 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4072 + "score": 0.4039 } } ], diff --git a/data/models/replete-ai_replete-llm-qwen2-7b.json b/data/models/replete-ai_replete-llm-qwen2-7b.json index 627d67572ee0cd0135e173767115d5ee7360b5e6..e51b10d1bb4ff7d76603042a5cdf25b16b9bc87f 100644 --- a/data/models/replete-ai_replete-llm-qwen2-7b.json +++ b/data/models/replete-ai_replete-llm-qwen2-7b.json @@ -5,7 +5,7 @@ "developer": "Replete-AI", "inference_platform": "unknown", "additional_details": { - "precision": "float16", + "precision": "bfloat16", "architecture": "Qwen2ForCausalLM", "params_billions": "7.616" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0905 + "score": 0.0932 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2985 + "score": 0.2977 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2534 + "score": 0.2475 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3848 + "score": 0.3941 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1158 + "score": 0.1157 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0932 + "score": 0.0905 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2977 + "score": 0.2985 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2475 + "score": 0.2534 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3941 + "score": 0.3848 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.1157 + "score": 0.1158 } } ], diff --git a/data/models/sfairxc_fsfairx-llama3-rm-v0.1.json b/data/models/sfairxc_fsfairx-llama3-rm-v0.1.json index 83ff30916d3f35c6333d5f652927c45d63836756..ecef2ed14b1e2210796bfe9e4943404f7ebd3b7c 100644 --- a/data/models/sfairxc_fsfairx-llama3-rm-v0.1.json +++ b/data/models/sfairxc_fsfairx-llama3-rm-v0.1.json @@ -9,10 +9,10 @@ }, "evaluations": [ { - "evaluation_id": "reward-bench/sfairXC_FsfairX-LLaMA3-RM-v0.1/1766412838.146816", + "evaluation_id": "reward-bench-2/sfairXC_FsfairX-LLaMA3-RM-v0.1/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench", + "source_name": "RewardBench 2", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -31,109 +31,127 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8338 + "score": 0.6292 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9944 + "score": 0.5916 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6513 + "score": 0.4188 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" + } + }, + { + "evaluation_name": "Math", + "metric_config": { + "evaluation_description": "Math score - measures mathematical reasoning", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.6284 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8676 + "score": 0.7667 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8644 + "score": 0.7051 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Prior Sets (0.5 weight)", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7492 + "score": 0.6647 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } } ], @@ -141,10 +159,10 @@ "generation_config": null }, { - "evaluation_id": "reward-bench-2/sfairXC_FsfairX-LLaMA3-RM-v0.1/1766412838.146816", + "evaluation_id": "reward-bench/sfairXC_FsfairX-LLaMA3-RM-v0.1/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench 2", + "source_name": "RewardBench", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -163,127 +181,109 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6292 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5916 + "score": 0.8338 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Precise IF", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.4188 + "score": 0.9944 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6284 + "score": 0.6513 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7667 + "score": 0.8676 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Focus", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7051 + "score": 0.8644 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Ties", + "evaluation_name": "Prior Sets (0.5 weight)", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6647 + "score": 0.7492 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } } ], diff --git a/data/models/skywork_skywork-reward-gemma-2-27b-v0.2.json b/data/models/skywork_skywork-reward-gemma-2-27b-v0.2.json index 52952d2b07ce3bbc9c007749585e45b758ae4213..4109d07feac30d842a656539e17ae764cf441c79 100644 --- a/data/models/skywork_skywork-reward-gemma-2-27b-v0.2.json +++ b/data/models/skywork_skywork-reward-gemma-2-27b-v0.2.json @@ -142,10 +142,10 @@ "generation_config": null }, { - "evaluation_id": "reward-bench/Skywork_Skywork-Reward-Gemma-2-27B-v0.2/1766412838.146816", + "evaluation_id": "reward-bench-2/Skywork_Skywork-Reward-Gemma-2-27B-v0.2/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench", + "source_name": "RewardBench 2", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -164,128 +164,104 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9426 + "score": 0.7531 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9609 + "score": 0.7674 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8991 + "score": 0.375 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Safety", + "evaluation_name": "Math", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Math score - measures mathematical reasoning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9297 + "score": 0.6721 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9807 + "score": 0.9689 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } - } - ], - "detailed_evaluation_results": null, - "generation_config": null - }, - { - "evaluation_id": "reward-bench-2/Skywork_Skywork-Reward-Gemma-2-27B-v0.2/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "eval_library": { - "name": "rewardbench", - "version": "0.1.3", - "additional_details": { - "subsets": "Chat, Chat Hard, Safety, Reasoning", - "hf_space": "allenai/reward-bench" - } - }, - "benchmark": "reward-bench", - "evaluation_results": [ + }, { - "evaluation_name": "Score", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7531 + "score": 0.9172 }, "source_data": { "dataset_name": "RewardBench 2", @@ -294,111 +270,135 @@ } }, { - "evaluation_name": "Factuality", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7674 + "score": 0.8182 }, "source_data": { "dataset_name": "RewardBench 2", "source_type": "hf_dataset", "hf_repo": "allenai/reward-bench-2-results" } - }, + } + ], + "detailed_evaluation_results": null, + "generation_config": null + }, + { + "evaluation_id": "reward-bench/Skywork_Skywork-Reward-Gemma-2-27B-v0.2/1766412838.146816", + "retrieved_timestamp": "1766412838.146816", + "source_metadata": { + "source_name": "RewardBench", + "source_type": "documentation", + "source_organization_name": "Allen Institute for AI", + "source_organization_url": "https://allenai.org", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "rewardbench", + "version": "0.1.3", + "additional_details": { + "subsets": "Chat, Chat Hard, Safety, Reasoning", + "hf_space": "allenai/reward-bench" + } + }, + "benchmark": "reward-bench", + "evaluation_results": [ { - "evaluation_name": "Precise IF", + "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.375 + "score": 0.9426 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6721 + "score": 0.9609 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Safety", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9689 + "score": 0.8991 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Focus", + "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9172 + "score": 0.9297 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Ties", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8182 + "score": 0.9807 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } } ], diff --git a/data/models/skywork_skywork-reward-gemma-2-27b.json b/data/models/skywork_skywork-reward-gemma-2-27b.json index c2b742dbc17aa9720f66d090e08eb784ec7accdc..0b4cc11a7f17a4535cc37f935fdae034a6214bce 100644 --- a/data/models/skywork_skywork-reward-gemma-2-27b.json +++ b/data/models/skywork_skywork-reward-gemma-2-27b.json @@ -9,10 +9,10 @@ }, "evaluations": [ { - "evaluation_id": "reward-bench-2/Skywork_Skywork-Reward-Gemma-2-27B/1766412838.146816", + "evaluation_id": "reward-bench/Skywork_Skywork-Reward-Gemma-2-27B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench 2", + "source_name": "RewardBench", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -31,104 +31,128 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7576 + "score": 0.938 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Factuality", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7368 + "score": 0.9581 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Precise IF", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.4031 + "score": 0.9145 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7049 + "score": 0.9189 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Safety", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9422 + "score": 0.9606 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } - }, + } + ], + "detailed_evaluation_results": null, + "generation_config": null + }, + { + "evaluation_id": "reward-bench-2/Skywork_Skywork-Reward-Gemma-2-27B/1766412838.146816", + "retrieved_timestamp": "1766412838.146816", + "source_metadata": { + "source_name": "RewardBench 2", + "source_type": "documentation", + "source_organization_name": "Allen Institute for AI", + "source_organization_url": "https://allenai.org", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "rewardbench", + "version": "0.1.3", + "additional_details": { + "subsets": "Chat, Chat Hard, Safety, Reasoning", + "hf_space": "allenai/reward-bench" + } + }, + "benchmark": "reward-bench", + "evaluation_results": [ { - "evaluation_name": "Focus", + "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9323 + "score": 0.7576 }, "source_data": { "dataset_name": "RewardBench 2", @@ -137,135 +161,111 @@ } }, { - "evaluation_name": "Ties", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8261 + "score": 0.7368 }, "source_data": { "dataset_name": "RewardBench 2", "source_type": "hf_dataset", "hf_repo": "allenai/reward-bench-2-results" } - } - ], - "detailed_evaluation_results": null, - "generation_config": null - }, - { - "evaluation_id": "reward-bench/Skywork_Skywork-Reward-Gemma-2-27B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "eval_library": { - "name": "rewardbench", - "version": "0.1.3", - "additional_details": { - "subsets": "Chat, Chat Hard, Safety, Reasoning", - "hf_space": "allenai/reward-bench" - } - }, - "benchmark": "reward-bench", - "evaluation_results": [ + }, { - "evaluation_name": "Score", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.938 + "score": 0.4031 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Math", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Math score - measures mathematical reasoning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9581 + "score": 0.7049 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9145 + "score": 0.9422 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Safety", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9189 + "score": 0.9323 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9606 + "score": 0.8261 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } } ], diff --git a/data/models/skywork_skywork-reward-llama-3.1-8b.json b/data/models/skywork_skywork-reward-llama-3.1-8b.json index dedd0015bc30c7a59cc8db5e1fbdb3b6b6cbc978..d700e3f49377a01e2f9d11b4773879e002a65e98 100644 --- a/data/models/skywork_skywork-reward-llama-3.1-8b.json +++ b/data/models/skywork_skywork-reward-llama-3.1-8b.json @@ -9,10 +9,10 @@ }, "evaluations": [ { - "evaluation_id": "reward-bench/Skywork_Skywork-Reward-Llama-3.1-8B/1766412838.146816", + "evaluation_id": "reward-bench-2/Skywork_Skywork-Reward-Llama-3.1-8B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench", + "source_name": "RewardBench 2", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -31,128 +31,104 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9252 + "score": 0.7314 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9581 + "score": 0.6989 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8728 + "score": 0.425 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Safety", + "evaluation_name": "Math", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Math score - measures mathematical reasoning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9081 + "score": 0.6284 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.962 + "score": 0.9333 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } - } - ], - "detailed_evaluation_results": null, - "generation_config": null - }, - { - "evaluation_id": "reward-bench-2/Skywork_Skywork-Reward-Llama-3.1-8B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "eval_library": { - "name": "rewardbench", - "version": "0.1.3", - "additional_details": { - "subsets": "Chat, Chat Hard, Safety, Reasoning", - "hf_space": "allenai/reward-bench" - } - }, - "benchmark": "reward-bench", - "evaluation_results": [ + }, { - "evaluation_name": "Score", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7314 + "score": 0.9616 }, "source_data": { "dataset_name": "RewardBench 2", @@ -161,111 +137,135 @@ } }, { - "evaluation_name": "Factuality", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6989 + "score": 0.741 }, "source_data": { "dataset_name": "RewardBench 2", "source_type": "hf_dataset", "hf_repo": "allenai/reward-bench-2-results" } - }, + } + ], + "detailed_evaluation_results": null, + "generation_config": null + }, + { + "evaluation_id": "reward-bench/Skywork_Skywork-Reward-Llama-3.1-8B/1766412838.146816", + "retrieved_timestamp": "1766412838.146816", + "source_metadata": { + "source_name": "RewardBench", + "source_type": "documentation", + "source_organization_name": "Allen Institute for AI", + "source_organization_url": "https://allenai.org", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "rewardbench", + "version": "0.1.3", + "additional_details": { + "subsets": "Chat, Chat Hard, Safety, Reasoning", + "hf_space": "allenai/reward-bench" + } + }, + "benchmark": "reward-bench", + "evaluation_results": [ { - "evaluation_name": "Precise IF", + "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.425 + "score": 0.9252 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6284 + "score": 0.9581 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Safety", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9333 + "score": 0.8728 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Focus", + "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9616 + "score": 0.9081 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Ties", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.741 + "score": 0.962 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } } ], diff --git a/data/models/skywork_skywork-vl-reward-7b.json b/data/models/skywork_skywork-vl-reward-7b.json index 651d1416fd84d9618565234fc2f23befa272cb51..d1caca7afd32adac0ce3eaccc5894c6b1d1db99d 100644 --- a/data/models/skywork_skywork-vl-reward-7b.json +++ b/data/models/skywork_skywork-vl-reward-7b.json @@ -9,10 +9,10 @@ }, "evaluations": [ { - "evaluation_id": "reward-bench-2/Skywork_Skywork-VL-Reward-7B/1766412838.146816", + "evaluation_id": "reward-bench/Skywork_Skywork-VL-Reward-7B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench 2", + "source_name": "RewardBench", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -31,104 +31,128 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6885 + "score": 0.9007 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Factuality", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6063 + "score": 0.8994 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Precise IF", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.35 + "score": 0.875 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6339 + "score": 0.9108 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Safety", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8911 + "score": 0.9176 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } - }, + } + ], + "detailed_evaluation_results": null, + "generation_config": null + }, + { + "evaluation_id": "reward-bench-2/Skywork_Skywork-VL-Reward-7B/1766412838.146816", + "retrieved_timestamp": "1766412838.146816", + "source_metadata": { + "source_name": "RewardBench 2", + "source_type": "documentation", + "source_organization_name": "Allen Institute for AI", + "source_organization_url": "https://allenai.org", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "rewardbench", + "version": "0.1.3", + "additional_details": { + "subsets": "Chat, Chat Hard, Safety, Reasoning", + "hf_space": "allenai/reward-bench" + } + }, + "benchmark": "reward-bench", + "evaluation_results": [ { - "evaluation_name": "Focus", + "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8909 + "score": 0.6885 }, "source_data": { "dataset_name": "RewardBench 2", @@ -137,135 +161,111 @@ } }, { - "evaluation_name": "Ties", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7586 + "score": 0.6063 }, "source_data": { "dataset_name": "RewardBench 2", "source_type": "hf_dataset", "hf_repo": "allenai/reward-bench-2-results" } - } - ], - "detailed_evaluation_results": null, - "generation_config": null - }, - { - "evaluation_id": "reward-bench/Skywork_Skywork-VL-Reward-7B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "eval_library": { - "name": "rewardbench", - "version": "0.1.3", - "additional_details": { - "subsets": "Chat, Chat Hard, Safety, Reasoning", - "hf_space": "allenai/reward-bench" - } - }, - "benchmark": "reward-bench", - "evaluation_results": [ + }, { - "evaluation_name": "Score", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9007 + "score": 0.35 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Math", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Math score - measures mathematical reasoning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8994 + "score": 0.6339 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.875 + "score": 0.8911 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Safety", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9108 + "score": 0.8909 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9176 + "score": 0.7586 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } } ], diff --git a/data/models/snowflake_snowflake-arctic-instruct.json b/data/models/snowflake_snowflake-arctic-instruct.json index 332fb7788a04f01cd7fc12206b12004a2af1d5d3..dcce9cb71bbe18fac2e40bbf0810fef5adcc4078 100644 --- a/data/models/snowflake_snowflake-arctic-instruct.json +++ b/data/models/snowflake_snowflake-arctic-instruct.json @@ -7,10 +7,10 @@ }, "evaluations": [ { - "evaluation_id": "helm_mmlu/snowflake_snowflake-arctic-instruct/1774096312.00548", - "retrieved_timestamp": "1774096312.00548", + "evaluation_id": "helm_lite/snowflake_snowflake-arctic-instruct/1774096306.427425", + "retrieved_timestamp": "1774096306.427425", "source_metadata": { - "source_name": "helm_mmlu", + "source_name": "helm_lite", "source_type": "documentation", "source_organization_name": "crfm", "evaluator_relationship": "third_party" @@ -19,438 +19,382 @@ "name": "helm", "version": "unknown" }, - "benchmark": "helm_mmlu", + "benchmark": "helm_lite", "evaluation_results": [ { - "evaluation_name": "MMLU All Subjects", + "evaluation_name": "Mean win rate", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "helm_lite", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", + "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.677, + "score": 0.338, "details": { - "description": "min=0.28, mean=0.677, max=0.912, sum=77.129 (114)", + "description": "", "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": "{\"description\": \"min=0.35, mean=0.42, max=0.544, sum=47.89 (114)\", \"tab\": \"Efficiency\", \"score\": \"0.4200856614493726\"}", - "MMLU All Subjects - # eval": "{\"description\": \"min=100, mean=246.351, max=1534, sum=28084 (114)\", \"tab\": \"General information\", \"score\": \"246.35087719298247\"}", - "MMLU All Subjects - # train": "{\"description\": \"min=5, mean=5, max=5, sum=570 (114)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "MMLU All Subjects - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (114)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "MMLU All Subjects - # prompt tokens": "{\"description\": \"min=304.474, mean=706.682, max=3159.636, sum=80561.749 (114)\", \"tab\": \"General information\", \"score\": \"706.6820126388612\"}", - "MMLU All Subjects - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=114 (114)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Mean win rate - Efficiency": "{\"description\": \"\", \"tab\": \"Efficiency\", \"score\": \"0.7606242197253433\"}", + "Mean win rate - General information": "{\"description\": \"\", \"tab\": \"General information\", \"score\": \"\"}" } }, "generation_config": { - "additional_details": { - "subject": "[\"abstract_algebra\", \"anatomy\", \"astronomy\", \"business_ethics\", \"clinical_knowledge\", \"college_biology\", \"college_chemistry\", \"college_computer_science\", \"college_mathematics\", \"college_medicine\", \"college_physics\", \"computer_security\", \"conceptual_physics\", \"econometrics\", \"electrical_engineering\", \"elementary_mathematics\", \"formal_logic\", \"global_facts\", \"high_school_biology\", \"high_school_chemistry\", \"high_school_computer_science\", \"high_school_european_history\", \"high_school_geography\", \"high_school_government_and_politics\", \"high_school_macroeconomics\", \"high_school_mathematics\", \"high_school_microeconomics\", \"high_school_physics\", \"high_school_psychology\", \"high_school_statistics\", \"high_school_us_history\", \"high_school_world_history\", \"human_aging\", \"human_sexuality\", \"international_law\", \"jurisprudence\", \"logical_fallacies\", \"machine_learning\", \"management\", \"marketing\", \"medical_genetics\", \"miscellaneous\", \"moral_disputes\", \"moral_scenarios\", \"nutrition\", \"philosophy\", \"prehistory\", \"professional_accounting\", \"professional_law\", \"professional_medicine\", \"professional_psychology\", \"public_relations\", \"security_studies\", \"sociology\", \"us_foreign_policy\", \"virology\", \"world_religions\"]", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "[\"mmlu_abstract_algebra\", \"mmlu_anatomy\", \"mmlu_astronomy\", \"mmlu_business_ethics\", \"mmlu_clinical_knowledge\", \"mmlu_college_biology\", \"mmlu_college_chemistry\", \"mmlu_college_computer_science\", \"mmlu_college_mathematics\", \"mmlu_college_medicine\", \"mmlu_college_physics\", \"mmlu_computer_security\", \"mmlu_conceptual_physics\", \"mmlu_econometrics\", \"mmlu_electrical_engineering\", \"mmlu_elementary_mathematics\", \"mmlu_formal_logic\", \"mmlu_global_facts\", \"mmlu_high_school_biology\", \"mmlu_high_school_chemistry\", \"mmlu_high_school_computer_science\", \"mmlu_high_school_european_history\", \"mmlu_high_school_geography\", \"mmlu_high_school_government_and_politics\", \"mmlu_high_school_macroeconomics\", \"mmlu_high_school_mathematics\", \"mmlu_high_school_microeconomics\", \"mmlu_high_school_physics\", \"mmlu_high_school_psychology\", \"mmlu_high_school_statistics\", \"mmlu_high_school_us_history\", \"mmlu_high_school_world_history\", \"mmlu_human_aging\", \"mmlu_human_sexuality\", \"mmlu_international_law\", \"mmlu_jurisprudence\", \"mmlu_logical_fallacies\", \"mmlu_machine_learning\", \"mmlu_management\", \"mmlu_marketing\", \"mmlu_medical_genetics\", \"mmlu_miscellaneous\", \"mmlu_moral_disputes\", \"mmlu_moral_scenarios\", \"mmlu_nutrition\", \"mmlu_philosophy\", \"mmlu_prehistory\", \"mmlu_professional_accounting\", \"mmlu_professional_law\", \"mmlu_professional_medicine\", \"mmlu_professional_psychology\", \"mmlu_public_relations\", \"mmlu_security_studies\", \"mmlu_sociology\", \"mmlu_us_foreign_policy\", \"mmlu_virology\", \"mmlu_world_religions\"]" - } + "additional_details": {} } }, { - "evaluation_name": "Abstract Algebra", + "evaluation_name": "NarrativeQA", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "NarrativeQA", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on Abstract Algebra", + "evaluation_description": "F1 on NarrativeQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.35, + "score": 0.654, "details": { - "description": "min=0.35, mean=0.35, max=0.35, sum=0.7 (2)", + "description": "min=0.654, mean=0.654, max=0.654, sum=0.654 (1)", "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": "{\"description\": \"min=0.377, mean=0.377, max=0.377, sum=0.753 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.37665764808654784\"}", - "Abstract Algebra - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "Abstract Algebra - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Abstract Algebra - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Abstract Algebra - # prompt tokens": "{\"description\": \"min=397.65, mean=397.65, max=397.65, sum=795.3 (2)\", \"tab\": \"General information\", \"score\": \"397.65\"}", - "Abstract Algebra - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "NarrativeQA - Observed inference time (s)": "{\"description\": \"min=0.624, mean=0.624, max=0.624, sum=0.624 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.6239793220036466\"}", + "NarrativeQA - # eval": "{\"description\": \"min=355, mean=355, max=355, sum=355 (1)\", \"tab\": \"General information\", \"score\": \"355.0\"}", + "NarrativeQA - # train": "{\"description\": \"min=4.262, mean=4.262, max=4.262, sum=4.262 (1)\", \"tab\": \"General information\", \"score\": \"4.261971830985916\"}", + "NarrativeQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "NarrativeQA - # prompt tokens": "{\"description\": \"min=3603.217, mean=3603.217, max=3603.217, sum=3603.217 (1)\", \"tab\": \"General information\", \"score\": \"3603.2169014084507\"}", + "NarrativeQA - # output tokens": "{\"description\": \"min=11.907, mean=11.907, max=11.907, sum=11.907 (1)\", \"tab\": \"General information\", \"score\": \"11.907042253521126\"}" } }, "generation_config": { - "additional_details": { - "subject": "\"abstract_algebra\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_abstract_algebra\"" - } + "additional_details": {} } }, { - "evaluation_name": "Anatomy", + "evaluation_name": "NaturalQuestions (closed-book)", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "NaturalQuestions (closed-book)", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on Anatomy", + "evaluation_description": "F1 on NaturalQuestions (closed-book)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.652, + "score": 0.39, "details": { - "description": "min=0.652, mean=0.652, max=0.652, sum=1.304 (2)", + "description": "min=0.39, mean=0.39, max=0.39, sum=0.39 (1)", "tab": "Accuracy", - "Anatomy - Observed inference time (s)": "{\"description\": \"min=0.365, mean=0.365, max=0.365, sum=0.731 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3654881194785789\"}", - "Anatomy - # eval": "{\"description\": \"min=135, mean=135, max=135, sum=270 (2)\", \"tab\": \"General information\", \"score\": \"135.0\"}", - "Anatomy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Anatomy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Anatomy - # prompt tokens": "{\"description\": \"min=409.133, mean=409.133, max=409.133, sum=818.267 (2)\", \"tab\": \"General information\", \"score\": \"409.1333333333333\"}", - "Anatomy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "NaturalQuestions (open-book) - Observed inference time (s)": "{\"description\": \"min=0.636, mean=0.636, max=0.636, sum=0.636 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.6355201268196106\"}", + "NaturalQuestions (closed-book) - Observed inference time (s)": "{\"description\": \"min=0.469, mean=0.469, max=0.469, sum=0.469 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.4687326259613037\"}", + "NaturalQuestions (open-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", + "NaturalQuestions (open-book) - # train": "{\"description\": \"min=4.825, mean=4.825, max=4.825, sum=4.825 (1)\", \"tab\": \"General information\", \"score\": \"4.825\"}", + "NaturalQuestions (open-book) - truncated": "{\"description\": \"min=0.028, mean=0.028, max=0.028, sum=0.028 (1)\", \"tab\": \"General information\", \"score\": \"0.028\"}", + "NaturalQuestions (open-book) - # prompt tokens": "{\"description\": \"min=2311.514, mean=2311.514, max=2311.514, sum=2311.514 (1)\", \"tab\": \"General information\", \"score\": \"2311.514\"}", + "NaturalQuestions (open-book) - # output tokens": "{\"description\": \"min=18.701, mean=18.701, max=18.701, sum=18.701 (1)\", \"tab\": \"General information\", \"score\": \"18.701\"}", + "NaturalQuestions (closed-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", + "NaturalQuestions (closed-book) - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "NaturalQuestions (closed-book) - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "NaturalQuestions (closed-book) - # prompt tokens": "{\"description\": \"min=166.383, mean=166.383, max=166.383, sum=166.383 (1)\", \"tab\": \"General information\", \"score\": \"166.383\"}", + "NaturalQuestions (closed-book) - # output tokens": "{\"description\": \"min=14.473, mean=14.473, max=14.473, sum=14.473 (1)\", \"tab\": \"General information\", \"score\": \"14.473\"}" } }, "generation_config": { "additional_details": { - "subject": "\"anatomy\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_anatomy\"" + "mode": "\"closedbook\"" } } }, { - "evaluation_name": "College Physics", + "evaluation_name": "OpenbookQA", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "OpenbookQA", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on College Physics", + "evaluation_description": "EM on OpenbookQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.461, + "score": 0.828, "details": { - "description": "min=0.461, mean=0.461, max=0.461, sum=0.922 (2)", + "description": "min=0.828, mean=0.828, max=0.828, sum=0.828 (1)", "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": "{\"description\": \"min=0.35, mean=0.35, max=0.35, sum=0.701 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3502761268615723\"}", - "College Biology - Observed inference time (s)": "{\"description\": \"min=0.421, mean=0.421, max=0.421, sum=0.842 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.421069688267178\"}", - "College Computer Science - Observed inference time (s)": "{\"description\": \"min=0.427, mean=0.427, max=0.427, sum=0.853 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4266632032394409\"}", - "College Mathematics - Observed inference time (s)": "{\"description\": \"min=0.429, mean=0.429, max=0.429, sum=0.858 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42887043952941895\"}", - "College Medicine - Observed inference time (s)": "{\"description\": \"min=0.434, mean=0.434, max=0.434, sum=0.869 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4343285574389331\"}", - "College Physics - Observed inference time (s)": "{\"description\": \"min=0.421, mean=0.421, max=0.421, sum=0.842 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4209739086674709\"}", - "College Chemistry - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "College Chemistry - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "College Chemistry - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "College Chemistry - # prompt tokens": "{\"description\": \"min=622.43, mean=622.43, max=622.43, sum=1244.86 (2)\", \"tab\": \"General information\", \"score\": \"622.43\"}", - "College Chemistry - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "College Biology - # eval": "{\"description\": \"min=144, mean=144, max=144, sum=288 (2)\", \"tab\": \"General information\", \"score\": \"144.0\"}", - "College Biology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "College Biology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "College Biology - # prompt tokens": "{\"description\": \"min=553.632, mean=553.632, max=553.632, sum=1107.264 (2)\", \"tab\": \"General information\", \"score\": \"553.6319444444445\"}", - "College Biology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "College Computer Science - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "College Computer Science - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "College Computer Science - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "College Computer Science - # prompt tokens": "{\"description\": \"min=901.14, mean=901.14, max=901.14, sum=1802.28 (2)\", \"tab\": \"General information\", \"score\": \"901.14\"}", - "College Computer Science - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "College Mathematics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "College Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "College Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "College Mathematics - # prompt tokens": "{\"description\": \"min=646.96, mean=646.96, max=646.96, sum=1293.92 (2)\", \"tab\": \"General information\", \"score\": \"646.96\"}", - "College Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "College Medicine - # eval": "{\"description\": \"min=173, mean=173, max=173, sum=346 (2)\", \"tab\": \"General information\", \"score\": \"173.0\"}", - "College Medicine - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "College Medicine - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "College Medicine - # prompt tokens": "{\"description\": \"min=608.671, mean=608.671, max=608.671, sum=1217.341 (2)\", \"tab\": \"General information\", \"score\": \"608.6705202312139\"}", - "College Medicine - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "College Physics - # eval": "{\"description\": \"min=102, mean=102, max=102, sum=204 (2)\", \"tab\": \"General information\", \"score\": \"102.0\"}", - "College Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "College Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "College Physics - # prompt tokens": "{\"description\": \"min=551.873, mean=551.873, max=551.873, sum=1103.745 (2)\", \"tab\": \"General information\", \"score\": \"551.8725490196078\"}", - "College Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "OpenbookQA - Observed inference time (s)": "{\"description\": \"min=0.284, mean=0.284, max=0.284, sum=0.284 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.2840936713218689\"}", + "OpenbookQA - # eval": "{\"description\": \"min=500, mean=500, max=500, sum=500 (1)\", \"tab\": \"General information\", \"score\": \"500.0\"}", + "OpenbookQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "OpenbookQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "OpenbookQA - # prompt tokens": "{\"description\": \"min=291.574, mean=291.574, max=291.574, sum=291.574 (1)\", \"tab\": \"General information\", \"score\": \"291.574\"}", + "OpenbookQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"college_physics\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_college_physics\"" + "dataset": "\"openbookqa\"", + "method": "\"multiple_choice_joint\"" } } }, { - "evaluation_name": "Computer Security", + "evaluation_name": "MMLU", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "MMLU", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on Computer Security", + "evaluation_description": "EM on MMLU", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.84, + "score": 0.575, "details": { - "description": "min=0.84, mean=0.84, max=0.84, sum=1.68 (2)", + "description": "min=0.31, mean=0.575, max=0.88, sum=2.876 (5)", "tab": "Accuracy", - "Computer Security - Observed inference time (s)": "{\"description\": \"min=0.412, mean=0.412, max=0.412, sum=0.825 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.41247488737106325\"}", - "Computer Security - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "Computer Security - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Computer Security - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Computer Security - # prompt tokens": "{\"description\": \"min=428.17, mean=428.17, max=428.17, sum=856.34 (2)\", \"tab\": \"General information\", \"score\": \"428.17\"}", - "Computer Security - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "MMLU - Observed inference time (s)": "{\"description\": \"min=0.293, mean=0.303, max=0.317, sum=1.516 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.30325288054817606\"}", + "MMLU - # eval": "{\"description\": \"min=100, mean=102.8, max=114, sum=514 (5)\", \"tab\": \"General information\", \"score\": \"102.8\"}", + "MMLU - # train": "{\"description\": \"min=5, mean=5, max=5, sum=25 (5)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "MMLU - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "MMLU - # prompt tokens": "{\"description\": \"min=406.65, mean=531.547, max=693.675, sum=2657.735 (5)\", \"tab\": \"General information\", \"score\": \"531.5470877192982\"}", + "MMLU - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"computer_security\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_computer_security\"" + "subject": "[\"abstract_algebra\", \"college_chemistry\", \"computer_security\", \"econometrics\", \"us_foreign_policy\"]", + "method": "\"multiple_choice_joint\"" } } }, { - "evaluation_name": "Econometrics", + "evaluation_name": "MATH", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "MATH", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on Econometrics", + "evaluation_description": "Equivalent (CoT) on MATH", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5, + "score": 0.519, "details": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)", + "description": "min=0.316, mean=0.519, max=0.785, sum=3.636 (7)", "tab": "Accuracy", - "Econometrics - Observed inference time (s)": "{\"description\": \"min=0.436, mean=0.436, max=0.436, sum=0.873 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.436487873395284\"}", - "Econometrics - # eval": "{\"description\": \"min=114, mean=114, max=114, sum=228 (2)\", \"tab\": \"General information\", \"score\": \"114.0\"}", - "Econometrics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Econometrics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Econometrics - # prompt tokens": "{\"description\": \"min=684.675, mean=684.675, max=684.675, sum=1369.351 (2)\", \"tab\": \"General information\", \"score\": \"684.6754385964912\"}", - "Econometrics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "MATH - Observed inference time (s)": "{\"description\": \"min=1.482, mean=1.724, max=1.995, sum=12.068 (7)\", \"tab\": \"Efficiency\", \"score\": \"1.723981539653867\"}", + "MATH - # eval": "{\"description\": \"min=30, mean=62.429, max=135, sum=437 (7)\", \"tab\": \"General information\", \"score\": \"62.42857142857143\"}", + "MATH - # train": "{\"description\": \"min=8, mean=8, max=8, sum=56 (7)\", \"tab\": \"General information\", \"score\": \"8.0\"}", + "MATH - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (7)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "MATH - # prompt tokens": "{\"description\": \"min=971.652, mean=1438.636, max=2490.962, sum=10070.453 (7)\", \"tab\": \"General information\", \"score\": \"1438.6362030100095\"}", + "MATH - # output tokens": "{\"description\": \"min=82.872, mean=98.802, max=122.233, sum=691.615 (7)\", \"tab\": \"General information\", \"score\": \"98.80208187931566\"}" } }, "generation_config": { "additional_details": { - "subject": "\"econometrics\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_econometrics\"" + "subject": "[\"algebra\", \"counting_and_probability\", \"geometry\", \"intermediate_algebra\", \"number_theory\", \"prealgebra\", \"precalculus\"]", + "level": "\"1\"", + "use_official_examples": "\"False\"", + "use_chain_of_thought": "\"True\"" } } }, { - "evaluation_name": "Global Facts", + "evaluation_name": "GSM8K", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "GSM8K", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on Global Facts", + "evaluation_description": "EM on GSM8K", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.39, + "score": 0.768, "details": { - "description": "min=0.39, mean=0.39, max=0.39, sum=0.78 (2)", + "description": "min=0.768, mean=0.768, max=0.768, sum=0.768 (1)", "tab": "Accuracy", - "Global Facts - Observed inference time (s)": "{\"description\": \"min=0.42, mean=0.42, max=0.42, sum=0.839 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.41951879262924197\"}", - "Global Facts - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "Global Facts - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Global Facts - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Global Facts - # prompt tokens": "{\"description\": \"min=484.54, mean=484.54, max=484.54, sum=969.08 (2)\", \"tab\": \"General information\", \"score\": \"484.54\"}", - "Global Facts - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "GSM8K - Observed inference time (s)": "{\"description\": \"min=2.961, mean=2.961, max=2.961, sum=2.961 (1)\", \"tab\": \"Efficiency\", \"score\": \"2.9610197002887726\"}", + "GSM8K - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", + "GSM8K - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "GSM8K - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "GSM8K - # prompt tokens": "{\"description\": \"min=1207.746, mean=1207.746, max=1207.746, sum=1207.746 (1)\", \"tab\": \"General information\", \"score\": \"1207.746\"}", + "GSM8K - # output tokens": "{\"description\": \"min=189.305, mean=189.305, max=189.305, sum=189.305 (1)\", \"tab\": \"General information\", \"score\": \"189.305\"}" } }, "generation_config": { "additional_details": { - "subject": "\"global_facts\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_global_facts\"" + "stop": "\"none\"" } } }, { - "evaluation_name": "Jurisprudence", + "evaluation_name": "LegalBench", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "LegalBench", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on Jurisprudence", + "evaluation_description": "EM on LegalBench", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.741, + "score": 0.588, "details": { - "description": "min=0.741, mean=0.741, max=0.741, sum=1.481 (2)", + "description": "min=0.351, mean=0.588, max=0.874, sum=2.94 (5)", "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": "{\"description\": \"min=0.422, mean=0.422, max=0.422, sum=0.843 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.421647725281892\"}", - "Jurisprudence - # eval": "{\"description\": \"min=108, mean=108, max=108, sum=216 (2)\", \"tab\": \"General information\", \"score\": \"108.0\"}", - "Jurisprudence - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Jurisprudence - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Jurisprudence - # prompt tokens": "{\"description\": \"min=449.898, mean=449.898, max=449.898, sum=899.796 (2)\", \"tab\": \"General information\", \"score\": \"449.89814814814815\"}", - "Jurisprudence - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "LegalBench - Observed inference time (s)": "{\"description\": \"min=0.292, mean=0.346, max=0.462, sum=1.729 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.34576316386866485\"}", + "LegalBench - # eval": "{\"description\": \"min=95, mean=409.4, max=1000, sum=2047 (5)\", \"tab\": \"General information\", \"score\": \"409.4\"}", + "LegalBench - # train": "{\"description\": \"min=1.81, mean=4.162, max=5, sum=20.81 (5)\", \"tab\": \"General information\", \"score\": \"4.162040816326531\"}", + "LegalBench - truncated": "{\"description\": \"min=0, mean=0.002, max=0.008, sum=0.008 (5)\", \"tab\": \"General information\", \"score\": \"0.0016326530612244899\"}", + "LegalBench - # prompt tokens": "{\"description\": \"min=239.137, mean=1024.722, max=3561.237, sum=5123.61 (5)\", \"tab\": \"General information\", \"score\": \"1024.7220443430492\"}", + "LegalBench - # output tokens": "{\"description\": \"min=2, mean=2.438, max=3.421, sum=12.188 (5)\", \"tab\": \"General information\", \"score\": \"2.4375592890361366\"}" } }, "generation_config": { "additional_details": { - "subject": "\"jurisprudence\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_jurisprudence\"" + "subset": "[\"abercrombie\", \"corporate_lobbying\", \"function_of_decision_section\", \"international_citizenship_questions\", \"proa\"]" } } }, { - "evaluation_name": "Philosophy", + "evaluation_name": "MedQA", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "MedQA", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on Philosophy", + "evaluation_description": "EM on MedQA", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.752, + "score": 0.581, "details": { - "description": "min=0.752, mean=0.752, max=0.752, sum=1.505 (2)", + "description": "min=0.581, mean=0.581, max=0.581, sum=0.581 (1)", "tab": "Accuracy", - "Philosophy - Observed inference time (s)": "{\"description\": \"min=0.418, mean=0.418, max=0.418, sum=0.837 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.418486426497579\"}", - "Philosophy - # eval": "{\"description\": \"min=311, mean=311, max=311, sum=622 (2)\", \"tab\": \"General information\", \"score\": \"311.0\"}", - "Philosophy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Philosophy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Philosophy - # prompt tokens": "{\"description\": \"min=372.122, mean=372.122, max=372.122, sum=744.244 (2)\", \"tab\": \"General information\", \"score\": \"372.12218649517683\"}", - "Philosophy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "MedQA - Observed inference time (s)": "{\"description\": \"min=0.313, mean=0.313, max=0.313, sum=0.313 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.31300480038697864\"}", + "MedQA - # eval": "{\"description\": \"min=503, mean=503, max=503, sum=503 (1)\", \"tab\": \"General information\", \"score\": \"503.0\"}", + "MedQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "MedQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "MedQA - # prompt tokens": "{\"description\": \"min=1243.901, mean=1243.901, max=1243.901, sum=1243.901 (1)\", \"tab\": \"General information\", \"score\": \"1243.9005964214712\"}", + "MedQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { - "additional_details": { - "subject": "\"philosophy\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_philosophy\"" - } + "additional_details": {} } }, { - "evaluation_name": "Professional Psychology", + "evaluation_name": "WMT 2014", "source_data": { - "dataset_name": "helm_mmlu", + "dataset_name": "WMT 2014", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" + "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" ] }, "metric_config": { - "evaluation_description": "EM on Professional Psychology", + "evaluation_description": "BLEU-4 on WMT 2014", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.724, + "score": 0.172, "details": { - "description": "min=0.724, mean=0.724, max=0.724, sum=1.448 (2)", + "description": "min=0.09, mean=0.172, max=0.217, sum=0.86 (5)", "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": "{\"description\": \"min=0.445, mean=0.445, max=0.445, sum=0.89 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4448305149288738\"}", - "Professional Accounting - Observed inference time (s)": "{\"description\": \"min=0.443, mean=0.443, max=0.443, sum=0.887 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.44340477683019974\"}", - "Professional Law - Observed inference time (s)": "{\"description\": \"min=0.531, mean=0.531, max=0.531, sum=1.062 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.531202322345669\"}", - "Professional Psychology - Observed inference time (s)": "{\"description\": \"min=0.423, mean=0.423, max=0.423, sum=0.847 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42342418120577446\"}", - "Professional Medicine - # eval": "{\"description\": \"min=272, mean=272, max=272, sum=544 (2)\", \"tab\": \"General information\", \"score\": \"272.0\"}", - "Professional Medicine - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Professional Medicine - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Professional Medicine - # prompt tokens": "{\"description\": \"min=1330.647, mean=1330.647, max=1330.647, sum=2661.294 (2)\", \"tab\": \"General information\", \"score\": \"1330.6470588235295\"}", - "Professional Medicine - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "Professional Accounting - # eval": "{\"description\": \"min=282, mean=282, max=282, sum=564 (2)\", \"tab\": \"General information\", \"score\": \"282.0\"}", - "Professional Accounting - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Professional Accounting - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Professional Accounting - # prompt tokens": "{\"description\": \"min=823.277, mean=823.277, max=823.277, sum=1646.553 (2)\", \"tab\": \"General information\", \"score\": \"823.2765957446809\"}", - "Professional Accounting - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "Professional Law - # eval": "{\"description\": \"min=1534, mean=1534, max=1534, sum=3068 (2)\", \"tab\": \"General information\", \"score\": \"1534.0\"}", - "Professional Law - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Professional Law - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Professional Law - # prompt tokens": "{\"description\": \"min=1915.007, mean=1915.007, max=1915.007, sum=3830.014 (2)\", \"tab\": \"General information\", \"score\": \"1915.0071707953064\"}", - "Professional Law - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "Professional Psychology - # eval": "{\"description\": \"min=612, mean=612, max=612, sum=1224 (2)\", \"tab\": \"General information\", \"score\": \"612.0\"}", - "Professional Psychology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Professional Psychology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Professional Psychology - # prompt tokens": "{\"description\": \"min=650.078, mean=650.078, max=650.078, sum=1300.157 (2)\", \"tab\": \"General information\", \"score\": \"650.0784313725491\"}", - "Professional Psychology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "WMT 2014 - Observed inference time (s)": "{\"description\": \"min=0.65, mean=0.681, max=0.702, sum=3.405 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.681007040066764\"}", + "WMT 2014 - # eval": "{\"description\": \"min=503, mean=568.8, max=832, sum=2844 (5)\", \"tab\": \"General information\", \"score\": \"568.8\"}", + "WMT 2014 - # train": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "WMT 2014 - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "WMT 2014 - # prompt tokens": "{\"description\": \"min=145.523, mean=160.288, max=182.972, sum=801.438 (5)\", \"tab\": \"General information\", \"score\": \"160.28751290334915\"}", + "WMT 2014 - # output tokens": "{\"description\": \"min=28.596, mean=30.59, max=31.485, sum=152.951 (5)\", \"tab\": \"General information\", \"score\": \"30.59012702630372\"}" } }, "generation_config": { "additional_details": { - "subject": "\"professional_psychology\"", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "\"mmlu_professional_psychology\"" + "language_pair": "[\"cs-en\", \"de-en\", \"fr-en\", \"hi-en\", \"ru-en\"]" } } - }, + } + ], + "detailed_evaluation_results": null, + "generation_config": { + "additional_details": {} + } + }, + { + "evaluation_id": "helm_mmlu/snowflake_snowflake-arctic-instruct/1774096312.00548", + "retrieved_timestamp": "1774096312.00548", + "source_metadata": { + "source_name": "helm_mmlu", + "source_type": "documentation", + "source_organization_name": "crfm", + "evaluator_relationship": "third_party" + }, + "eval_library": { + "name": "helm", + "version": "unknown" + }, + "benchmark": "helm_mmlu", + "evaluation_results": [ { - "evaluation_name": "Us Foreign Policy", + "evaluation_name": "MMLU All Subjects", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -459,36 +403,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", + "evaluation_description": "EM on MMLU All Subjects", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.88, + "score": 0.677, "details": { - "description": "min=0.88, mean=0.88, max=0.88, sum=1.76 (2)", + "description": "min=0.28, mean=0.677, max=0.912, sum=77.129 (114)", "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": "{\"description\": \"min=0.424, mean=0.424, max=0.424, sum=0.848 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42398189067840575\"}", - "Us Foreign Policy - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "Us Foreign Policy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Us Foreign Policy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Us Foreign Policy - # prompt tokens": "{\"description\": \"min=479.81, mean=479.81, max=479.81, sum=959.62 (2)\", \"tab\": \"General information\", \"score\": \"479.81\"}", - "Us Foreign Policy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "MMLU All Subjects - Observed inference time (s)": "{\"description\": \"min=0.35, mean=0.42, max=0.544, sum=47.89 (114)\", \"tab\": \"Efficiency\", \"score\": \"0.4200856614493726\"}", + "MMLU All Subjects - # eval": "{\"description\": \"min=100, mean=246.351, max=1534, sum=28084 (114)\", \"tab\": \"General information\", \"score\": \"246.35087719298247\"}", + "MMLU All Subjects - # train": "{\"description\": \"min=5, mean=5, max=5, sum=570 (114)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "MMLU All Subjects - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (114)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "MMLU All Subjects - # prompt tokens": "{\"description\": \"min=304.474, mean=706.682, max=3159.636, sum=80561.749 (114)\", \"tab\": \"General information\", \"score\": \"706.6820126388612\"}", + "MMLU All Subjects - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=114 (114)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"us_foreign_policy\"", + "subject": "[\"abstract_algebra\", \"anatomy\", \"astronomy\", \"business_ethics\", \"clinical_knowledge\", \"college_biology\", \"college_chemistry\", \"college_computer_science\", \"college_mathematics\", \"college_medicine\", \"college_physics\", \"computer_security\", \"conceptual_physics\", \"econometrics\", \"electrical_engineering\", \"elementary_mathematics\", \"formal_logic\", \"global_facts\", \"high_school_biology\", \"high_school_chemistry\", \"high_school_computer_science\", \"high_school_european_history\", \"high_school_geography\", \"high_school_government_and_politics\", \"high_school_macroeconomics\", \"high_school_mathematics\", \"high_school_microeconomics\", \"high_school_physics\", \"high_school_psychology\", \"high_school_statistics\", \"high_school_us_history\", \"high_school_world_history\", \"human_aging\", \"human_sexuality\", \"international_law\", \"jurisprudence\", \"logical_fallacies\", \"machine_learning\", \"management\", \"marketing\", \"medical_genetics\", \"miscellaneous\", \"moral_disputes\", \"moral_scenarios\", \"nutrition\", \"philosophy\", \"prehistory\", \"professional_accounting\", \"professional_law\", \"professional_medicine\", \"professional_psychology\", \"public_relations\", \"security_studies\", \"sociology\", \"us_foreign_policy\", \"virology\", \"world_religions\"]", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_us_foreign_policy\"" + "groups": "[\"mmlu_abstract_algebra\", \"mmlu_anatomy\", \"mmlu_astronomy\", \"mmlu_business_ethics\", \"mmlu_clinical_knowledge\", \"mmlu_college_biology\", \"mmlu_college_chemistry\", \"mmlu_college_computer_science\", \"mmlu_college_mathematics\", \"mmlu_college_medicine\", \"mmlu_college_physics\", \"mmlu_computer_security\", \"mmlu_conceptual_physics\", \"mmlu_econometrics\", \"mmlu_electrical_engineering\", \"mmlu_elementary_mathematics\", \"mmlu_formal_logic\", \"mmlu_global_facts\", \"mmlu_high_school_biology\", \"mmlu_high_school_chemistry\", \"mmlu_high_school_computer_science\", \"mmlu_high_school_european_history\", \"mmlu_high_school_geography\", \"mmlu_high_school_government_and_politics\", \"mmlu_high_school_macroeconomics\", \"mmlu_high_school_mathematics\", \"mmlu_high_school_microeconomics\", \"mmlu_high_school_physics\", \"mmlu_high_school_psychology\", \"mmlu_high_school_statistics\", \"mmlu_high_school_us_history\", \"mmlu_high_school_world_history\", \"mmlu_human_aging\", \"mmlu_human_sexuality\", \"mmlu_international_law\", \"mmlu_jurisprudence\", \"mmlu_logical_fallacies\", \"mmlu_machine_learning\", \"mmlu_management\", \"mmlu_marketing\", \"mmlu_medical_genetics\", \"mmlu_miscellaneous\", \"mmlu_moral_disputes\", \"mmlu_moral_scenarios\", \"mmlu_nutrition\", \"mmlu_philosophy\", \"mmlu_prehistory\", \"mmlu_professional_accounting\", \"mmlu_professional_law\", \"mmlu_professional_medicine\", \"mmlu_professional_psychology\", \"mmlu_public_relations\", \"mmlu_security_studies\", \"mmlu_sociology\", \"mmlu_us_foreign_policy\", \"mmlu_virology\", \"mmlu_world_religions\"]" } } }, { - "evaluation_name": "Astronomy", + "evaluation_name": "Abstract Algebra", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -497,36 +441,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Astronomy", + "evaluation_description": "EM on Abstract Algebra", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.763, + "score": 0.35, "details": { - "description": "min=0.763, mean=0.763, max=0.763, sum=1.526 (2)", + "description": "min=0.35, mean=0.35, max=0.35, sum=0.7 (2)", "tab": "Accuracy", - "Astronomy - Observed inference time (s)": "{\"description\": \"min=0.424, mean=0.424, max=0.424, sum=0.848 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42381788398090164\"}", - "Astronomy - # eval": "{\"description\": \"min=152, mean=152, max=152, sum=304 (2)\", \"tab\": \"General information\", \"score\": \"152.0\"}", - "Astronomy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Astronomy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Astronomy - # prompt tokens": "{\"description\": \"min=681.079, mean=681.079, max=681.079, sum=1362.158 (2)\", \"tab\": \"General information\", \"score\": \"681.078947368421\"}", - "Astronomy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Abstract Algebra - Observed inference time (s)": "{\"description\": \"min=0.377, mean=0.377, max=0.377, sum=0.753 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.37665764808654784\"}", + "Abstract Algebra - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "Abstract Algebra - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Abstract Algebra - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Abstract Algebra - # prompt tokens": "{\"description\": \"min=397.65, mean=397.65, max=397.65, sum=795.3 (2)\", \"tab\": \"General information\", \"score\": \"397.65\"}", + "Abstract Algebra - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"astronomy\"", + "subject": "\"abstract_algebra\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_astronomy\"" + "groups": "\"mmlu_abstract_algebra\"" } } }, { - "evaluation_name": "Business Ethics", + "evaluation_name": "Anatomy", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -535,36 +479,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Business Ethics", + "evaluation_description": "EM on Anatomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.69, + "score": 0.652, "details": { - "description": "min=0.69, mean=0.69, max=0.69, sum=1.38 (2)", + "description": "min=0.652, mean=0.652, max=0.652, sum=1.304 (2)", "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": "{\"description\": \"min=0.432, mean=0.432, max=0.432, sum=0.863 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4315712761878967\"}", - "Business Ethics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "Business Ethics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Business Ethics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Business Ethics - # prompt tokens": "{\"description\": \"min=674.44, mean=674.44, max=674.44, sum=1348.88 (2)\", \"tab\": \"General information\", \"score\": \"674.44\"}", - "Business Ethics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Anatomy - Observed inference time (s)": "{\"description\": \"min=0.365, mean=0.365, max=0.365, sum=0.731 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3654881194785789\"}", + "Anatomy - # eval": "{\"description\": \"min=135, mean=135, max=135, sum=270 (2)\", \"tab\": \"General information\", \"score\": \"135.0\"}", + "Anatomy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Anatomy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Anatomy - # prompt tokens": "{\"description\": \"min=409.133, mean=409.133, max=409.133, sum=818.267 (2)\", \"tab\": \"General information\", \"score\": \"409.1333333333333\"}", + "Anatomy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"business_ethics\"", + "subject": "\"anatomy\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_business_ethics\"" + "groups": "\"mmlu_anatomy\"" } } }, { - "evaluation_name": "Clinical Knowledge", + "evaluation_name": "College Physics", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -573,36 +517,66 @@ ] }, "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", + "evaluation_description": "EM on College Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.781, + "score": 0.461, "details": { - "description": "min=0.781, mean=0.781, max=0.781, sum=1.562 (2)", + "description": "min=0.461, mean=0.461, max=0.461, sum=0.922 (2)", "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": "{\"description\": \"min=0.42, mean=0.42, max=0.42, sum=0.841 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4204666920428006\"}", - "Clinical Knowledge - # eval": "{\"description\": \"min=265, mean=265, max=265, sum=530 (2)\", \"tab\": \"General information\", \"score\": \"265.0\"}", - "Clinical Knowledge - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Clinical Knowledge - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Clinical Knowledge - # prompt tokens": "{\"description\": \"min=487.374, mean=487.374, max=487.374, sum=974.747 (2)\", \"tab\": \"General information\", \"score\": \"487.3735849056604\"}", - "Clinical Knowledge - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "College Chemistry - Observed inference time (s)": "{\"description\": \"min=0.35, mean=0.35, max=0.35, sum=0.701 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3502761268615723\"}", + "College Biology - Observed inference time (s)": "{\"description\": \"min=0.421, mean=0.421, max=0.421, sum=0.842 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.421069688267178\"}", + "College Computer Science - Observed inference time (s)": "{\"description\": \"min=0.427, mean=0.427, max=0.427, sum=0.853 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4266632032394409\"}", + "College Mathematics - Observed inference time (s)": "{\"description\": \"min=0.429, mean=0.429, max=0.429, sum=0.858 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42887043952941895\"}", + "College Medicine - Observed inference time (s)": "{\"description\": \"min=0.434, mean=0.434, max=0.434, sum=0.869 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4343285574389331\"}", + "College Physics - Observed inference time (s)": "{\"description\": \"min=0.421, mean=0.421, max=0.421, sum=0.842 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4209739086674709\"}", + "College Chemistry - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "College Chemistry - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "College Chemistry - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "College Chemistry - # prompt tokens": "{\"description\": \"min=622.43, mean=622.43, max=622.43, sum=1244.86 (2)\", \"tab\": \"General information\", \"score\": \"622.43\"}", + "College Chemistry - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "College Biology - # eval": "{\"description\": \"min=144, mean=144, max=144, sum=288 (2)\", \"tab\": \"General information\", \"score\": \"144.0\"}", + "College Biology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "College Biology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "College Biology - # prompt tokens": "{\"description\": \"min=553.632, mean=553.632, max=553.632, sum=1107.264 (2)\", \"tab\": \"General information\", \"score\": \"553.6319444444445\"}", + "College Biology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "College Computer Science - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "College Computer Science - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "College Computer Science - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "College Computer Science - # prompt tokens": "{\"description\": \"min=901.14, mean=901.14, max=901.14, sum=1802.28 (2)\", \"tab\": \"General information\", \"score\": \"901.14\"}", + "College Computer Science - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "College Mathematics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "College Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "College Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "College Mathematics - # prompt tokens": "{\"description\": \"min=646.96, mean=646.96, max=646.96, sum=1293.92 (2)\", \"tab\": \"General information\", \"score\": \"646.96\"}", + "College Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "College Medicine - # eval": "{\"description\": \"min=173, mean=173, max=173, sum=346 (2)\", \"tab\": \"General information\", \"score\": \"173.0\"}", + "College Medicine - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "College Medicine - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "College Medicine - # prompt tokens": "{\"description\": \"min=608.671, mean=608.671, max=608.671, sum=1217.341 (2)\", \"tab\": \"General information\", \"score\": \"608.6705202312139\"}", + "College Medicine - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "College Physics - # eval": "{\"description\": \"min=102, mean=102, max=102, sum=204 (2)\", \"tab\": \"General information\", \"score\": \"102.0\"}", + "College Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "College Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "College Physics - # prompt tokens": "{\"description\": \"min=551.873, mean=551.873, max=551.873, sum=1103.745 (2)\", \"tab\": \"General information\", \"score\": \"551.8725490196078\"}", + "College Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"clinical_knowledge\"", + "subject": "\"college_physics\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_clinical_knowledge\"" + "groups": "\"mmlu_college_physics\"" } } }, { - "evaluation_name": "Conceptual Physics", + "evaluation_name": "Computer Security", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -611,36 +585,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Conceptual Physics", + "evaluation_description": "EM on Computer Security", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.634, + "score": 0.84, "details": { - "description": "min=0.634, mean=0.634, max=0.634, sum=1.268 (2)", + "description": "min=0.84, mean=0.84, max=0.84, sum=1.68 (2)", "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": "{\"description\": \"min=0.412, mean=0.412, max=0.412, sum=0.824 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4118805824442113\"}", - "Conceptual Physics - # eval": "{\"description\": \"min=235, mean=235, max=235, sum=470 (2)\", \"tab\": \"General information\", \"score\": \"235.0\"}", - "Conceptual Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Conceptual Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Conceptual Physics - # prompt tokens": "{\"description\": \"min=333.153, mean=333.153, max=333.153, sum=666.306 (2)\", \"tab\": \"General information\", \"score\": \"333.1531914893617\"}", - "Conceptual Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Computer Security - Observed inference time (s)": "{\"description\": \"min=0.412, mean=0.412, max=0.412, sum=0.825 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.41247488737106325\"}", + "Computer Security - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "Computer Security - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Computer Security - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Computer Security - # prompt tokens": "{\"description\": \"min=428.17, mean=428.17, max=428.17, sum=856.34 (2)\", \"tab\": \"General information\", \"score\": \"428.17\"}", + "Computer Security - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"conceptual_physics\"", + "subject": "\"computer_security\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_conceptual_physics\"" + "groups": "\"mmlu_computer_security\"" } } }, { - "evaluation_name": "Electrical Engineering", + "evaluation_name": "Econometrics", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -649,36 +623,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Electrical Engineering", + "evaluation_description": "EM on Econometrics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.662, + "score": 0.5, "details": { - "description": "min=0.662, mean=0.662, max=0.662, sum=1.324 (2)", + "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)", "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": "{\"description\": \"min=0.428, mean=0.428, max=0.428, sum=0.856 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42821227435407966\"}", - "Electrical Engineering - # eval": "{\"description\": \"min=145, mean=145, max=145, sum=290 (2)\", \"tab\": \"General information\", \"score\": \"145.0\"}", - "Electrical Engineering - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Electrical Engineering - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Electrical Engineering - # prompt tokens": "{\"description\": \"min=497.779, mean=497.779, max=497.779, sum=995.559 (2)\", \"tab\": \"General information\", \"score\": \"497.7793103448276\"}", - "Electrical Engineering - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Econometrics - Observed inference time (s)": "{\"description\": \"min=0.436, mean=0.436, max=0.436, sum=0.873 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.436487873395284\"}", + "Econometrics - # eval": "{\"description\": \"min=114, mean=114, max=114, sum=228 (2)\", \"tab\": \"General information\", \"score\": \"114.0\"}", + "Econometrics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Econometrics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Econometrics - # prompt tokens": "{\"description\": \"min=684.675, mean=684.675, max=684.675, sum=1369.351 (2)\", \"tab\": \"General information\", \"score\": \"684.6754385964912\"}", + "Econometrics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"electrical_engineering\"", + "subject": "\"econometrics\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_electrical_engineering\"" + "groups": "\"mmlu_econometrics\"" } } }, { - "evaluation_name": "Elementary Mathematics", + "evaluation_name": "Global Facts", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -687,36 +661,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", + "evaluation_description": "EM on Global Facts", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.481, + "score": 0.39, "details": { - "description": "min=0.481, mean=0.481, max=0.481, sum=0.963 (2)", + "description": "min=0.39, mean=0.39, max=0.39, sum=0.78 (2)", "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": "{\"description\": \"min=0.427, mean=0.427, max=0.427, sum=0.853 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4265344634888664\"}", - "Elementary Mathematics - # eval": "{\"description\": \"min=378, mean=378, max=378, sum=756 (2)\", \"tab\": \"General information\", \"score\": \"378.0\"}", - "Elementary Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Elementary Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Elementary Mathematics - # prompt tokens": "{\"description\": \"min=609.156, mean=609.156, max=609.156, sum=1218.312 (2)\", \"tab\": \"General information\", \"score\": \"609.1560846560847\"}", - "Elementary Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Global Facts - Observed inference time (s)": "{\"description\": \"min=0.42, mean=0.42, max=0.42, sum=0.839 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.41951879262924197\"}", + "Global Facts - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "Global Facts - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Global Facts - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Global Facts - # prompt tokens": "{\"description\": \"min=484.54, mean=484.54, max=484.54, sum=969.08 (2)\", \"tab\": \"General information\", \"score\": \"484.54\"}", + "Global Facts - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"elementary_mathematics\"", + "subject": "\"global_facts\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_elementary_mathematics\"" + "groups": "\"mmlu_global_facts\"" } } }, { - "evaluation_name": "Formal Logic", + "evaluation_name": "Jurisprudence", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -725,36 +699,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Formal Logic", + "evaluation_description": "EM on Jurisprudence", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.444, + "score": 0.741, "details": { - "description": "min=0.444, mean=0.444, max=0.444, sum=0.889 (2)", + "description": "min=0.741, mean=0.741, max=0.741, sum=1.481 (2)", "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": "{\"description\": \"min=0.411, mean=0.411, max=0.411, sum=0.821 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4107102117841206\"}", - "Formal Logic - # eval": "{\"description\": \"min=126, mean=126, max=126, sum=252 (2)\", \"tab\": \"General information\", \"score\": \"126.0\"}", - "Formal Logic - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Formal Logic - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Formal Logic - # prompt tokens": "{\"description\": \"min=691.81, mean=691.81, max=691.81, sum=1383.619 (2)\", \"tab\": \"General information\", \"score\": \"691.8095238095239\"}", - "Formal Logic - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Jurisprudence - Observed inference time (s)": "{\"description\": \"min=0.422, mean=0.422, max=0.422, sum=0.843 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.421647725281892\"}", + "Jurisprudence - # eval": "{\"description\": \"min=108, mean=108, max=108, sum=216 (2)\", \"tab\": \"General information\", \"score\": \"108.0\"}", + "Jurisprudence - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Jurisprudence - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Jurisprudence - # prompt tokens": "{\"description\": \"min=449.898, mean=449.898, max=449.898, sum=899.796 (2)\", \"tab\": \"General information\", \"score\": \"449.89814814814815\"}", + "Jurisprudence - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"formal_logic\"", + "subject": "\"jurisprudence\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_formal_logic\"" + "groups": "\"mmlu_jurisprudence\"" } } }, { - "evaluation_name": "High School World History", + "evaluation_name": "Philosophy", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -763,114 +737,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on High School World History", + "evaluation_description": "EM on Philosophy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.827, + "score": 0.752, "details": { - "description": "min=0.827, mean=0.827, max=0.827, sum=1.654 (2)", + "description": "min=0.752, mean=0.752, max=0.752, sum=1.505 (2)", "tab": "Accuracy", - "High School Biology - Observed inference time (s)": "{\"description\": \"min=0.424, mean=0.424, max=0.424, sum=0.847 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42357982127897204\"}", - "High School Chemistry - Observed inference time (s)": "{\"description\": \"min=0.412, mean=0.412, max=0.412, sum=0.825 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.41242665375394777\"}", - "High School Computer Science - Observed inference time (s)": "{\"description\": \"min=0.445, mean=0.445, max=0.445, sum=0.89 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.44495458364486695\"}", - "High School European History - Observed inference time (s)": "{\"description\": \"min=0.544, mean=0.544, max=0.544, sum=1.088 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5441486705433238\"}", - "High School Geography - Observed inference time (s)": "{\"description\": \"min=0.415, mean=0.415, max=0.415, sum=0.83 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4149725003675981\"}", - "High School Government And Politics - Observed inference time (s)": "{\"description\": \"min=0.383, mean=0.383, max=0.383, sum=0.766 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.38312110629106433\"}", - "High School Macroeconomics - Observed inference time (s)": "{\"description\": \"min=0.403, mean=0.403, max=0.403, sum=0.807 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4034240123553154\"}", - "High School Mathematics - Observed inference time (s)": "{\"description\": \"min=0.39, mean=0.39, max=0.39, sum=0.779 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.38954139285617406\"}", - "High School Microeconomics - Observed inference time (s)": "{\"description\": \"min=0.399, mean=0.399, max=0.399, sum=0.798 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3992174813727371\"}", - "High School Physics - Observed inference time (s)": "{\"description\": \"min=0.409, mean=0.409, max=0.409, sum=0.819 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.40926165138648835\"}", - "High School Psychology - Observed inference time (s)": "{\"description\": \"min=0.408, mean=0.408, max=0.408, sum=0.816 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4081065694126514\"}", - "High School Statistics - Observed inference time (s)": "{\"description\": \"min=0.417, mean=0.417, max=0.417, sum=0.833 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4166152830477114\"}", - "High School US History - Observed inference time (s)": "{\"description\": \"min=0.45, mean=0.45, max=0.45, sum=0.901 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4504043985815609\"}", - "High School World History - Observed inference time (s)": "{\"description\": \"min=0.416, mean=0.416, max=0.416, sum=0.833 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4162542166086189\"}", - "High School Biology - # eval": "{\"description\": \"min=310, mean=310, max=310, sum=620 (2)\", \"tab\": \"General information\", \"score\": \"310.0\"}", - "High School Biology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Biology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Biology - # prompt tokens": "{\"description\": \"min=596.894, mean=596.894, max=596.894, sum=1193.787 (2)\", \"tab\": \"General information\", \"score\": \"596.8935483870968\"}", - "High School Biology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Chemistry - # eval": "{\"description\": \"min=203, mean=203, max=203, sum=406 (2)\", \"tab\": \"General information\", \"score\": \"203.0\"}", - "High School Chemistry - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Chemistry - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Chemistry - # prompt tokens": "{\"description\": \"min=568.665, mean=568.665, max=568.665, sum=1137.33 (2)\", \"tab\": \"General information\", \"score\": \"568.6650246305419\"}", - "High School Chemistry - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Computer Science - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "High School Computer Science - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Computer Science - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Computer Science - # prompt tokens": "{\"description\": \"min=988.57, mean=988.57, max=988.57, sum=1977.14 (2)\", \"tab\": \"General information\", \"score\": \"988.57\"}", - "High School Computer Science - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School European History - # eval": "{\"description\": \"min=165, mean=165, max=165, sum=330 (2)\", \"tab\": \"General information\", \"score\": \"165.0\"}", - "High School European History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School European History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School European History - # prompt tokens": "{\"description\": \"min=3159.636, mean=3159.636, max=3159.636, sum=6319.273 (2)\", \"tab\": \"General information\", \"score\": \"3159.6363636363635\"}", - "High School European History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Geography - # eval": "{\"description\": \"min=198, mean=198, max=198, sum=396 (2)\", \"tab\": \"General information\", \"score\": \"198.0\"}", - "High School Geography - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Geography - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Geography - # prompt tokens": "{\"description\": \"min=436.657, mean=436.657, max=436.657, sum=873.313 (2)\", \"tab\": \"General information\", \"score\": \"436.65656565656565\"}", - "High School Geography - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Government And Politics - # eval": "{\"description\": \"min=193, mean=193, max=193, sum=386 (2)\", \"tab\": \"General information\", \"score\": \"193.0\"}", - "High School Government And Politics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Government And Politics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Government And Politics - # prompt tokens": "{\"description\": \"min=527.927, mean=527.927, max=527.927, sum=1055.855 (2)\", \"tab\": \"General information\", \"score\": \"527.9274611398964\"}", - "High School Government And Politics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Macroeconomics - # eval": "{\"description\": \"min=390, mean=390, max=390, sum=780 (2)\", \"tab\": \"General information\", \"score\": \"390.0\"}", - "High School Macroeconomics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Macroeconomics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Macroeconomics - # prompt tokens": "{\"description\": \"min=445.662, mean=445.662, max=445.662, sum=891.323 (2)\", \"tab\": \"General information\", \"score\": \"445.66153846153844\"}", - "High School Macroeconomics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Mathematics - # eval": "{\"description\": \"min=270, mean=270, max=270, sum=540 (2)\", \"tab\": \"General information\", \"score\": \"270.0\"}", - "High School Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Mathematics - # prompt tokens": "{\"description\": \"min=579.181, mean=579.181, max=579.181, sum=1158.363 (2)\", \"tab\": \"General information\", \"score\": \"579.1814814814815\"}", - "High School Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Microeconomics - # eval": "{\"description\": \"min=238, mean=238, max=238, sum=476 (2)\", \"tab\": \"General information\", \"score\": \"238.0\"}", - "High School Microeconomics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Microeconomics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Microeconomics - # prompt tokens": "{\"description\": \"min=449.492, mean=449.492, max=449.492, sum=898.983 (2)\", \"tab\": \"General information\", \"score\": \"449.49159663865544\"}", - "High School Microeconomics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Physics - # eval": "{\"description\": \"min=151, mean=151, max=151, sum=302 (2)\", \"tab\": \"General information\", \"score\": \"151.0\"}", - "High School Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Physics - # prompt tokens": "{\"description\": \"min=621.788, mean=621.788, max=621.788, sum=1243.576 (2)\", \"tab\": \"General information\", \"score\": \"621.7880794701987\"}", - "High School Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Psychology - # eval": "{\"description\": \"min=545, mean=545, max=545, sum=1090 (2)\", \"tab\": \"General information\", \"score\": \"545.0\"}", - "High School Psychology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Psychology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Psychology - # prompt tokens": "{\"description\": \"min=585.919, mean=585.919, max=585.919, sum=1171.839 (2)\", \"tab\": \"General information\", \"score\": \"585.9192660550459\"}", - "High School Psychology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School Statistics - # eval": "{\"description\": \"min=216, mean=216, max=216, sum=432 (2)\", \"tab\": \"General information\", \"score\": \"216.0\"}", - "High School Statistics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School Statistics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School Statistics - # prompt tokens": "{\"description\": \"min=908.208, mean=908.208, max=908.208, sum=1816.417 (2)\", \"tab\": \"General information\", \"score\": \"908.2083333333334\"}", - "High School Statistics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School US History - # eval": "{\"description\": \"min=204, mean=204, max=204, sum=408 (2)\", \"tab\": \"General information\", \"score\": \"204.0\"}", - "High School US History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School US History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School US History - # prompt tokens": "{\"description\": \"min=2535.324, mean=2535.324, max=2535.324, sum=5070.647 (2)\", \"tab\": \"General information\", \"score\": \"2535.323529411765\"}", - "High School US History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "High School World History - # eval": "{\"description\": \"min=237, mean=237, max=237, sum=474 (2)\", \"tab\": \"General information\", \"score\": \"237.0\"}", - "High School World History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "High School World History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "High School World History - # prompt tokens": "{\"description\": \"min=1638.219, mean=1638.219, max=1638.219, sum=3276.439 (2)\", \"tab\": \"General information\", \"score\": \"1638.2194092827003\"}", - "High School World History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Philosophy - Observed inference time (s)": "{\"description\": \"min=0.418, mean=0.418, max=0.418, sum=0.837 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.418486426497579\"}", + "Philosophy - # eval": "{\"description\": \"min=311, mean=311, max=311, sum=622 (2)\", \"tab\": \"General information\", \"score\": \"311.0\"}", + "Philosophy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Philosophy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Philosophy - # prompt tokens": "{\"description\": \"min=372.122, mean=372.122, max=372.122, sum=744.244 (2)\", \"tab\": \"General information\", \"score\": \"372.12218649517683\"}", + "Philosophy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"high_school_world_history\"", + "subject": "\"philosophy\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_high_school_world_history\"" + "groups": "\"mmlu_philosophy\"" } } }, { - "evaluation_name": "Human Sexuality", + "evaluation_name": "Professional Psychology", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -879,42 +775,54 @@ ] }, "metric_config": { - "evaluation_description": "EM on Human Sexuality", + "evaluation_description": "EM on Professional Psychology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.847, + "score": 0.724, "details": { - "description": "min=0.847, mean=0.847, max=0.847, sum=1.695 (2)", + "description": "min=0.724, mean=0.724, max=0.724, sum=1.448 (2)", "tab": "Accuracy", - "Human Aging - Observed inference time (s)": "{\"description\": \"min=0.401, mean=0.401, max=0.401, sum=0.802 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4010318255745242\"}", - "Human Sexuality - Observed inference time (s)": "{\"description\": \"min=0.393, mean=0.393, max=0.393, sum=0.787 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.39331119843111695\"}", - "Human Aging - # eval": "{\"description\": \"min=223, mean=223, max=223, sum=446 (2)\", \"tab\": \"General information\", \"score\": \"223.0\"}", - "Human Aging - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Human Aging - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Human Aging - # prompt tokens": "{\"description\": \"min=361.26, mean=361.26, max=361.26, sum=722.52 (2)\", \"tab\": \"General information\", \"score\": \"361.26008968609864\"}", - "Human Aging - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "Human Sexuality - # eval": "{\"description\": \"min=131, mean=131, max=131, sum=262 (2)\", \"tab\": \"General information\", \"score\": \"131.0\"}", - "Human Sexuality - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Human Sexuality - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Human Sexuality - # prompt tokens": "{\"description\": \"min=403.382, mean=403.382, max=403.382, sum=806.763 (2)\", \"tab\": \"General information\", \"score\": \"403.381679389313\"}", - "Human Sexuality - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Professional Medicine - Observed inference time (s)": "{\"description\": \"min=0.445, mean=0.445, max=0.445, sum=0.89 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4448305149288738\"}", + "Professional Accounting - Observed inference time (s)": "{\"description\": \"min=0.443, mean=0.443, max=0.443, sum=0.887 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.44340477683019974\"}", + "Professional Law - Observed inference time (s)": "{\"description\": \"min=0.531, mean=0.531, max=0.531, sum=1.062 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.531202322345669\"}", + "Professional Psychology - Observed inference time (s)": "{\"description\": \"min=0.423, mean=0.423, max=0.423, sum=0.847 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42342418120577446\"}", + "Professional Medicine - # eval": "{\"description\": \"min=272, mean=272, max=272, sum=544 (2)\", \"tab\": \"General information\", \"score\": \"272.0\"}", + "Professional Medicine - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Professional Medicine - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Professional Medicine - # prompt tokens": "{\"description\": \"min=1330.647, mean=1330.647, max=1330.647, sum=2661.294 (2)\", \"tab\": \"General information\", \"score\": \"1330.6470588235295\"}", + "Professional Medicine - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "Professional Accounting - # eval": "{\"description\": \"min=282, mean=282, max=282, sum=564 (2)\", \"tab\": \"General information\", \"score\": \"282.0\"}", + "Professional Accounting - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Professional Accounting - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Professional Accounting - # prompt tokens": "{\"description\": \"min=823.277, mean=823.277, max=823.277, sum=1646.553 (2)\", \"tab\": \"General information\", \"score\": \"823.2765957446809\"}", + "Professional Accounting - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "Professional Law - # eval": "{\"description\": \"min=1534, mean=1534, max=1534, sum=3068 (2)\", \"tab\": \"General information\", \"score\": \"1534.0\"}", + "Professional Law - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Professional Law - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Professional Law - # prompt tokens": "{\"description\": \"min=1915.007, mean=1915.007, max=1915.007, sum=3830.014 (2)\", \"tab\": \"General information\", \"score\": \"1915.0071707953064\"}", + "Professional Law - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "Professional Psychology - # eval": "{\"description\": \"min=612, mean=612, max=612, sum=1224 (2)\", \"tab\": \"General information\", \"score\": \"612.0\"}", + "Professional Psychology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Professional Psychology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Professional Psychology - # prompt tokens": "{\"description\": \"min=650.078, mean=650.078, max=650.078, sum=1300.157 (2)\", \"tab\": \"General information\", \"score\": \"650.0784313725491\"}", + "Professional Psychology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"human_sexuality\"", + "subject": "\"professional_psychology\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_human_sexuality\"" + "groups": "\"mmlu_professional_psychology\"" } } }, { - "evaluation_name": "International Law", + "evaluation_name": "Us Foreign Policy", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -923,36 +831,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on International Law", + "evaluation_description": "EM on Us Foreign Policy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.826, + "score": 0.88, "details": { - "description": "min=0.826, mean=0.826, max=0.826, sum=1.653 (2)", + "description": "min=0.88, mean=0.88, max=0.88, sum=1.76 (2)", "tab": "Accuracy", - "International Law - Observed inference time (s)": "{\"description\": \"min=0.42, mean=0.42, max=0.42, sum=0.841 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42040472779392213\"}", - "International Law - # eval": "{\"description\": \"min=121, mean=121, max=121, sum=242 (2)\", \"tab\": \"General information\", \"score\": \"121.0\"}", - "International Law - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "International Law - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "International Law - # prompt tokens": "{\"description\": \"min=729.463, mean=729.463, max=729.463, sum=1458.926 (2)\", \"tab\": \"General information\", \"score\": \"729.4628099173553\"}", - "International Law - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Us Foreign Policy - Observed inference time (s)": "{\"description\": \"min=0.424, mean=0.424, max=0.424, sum=0.848 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42398189067840575\"}", + "Us Foreign Policy - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "Us Foreign Policy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Us Foreign Policy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Us Foreign Policy - # prompt tokens": "{\"description\": \"min=479.81, mean=479.81, max=479.81, sum=959.62 (2)\", \"tab\": \"General information\", \"score\": \"479.81\"}", + "Us Foreign Policy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"international_law\"", + "subject": "\"us_foreign_policy\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_international_law\"" + "groups": "\"mmlu_us_foreign_policy\"" } } }, { - "evaluation_name": "Logical Fallacies", + "evaluation_name": "Astronomy", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -961,36 +869,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Logical Fallacies", + "evaluation_description": "EM on Astronomy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.779, + "score": 0.763, "details": { - "description": "min=0.779, mean=0.779, max=0.779, sum=1.558 (2)", + "description": "min=0.763, mean=0.763, max=0.763, sum=1.526 (2)", "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": "{\"description\": \"min=0.404, mean=0.404, max=0.404, sum=0.809 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4043445353127696\"}", - "Logical Fallacies - # eval": "{\"description\": \"min=163, mean=163, max=163, sum=326 (2)\", \"tab\": \"General information\", \"score\": \"163.0\"}", - "Logical Fallacies - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Logical Fallacies - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Logical Fallacies - # prompt tokens": "{\"description\": \"min=502.755, mean=502.755, max=502.755, sum=1005.509 (2)\", \"tab\": \"General information\", \"score\": \"502.7546012269939\"}", - "Logical Fallacies - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Astronomy - Observed inference time (s)": "{\"description\": \"min=0.424, mean=0.424, max=0.424, sum=0.848 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42381788398090164\"}", + "Astronomy - # eval": "{\"description\": \"min=152, mean=152, max=152, sum=304 (2)\", \"tab\": \"General information\", \"score\": \"152.0\"}", + "Astronomy - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Astronomy - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Astronomy - # prompt tokens": "{\"description\": \"min=681.079, mean=681.079, max=681.079, sum=1362.158 (2)\", \"tab\": \"General information\", \"score\": \"681.078947368421\"}", + "Astronomy - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"logical_fallacies\"", + "subject": "\"astronomy\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_logical_fallacies\"" + "groups": "\"mmlu_astronomy\"" } } }, { - "evaluation_name": "Machine Learning", + "evaluation_name": "Business Ethics", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -999,36 +907,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Machine Learning", + "evaluation_description": "EM on Business Ethics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.473, + "score": 0.69, "details": { - "description": "min=0.473, mean=0.473, max=0.473, sum=0.946 (2)", + "description": "min=0.69, mean=0.69, max=0.69, sum=1.38 (2)", "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": "{\"description\": \"min=0.421, mean=0.421, max=0.421, sum=0.842 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42122456644262585\"}", - "Machine Learning - # eval": "{\"description\": \"min=112, mean=112, max=112, sum=224 (2)\", \"tab\": \"General information\", \"score\": \"112.0\"}", - "Machine Learning - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Machine Learning - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Machine Learning - # prompt tokens": "{\"description\": \"min=730.402, mean=730.402, max=730.402, sum=1460.804 (2)\", \"tab\": \"General information\", \"score\": \"730.4017857142857\"}", - "Machine Learning - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Business Ethics - Observed inference time (s)": "{\"description\": \"min=0.432, mean=0.432, max=0.432, sum=0.863 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4315712761878967\"}", + "Business Ethics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "Business Ethics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Business Ethics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Business Ethics - # prompt tokens": "{\"description\": \"min=674.44, mean=674.44, max=674.44, sum=1348.88 (2)\", \"tab\": \"General information\", \"score\": \"674.44\"}", + "Business Ethics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"machine_learning\"", + "subject": "\"business_ethics\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_machine_learning\"" + "groups": "\"mmlu_business_ethics\"" } } }, { - "evaluation_name": "Management", + "evaluation_name": "Clinical Knowledge", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1037,36 +945,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Management", + "evaluation_description": "EM on Clinical Knowledge", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.796, + "score": 0.781, "details": { - "description": "min=0.796, mean=0.796, max=0.796, sum=1.592 (2)", + "description": "min=0.781, mean=0.781, max=0.781, sum=1.562 (2)", "tab": "Accuracy", - "Management - Observed inference time (s)": "{\"description\": \"min=0.392, mean=0.392, max=0.392, sum=0.785 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.392485206566968\"}", - "Management - # eval": "{\"description\": \"min=103, mean=103, max=103, sum=206 (2)\", \"tab\": \"General information\", \"score\": \"103.0\"}", - "Management - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Management - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Management - # prompt tokens": "{\"description\": \"min=315.777, mean=315.777, max=315.777, sum=631.553 (2)\", \"tab\": \"General information\", \"score\": \"315.77669902912623\"}", - "Management - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Clinical Knowledge - Observed inference time (s)": "{\"description\": \"min=0.42, mean=0.42, max=0.42, sum=0.841 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4204666920428006\"}", + "Clinical Knowledge - # eval": "{\"description\": \"min=265, mean=265, max=265, sum=530 (2)\", \"tab\": \"General information\", \"score\": \"265.0\"}", + "Clinical Knowledge - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Clinical Knowledge - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Clinical Knowledge - # prompt tokens": "{\"description\": \"min=487.374, mean=487.374, max=487.374, sum=974.747 (2)\", \"tab\": \"General information\", \"score\": \"487.3735849056604\"}", + "Clinical Knowledge - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"management\"", + "subject": "\"clinical_knowledge\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_management\"" + "groups": "\"mmlu_clinical_knowledge\"" } } }, { - "evaluation_name": "Marketing", + "evaluation_name": "Conceptual Physics", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1075,36 +983,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Marketing", + "evaluation_description": "EM on Conceptual Physics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.902, + "score": 0.634, "details": { - "description": "min=0.902, mean=0.902, max=0.902, sum=1.803 (2)", + "description": "min=0.634, mean=0.634, max=0.634, sum=1.268 (2)", "tab": "Accuracy", - "Marketing - Observed inference time (s)": "{\"description\": \"min=0.407, mean=0.407, max=0.407, sum=0.813 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.406507401384859\"}", - "Marketing - # eval": "{\"description\": \"min=234, mean=234, max=234, sum=468 (2)\", \"tab\": \"General information\", \"score\": \"234.0\"}", - "Marketing - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Marketing - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Marketing - # prompt tokens": "{\"description\": \"min=472.628, mean=472.628, max=472.628, sum=945.256 (2)\", \"tab\": \"General information\", \"score\": \"472.62820512820514\"}", - "Marketing - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Conceptual Physics - Observed inference time (s)": "{\"description\": \"min=0.412, mean=0.412, max=0.412, sum=0.824 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4118805824442113\"}", + "Conceptual Physics - # eval": "{\"description\": \"min=235, mean=235, max=235, sum=470 (2)\", \"tab\": \"General information\", \"score\": \"235.0\"}", + "Conceptual Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Conceptual Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Conceptual Physics - # prompt tokens": "{\"description\": \"min=333.153, mean=333.153, max=333.153, sum=666.306 (2)\", \"tab\": \"General information\", \"score\": \"333.1531914893617\"}", + "Conceptual Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"marketing\"", + "subject": "\"conceptual_physics\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_marketing\"" + "groups": "\"mmlu_conceptual_physics\"" } } }, { - "evaluation_name": "Medical Genetics", + "evaluation_name": "Electrical Engineering", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1113,36 +1021,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Medical Genetics", + "evaluation_description": "EM on Electrical Engineering", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.76, + "score": 0.662, "details": { - "description": "min=0.76, mean=0.76, max=0.76, sum=1.52 (2)", + "description": "min=0.662, mean=0.662, max=0.662, sum=1.324 (2)", "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": "{\"description\": \"min=0.417, mean=0.417, max=0.417, sum=0.835 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.41734427213668823\"}", - "Medical Genetics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", - "Medical Genetics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Medical Genetics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Medical Genetics - # prompt tokens": "{\"description\": \"min=408.14, mean=408.14, max=408.14, sum=816.28 (2)\", \"tab\": \"General information\", \"score\": \"408.14\"}", - "Medical Genetics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Electrical Engineering - Observed inference time (s)": "{\"description\": \"min=0.428, mean=0.428, max=0.428, sum=0.856 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42821227435407966\"}", + "Electrical Engineering - # eval": "{\"description\": \"min=145, mean=145, max=145, sum=290 (2)\", \"tab\": \"General information\", \"score\": \"145.0\"}", + "Electrical Engineering - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Electrical Engineering - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Electrical Engineering - # prompt tokens": "{\"description\": \"min=497.779, mean=497.779, max=497.779, sum=995.559 (2)\", \"tab\": \"General information\", \"score\": \"497.7793103448276\"}", + "Electrical Engineering - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"medical_genetics\"", + "subject": "\"electrical_engineering\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_medical_genetics\"" + "groups": "\"mmlu_electrical_engineering\"" } } }, { - "evaluation_name": "Miscellaneous", + "evaluation_name": "Elementary Mathematics", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1151,36 +1059,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Miscellaneous", + "evaluation_description": "EM on Elementary Mathematics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.875, + "score": 0.481, "details": { - "description": "min=0.875, mean=0.875, max=0.875, sum=1.75 (2)", + "description": "min=0.481, mean=0.481, max=0.481, sum=0.963 (2)", "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": "{\"description\": \"min=0.407, mean=0.407, max=0.407, sum=0.814 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.40693108880200146\"}", - "Miscellaneous - # eval": "{\"description\": \"min=783, mean=783, max=783, sum=1566 (2)\", \"tab\": \"General information\", \"score\": \"783.0\"}", - "Miscellaneous - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Miscellaneous - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Miscellaneous - # prompt tokens": "{\"description\": \"min=345.913, mean=345.913, max=345.913, sum=691.826 (2)\", \"tab\": \"General information\", \"score\": \"345.9131545338442\"}", - "Miscellaneous - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Elementary Mathematics - Observed inference time (s)": "{\"description\": \"min=0.427, mean=0.427, max=0.427, sum=0.853 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4265344634888664\"}", + "Elementary Mathematics - # eval": "{\"description\": \"min=378, mean=378, max=378, sum=756 (2)\", \"tab\": \"General information\", \"score\": \"378.0\"}", + "Elementary Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Elementary Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Elementary Mathematics - # prompt tokens": "{\"description\": \"min=609.156, mean=609.156, max=609.156, sum=1218.312 (2)\", \"tab\": \"General information\", \"score\": \"609.1560846560847\"}", + "Elementary Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"miscellaneous\"", + "subject": "\"elementary_mathematics\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_miscellaneous\"" + "groups": "\"mmlu_elementary_mathematics\"" } } }, { - "evaluation_name": "Moral Scenarios", + "evaluation_name": "Formal Logic", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1189,42 +1097,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Moral Scenarios", + "evaluation_description": "EM on Formal Logic", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.28, + "score": 0.444, "details": { - "description": "min=0.28, mean=0.28, max=0.28, sum=0.561 (2)", + "description": "min=0.444, mean=0.444, max=0.444, sum=0.889 (2)", "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": "{\"description\": \"min=0.424, mean=0.424, max=0.424, sum=0.848 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4239204674097844\"}", - "Moral Scenarios - Observed inference time (s)": "{\"description\": \"min=0.433, mean=0.433, max=0.433, sum=0.866 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.43297034721800737\"}", - "Moral Disputes - # eval": "{\"description\": \"min=346, mean=346, max=346, sum=692 (2)\", \"tab\": \"General information\", \"score\": \"346.0\"}", - "Moral Disputes - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Moral Disputes - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Moral Disputes - # prompt tokens": "{\"description\": \"min=542.506, mean=542.506, max=542.506, sum=1085.012 (2)\", \"tab\": \"General information\", \"score\": \"542.5057803468208\"}", - "Moral Disputes - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "Moral Scenarios - # eval": "{\"description\": \"min=895, mean=895, max=895, sum=1790 (2)\", \"tab\": \"General information\", \"score\": \"895.0\"}", - "Moral Scenarios - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Moral Scenarios - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Moral Scenarios - # prompt tokens": "{\"description\": \"min=756.479, mean=756.479, max=756.479, sum=1512.959 (2)\", \"tab\": \"General information\", \"score\": \"756.4793296089385\"}", - "Moral Scenarios - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Formal Logic - Observed inference time (s)": "{\"description\": \"min=0.411, mean=0.411, max=0.411, sum=0.821 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4107102117841206\"}", + "Formal Logic - # eval": "{\"description\": \"min=126, mean=126, max=126, sum=252 (2)\", \"tab\": \"General information\", \"score\": \"126.0\"}", + "Formal Logic - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Formal Logic - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Formal Logic - # prompt tokens": "{\"description\": \"min=691.81, mean=691.81, max=691.81, sum=1383.619 (2)\", \"tab\": \"General information\", \"score\": \"691.8095238095239\"}", + "Formal Logic - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"moral_scenarios\"", + "subject": "\"formal_logic\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_moral_scenarios\"" + "groups": "\"mmlu_formal_logic\"" } } }, { - "evaluation_name": "Nutrition", + "evaluation_name": "High School World History", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1233,36 +1135,114 @@ ] }, "metric_config": { - "evaluation_description": "EM on Nutrition", + "evaluation_description": "EM on High School World History", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.725, + "score": 0.827, "details": { - "description": "min=0.725, mean=0.725, max=0.725, sum=1.451 (2)", + "description": "min=0.827, mean=0.827, max=0.827, sum=1.654 (2)", "tab": "Accuracy", - "Nutrition - Observed inference time (s)": "{\"description\": \"min=0.417, mean=0.417, max=0.417, sum=0.835 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.41727598430284485\"}", - "Nutrition - # eval": "{\"description\": \"min=306, mean=306, max=306, sum=612 (2)\", \"tab\": \"General information\", \"score\": \"306.0\"}", - "Nutrition - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Nutrition - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Nutrition - # prompt tokens": "{\"description\": \"min=695.922, mean=695.922, max=695.922, sum=1391.843 (2)\", \"tab\": \"General information\", \"score\": \"695.9215686274509\"}", - "Nutrition - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "High School Biology - Observed inference time (s)": "{\"description\": \"min=0.424, mean=0.424, max=0.424, sum=0.847 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42357982127897204\"}", + "High School Chemistry - Observed inference time (s)": "{\"description\": \"min=0.412, mean=0.412, max=0.412, sum=0.825 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.41242665375394777\"}", + "High School Computer Science - Observed inference time (s)": "{\"description\": \"min=0.445, mean=0.445, max=0.445, sum=0.89 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.44495458364486695\"}", + "High School European History - Observed inference time (s)": "{\"description\": \"min=0.544, mean=0.544, max=0.544, sum=1.088 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.5441486705433238\"}", + "High School Geography - Observed inference time (s)": "{\"description\": \"min=0.415, mean=0.415, max=0.415, sum=0.83 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4149725003675981\"}", + "High School Government And Politics - Observed inference time (s)": "{\"description\": \"min=0.383, mean=0.383, max=0.383, sum=0.766 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.38312110629106433\"}", + "High School Macroeconomics - Observed inference time (s)": "{\"description\": \"min=0.403, mean=0.403, max=0.403, sum=0.807 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4034240123553154\"}", + "High School Mathematics - Observed inference time (s)": "{\"description\": \"min=0.39, mean=0.39, max=0.39, sum=0.779 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.38954139285617406\"}", + "High School Microeconomics - Observed inference time (s)": "{\"description\": \"min=0.399, mean=0.399, max=0.399, sum=0.798 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.3992174813727371\"}", + "High School Physics - Observed inference time (s)": "{\"description\": \"min=0.409, mean=0.409, max=0.409, sum=0.819 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.40926165138648835\"}", + "High School Psychology - Observed inference time (s)": "{\"description\": \"min=0.408, mean=0.408, max=0.408, sum=0.816 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4081065694126514\"}", + "High School Statistics - Observed inference time (s)": "{\"description\": \"min=0.417, mean=0.417, max=0.417, sum=0.833 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4166152830477114\"}", + "High School US History - Observed inference time (s)": "{\"description\": \"min=0.45, mean=0.45, max=0.45, sum=0.901 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4504043985815609\"}", + "High School World History - Observed inference time (s)": "{\"description\": \"min=0.416, mean=0.416, max=0.416, sum=0.833 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4162542166086189\"}", + "High School Biology - # eval": "{\"description\": \"min=310, mean=310, max=310, sum=620 (2)\", \"tab\": \"General information\", \"score\": \"310.0\"}", + "High School Biology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Biology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Biology - # prompt tokens": "{\"description\": \"min=596.894, mean=596.894, max=596.894, sum=1193.787 (2)\", \"tab\": \"General information\", \"score\": \"596.8935483870968\"}", + "High School Biology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Chemistry - # eval": "{\"description\": \"min=203, mean=203, max=203, sum=406 (2)\", \"tab\": \"General information\", \"score\": \"203.0\"}", + "High School Chemistry - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Chemistry - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Chemistry - # prompt tokens": "{\"description\": \"min=568.665, mean=568.665, max=568.665, sum=1137.33 (2)\", \"tab\": \"General information\", \"score\": \"568.6650246305419\"}", + "High School Chemistry - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Computer Science - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "High School Computer Science - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Computer Science - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Computer Science - # prompt tokens": "{\"description\": \"min=988.57, mean=988.57, max=988.57, sum=1977.14 (2)\", \"tab\": \"General information\", \"score\": \"988.57\"}", + "High School Computer Science - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School European History - # eval": "{\"description\": \"min=165, mean=165, max=165, sum=330 (2)\", \"tab\": \"General information\", \"score\": \"165.0\"}", + "High School European History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School European History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School European History - # prompt tokens": "{\"description\": \"min=3159.636, mean=3159.636, max=3159.636, sum=6319.273 (2)\", \"tab\": \"General information\", \"score\": \"3159.6363636363635\"}", + "High School European History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Geography - # eval": "{\"description\": \"min=198, mean=198, max=198, sum=396 (2)\", \"tab\": \"General information\", \"score\": \"198.0\"}", + "High School Geography - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Geography - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Geography - # prompt tokens": "{\"description\": \"min=436.657, mean=436.657, max=436.657, sum=873.313 (2)\", \"tab\": \"General information\", \"score\": \"436.65656565656565\"}", + "High School Geography - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Government And Politics - # eval": "{\"description\": \"min=193, mean=193, max=193, sum=386 (2)\", \"tab\": \"General information\", \"score\": \"193.0\"}", + "High School Government And Politics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Government And Politics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Government And Politics - # prompt tokens": "{\"description\": \"min=527.927, mean=527.927, max=527.927, sum=1055.855 (2)\", \"tab\": \"General information\", \"score\": \"527.9274611398964\"}", + "High School Government And Politics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Macroeconomics - # eval": "{\"description\": \"min=390, mean=390, max=390, sum=780 (2)\", \"tab\": \"General information\", \"score\": \"390.0\"}", + "High School Macroeconomics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Macroeconomics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Macroeconomics - # prompt tokens": "{\"description\": \"min=445.662, mean=445.662, max=445.662, sum=891.323 (2)\", \"tab\": \"General information\", \"score\": \"445.66153846153844\"}", + "High School Macroeconomics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Mathematics - # eval": "{\"description\": \"min=270, mean=270, max=270, sum=540 (2)\", \"tab\": \"General information\", \"score\": \"270.0\"}", + "High School Mathematics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Mathematics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Mathematics - # prompt tokens": "{\"description\": \"min=579.181, mean=579.181, max=579.181, sum=1158.363 (2)\", \"tab\": \"General information\", \"score\": \"579.1814814814815\"}", + "High School Mathematics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Microeconomics - # eval": "{\"description\": \"min=238, mean=238, max=238, sum=476 (2)\", \"tab\": \"General information\", \"score\": \"238.0\"}", + "High School Microeconomics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Microeconomics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Microeconomics - # prompt tokens": "{\"description\": \"min=449.492, mean=449.492, max=449.492, sum=898.983 (2)\", \"tab\": \"General information\", \"score\": \"449.49159663865544\"}", + "High School Microeconomics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Physics - # eval": "{\"description\": \"min=151, mean=151, max=151, sum=302 (2)\", \"tab\": \"General information\", \"score\": \"151.0\"}", + "High School Physics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Physics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Physics - # prompt tokens": "{\"description\": \"min=621.788, mean=621.788, max=621.788, sum=1243.576 (2)\", \"tab\": \"General information\", \"score\": \"621.7880794701987\"}", + "High School Physics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Psychology - # eval": "{\"description\": \"min=545, mean=545, max=545, sum=1090 (2)\", \"tab\": \"General information\", \"score\": \"545.0\"}", + "High School Psychology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Psychology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Psychology - # prompt tokens": "{\"description\": \"min=585.919, mean=585.919, max=585.919, sum=1171.839 (2)\", \"tab\": \"General information\", \"score\": \"585.9192660550459\"}", + "High School Psychology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School Statistics - # eval": "{\"description\": \"min=216, mean=216, max=216, sum=432 (2)\", \"tab\": \"General information\", \"score\": \"216.0\"}", + "High School Statistics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School Statistics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School Statistics - # prompt tokens": "{\"description\": \"min=908.208, mean=908.208, max=908.208, sum=1816.417 (2)\", \"tab\": \"General information\", \"score\": \"908.2083333333334\"}", + "High School Statistics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School US History - # eval": "{\"description\": \"min=204, mean=204, max=204, sum=408 (2)\", \"tab\": \"General information\", \"score\": \"204.0\"}", + "High School US History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School US History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School US History - # prompt tokens": "{\"description\": \"min=2535.324, mean=2535.324, max=2535.324, sum=5070.647 (2)\", \"tab\": \"General information\", \"score\": \"2535.323529411765\"}", + "High School US History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "High School World History - # eval": "{\"description\": \"min=237, mean=237, max=237, sum=474 (2)\", \"tab\": \"General information\", \"score\": \"237.0\"}", + "High School World History - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "High School World History - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "High School World History - # prompt tokens": "{\"description\": \"min=1638.219, mean=1638.219, max=1638.219, sum=3276.439 (2)\", \"tab\": \"General information\", \"score\": \"1638.2194092827003\"}", + "High School World History - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"nutrition\"", + "subject": "\"high_school_world_history\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_nutrition\"" + "groups": "\"mmlu_high_school_world_history\"" } } }, { - "evaluation_name": "Prehistory", + "evaluation_name": "Human Sexuality", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1271,36 +1251,42 @@ ] }, "metric_config": { - "evaluation_description": "EM on Prehistory", + "evaluation_description": "EM on Human Sexuality", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.79, + "score": 0.847, "details": { - "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)", + "description": "min=0.847, mean=0.847, max=0.847, sum=1.695 (2)", "tab": "Accuracy", - "Prehistory - Observed inference time (s)": "{\"description\": \"min=0.43, mean=0.43, max=0.43, sum=0.861 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4303552037403907\"}", - "Prehistory - # eval": "{\"description\": \"min=324, mean=324, max=324, sum=648 (2)\", \"tab\": \"General information\", \"score\": \"324.0\"}", - "Prehistory - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Prehistory - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Prehistory - # prompt tokens": "{\"description\": \"min=619.185, mean=619.185, max=619.185, sum=1238.37 (2)\", \"tab\": \"General information\", \"score\": \"619.1851851851852\"}", - "Prehistory - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Human Aging - Observed inference time (s)": "{\"description\": \"min=0.401, mean=0.401, max=0.401, sum=0.802 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4010318255745242\"}", + "Human Sexuality - Observed inference time (s)": "{\"description\": \"min=0.393, mean=0.393, max=0.393, sum=0.787 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.39331119843111695\"}", + "Human Aging - # eval": "{\"description\": \"min=223, mean=223, max=223, sum=446 (2)\", \"tab\": \"General information\", \"score\": \"223.0\"}", + "Human Aging - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Human Aging - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Human Aging - # prompt tokens": "{\"description\": \"min=361.26, mean=361.26, max=361.26, sum=722.52 (2)\", \"tab\": \"General information\", \"score\": \"361.26008968609864\"}", + "Human Aging - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "Human Sexuality - # eval": "{\"description\": \"min=131, mean=131, max=131, sum=262 (2)\", \"tab\": \"General information\", \"score\": \"131.0\"}", + "Human Sexuality - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Human Sexuality - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Human Sexuality - # prompt tokens": "{\"description\": \"min=403.382, mean=403.382, max=403.382, sum=806.763 (2)\", \"tab\": \"General information\", \"score\": \"403.381679389313\"}", + "Human Sexuality - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"prehistory\"", + "subject": "\"human_sexuality\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_prehistory\"" + "groups": "\"mmlu_human_sexuality\"" } } }, { - "evaluation_name": "Public Relations", + "evaluation_name": "International Law", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1309,36 +1295,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Public Relations", + "evaluation_description": "EM on International Law", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.664, + "score": 0.826, "details": { - "description": "min=0.664, mean=0.664, max=0.664, sum=1.327 (2)", + "description": "min=0.826, mean=0.826, max=0.826, sum=1.653 (2)", "tab": "Accuracy", - "Public Relations - Observed inference time (s)": "{\"description\": \"min=0.428, mean=0.428, max=0.428, sum=0.855 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42750670259649104\"}", - "Public Relations - # eval": "{\"description\": \"min=110, mean=110, max=110, sum=220 (2)\", \"tab\": \"General information\", \"score\": \"110.0\"}", - "Public Relations - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Public Relations - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Public Relations - # prompt tokens": "{\"description\": \"min=474.827, mean=474.827, max=474.827, sum=949.655 (2)\", \"tab\": \"General information\", \"score\": \"474.8272727272727\"}", - "Public Relations - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "International Law - Observed inference time (s)": "{\"description\": \"min=0.42, mean=0.42, max=0.42, sum=0.841 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42040472779392213\"}", + "International Law - # eval": "{\"description\": \"min=121, mean=121, max=121, sum=242 (2)\", \"tab\": \"General information\", \"score\": \"121.0\"}", + "International Law - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "International Law - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "International Law - # prompt tokens": "{\"description\": \"min=729.463, mean=729.463, max=729.463, sum=1458.926 (2)\", \"tab\": \"General information\", \"score\": \"729.4628099173553\"}", + "International Law - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"public_relations\"", + "subject": "\"international_law\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_public_relations\"" + "groups": "\"mmlu_international_law\"" } } }, { - "evaluation_name": "Security Studies", + "evaluation_name": "Logical Fallacies", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1347,36 +1333,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Security Studies", + "evaluation_description": "EM on Logical Fallacies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.78, + "score": 0.779, "details": { - "description": "min=0.78, mean=0.78, max=0.78, sum=1.559 (2)", + "description": "min=0.779, mean=0.779, max=0.779, sum=1.558 (2)", "tab": "Accuracy", - "Security Studies - Observed inference time (s)": "{\"description\": \"min=0.466, mean=0.466, max=0.466, sum=0.933 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4662662194699657\"}", - "Security Studies - # eval": "{\"description\": \"min=245, mean=245, max=245, sum=490 (2)\", \"tab\": \"General information\", \"score\": \"245.0\"}", - "Security Studies - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Security Studies - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Security Studies - # prompt tokens": "{\"description\": \"min=1377.531, mean=1377.531, max=1377.531, sum=2755.061 (2)\", \"tab\": \"General information\", \"score\": \"1377.530612244898\"}", - "Security Studies - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Logical Fallacies - Observed inference time (s)": "{\"description\": \"min=0.404, mean=0.404, max=0.404, sum=0.809 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4043445353127696\"}", + "Logical Fallacies - # eval": "{\"description\": \"min=163, mean=163, max=163, sum=326 (2)\", \"tab\": \"General information\", \"score\": \"163.0\"}", + "Logical Fallacies - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Logical Fallacies - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Logical Fallacies - # prompt tokens": "{\"description\": \"min=502.755, mean=502.755, max=502.755, sum=1005.509 (2)\", \"tab\": \"General information\", \"score\": \"502.7546012269939\"}", + "Logical Fallacies - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"security_studies\"", + "subject": "\"logical_fallacies\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_security_studies\"" + "groups": "\"mmlu_logical_fallacies\"" } } }, { - "evaluation_name": "Sociology", + "evaluation_name": "Machine Learning", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1385,36 +1371,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Sociology", + "evaluation_description": "EM on Machine Learning", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.891, + "score": 0.473, "details": { - "description": "min=0.891, mean=0.891, max=0.891, sum=1.781 (2)", + "description": "min=0.473, mean=0.473, max=0.473, sum=0.946 (2)", "tab": "Accuracy", - "Sociology - Observed inference time (s)": "{\"description\": \"min=0.416, mean=0.416, max=0.416, sum=0.832 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4159522590352528\"}", - "Sociology - # eval": "{\"description\": \"min=201, mean=201, max=201, sum=402 (2)\", \"tab\": \"General information\", \"score\": \"201.0\"}", - "Sociology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Sociology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Sociology - # prompt tokens": "{\"description\": \"min=508.478, mean=508.478, max=508.478, sum=1016.955 (2)\", \"tab\": \"General information\", \"score\": \"508.4776119402985\"}", - "Sociology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Machine Learning - Observed inference time (s)": "{\"description\": \"min=0.421, mean=0.421, max=0.421, sum=0.842 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42122456644262585\"}", + "Machine Learning - # eval": "{\"description\": \"min=112, mean=112, max=112, sum=224 (2)\", \"tab\": \"General information\", \"score\": \"112.0\"}", + "Machine Learning - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Machine Learning - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Machine Learning - # prompt tokens": "{\"description\": \"min=730.402, mean=730.402, max=730.402, sum=1460.804 (2)\", \"tab\": \"General information\", \"score\": \"730.4017857142857\"}", + "Machine Learning - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"sociology\"", + "subject": "\"machine_learning\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_sociology\"" + "groups": "\"mmlu_machine_learning\"" } } }, { - "evaluation_name": "Virology", + "evaluation_name": "Management", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1423,36 +1409,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on Virology", + "evaluation_description": "EM on Management", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.536, + "score": 0.796, "details": { - "description": "min=0.536, mean=0.536, max=0.536, sum=1.072 (2)", + "description": "min=0.796, mean=0.796, max=0.796, sum=1.592 (2)", "tab": "Accuracy", - "Virology - Observed inference time (s)": "{\"description\": \"min=0.405, mean=0.405, max=0.405, sum=0.809 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.40467354332108096\"}", - "Virology - # eval": "{\"description\": \"min=166, mean=166, max=166, sum=332 (2)\", \"tab\": \"General information\", \"score\": \"166.0\"}", - "Virology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "Virology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "Virology - # prompt tokens": "{\"description\": \"min=405.108, mean=405.108, max=405.108, sum=810.217 (2)\", \"tab\": \"General information\", \"score\": \"405.10843373493975\"}", - "Virology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Management - Observed inference time (s)": "{\"description\": \"min=0.392, mean=0.392, max=0.392, sum=0.785 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.392485206566968\"}", + "Management - # eval": "{\"description\": \"min=103, mean=103, max=103, sum=206 (2)\", \"tab\": \"General information\", \"score\": \"103.0\"}", + "Management - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Management - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Management - # prompt tokens": "{\"description\": \"min=315.777, mean=315.777, max=315.777, sum=631.553 (2)\", \"tab\": \"General information\", \"score\": \"315.77669902912623\"}", + "Management - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"virology\"", + "subject": "\"management\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_virology\"" + "groups": "\"mmlu_management\"" } } }, { - "evaluation_name": "World Religions", + "evaluation_name": "Marketing", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1461,36 +1447,36 @@ ] }, "metric_config": { - "evaluation_description": "EM on World Religions", + "evaluation_description": "EM on Marketing", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.854, + "score": 0.902, "details": { - "description": "min=0.854, mean=0.854, max=0.854, sum=1.708 (2)", + "description": "min=0.902, mean=0.902, max=0.902, sum=1.803 (2)", "tab": "Accuracy", - "World Religions - Observed inference time (s)": "{\"description\": \"min=0.393, mean=0.393, max=0.393, sum=0.787 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.39336834455791275\"}", - "World Religions - # eval": "{\"description\": \"min=171, mean=171, max=171, sum=342 (2)\", \"tab\": \"General information\", \"score\": \"171.0\"}", - "World Religions - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "World Religions - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "World Religions - # prompt tokens": "{\"description\": \"min=304.474, mean=304.474, max=304.474, sum=608.947 (2)\", \"tab\": \"General information\", \"score\": \"304.4736842105263\"}", - "World Religions - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Marketing - Observed inference time (s)": "{\"description\": \"min=0.407, mean=0.407, max=0.407, sum=0.813 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.406507401384859\"}", + "Marketing - # eval": "{\"description\": \"min=234, mean=234, max=234, sum=468 (2)\", \"tab\": \"General information\", \"score\": \"234.0\"}", + "Marketing - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Marketing - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Marketing - # prompt tokens": "{\"description\": \"min=472.628, mean=472.628, max=472.628, sum=945.256 (2)\", \"tab\": \"General information\", \"score\": \"472.62820512820514\"}", + "Marketing - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "\"world_religions\"", + "subject": "\"marketing\"", "method": "\"multiple_choice_joint\"", "eval_split": "\"test\"", - "groups": "\"mmlu_world_religions\"" + "groups": "\"mmlu_marketing\"" } } }, { - "evaluation_name": "Mean win rate", + "evaluation_name": "Medical Genetics", "source_data": { "dataset_name": "helm_mmlu", "source_type": "url", @@ -1499,404 +1485,418 @@ ] }, "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", + "evaluation_description": "EM on Medical Genetics", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.565, + "score": 0.76, "details": { - "description": "", - "tab": "Efficiency" + "description": "min=0.76, mean=0.76, max=0.76, sum=1.52 (2)", + "tab": "Accuracy", + "Medical Genetics - Observed inference time (s)": "{\"description\": \"min=0.417, mean=0.417, max=0.417, sum=0.835 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.41734427213668823\"}", + "Medical Genetics - # eval": "{\"description\": \"min=100, mean=100, max=100, sum=200 (2)\", \"tab\": \"General information\", \"score\": \"100.0\"}", + "Medical Genetics - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Medical Genetics - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Medical Genetics - # prompt tokens": "{\"description\": \"min=408.14, mean=408.14, max=408.14, sum=816.28 (2)\", \"tab\": \"General information\", \"score\": \"408.14\"}", + "Medical Genetics - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { - "additional_details": {} + "additional_details": { + "subject": "\"medical_genetics\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_medical_genetics\"" + } } - } - ], - "detailed_evaluation_results": null, - "generation_config": { - "additional_details": { - "subject": "[\"abstract_algebra\", \"anatomy\", \"astronomy\", \"business_ethics\", \"clinical_knowledge\", \"college_biology\", \"college_chemistry\", \"college_computer_science\", \"college_mathematics\", \"college_medicine\", \"college_physics\", \"computer_security\", \"conceptual_physics\", \"econometrics\", \"electrical_engineering\", \"elementary_mathematics\", \"formal_logic\", \"global_facts\", \"high_school_biology\", \"high_school_chemistry\", \"high_school_computer_science\", \"high_school_european_history\", \"high_school_geography\", \"high_school_government_and_politics\", \"high_school_macroeconomics\", \"high_school_mathematics\", \"high_school_microeconomics\", \"high_school_physics\", \"high_school_psychology\", \"high_school_statistics\", \"high_school_us_history\", \"high_school_world_history\", \"human_aging\", \"human_sexuality\", \"international_law\", \"jurisprudence\", \"logical_fallacies\", \"machine_learning\", \"management\", \"marketing\", \"medical_genetics\", \"miscellaneous\", \"moral_disputes\", \"moral_scenarios\", \"nutrition\", \"philosophy\", \"prehistory\", \"professional_accounting\", \"professional_law\", \"professional_medicine\", \"professional_psychology\", \"public_relations\", \"security_studies\", \"sociology\", \"us_foreign_policy\", \"virology\", \"world_religions\"]", - "method": "\"multiple_choice_joint\"", - "eval_split": "\"test\"", - "groups": "[\"mmlu_abstract_algebra\", \"mmlu_anatomy\", \"mmlu_astronomy\", \"mmlu_business_ethics\", \"mmlu_clinical_knowledge\", \"mmlu_college_biology\", \"mmlu_college_chemistry\", \"mmlu_college_computer_science\", \"mmlu_college_mathematics\", \"mmlu_college_medicine\", \"mmlu_college_physics\", \"mmlu_computer_security\", \"mmlu_conceptual_physics\", \"mmlu_econometrics\", \"mmlu_electrical_engineering\", \"mmlu_elementary_mathematics\", \"mmlu_formal_logic\", \"mmlu_global_facts\", \"mmlu_high_school_biology\", \"mmlu_high_school_chemistry\", \"mmlu_high_school_computer_science\", \"mmlu_high_school_european_history\", \"mmlu_high_school_geography\", \"mmlu_high_school_government_and_politics\", \"mmlu_high_school_macroeconomics\", \"mmlu_high_school_mathematics\", \"mmlu_high_school_microeconomics\", \"mmlu_high_school_physics\", \"mmlu_high_school_psychology\", \"mmlu_high_school_statistics\", \"mmlu_high_school_us_history\", \"mmlu_high_school_world_history\", \"mmlu_human_aging\", \"mmlu_human_sexuality\", \"mmlu_international_law\", \"mmlu_jurisprudence\", \"mmlu_logical_fallacies\", \"mmlu_machine_learning\", \"mmlu_management\", \"mmlu_marketing\", \"mmlu_medical_genetics\", \"mmlu_miscellaneous\", \"mmlu_moral_disputes\", \"mmlu_moral_scenarios\", \"mmlu_nutrition\", \"mmlu_philosophy\", \"mmlu_prehistory\", \"mmlu_professional_accounting\", \"mmlu_professional_law\", \"mmlu_professional_medicine\", \"mmlu_professional_psychology\", \"mmlu_public_relations\", \"mmlu_security_studies\", \"mmlu_sociology\", \"mmlu_us_foreign_policy\", \"mmlu_virology\", \"mmlu_world_religions\"]" - } - } - }, - { - "evaluation_id": "helm_lite/snowflake_snowflake-arctic-instruct/1774096306.427425", - "retrieved_timestamp": "1774096306.427425", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "eval_library": { - "name": "helm", - "version": "unknown" - }, - "benchmark": "helm_lite", - "evaluation_results": [ + }, { - "evaluation_name": "Mean win rate", + "evaluation_name": "Miscellaneous", "source_data": { - "dataset_name": "helm_lite", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", + "evaluation_description": "EM on Miscellaneous", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.338, + "score": 0.875, "details": { - "description": "", + "description": "min=0.875, mean=0.875, max=0.875, sum=1.75 (2)", "tab": "Accuracy", - "Mean win rate - Efficiency": "{\"description\": \"\", \"tab\": \"Efficiency\", \"score\": \"0.7606242197253433\"}", - "Mean win rate - General information": "{\"description\": \"\", \"tab\": \"General information\", \"score\": \"\"}" + "Miscellaneous - Observed inference time (s)": "{\"description\": \"min=0.407, mean=0.407, max=0.407, sum=0.814 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.40693108880200146\"}", + "Miscellaneous - # eval": "{\"description\": \"min=783, mean=783, max=783, sum=1566 (2)\", \"tab\": \"General information\", \"score\": \"783.0\"}", + "Miscellaneous - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Miscellaneous - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Miscellaneous - # prompt tokens": "{\"description\": \"min=345.913, mean=345.913, max=345.913, sum=691.826 (2)\", \"tab\": \"General information\", \"score\": \"345.9131545338442\"}", + "Miscellaneous - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { - "additional_details": {} + "additional_details": { + "subject": "\"miscellaneous\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_miscellaneous\"" + } } }, { - "evaluation_name": "NarrativeQA", + "evaluation_name": "Moral Scenarios", "source_data": { - "dataset_name": "NarrativeQA", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "F1 on NarrativeQA", + "evaluation_description": "EM on Moral Scenarios", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.654, + "score": 0.28, "details": { - "description": "min=0.654, mean=0.654, max=0.654, sum=0.654 (1)", + "description": "min=0.28, mean=0.28, max=0.28, sum=0.561 (2)", "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": "{\"description\": \"min=0.624, mean=0.624, max=0.624, sum=0.624 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.6239793220036466\"}", - "NarrativeQA - # eval": "{\"description\": \"min=355, mean=355, max=355, sum=355 (1)\", \"tab\": \"General information\", \"score\": \"355.0\"}", - "NarrativeQA - # train": "{\"description\": \"min=4.262, mean=4.262, max=4.262, sum=4.262 (1)\", \"tab\": \"General information\", \"score\": \"4.261971830985916\"}", - "NarrativeQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "NarrativeQA - # prompt tokens": "{\"description\": \"min=3603.217, mean=3603.217, max=3603.217, sum=3603.217 (1)\", \"tab\": \"General information\", \"score\": \"3603.2169014084507\"}", - "NarrativeQA - # output tokens": "{\"description\": \"min=11.907, mean=11.907, max=11.907, sum=11.907 (1)\", \"tab\": \"General information\", \"score\": \"11.907042253521126\"}" + "Moral Disputes - Observed inference time (s)": "{\"description\": \"min=0.424, mean=0.424, max=0.424, sum=0.848 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4239204674097844\"}", + "Moral Scenarios - Observed inference time (s)": "{\"description\": \"min=0.433, mean=0.433, max=0.433, sum=0.866 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.43297034721800737\"}", + "Moral Disputes - # eval": "{\"description\": \"min=346, mean=346, max=346, sum=692 (2)\", \"tab\": \"General information\", \"score\": \"346.0\"}", + "Moral Disputes - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Moral Disputes - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Moral Disputes - # prompt tokens": "{\"description\": \"min=542.506, mean=542.506, max=542.506, sum=1085.012 (2)\", \"tab\": \"General information\", \"score\": \"542.5057803468208\"}", + "Moral Disputes - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}", + "Moral Scenarios - # eval": "{\"description\": \"min=895, mean=895, max=895, sum=1790 (2)\", \"tab\": \"General information\", \"score\": \"895.0\"}", + "Moral Scenarios - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Moral Scenarios - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Moral Scenarios - # prompt tokens": "{\"description\": \"min=756.479, mean=756.479, max=756.479, sum=1512.959 (2)\", \"tab\": \"General information\", \"score\": \"756.4793296089385\"}", + "Moral Scenarios - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { - "additional_details": {} + "additional_details": { + "subject": "\"moral_scenarios\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_moral_scenarios\"" + } } }, { - "evaluation_name": "NaturalQuestions (closed-book)", + "evaluation_name": "Nutrition", "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", + "evaluation_description": "EM on Nutrition", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.39, + "score": 0.725, "details": { - "description": "min=0.39, mean=0.39, max=0.39, sum=0.39 (1)", + "description": "min=0.725, mean=0.725, max=0.725, sum=1.451 (2)", "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": "{\"description\": \"min=0.636, mean=0.636, max=0.636, sum=0.636 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.6355201268196106\"}", - "NaturalQuestions (closed-book) - Observed inference time (s)": "{\"description\": \"min=0.469, mean=0.469, max=0.469, sum=0.469 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.4687326259613037\"}", - "NaturalQuestions (open-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", - "NaturalQuestions (open-book) - # train": "{\"description\": \"min=4.825, mean=4.825, max=4.825, sum=4.825 (1)\", \"tab\": \"General information\", \"score\": \"4.825\"}", - "NaturalQuestions (open-book) - truncated": "{\"description\": \"min=0.028, mean=0.028, max=0.028, sum=0.028 (1)\", \"tab\": \"General information\", \"score\": \"0.028\"}", - "NaturalQuestions (open-book) - # prompt tokens": "{\"description\": \"min=2311.514, mean=2311.514, max=2311.514, sum=2311.514 (1)\", \"tab\": \"General information\", \"score\": \"2311.514\"}", - "NaturalQuestions (open-book) - # output tokens": "{\"description\": \"min=18.701, mean=18.701, max=18.701, sum=18.701 (1)\", \"tab\": \"General information\", \"score\": \"18.701\"}", - "NaturalQuestions (closed-book) - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", - "NaturalQuestions (closed-book) - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "NaturalQuestions (closed-book) - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "NaturalQuestions (closed-book) - # prompt tokens": "{\"description\": \"min=166.383, mean=166.383, max=166.383, sum=166.383 (1)\", \"tab\": \"General information\", \"score\": \"166.383\"}", - "NaturalQuestions (closed-book) - # output tokens": "{\"description\": \"min=14.473, mean=14.473, max=14.473, sum=14.473 (1)\", \"tab\": \"General information\", \"score\": \"14.473\"}" + "Nutrition - Observed inference time (s)": "{\"description\": \"min=0.417, mean=0.417, max=0.417, sum=0.835 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.41727598430284485\"}", + "Nutrition - # eval": "{\"description\": \"min=306, mean=306, max=306, sum=612 (2)\", \"tab\": \"General information\", \"score\": \"306.0\"}", + "Nutrition - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Nutrition - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Nutrition - # prompt tokens": "{\"description\": \"min=695.922, mean=695.922, max=695.922, sum=1391.843 (2)\", \"tab\": \"General information\", \"score\": \"695.9215686274509\"}", + "Nutrition - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "mode": "\"closedbook\"" + "subject": "\"nutrition\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_nutrition\"" } } }, { - "evaluation_name": "OpenbookQA", + "evaluation_name": "Prehistory", "source_data": { - "dataset_name": "OpenbookQA", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "EM on OpenbookQA", + "evaluation_description": "EM on Prehistory", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.828, + "score": 0.79, "details": { - "description": "min=0.828, mean=0.828, max=0.828, sum=0.828 (1)", + "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)", "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": "{\"description\": \"min=0.284, mean=0.284, max=0.284, sum=0.284 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.2840936713218689\"}", - "OpenbookQA - # eval": "{\"description\": \"min=500, mean=500, max=500, sum=500 (1)\", \"tab\": \"General information\", \"score\": \"500.0\"}", - "OpenbookQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "OpenbookQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "OpenbookQA - # prompt tokens": "{\"description\": \"min=291.574, mean=291.574, max=291.574, sum=291.574 (1)\", \"tab\": \"General information\", \"score\": \"291.574\"}", - "OpenbookQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Prehistory - Observed inference time (s)": "{\"description\": \"min=0.43, mean=0.43, max=0.43, sum=0.861 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4303552037403907\"}", + "Prehistory - # eval": "{\"description\": \"min=324, mean=324, max=324, sum=648 (2)\", \"tab\": \"General information\", \"score\": \"324.0\"}", + "Prehistory - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Prehistory - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Prehistory - # prompt tokens": "{\"description\": \"min=619.185, mean=619.185, max=619.185, sum=1238.37 (2)\", \"tab\": \"General information\", \"score\": \"619.1851851851852\"}", + "Prehistory - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "dataset": "\"openbookqa\"", - "method": "\"multiple_choice_joint\"" + "subject": "\"prehistory\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_prehistory\"" } } }, { - "evaluation_name": "MMLU", + "evaluation_name": "Public Relations", "source_data": { - "dataset_name": "MMLU", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "EM on MMLU", + "evaluation_description": "EM on Public Relations", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.575, + "score": 0.664, "details": { - "description": "min=0.31, mean=0.575, max=0.88, sum=2.876 (5)", + "description": "min=0.664, mean=0.664, max=0.664, sum=1.327 (2)", "tab": "Accuracy", - "MMLU - Observed inference time (s)": "{\"description\": \"min=0.293, mean=0.303, max=0.317, sum=1.516 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.30325288054817606\"}", - "MMLU - # eval": "{\"description\": \"min=100, mean=102.8, max=114, sum=514 (5)\", \"tab\": \"General information\", \"score\": \"102.8\"}", - "MMLU - # train": "{\"description\": \"min=5, mean=5, max=5, sum=25 (5)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "MMLU - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "MMLU - # prompt tokens": "{\"description\": \"min=406.65, mean=531.547, max=693.675, sum=2657.735 (5)\", \"tab\": \"General information\", \"score\": \"531.5470877192982\"}", - "MMLU - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "Public Relations - Observed inference time (s)": "{\"description\": \"min=0.428, mean=0.428, max=0.428, sum=0.855 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.42750670259649104\"}", + "Public Relations - # eval": "{\"description\": \"min=110, mean=110, max=110, sum=220 (2)\", \"tab\": \"General information\", \"score\": \"110.0\"}", + "Public Relations - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Public Relations - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Public Relations - # prompt tokens": "{\"description\": \"min=474.827, mean=474.827, max=474.827, sum=949.655 (2)\", \"tab\": \"General information\", \"score\": \"474.8272727272727\"}", + "Public Relations - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "[\"abstract_algebra\", \"college_chemistry\", \"computer_security\", \"econometrics\", \"us_foreign_policy\"]", - "method": "\"multiple_choice_joint\"" + "subject": "\"public_relations\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_public_relations\"" } } }, { - "evaluation_name": "MATH", + "evaluation_name": "Security Studies", "source_data": { - "dataset_name": "MATH", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", + "evaluation_description": "EM on Security Studies", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.519, + "score": 0.78, "details": { - "description": "min=0.316, mean=0.519, max=0.785, sum=3.636 (7)", + "description": "min=0.78, mean=0.78, max=0.78, sum=1.559 (2)", "tab": "Accuracy", - "MATH - Observed inference time (s)": "{\"description\": \"min=1.482, mean=1.724, max=1.995, sum=12.068 (7)\", \"tab\": \"Efficiency\", \"score\": \"1.723981539653867\"}", - "MATH - # eval": "{\"description\": \"min=30, mean=62.429, max=135, sum=437 (7)\", \"tab\": \"General information\", \"score\": \"62.42857142857143\"}", - "MATH - # train": "{\"description\": \"min=8, mean=8, max=8, sum=56 (7)\", \"tab\": \"General information\", \"score\": \"8.0\"}", - "MATH - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (7)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "MATH - # prompt tokens": "{\"description\": \"min=971.652, mean=1438.636, max=2490.962, sum=10070.453 (7)\", \"tab\": \"General information\", \"score\": \"1438.6362030100095\"}", - "MATH - # output tokens": "{\"description\": \"min=82.872, mean=98.802, max=122.233, sum=691.615 (7)\", \"tab\": \"General information\", \"score\": \"98.80208187931566\"}" + "Security Studies - Observed inference time (s)": "{\"description\": \"min=0.466, mean=0.466, max=0.466, sum=0.933 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4662662194699657\"}", + "Security Studies - # eval": "{\"description\": \"min=245, mean=245, max=245, sum=490 (2)\", \"tab\": \"General information\", \"score\": \"245.0\"}", + "Security Studies - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Security Studies - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Security Studies - # prompt tokens": "{\"description\": \"min=1377.531, mean=1377.531, max=1377.531, sum=2755.061 (2)\", \"tab\": \"General information\", \"score\": \"1377.530612244898\"}", + "Security Studies - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subject": "[\"algebra\", \"counting_and_probability\", \"geometry\", \"intermediate_algebra\", \"number_theory\", \"prealgebra\", \"precalculus\"]", - "level": "\"1\"", - "use_official_examples": "\"False\"", - "use_chain_of_thought": "\"True\"" + "subject": "\"security_studies\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_security_studies\"" } } }, { - "evaluation_name": "GSM8K", + "evaluation_name": "Sociology", "source_data": { - "dataset_name": "GSM8K", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "EM on GSM8K", + "evaluation_description": "EM on Sociology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.768, + "score": 0.891, "details": { - "description": "min=0.768, mean=0.768, max=0.768, sum=0.768 (1)", + "description": "min=0.891, mean=0.891, max=0.891, sum=1.781 (2)", "tab": "Accuracy", - "GSM8K - Observed inference time (s)": "{\"description\": \"min=2.961, mean=2.961, max=2.961, sum=2.961 (1)\", \"tab\": \"Efficiency\", \"score\": \"2.9610197002887726\"}", - "GSM8K - # eval": "{\"description\": \"min=1000, mean=1000, max=1000, sum=1000 (1)\", \"tab\": \"General information\", \"score\": \"1000.0\"}", - "GSM8K - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "GSM8K - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "GSM8K - # prompt tokens": "{\"description\": \"min=1207.746, mean=1207.746, max=1207.746, sum=1207.746 (1)\", \"tab\": \"General information\", \"score\": \"1207.746\"}", - "GSM8K - # output tokens": "{\"description\": \"min=189.305, mean=189.305, max=189.305, sum=189.305 (1)\", \"tab\": \"General information\", \"score\": \"189.305\"}" + "Sociology - Observed inference time (s)": "{\"description\": \"min=0.416, mean=0.416, max=0.416, sum=0.832 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.4159522590352528\"}", + "Sociology - # eval": "{\"description\": \"min=201, mean=201, max=201, sum=402 (2)\", \"tab\": \"General information\", \"score\": \"201.0\"}", + "Sociology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Sociology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Sociology - # prompt tokens": "{\"description\": \"min=508.478, mean=508.478, max=508.478, sum=1016.955 (2)\", \"tab\": \"General information\", \"score\": \"508.4776119402985\"}", + "Sociology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "stop": "\"none\"" + "subject": "\"sociology\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_sociology\"" } } }, { - "evaluation_name": "LegalBench", + "evaluation_name": "Virology", "source_data": { - "dataset_name": "LegalBench", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "EM on LegalBench", + "evaluation_description": "EM on Virology", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.588, + "score": 0.536, "details": { - "description": "min=0.351, mean=0.588, max=0.874, sum=2.94 (5)", + "description": "min=0.536, mean=0.536, max=0.536, sum=1.072 (2)", "tab": "Accuracy", - "LegalBench - Observed inference time (s)": "{\"description\": \"min=0.292, mean=0.346, max=0.462, sum=1.729 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.34576316386866485\"}", - "LegalBench - # eval": "{\"description\": \"min=95, mean=409.4, max=1000, sum=2047 (5)\", \"tab\": \"General information\", \"score\": \"409.4\"}", - "LegalBench - # train": "{\"description\": \"min=1.81, mean=4.162, max=5, sum=20.81 (5)\", \"tab\": \"General information\", \"score\": \"4.162040816326531\"}", - "LegalBench - truncated": "{\"description\": \"min=0, mean=0.002, max=0.008, sum=0.008 (5)\", \"tab\": \"General information\", \"score\": \"0.0016326530612244899\"}", - "LegalBench - # prompt tokens": "{\"description\": \"min=239.137, mean=1024.722, max=3561.237, sum=5123.61 (5)\", \"tab\": \"General information\", \"score\": \"1024.7220443430492\"}", - "LegalBench - # output tokens": "{\"description\": \"min=2, mean=2.438, max=3.421, sum=12.188 (5)\", \"tab\": \"General information\", \"score\": \"2.4375592890361366\"}" + "Virology - Observed inference time (s)": "{\"description\": \"min=0.405, mean=0.405, max=0.405, sum=0.809 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.40467354332108096\"}", + "Virology - # eval": "{\"description\": \"min=166, mean=166, max=166, sum=332 (2)\", \"tab\": \"General information\", \"score\": \"166.0\"}", + "Virology - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "Virology - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "Virology - # prompt tokens": "{\"description\": \"min=405.108, mean=405.108, max=405.108, sum=810.217 (2)\", \"tab\": \"General information\", \"score\": \"405.10843373493975\"}", + "Virology - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { "additional_details": { - "subset": "[\"abercrombie\", \"corporate_lobbying\", \"function_of_decision_section\", \"international_citizenship_questions\", \"proa\"]" + "subject": "\"virology\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_virology\"" } } }, { - "evaluation_name": "MedQA", + "evaluation_name": "World Religions", "source_data": { - "dataset_name": "MedQA", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "EM on MedQA", + "evaluation_description": "EM on World Religions", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.581, + "score": 0.854, "details": { - "description": "min=0.581, mean=0.581, max=0.581, sum=0.581 (1)", + "description": "min=0.854, mean=0.854, max=0.854, sum=1.708 (2)", "tab": "Accuracy", - "MedQA - Observed inference time (s)": "{\"description\": \"min=0.313, mean=0.313, max=0.313, sum=0.313 (1)\", \"tab\": \"Efficiency\", \"score\": \"0.31300480038697864\"}", - "MedQA - # eval": "{\"description\": \"min=503, mean=503, max=503, sum=503 (1)\", \"tab\": \"General information\", \"score\": \"503.0\"}", - "MedQA - # train": "{\"description\": \"min=5, mean=5, max=5, sum=5 (1)\", \"tab\": \"General information\", \"score\": \"5.0\"}", - "MedQA - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (1)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "MedQA - # prompt tokens": "{\"description\": \"min=1243.901, mean=1243.901, max=1243.901, sum=1243.901 (1)\", \"tab\": \"General information\", \"score\": \"1243.9005964214712\"}", - "MedQA - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=1 (1)\", \"tab\": \"General information\", \"score\": \"1.0\"}" + "World Religions - Observed inference time (s)": "{\"description\": \"min=0.393, mean=0.393, max=0.393, sum=0.787 (2)\", \"tab\": \"Efficiency\", \"score\": \"0.39336834455791275\"}", + "World Religions - # eval": "{\"description\": \"min=171, mean=171, max=171, sum=342 (2)\", \"tab\": \"General information\", \"score\": \"171.0\"}", + "World Religions - # train": "{\"description\": \"min=5, mean=5, max=5, sum=10 (2)\", \"tab\": \"General information\", \"score\": \"5.0\"}", + "World Religions - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (2)\", \"tab\": \"General information\", \"score\": \"0.0\"}", + "World Religions - # prompt tokens": "{\"description\": \"min=304.474, mean=304.474, max=304.474, sum=608.947 (2)\", \"tab\": \"General information\", \"score\": \"304.4736842105263\"}", + "World Religions - # output tokens": "{\"description\": \"min=1, mean=1, max=1, sum=2 (2)\", \"tab\": \"General information\", \"score\": \"1.0\"}" } }, "generation_config": { - "additional_details": {} + "additional_details": { + "subject": "\"world_religions\"", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "\"mmlu_world_religions\"" + } } }, { - "evaluation_name": "WMT 2014", + "evaluation_name": "Mean win rate", "source_data": { - "dataset_name": "WMT 2014", + "dataset_name": "helm_mmlu", "source_type": "url", "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" + "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" ] }, "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", + "evaluation_description": "How many models this model outperforms on average (over columns).", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.172, + "score": 0.565, "details": { - "description": "min=0.09, mean=0.172, max=0.217, sum=0.86 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": "{\"description\": \"min=0.65, mean=0.681, max=0.702, sum=3.405 (5)\", \"tab\": \"Efficiency\", \"score\": \"0.681007040066764\"}", - "WMT 2014 - # eval": "{\"description\": \"min=503, mean=568.8, max=832, sum=2844 (5)\", \"tab\": \"General information\", \"score\": \"568.8\"}", - "WMT 2014 - # train": "{\"description\": \"min=1, mean=1, max=1, sum=5 (5)\", \"tab\": \"General information\", \"score\": \"1.0\"}", - "WMT 2014 - truncated": "{\"description\": \"min=0, mean=0, max=0, sum=0 (5)\", \"tab\": \"General information\", \"score\": \"0.0\"}", - "WMT 2014 - # prompt tokens": "{\"description\": \"min=145.523, mean=160.288, max=182.972, sum=801.438 (5)\", \"tab\": \"General information\", \"score\": \"160.28751290334915\"}", - "WMT 2014 - # output tokens": "{\"description\": \"min=28.596, mean=30.59, max=31.485, sum=152.951 (5)\", \"tab\": \"General information\", \"score\": \"30.59012702630372\"}" + "description": "", + "tab": "Efficiency" } }, "generation_config": { - "additional_details": { - "language_pair": "[\"cs-en\", \"de-en\", \"fr-en\", \"hi-en\", \"ru-en\"]" - } + "additional_details": {} } } ], "detailed_evaluation_results": null, "generation_config": { - "additional_details": {} + "additional_details": { + "subject": "[\"abstract_algebra\", \"anatomy\", \"astronomy\", \"business_ethics\", \"clinical_knowledge\", \"college_biology\", \"college_chemistry\", \"college_computer_science\", \"college_mathematics\", \"college_medicine\", \"college_physics\", \"computer_security\", \"conceptual_physics\", \"econometrics\", \"electrical_engineering\", \"elementary_mathematics\", \"formal_logic\", \"global_facts\", \"high_school_biology\", \"high_school_chemistry\", \"high_school_computer_science\", \"high_school_european_history\", \"high_school_geography\", \"high_school_government_and_politics\", \"high_school_macroeconomics\", \"high_school_mathematics\", \"high_school_microeconomics\", \"high_school_physics\", \"high_school_psychology\", \"high_school_statistics\", \"high_school_us_history\", \"high_school_world_history\", \"human_aging\", \"human_sexuality\", \"international_law\", \"jurisprudence\", \"logical_fallacies\", \"machine_learning\", \"management\", \"marketing\", \"medical_genetics\", \"miscellaneous\", \"moral_disputes\", \"moral_scenarios\", \"nutrition\", \"philosophy\", \"prehistory\", \"professional_accounting\", \"professional_law\", \"professional_medicine\", \"professional_psychology\", \"public_relations\", \"security_studies\", \"sociology\", \"us_foreign_policy\", \"virology\", \"world_religions\"]", + "method": "\"multiple_choice_joint\"", + "eval_split": "\"test\"", + "groups": "[\"mmlu_abstract_algebra\", \"mmlu_anatomy\", \"mmlu_astronomy\", \"mmlu_business_ethics\", \"mmlu_clinical_knowledge\", \"mmlu_college_biology\", \"mmlu_college_chemistry\", \"mmlu_college_computer_science\", \"mmlu_college_mathematics\", \"mmlu_college_medicine\", \"mmlu_college_physics\", \"mmlu_computer_security\", \"mmlu_conceptual_physics\", \"mmlu_econometrics\", \"mmlu_electrical_engineering\", \"mmlu_elementary_mathematics\", \"mmlu_formal_logic\", \"mmlu_global_facts\", \"mmlu_high_school_biology\", \"mmlu_high_school_chemistry\", \"mmlu_high_school_computer_science\", \"mmlu_high_school_european_history\", \"mmlu_high_school_geography\", \"mmlu_high_school_government_and_politics\", \"mmlu_high_school_macroeconomics\", \"mmlu_high_school_mathematics\", \"mmlu_high_school_microeconomics\", \"mmlu_high_school_physics\", \"mmlu_high_school_psychology\", \"mmlu_high_school_statistics\", \"mmlu_high_school_us_history\", \"mmlu_high_school_world_history\", \"mmlu_human_aging\", \"mmlu_human_sexuality\", \"mmlu_international_law\", \"mmlu_jurisprudence\", \"mmlu_logical_fallacies\", \"mmlu_machine_learning\", \"mmlu_management\", \"mmlu_marketing\", \"mmlu_medical_genetics\", \"mmlu_miscellaneous\", \"mmlu_moral_disputes\", \"mmlu_moral_scenarios\", \"mmlu_nutrition\", \"mmlu_philosophy\", \"mmlu_prehistory\", \"mmlu_professional_accounting\", \"mmlu_professional_law\", \"mmlu_professional_medicine\", \"mmlu_professional_psychology\", \"mmlu_public_relations\", \"mmlu_security_studies\", \"mmlu_sociology\", \"mmlu_us_foreign_policy\", \"mmlu_virology\", \"mmlu_world_religions\"]" + } } } ] diff --git a/data/models/ucla-agi_llama-3-instruct-8b-sppo-iter3.json b/data/models/ucla-agi_llama-3-instruct-8b-sppo-iter3.json index dd0fde633a69c27555ebadb4cd9d2b225a2b1309..6c8b4b0133a49d420e13a89373c13c0888830855 100644 --- a/data/models/ucla-agi_llama-3-instruct-8b-sppo-iter3.json +++ b/data/models/ucla-agi_llama-3-instruct-8b-sppo-iter3.json @@ -5,7 +5,7 @@ "developer": "UCLA-AGI", "inference_platform": "unknown", "additional_details": { - "precision": "float16", + "precision": "bfloat16", "architecture": "LlamaForCausalLM", "params_billions": "8.03" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.6703 + "score": 0.6834 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5076 + "score": 0.508 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0718 + "score": 0.0959 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3647 + "score": 0.3661 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3658 + "score": 0.3644 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.6834 + "score": 0.6703 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.508 + "score": 0.5076 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0959 + "score": 0.0718 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3661 + "score": 0.3647 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3644 + "score": 0.3658 } } ], diff --git a/data/models/valiantlabs_llama3.1-8b-fireplace2.json b/data/models/valiantlabs_llama3.1-8b-fireplace2.json index 0a050b1b1736e6133f5ceae5b758cb4a60844c7f..1328c2c7f3791f083d4f447f5573aa125ba6707d 100644 --- a/data/models/valiantlabs_llama3.1-8b-fireplace2.json +++ b/data/models/valiantlabs_llama3.1-8b-fireplace2.json @@ -5,7 +5,7 @@ "developer": "ValiantLabs", "inference_platform": "unknown", "additional_details": { - "precision": "float16", + "precision": "bfloat16", "architecture": "LlamaForCausalLM", "params_billions": "8.03" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5483 + "score": 0.5328 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.461 + "score": 0.4613 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0582 + "score": 0.0876 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2886 + "score": 0.2894 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3433 + "score": 0.3367 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2407 + "score": 0.2424 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5328 + "score": 0.5483 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4613 + "score": 0.461 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0876 + "score": 0.0582 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2894 + "score": 0.2886 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3367 + "score": 0.3433 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2424 + "score": 0.2407 } } ], diff --git a/data/models/valiantlabs_llama3.1-8b-shiningvaliant2.json b/data/models/valiantlabs_llama3.1-8b-shiningvaliant2.json index 0736460b872bea97a209be68cef1f113fa7d9f3d..f3e37b204fa779a9e21a0521a813b464c3fe641b 100644 --- a/data/models/valiantlabs_llama3.1-8b-shiningvaliant2.json +++ b/data/models/valiantlabs_llama3.1-8b-shiningvaliant2.json @@ -5,7 +5,7 @@ "developer": "ValiantLabs", "inference_platform": "unknown", "additional_details": { - "precision": "float16", + "precision": "bfloat16", "architecture": "LlamaForCausalLM", "params_billions": "8.03" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2678 + "score": 0.6496 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4429 + "score": 0.4774 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0521 + "score": 0.0566 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.302 + "score": 0.3104 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3959 + "score": 0.3909 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.2927 + "score": 0.3382 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.6496 + "score": 0.2678 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4774 + "score": 0.4429 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0566 + "score": 0.0521 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3104 + "score": 0.302 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3909 + "score": 0.3959 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3382 + "score": 0.2927 } } ], diff --git a/data/models/virnect_llama-3-korean-8b.json b/data/models/virnect_llama-3-korean-8b.json index 99d18345be715ff626474cdbe5ae8e088352cbe8..10825c433e16c0218b8189ab67e3befd38673ff7 100644 --- a/data/models/virnect_llama-3-korean-8b.json +++ b/data/models/virnect_llama-3-korean-8b.json @@ -5,7 +5,7 @@ "developer": "VIRNECT", "inference_platform": "unknown", "additional_details": { - "precision": "float16", + "precision": "bfloat16", "architecture": "LlamaForCausalLM", "params_billions": "8.03" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5058 + "score": 0.5021 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4908 + "score": 0.4918 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0929 + "score": 0.108 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3662 + "score": 0.3648 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3539 + "score": 0.3536 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5021 + "score": 0.5058 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4918 + "score": 0.4908 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.108 + "score": 0.0929 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3648 + "score": 0.3662 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3536 + "score": 0.3539 } } ], diff --git a/data/models/weqweasdas_hh_rlhf_rm_open_llama_3b.json b/data/models/weqweasdas_hh_rlhf_rm_open_llama_3b.json index f9d2b14f0a0f79f11e39957c0f38b89aa1e78ac9..564c2ceeb0768d947ec7e8507c351558f76e3907 100644 --- a/data/models/weqweasdas_hh_rlhf_rm_open_llama_3b.json +++ b/data/models/weqweasdas_hh_rlhf_rm_open_llama_3b.json @@ -9,10 +9,10 @@ }, "evaluations": [ { - "evaluation_id": "reward-bench/weqweasdas_hh_rlhf_rm_open_llama_3b/1766412838.146816", + "evaluation_id": "reward-bench-2/weqweasdas_hh_rlhf_rm_open_llama_3b/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench", + "source_name": "RewardBench 2", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -31,109 +31,127 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5027 + "score": 0.2498 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8184 + "score": 0.3642 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3728 + "score": 0.275 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" + } + }, + { + "evaluation_name": "Math", + "metric_config": { + "evaluation_description": "Math score - measures mathematical reasoning", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.3497 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.4149 + "score": 0.24 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3281 + "score": 0.2384 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Prior Sets (0.5 weight)", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6564 + "score": 0.0315 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } } ], @@ -141,10 +159,10 @@ "generation_config": null }, { - "evaluation_id": "reward-bench-2/weqweasdas_hh_rlhf_rm_open_llama_3b/1766412838.146816", + "evaluation_id": "reward-bench/weqweasdas_hh_rlhf_rm_open_llama_3b/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench 2", + "source_name": "RewardBench", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -163,127 +181,109 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2498 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3642 + "score": 0.5027 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Precise IF", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.275 + "score": 0.8184 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3497 + "score": 0.3728 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.24 + "score": 0.4149 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Focus", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.2384 + "score": 0.3281 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Ties", + "evaluation_name": "Prior Sets (0.5 weight)", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.0315 + "score": 0.6564 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } } ], diff --git a/data/models/weqweasdas_rm-gemma-2b.json b/data/models/weqweasdas_rm-gemma-2b.json index 23bdb9776ed416ae7ab56cbae8b9e5655ba27e0c..b1151d30882a530616e4e5f2252a198e5a25a8a2 100644 --- a/data/models/weqweasdas_rm-gemma-2b.json +++ b/data/models/weqweasdas_rm-gemma-2b.json @@ -9,10 +9,10 @@ }, "evaluations": [ { - "evaluation_id": "reward-bench/weqweasdas_RM-Gemma-2B/1766412838.146816", + "evaluation_id": "reward-bench-2/weqweasdas_RM-Gemma-2B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench", + "source_name": "RewardBench 2", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -31,109 +31,127 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6549 + "score": 0.3057 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9441 + "score": 0.3705 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.4079 + "score": 0.2812 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" + } + }, + { + "evaluation_name": "Math", + "metric_config": { + "evaluation_description": "Math score - measures mathematical reasoning", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.4317 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.4986 + "score": 0.3311 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7637 + "score": 0.2343 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Prior Sets (0.5 weight)", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6652 + "score": 0.1851 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } } ], @@ -141,10 +159,10 @@ "generation_config": null }, { - "evaluation_id": "reward-bench-2/weqweasdas_RM-Gemma-2B/1766412838.146816", + "evaluation_id": "reward-bench/weqweasdas_RM-Gemma-2B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench 2", + "source_name": "RewardBench", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -163,127 +181,109 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3057 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3705 + "score": 0.6549 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Precise IF", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.2812 + "score": 0.9441 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.4317 + "score": 0.4079 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3311 + "score": 0.4986 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Focus", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.2343 + "score": 0.7637 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Ties", + "evaluation_name": "Prior Sets (0.5 weight)", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.1851 + "score": 0.6652 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } } ], diff --git a/data/models/weqweasdas_rm-mistral-7b.json b/data/models/weqweasdas_rm-mistral-7b.json index 014b8589e308fa2571a3ca85971364da616341ae..2c0c95b4657b4530753b94c6c05b68b220f49072 100644 --- a/data/models/weqweasdas_rm-mistral-7b.json +++ b/data/models/weqweasdas_rm-mistral-7b.json @@ -9,10 +9,10 @@ }, "evaluations": [ { - "evaluation_id": "reward-bench/weqweasdas_RM-Mistral-7B/1766412838.146816", + "evaluation_id": "reward-bench-2/weqweasdas_RM-Mistral-7B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench", + "source_name": "RewardBench 2", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -31,109 +31,127 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench Score", + "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7982 + "score": 0.596 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat", + "evaluation_name": "Factuality", "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", + "evaluation_description": "Factuality score - measures factual accuracy", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.9665 + "score": 0.5937 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Chat Hard", + "evaluation_name": "Precise IF", "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", + "evaluation_description": "Precise Instruction Following score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6053 + "score": 0.3438 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" + } + }, + { + "evaluation_name": "Math", + "metric_config": { + "evaluation_description": "Math score - measures mathematical reasoning", + "lower_is_better": false, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0 + }, + "score_details": { + "score": 0.5956 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", + "evaluation_description": "Safety score - measures safety awareness", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.8703 + "score": 0.6911 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Reasoning", + "evaluation_name": "Focus", "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", + "evaluation_description": "Focus score - measures response focus", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7736 + "score": 0.7293 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } }, { - "evaluation_name": "Prior Sets (0.5 weight)", + "evaluation_name": "Ties", "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", + "evaluation_description": "Ties score - ability to identify tie cases", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.753 + "score": 0.6226 }, "source_data": { - "dataset_name": "RewardBench", + "dataset_name": "RewardBench 2", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" + "hf_repo": "allenai/reward-bench-2-results" } } ], @@ -141,10 +159,10 @@ "generation_config": null }, { - "evaluation_id": "reward-bench-2/weqweasdas_RM-Mistral-7B/1766412838.146816", + "evaluation_id": "reward-bench/weqweasdas_RM-Mistral-7B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", "source_metadata": { - "source_name": "RewardBench 2", + "source_name": "RewardBench", "source_type": "documentation", "source_organization_name": "Allen Institute for AI", "source_organization_url": "https://allenai.org", @@ -163,127 +181,109 @@ { "evaluation_name": "Score", "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.596 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", + "evaluation_description": "Overall RewardBench Score", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5937 + "score": 0.7982 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Precise IF", + "evaluation_name": "Chat", "metric_config": { - "evaluation_description": "Precise Instruction Following score", + "evaluation_description": "Chat accuracy - includes easy chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.3438 + "score": 0.9665 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Math", + "evaluation_name": "Chat Hard", "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", + "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.5956 + "score": 0.6053 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { "evaluation_name": "Safety", "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", + "evaluation_description": "Safety accuracy - includes safety subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6911 + "score": 0.8703 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Focus", + "evaluation_name": "Reasoning", "metric_config": { - "evaluation_description": "Focus score - measures response focus", + "evaluation_description": "Reasoning accuracy - includes code and math subsets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.7293 + "score": 0.7736 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } }, { - "evaluation_name": "Ties", + "evaluation_name": "Prior Sets (0.5 weight)", "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", + "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", "lower_is_better": false, "score_type": "continuous", "min_score": 0.0, "max_score": 1.0 }, "score_details": { - "score": 0.6226 + "score": 0.753 }, "source_data": { - "dataset_name": "RewardBench 2", + "dataset_name": "RewardBench", "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" + "hf_repo": "allenai/reward-bench" } } ], diff --git a/data/models/xai_grok-3-mini.json b/data/models/xai_grok-3-mini.json index 6f7e913322e4d12afd1b4e9815b3c829b5eb051d..7fda2d0643a4e1f5eb9d98f1dc44e9ec85970d5d 100644 --- a/data/models/xai_grok-3-mini.json +++ b/data/models/xai_grok-3-mini.json @@ -10,8 +10,8 @@ }, "evaluations": [ { - "evaluation_id": "global-mmlu-lite/xai_grok-3-mini/1773936583.743359", - "retrieved_timestamp": "1773936583.743359", + "evaluation_id": "global-mmlu-lite/xai_grok-3-mini/1773936496.366405", + "retrieved_timestamp": "1773936496.366405", "source_metadata": { "source_name": "Global MMLU Lite Leaderboard", "source_type": "documentation", @@ -525,8 +525,8 @@ "generation_config": null }, { - "evaluation_id": "global-mmlu-lite/xai_grok-3-mini/1773936496.366405", - "retrieved_timestamp": "1773936496.366405", + "evaluation_id": "global-mmlu-lite/xai_grok-3-mini/1773936583.743359", + "retrieved_timestamp": "1773936583.743359", "source_metadata": { "source_name": "Global MMLU Lite Leaderboard", "source_type": "documentation", diff --git a/data/models/xai_grok-4.json b/data/models/xai_grok-4.json index 2396de921bef3d760583d06f283159a7a4b7cc87..8fb7b1b24e602a14c14ed4217ea5e7cd9fdf0b69 100644 --- a/data/models/xai_grok-4.json +++ b/data/models/xai_grok-4.json @@ -84,7 +84,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/terminus-2__grok-4/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/mini-swe-agent__grok-4/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -108,7 +108,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-10-31", + "evaluation_timestamp": "2025-11-03", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -117,7 +117,7 @@ "max_score": 100.0 }, "score_details": { - "score": 23.1, + "score": 25.4, "uncertainty": { "standard_error": { "value": 2.9 @@ -127,7 +127,7 @@ }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Grok 4\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Grok 4\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -144,7 +144,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Grok 4\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Grok 4\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -158,7 +158,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/mini-swe-agent__grok-4/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/terminus-2__grok-4/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -182,7 +182,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-03", + "evaluation_timestamp": "2025-10-31", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -191,7 +191,7 @@ "max_score": 100.0 }, "score_details": { - "score": 25.4, + "score": 23.1, "uncertainty": { "standard_error": { "value": 2.9 @@ -201,7 +201,7 @@ }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Grok 4\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Grok 4\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -218,7 +218,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Grok 4\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Grok 4\" -k 5", "agentic_eval_config": { "available_tools": [ { diff --git a/data/models/xai_grok-code-fast-1.json b/data/models/xai_grok-code-fast-1.json index 3dead5de67b9b087729b3bc778973bf9f0bdd596..80a497dcbe0e067252e23ff5d2567dcecf97e827 100644 --- a/data/models/xai_grok-code-fast-1.json +++ b/data/models/xai_grok-code-fast-1.json @@ -4,13 +4,13 @@ "id": "xai/grok-code-fast-1", "developer": "xAI", "additional_details": { - "agent_name": "Terminus 2", - "agent_organization": "Terminal Bench" + "agent_name": "Mini-SWE-Agent", + "agent_organization": "Princeton" } }, "evaluations": [ { - "evaluation_id": "terminal-bench-2.0/terminus-2__grok-code-fast-1/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/mini-swe-agent__grok-code-fast-1/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -34,7 +34,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-10-31", + "evaluation_timestamp": "2025-11-03", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -43,17 +43,17 @@ "max_score": 100.0 }, "score_details": { - "score": 14.2, + "score": 25.8, "uncertainty": { "standard_error": { - "value": 2.5 + "value": 2.6 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Grok Code Fast 1\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Grok Code Fast 1\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -70,7 +70,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Grok Code Fast 1\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Grok Code Fast 1\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -84,7 +84,7 @@ } }, { - "evaluation_id": "terminal-bench-2.0/mini-swe-agent__grok-code-fast-1/1773776901.772108", + "evaluation_id": "terminal-bench-2.0/terminus-2__grok-code-fast-1/1773776901.772108", "retrieved_timestamp": "1773776901.772108", "source_metadata": { "source_name": "Terminal-Bench 2.0", @@ -108,7 +108,7 @@ "https://www.tbench.ai/leaderboard/terminal-bench/2.0" ] }, - "evaluation_timestamp": "2025-11-03", + "evaluation_timestamp": "2025-10-31", "metric_config": { "evaluation_description": "Task resolution accuracy across 87 terminal tasks with 5 trials each", "lower_is_better": false, @@ -117,17 +117,17 @@ "max_score": 100.0 }, "score_details": { - "score": 25.8, + "score": 14.2, "uncertainty": { "standard_error": { - "value": 2.6 + "value": 2.5 }, "num_samples": 435 } }, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Grok Code Fast 1\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Grok Code Fast 1\" -k 5", "agentic_eval_config": { "available_tools": [ { @@ -144,7 +144,7 @@ "detailed_evaluation_results": null, "generation_config": { "generation_args": { - "execution_command": "harbor run -d terminal-bench@2.0 -a \"Mini-SWE-Agent\" -m \"Grok Code Fast 1\" -k 5", + "execution_command": "harbor run -d terminal-bench@2.0 -a \"Terminus 2\" -m \"Grok Code Fast 1\" -k 5", "agentic_eval_config": { "available_tools": [ { diff --git a/data/models/ycros_bagelmisterytour-v2-8x7b.json b/data/models/ycros_bagelmisterytour-v2-8x7b.json index c7b7f840ab350df665db2f8b289c03a4556651c3..ba69aabd10b1f09ccc48e0969d876027b03e3a4b 100644 --- a/data/models/ycros_bagelmisterytour-v2-8x7b.json +++ b/data/models/ycros_bagelmisterytour-v2-8x7b.json @@ -5,7 +5,7 @@ "developer": "ycros", "inference_platform": "unknown", "additional_details": { - "precision": "float16", + "precision": "bfloat16", "architecture": "MixtralForCausalLM", "params_billions": "46.703" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5994 + "score": 0.6262 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5159 + "score": 0.5142 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0785 + "score": 0.0937 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3045 + "score": 0.3079 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4203 + "score": 0.4138 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3473 + "score": 0.3481 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.6262 + "score": 0.5994 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5142 + "score": 0.5159 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0937 + "score": 0.0785 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3079 + "score": 0.3045 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4138 + "score": 0.4203 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3481 + "score": 0.3473 } } ], diff --git a/data/models/yoyo-ai_qwen2.5-14b-yoyo-1010.json b/data/models/yoyo-ai_qwen2.5-14b-yoyo-1010.json index e42112fd11a9b36734c8e8c4176443d8815fdc44..2a296ce898176000fe4f075772b9bc2bcb8d3b71 100644 --- a/data/models/yoyo-ai_qwen2.5-14b-yoyo-1010.json +++ b/data/models/yoyo-ai_qwen2.5-14b-yoyo-1010.json @@ -5,7 +5,7 @@ "developer": "YOYO-AI", "inference_platform": "unknown", "additional_details": { - "precision": "bfloat16", + "precision": "float16", "architecture": "Qwen2ForCausalLM", "params_billions": "14.77" } @@ -44,7 +44,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.7905 + "score": 0.5899 } }, { @@ -62,7 +62,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.6406 + "score": 0.654 } }, { @@ -80,7 +80,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.0 + "score": 0.4509 } }, { @@ -98,7 +98,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3163 + "score": 0.3834 } }, { @@ -116,7 +116,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4181 + "score": 0.4744 } }, { @@ -134,7 +134,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4944 + "score": 0.5376 } } ], @@ -174,7 +174,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5899 + "score": 0.7905 } }, { @@ -192,7 +192,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.654 + "score": 0.6406 } }, { @@ -210,7 +210,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4509 + "score": 0.0 } }, { @@ -228,7 +228,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.3834 + "score": 0.3163 } }, { @@ -246,7 +246,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.4744 + "score": 0.4181 } }, { @@ -264,7 +264,7 @@ "max_score": 1.0 }, "score_details": { - "score": 0.5376 + "score": 0.4944 } } ],