Spaces:
Running on CPU Spr
Running on CPU Spr
GitHub Actions commited on
Commit ·
49596d9
1
Parent(s): d8be99e
chore: sync EEE pipeline output [2026-04-02 05:07 UTC]
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- data/benchmarks.json +5 -1
- data/benchmarks/appworld_test_normal.json +2 -2
- data/benchmarks/browsecompplus.json +2 -2
- data/benchmarks/hfopenllm_v2.json +205 -218
- data/benchmarks/livecodebenchpro.json +3 -3
- data/benchmarks/reward-bench.json +174 -174
- data/benchmarks/swe-bench.json +2 -2
- data/benchmarks/tau-bench-2_airline.json +2 -2
- data/benchmarks/tau-bench-2_retail.json +1 -1
- data/benchmarks/tau-bench-2_telecom.json +2 -2
- data/benchmarks/terminal-bench-2.0.json +20 -20
- data/benchmarks/theory_of_mind.json +12 -0
- data/developers.json +1 -1
- data/developers/adriszmar.json +6 -6
- data/developers/ai2.json +3 -3
- data/developers/akjindal53244.json +5 -5
- data/developers/allenai.json +33 -33
- data/developers/anthropic.json +10 -10
- data/developers/cognitivecomputations.json +6 -6
- data/developers/columbia-nlp.json +6 -6
- data/developers/cpayne1303.json +5 -5
- data/developers/daemontatox.json +6 -6
- data/developers/deepmount00.json +6 -6
- data/developers/dfurman.json +6 -6
- data/developers/doppelreflex.json +6 -6
- data/developers/google.json +20 -20
- data/developers/huggingfacetb.json +6 -6
- data/developers/infly.json +6 -6
- data/developers/internlm.json +6 -6
- data/developers/jaspionjader.json +5 -5
- data/developers/leroydyer.json +6 -6
- data/developers/llmat.json +6 -6
- data/developers/lxzgordon.json +6 -6
- data/developers/meta.json +22 -22
- data/developers/minimax.json +1 -1
- data/developers/mistralai.json +16 -16
- data/developers/mlabonne.json +6 -6
- data/developers/moonshot_ai.json +1 -1
- data/developers/multiple.json +1 -1
- data/developers/nazimali.json +6 -6
- data/developers/nicolinho.json +12 -12
- data/developers/nisten.json +6 -6
- data/developers/nousresearch.json +0 -14
- data/developers/omkar1102.json +5 -5
- data/developers/openai.json +53 -53
- data/developers/openassistant.json +14 -14
- data/developers/openbmb.json +7 -7
- data/developers/pku-alignment.json +21 -21
- data/developers/primeintellect.json +4 -4
- data/developers/princeton-nlp.json +6 -6
data/benchmarks.json
CHANGED
|
@@ -45,7 +45,7 @@
|
|
| 45 |
},
|
| 46 |
{
|
| 47 |
"benchmark": "hfopenllm_v2",
|
| 48 |
-
"model_count":
|
| 49 |
},
|
| 50 |
{
|
| 51 |
"benchmark": "la_leaderboard",
|
|
@@ -78,5 +78,9 @@
|
|
| 78 |
{
|
| 79 |
"benchmark": "terminal-bench-2.0",
|
| 80 |
"model_count": 37
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
}
|
| 82 |
]
|
|
|
|
| 45 |
},
|
| 46 |
{
|
| 47 |
"benchmark": "hfopenllm_v2",
|
| 48 |
+
"model_count": 4493
|
| 49 |
},
|
| 50 |
{
|
| 51 |
"benchmark": "la_leaderboard",
|
|
|
|
| 78 |
{
|
| 79 |
"benchmark": "terminal-bench-2.0",
|
| 80 |
"model_count": 37
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"benchmark": "theory_of_mind",
|
| 84 |
+
"model_count": 1
|
| 85 |
}
|
| 86 |
]
|
data/benchmarks/appworld_test_normal.json
CHANGED
|
@@ -5,7 +5,7 @@
|
|
| 5 |
"name": "claude-opus-4-5",
|
| 6 |
"developer": "Anthropic",
|
| 7 |
"scores": {
|
| 8 |
-
"appworld/test_normal": 0.
|
| 9 |
}
|
| 10 |
},
|
| 11 |
{
|
|
@@ -13,7 +13,7 @@
|
|
| 13 |
"name": "gemini-3-pro-preview",
|
| 14 |
"developer": "Google",
|
| 15 |
"scores": {
|
| 16 |
-
"appworld/test_normal": 0.
|
| 17 |
}
|
| 18 |
},
|
| 19 |
{
|
|
|
|
| 5 |
"name": "claude-opus-4-5",
|
| 6 |
"developer": "Anthropic",
|
| 7 |
"scores": {
|
| 8 |
+
"appworld/test_normal": 0.7
|
| 9 |
}
|
| 10 |
},
|
| 11 |
{
|
|
|
|
| 13 |
"name": "gemini-3-pro-preview",
|
| 14 |
"developer": "Google",
|
| 15 |
"scores": {
|
| 16 |
+
"appworld/test_normal": 0.55
|
| 17 |
}
|
| 18 |
},
|
| 19 |
{
|
data/benchmarks/browsecompplus.json
CHANGED
|
@@ -13,7 +13,7 @@
|
|
| 13 |
"name": "gemini-3-pro-preview",
|
| 14 |
"developer": "Google",
|
| 15 |
"scores": {
|
| 16 |
-
"browsecompplus": 0.
|
| 17 |
}
|
| 18 |
},
|
| 19 |
{
|
|
@@ -21,7 +21,7 @@
|
|
| 21 |
"name": "gpt-5.2-2025-12-11",
|
| 22 |
"developer": "OpenAI",
|
| 23 |
"scores": {
|
| 24 |
-
"browsecompplus": 0.
|
| 25 |
}
|
| 26 |
}
|
| 27 |
]
|
|
|
|
| 13 |
"name": "gemini-3-pro-preview",
|
| 14 |
"developer": "Google",
|
| 15 |
"scores": {
|
| 16 |
+
"browsecompplus": 0.3333
|
| 17 |
}
|
| 18 |
},
|
| 19 |
{
|
|
|
|
| 21 |
"name": "gpt-5.2-2025-12-11",
|
| 22 |
"developer": "OpenAI",
|
| 23 |
"scores": {
|
| 24 |
+
"browsecompplus": 0.43
|
| 25 |
}
|
| 26 |
}
|
| 27 |
]
|
data/benchmarks/hfopenllm_v2.json
CHANGED
|
@@ -2176,12 +2176,12 @@
|
|
| 2176 |
"name": "LION-Gemma-2b-dpo-v1.0",
|
| 2177 |
"developer": "Columbia-NLP",
|
| 2178 |
"scores": {
|
| 2179 |
-
"IFEval": 0.
|
| 2180 |
-
"BBH": 0.
|
| 2181 |
-
"MATH Level 5": 0.
|
| 2182 |
-
"GPQA": 0.
|
| 2183 |
-
"MUSR": 0.
|
| 2184 |
-
"MMLU-PRO": 0.
|
| 2185 |
}
|
| 2186 |
},
|
| 2187 |
{
|
|
@@ -3229,12 +3229,12 @@
|
|
| 3229 |
"name": "PathfinderAI",
|
| 3230 |
"developer": "Daemontatox",
|
| 3231 |
"scores": {
|
| 3232 |
-
"IFEval": 0.
|
| 3233 |
-
"BBH": 0.
|
| 3234 |
-
"MATH Level 5": 0.
|
| 3235 |
-
"GPQA": 0.
|
| 3236 |
-
"MUSR": 0.
|
| 3237 |
-
"MMLU-PRO": 0.
|
| 3238 |
}
|
| 3239 |
},
|
| 3240 |
{
|
|
@@ -4321,12 +4321,12 @@
|
|
| 4321 |
"name": "Llama-3.1-8b-ITA",
|
| 4322 |
"developer": "DeepMount00",
|
| 4323 |
"scores": {
|
| 4324 |
-
"IFEval": 0.
|
| 4325 |
-
"BBH": 0.
|
| 4326 |
-
"MATH Level 5": 0.
|
| 4327 |
-
"GPQA": 0.
|
| 4328 |
-
"MUSR": 0.
|
| 4329 |
-
"MMLU-PRO": 0.
|
| 4330 |
}
|
| 4331 |
},
|
| 4332 |
{
|
|
@@ -4646,12 +4646,12 @@
|
|
| 4646 |
"name": "MN-12B-LilithFrame",
|
| 4647 |
"developer": "DoppelReflEx",
|
| 4648 |
"scores": {
|
| 4649 |
-
"IFEval": 0.
|
| 4650 |
-
"BBH": 0.
|
| 4651 |
-
"MATH Level 5": 0.
|
| 4652 |
-
"GPQA": 0.
|
| 4653 |
-
"MUSR": 0.
|
| 4654 |
-
"MMLU-PRO": 0.
|
| 4655 |
}
|
| 4656 |
},
|
| 4657 |
{
|
|
@@ -9144,12 +9144,12 @@
|
|
| 9144 |
"name": "SmolLM2-135M-Instruct",
|
| 9145 |
"developer": "HuggingFaceTB",
|
| 9146 |
"scores": {
|
| 9147 |
-
"IFEval": 0.
|
| 9148 |
-
"BBH": 0.
|
| 9149 |
-
"MATH Level 5": 0.
|
| 9150 |
-
"GPQA": 0.
|
| 9151 |
-
"MUSR": 0.
|
| 9152 |
-
"MMLU-PRO": 0.
|
| 9153 |
}
|
| 9154 |
},
|
| 9155 |
{
|
|
@@ -13057,12 +13057,12 @@
|
|
| 13057 |
"name": "SpydazWeb_AI_HumanAI_012_INSTRUCT_XA",
|
| 13058 |
"developer": "LeroyDyer",
|
| 13059 |
"scores": {
|
| 13060 |
-
"IFEval": 0.
|
| 13061 |
-
"BBH": 0.
|
| 13062 |
-
"MATH Level 5": 0.
|
| 13063 |
-
"GPQA": 0.
|
| 13064 |
-
"MUSR": 0.
|
| 13065 |
-
"MMLU-PRO": 0.
|
| 13066 |
}
|
| 13067 |
},
|
| 13068 |
{
|
|
@@ -16874,19 +16874,6 @@
|
|
| 16874 |
"MMLU-PRO": 0.232
|
| 16875 |
}
|
| 16876 |
},
|
| 16877 |
-
{
|
| 16878 |
-
"model_id": "NousResearch/Yarn-Llama-2-7b-128k",
|
| 16879 |
-
"name": "Yarn-Llama-2-7b-128k",
|
| 16880 |
-
"developer": "NousResearch",
|
| 16881 |
-
"scores": {
|
| 16882 |
-
"IFEval": 0.1485,
|
| 16883 |
-
"BBH": 0.3248,
|
| 16884 |
-
"MATH Level 5": 0.0151,
|
| 16885 |
-
"GPQA": 0.2601,
|
| 16886 |
-
"MUSR": 0.3967,
|
| 16887 |
-
"MMLU-PRO": 0.1791
|
| 16888 |
-
}
|
| 16889 |
-
},
|
| 16890 |
{
|
| 16891 |
"model_id": "NousResearch/Yarn-Llama-2-7b-64k",
|
| 16892 |
"name": "Yarn-Llama-2-7b-64k",
|
|
@@ -17204,12 +17191,12 @@
|
|
| 17204 |
"name": "code-yi",
|
| 17205 |
"developer": "Omkar1102",
|
| 17206 |
"scores": {
|
| 17207 |
-
"IFEval": 0.
|
| 17208 |
-
"BBH": 0.
|
| 17209 |
"MATH Level 5": 0.0,
|
| 17210 |
-
"GPQA": 0.
|
| 17211 |
-
"MUSR": 0.
|
| 17212 |
-
"MMLU-PRO": 0.
|
| 17213 |
}
|
| 17214 |
},
|
| 17215 |
{
|
|
@@ -18141,11 +18128,11 @@
|
|
| 18141 |
"developer": "PrimeIntellect",
|
| 18142 |
"scores": {
|
| 18143 |
"IFEval": 0.1757,
|
| 18144 |
-
"BBH": 0.
|
| 18145 |
"MATH Level 5": 0.0,
|
| 18146 |
-
"GPQA": 0.
|
| 18147 |
-
"MUSR": 0.
|
| 18148 |
-
"MMLU-PRO": 0.
|
| 18149 |
}
|
| 18150 |
},
|
| 18151 |
{
|
|
@@ -18712,12 +18699,12 @@
|
|
| 18712 |
"name": "ODB-14B-sce",
|
| 18713 |
"developer": "Quazim0t0",
|
| 18714 |
"scores": {
|
| 18715 |
-
"IFEval": 0.
|
| 18716 |
-
"BBH": 0.
|
| 18717 |
-
"MATH Level 5": 0.
|
| 18718 |
-
"GPQA": 0.
|
| 18719 |
-
"MUSR": 0.
|
| 18720 |
-
"MMLU-PRO": 0.
|
| 18721 |
}
|
| 18722 |
},
|
| 18723 |
{
|
|
@@ -19466,12 +19453,12 @@
|
|
| 19466 |
"name": "Qwen2.5-0.5B-Instruct",
|
| 19467 |
"developer": "Qwen",
|
| 19468 |
"scores": {
|
| 19469 |
-
"IFEval": 0.
|
| 19470 |
-
"BBH": 0.
|
| 19471 |
-
"MATH Level 5": 0.
|
| 19472 |
-
"GPQA": 0.
|
| 19473 |
-
"MUSR": 0.
|
| 19474 |
-
"MMLU-PRO": 0.
|
| 19475 |
}
|
| 19476 |
},
|
| 19477 |
{
|
|
@@ -19726,12 +19713,12 @@
|
|
| 19726 |
"name": "Qwen2.5-Coder-7B-Instruct",
|
| 19727 |
"developer": "Qwen",
|
| 19728 |
"scores": {
|
| 19729 |
-
"IFEval": 0.
|
| 19730 |
-
"BBH": 0.
|
| 19731 |
-
"MATH Level 5": 0.
|
| 19732 |
-
"GPQA": 0.
|
| 19733 |
-
"MUSR": 0.
|
| 19734 |
-
"MMLU-PRO": 0.
|
| 19735 |
}
|
| 19736 |
},
|
| 19737 |
{
|
|
@@ -19986,12 +19973,12 @@
|
|
| 19986 |
"name": "Replete-LLM-Qwen2-7b",
|
| 19987 |
"developer": "Replete-AI",
|
| 19988 |
"scores": {
|
| 19989 |
-
"IFEval": 0.
|
| 19990 |
-
"BBH": 0.
|
| 19991 |
"MATH Level 5": 0.0,
|
| 19992 |
-
"GPQA": 0.
|
| 19993 |
-
"MUSR": 0.
|
| 19994 |
-
"MMLU-PRO": 0.
|
| 19995 |
}
|
| 19996 |
},
|
| 19997 |
{
|
|
@@ -24653,12 +24640,12 @@
|
|
| 24653 |
"name": "Llama-3-Instruct-8B-SPPO-Iter3",
|
| 24654 |
"developer": "UCLA-AGI",
|
| 24655 |
"scores": {
|
| 24656 |
-
"IFEval": 0.
|
| 24657 |
-
"BBH": 0.
|
| 24658 |
-
"MATH Level 5": 0.
|
| 24659 |
"GPQA": 0.2651,
|
| 24660 |
-
"MUSR": 0.
|
| 24661 |
-
"MMLU-PRO": 0.
|
| 24662 |
}
|
| 24663 |
},
|
| 24664 |
{
|
|
@@ -25004,12 +24991,12 @@
|
|
| 25004 |
"name": "llama-3-Korean-8B",
|
| 25005 |
"developer": "VIRNECT",
|
| 25006 |
"scores": {
|
| 25007 |
-
"IFEval": 0.
|
| 25008 |
-
"BBH": 0.
|
| 25009 |
-
"MATH Level 5": 0.
|
| 25010 |
"GPQA": 0.271,
|
| 25011 |
-
"MUSR": 0.
|
| 25012 |
-
"MMLU-PRO": 0.
|
| 25013 |
}
|
| 25014 |
},
|
| 25015 |
{
|
|
@@ -25108,12 +25095,12 @@
|
|
| 25108 |
"name": "Llama3.1-8B-Fireplace2",
|
| 25109 |
"developer": "ValiantLabs",
|
| 25110 |
"scores": {
|
| 25111 |
-
"IFEval": 0.
|
| 25112 |
-
"BBH": 0.
|
| 25113 |
-
"MATH Level 5": 0.
|
| 25114 |
-
"GPQA": 0.
|
| 25115 |
-
"MUSR": 0.
|
| 25116 |
-
"MMLU-PRO": 0.
|
| 25117 |
}
|
| 25118 |
},
|
| 25119 |
{
|
|
@@ -25121,12 +25108,12 @@
|
|
| 25121 |
"name": "Llama3.1-8B-ShiningValiant2",
|
| 25122 |
"developer": "ValiantLabs",
|
| 25123 |
"scores": {
|
| 25124 |
-
"IFEval": 0.
|
| 25125 |
-
"BBH": 0.
|
| 25126 |
-
"MATH Level 5": 0.
|
| 25127 |
-
"GPQA": 0.
|
| 25128 |
-
"MUSR": 0.
|
| 25129 |
-
"MMLU-PRO": 0.
|
| 25130 |
}
|
| 25131 |
},
|
| 25132 |
{
|
|
@@ -25654,12 +25641,12 @@
|
|
| 25654 |
"name": "Qwen2.5-14B-YOYO-1010",
|
| 25655 |
"developer": "YOYO-AI",
|
| 25656 |
"scores": {
|
| 25657 |
-
"IFEval": 0.
|
| 25658 |
-
"BBH": 0.
|
| 25659 |
-
"MATH Level 5": 0.
|
| 25660 |
-
"GPQA": 0.
|
| 25661 |
-
"MUSR": 0.
|
| 25662 |
-
"MMLU-PRO": 0.
|
| 25663 |
}
|
| 25664 |
},
|
| 25665 |
{
|
|
@@ -26603,12 +26590,12 @@
|
|
| 26603 |
"name": "QAIMath-Qwen2.5-7B-TIES",
|
| 26604 |
"developer": "adriszmar",
|
| 26605 |
"scores": {
|
| 26606 |
-
"IFEval": 0.
|
| 26607 |
-
"BBH": 0.
|
| 26608 |
-
"MATH Level 5": 0.
|
| 26609 |
-
"GPQA": 0.
|
| 26610 |
-
"MUSR": 0.
|
| 26611 |
-
"MMLU-PRO": 0.
|
| 26612 |
}
|
| 26613 |
},
|
| 26614 |
{
|
|
@@ -26889,12 +26876,12 @@
|
|
| 26889 |
"name": "Llama-3.1-Storm-8B",
|
| 26890 |
"developer": "akjindal53244",
|
| 26891 |
"scores": {
|
| 26892 |
-
"IFEval": 0.
|
| 26893 |
-
"BBH": 0.
|
| 26894 |
-
"MATH Level 5": 0.
|
| 26895 |
-
"GPQA": 0.
|
| 26896 |
"MUSR": 0.4028,
|
| 26897 |
-
"MMLU-PRO": 0.
|
| 26898 |
}
|
| 26899 |
},
|
| 26900 |
{
|
|
@@ -26915,12 +26902,12 @@
|
|
| 26915 |
"name": "Llama-3.1-Tulu-3-70B",
|
| 26916 |
"developer": "allenai",
|
| 26917 |
"scores": {
|
| 26918 |
-
"IFEval": 0.
|
| 26919 |
-
"BBH": 0.
|
| 26920 |
-
"MATH Level 5": 0.
|
| 26921 |
"GPQA": 0.3733,
|
| 26922 |
-
"MUSR": 0.
|
| 26923 |
-
"MMLU-PRO": 0.
|
| 26924 |
}
|
| 26925 |
},
|
| 26926 |
{
|
|
@@ -31647,12 +31634,12 @@
|
|
| 31647 |
"name": "dolphin-2.9.2-Phi-3-Medium-abliterated",
|
| 31648 |
"developer": "cognitivecomputations",
|
| 31649 |
"scores": {
|
| 31650 |
-
"IFEval": 0.
|
| 31651 |
-
"BBH": 0.
|
| 31652 |
-
"MATH Level 5": 0.
|
| 31653 |
-
"GPQA": 0.
|
| 31654 |
-
"MUSR": 0.
|
| 31655 |
-
"MMLU-PRO": 0.
|
| 31656 |
}
|
| 31657 |
},
|
| 31658 |
{
|
|
@@ -31790,12 +31777,12 @@
|
|
| 31790 |
"name": "llama-43m-beta",
|
| 31791 |
"developer": "cpayne1303",
|
| 31792 |
"scores": {
|
| 31793 |
-
"IFEval": 0.
|
| 31794 |
-
"BBH": 0.
|
| 31795 |
-
"MATH Level 5": 0.
|
| 31796 |
"GPQA": 0.2685,
|
| 31797 |
-
"MUSR": 0.
|
| 31798 |
-
"MMLU-PRO": 0.
|
| 31799 |
}
|
| 31800 |
},
|
| 31801 |
{
|
|
@@ -32167,12 +32154,12 @@
|
|
| 32167 |
"name": "Llama-3-8B-Orpo-v0.1",
|
| 32168 |
"developer": "dfurman",
|
| 32169 |
"scores": {
|
| 32170 |
-
"IFEval": 0.
|
| 32171 |
-
"BBH": 0.
|
| 32172 |
-
"MATH Level 5": 0.
|
| 32173 |
-
"GPQA": 0.
|
| 32174 |
-
"MUSR": 0.
|
| 32175 |
-
"MMLU-PRO": 0.
|
| 32176 |
}
|
| 32177 |
},
|
| 32178 |
{
|
|
@@ -34663,12 +34650,12 @@
|
|
| 34663 |
"name": "gemma-2-2b",
|
| 34664 |
"developer": "Google",
|
| 34665 |
"scores": {
|
| 34666 |
-
"IFEval": 0.
|
| 34667 |
-
"BBH": 0.
|
| 34668 |
-
"MATH Level 5": 0.
|
| 34669 |
"GPQA": 0.2626,
|
| 34670 |
-
"MUSR": 0.
|
| 34671 |
-
"MMLU-PRO": 0.
|
| 34672 |
}
|
| 34673 |
},
|
| 34674 |
{
|
|
@@ -34689,12 +34676,12 @@
|
|
| 34689 |
"name": "gemma-2-2b-jpn-it",
|
| 34690 |
"developer": "Google",
|
| 34691 |
"scores": {
|
| 34692 |
-
"IFEval": 0.
|
| 34693 |
-
"BBH": 0.
|
| 34694 |
-
"MATH Level 5": 0.
|
| 34695 |
-
"GPQA": 0.
|
| 34696 |
-
"MUSR": 0.
|
| 34697 |
-
"MMLU-PRO": 0.
|
| 34698 |
}
|
| 34699 |
},
|
| 34700 |
{
|
|
@@ -37705,12 +37692,12 @@
|
|
| 37705 |
"name": "Kosmos-EVAA-Fusion-8B",
|
| 37706 |
"developer": "jaspionjader",
|
| 37707 |
"scores": {
|
| 37708 |
-
"IFEval": 0.
|
| 37709 |
-
"BBH": 0.
|
| 37710 |
-
"MATH Level 5": 0.
|
| 37711 |
-
"GPQA": 0.
|
| 37712 |
"MUSR": 0.4277,
|
| 37713 |
-
"MMLU-PRO": 0.
|
| 37714 |
}
|
| 37715 |
},
|
| 37716 |
{
|
|
@@ -42359,12 +42346,12 @@
|
|
| 42359 |
"name": "Mistral-v0.3-7B-ORPO",
|
| 42360 |
"developer": "llmat",
|
| 42361 |
"scores": {
|
| 42362 |
-
"IFEval": 0.
|
| 42363 |
-
"BBH": 0.
|
| 42364 |
-
"MATH Level 5": 0.
|
| 42365 |
-
"GPQA": 0.
|
| 42366 |
-
"MUSR": 0.
|
| 42367 |
-
"MMLU-PRO": 0.
|
| 42368 |
}
|
| 42369 |
},
|
| 42370 |
{
|
|
@@ -44478,12 +44465,12 @@
|
|
| 44478 |
"name": "Mixtral-8x7B-v0.1",
|
| 44479 |
"developer": "mistralai",
|
| 44480 |
"scores": {
|
| 44481 |
-
"IFEval": 0.
|
| 44482 |
-
"BBH": 0.
|
| 44483 |
-
"MATH Level 5": 0.
|
| 44484 |
-
"GPQA": 0.
|
| 44485 |
-
"MUSR": 0.
|
| 44486 |
-
"MMLU-PRO": 0.
|
| 44487 |
}
|
| 44488 |
},
|
| 44489 |
{
|
|
@@ -44738,12 +44725,12 @@
|
|
| 44738 |
"name": "NeuralDaredevil-8B-abliterated",
|
| 44739 |
"developer": "mlabonne",
|
| 44740 |
"scores": {
|
| 44741 |
-
"IFEval": 0.
|
| 44742 |
-
"BBH": 0.
|
| 44743 |
-
"MATH Level 5": 0.
|
| 44744 |
-
"GPQA": 0.
|
| 44745 |
-
"MUSR": 0.
|
| 44746 |
-
"MMLU-PRO": 0.
|
| 44747 |
}
|
| 44748 |
},
|
| 44749 |
{
|
|
@@ -45076,12 +45063,12 @@
|
|
| 45076 |
"name": "Mistral-Nemo-Kurdish-Instruct",
|
| 45077 |
"developer": "nazimali",
|
| 45078 |
"scores": {
|
| 45079 |
-
"IFEval": 0.
|
| 45080 |
-
"BBH": 0.
|
| 45081 |
-
"MATH Level 5": 0.
|
| 45082 |
-
"GPQA": 0.
|
| 45083 |
-
"MUSR": 0.
|
| 45084 |
-
"MMLU-PRO": 0.
|
| 45085 |
}
|
| 45086 |
},
|
| 45087 |
{
|
|
@@ -46779,12 +46766,12 @@
|
|
| 46779 |
"name": "franqwenstein-35b",
|
| 46780 |
"developer": "nisten",
|
| 46781 |
"scores": {
|
| 46782 |
-
"IFEval": 0.
|
| 46783 |
-
"BBH": 0.
|
| 46784 |
-
"MATH Level 5": 0.
|
| 46785 |
-
"GPQA": 0.
|
| 46786 |
-
"MUSR": 0.
|
| 46787 |
-
"MMLU-PRO": 0.
|
| 46788 |
}
|
| 46789 |
},
|
| 46790 |
{
|
|
@@ -48729,12 +48716,12 @@
|
|
| 48729 |
"name": "Llama-3-8B-ProLong-512k-Instruct",
|
| 48730 |
"developer": "princeton-nlp",
|
| 48731 |
"scores": {
|
| 48732 |
-
"IFEval": 0.
|
| 48733 |
-
"BBH": 0.
|
| 48734 |
-
"MATH Level 5": 0.
|
| 48735 |
-
"GPQA": 0.
|
| 48736 |
-
"MUSR": 0.
|
| 48737 |
-
"MMLU-PRO": 0.
|
| 48738 |
}
|
| 48739 |
},
|
| 48740 |
{
|
|
@@ -51303,12 +51290,12 @@
|
|
| 51303 |
"name": "Gemma-2-Ataraxy-Gemmasutra-9B-slerp",
|
| 51304 |
"developer": "recoilme",
|
| 51305 |
"scores": {
|
| 51306 |
-
"IFEval": 0.
|
| 51307 |
-
"BBH": 0.
|
| 51308 |
-
"MATH Level 5": 0.
|
| 51309 |
-
"GPQA": 0.
|
| 51310 |
-
"MUSR": 0.
|
| 51311 |
-
"MMLU-PRO": 0.
|
| 51312 |
}
|
| 51313 |
},
|
| 51314 |
{
|
|
@@ -51329,12 +51316,12 @@
|
|
| 51329 |
"name": "recoilme-gemma-2-9B-v0.2",
|
| 51330 |
"developer": "recoilme",
|
| 51331 |
"scores": {
|
| 51332 |
-
"IFEval": 0.
|
| 51333 |
-
"BBH": 0.
|
| 51334 |
-
"MATH Level 5": 0.
|
| 51335 |
-
"GPQA": 0.
|
| 51336 |
-
"MUSR": 0.
|
| 51337 |
-
"MMLU-PRO": 0.
|
| 51338 |
}
|
| 51339 |
},
|
| 51340 |
{
|
|
@@ -51342,12 +51329,12 @@
|
|
| 51342 |
"name": "recoilme-gemma-2-9B-v0.3",
|
| 51343 |
"developer": "recoilme",
|
| 51344 |
"scores": {
|
| 51345 |
-
"IFEval": 0.
|
| 51346 |
-
"BBH": 0.
|
| 51347 |
-
"MATH Level 5": 0.
|
| 51348 |
-
"GPQA": 0.
|
| 51349 |
-
"MUSR": 0.
|
| 51350 |
-
"MMLU-PRO": 0.
|
| 51351 |
}
|
| 51352 |
},
|
| 51353 |
{
|
|
@@ -56997,12 +56984,12 @@
|
|
| 56997 |
"name": "BagelMIsteryTour-v2-8x7B",
|
| 56998 |
"developer": "ycros",
|
| 56999 |
"scores": {
|
| 57000 |
-
"IFEval": 0.
|
| 57001 |
-
"BBH": 0.
|
| 57002 |
-
"MATH Level 5": 0.
|
| 57003 |
-
"GPQA": 0.
|
| 57004 |
-
"MUSR": 0.
|
| 57005 |
-
"MMLU-PRO": 0.
|
| 57006 |
}
|
| 57007 |
},
|
| 57008 |
{
|
|
|
|
| 2176 |
"name": "LION-Gemma-2b-dpo-v1.0",
|
| 2177 |
"developer": "Columbia-NLP",
|
| 2178 |
"scores": {
|
| 2179 |
+
"IFEval": 0.3102,
|
| 2180 |
+
"BBH": 0.3881,
|
| 2181 |
+
"MATH Level 5": 0.0536,
|
| 2182 |
+
"GPQA": 0.2534,
|
| 2183 |
+
"MUSR": 0.4081,
|
| 2184 |
+
"MMLU-PRO": 0.1665
|
| 2185 |
}
|
| 2186 |
},
|
| 2187 |
{
|
|
|
|
| 3229 |
"name": "PathfinderAI",
|
| 3230 |
"developer": "Daemontatox",
|
| 3231 |
"scores": {
|
| 3232 |
+
"IFEval": 0.4855,
|
| 3233 |
+
"BBH": 0.6627,
|
| 3234 |
+
"MATH Level 5": 0.4841,
|
| 3235 |
+
"GPQA": 0.3096,
|
| 3236 |
+
"MUSR": 0.4256,
|
| 3237 |
+
"MMLU-PRO": 0.5542
|
| 3238 |
}
|
| 3239 |
},
|
| 3240 |
{
|
|
|
|
| 4321 |
"name": "Llama-3.1-8b-ITA",
|
| 4322 |
"developer": "DeepMount00",
|
| 4323 |
"scores": {
|
| 4324 |
+
"IFEval": 0.5365,
|
| 4325 |
+
"BBH": 0.517,
|
| 4326 |
+
"MATH Level 5": 0.1707,
|
| 4327 |
+
"GPQA": 0.3062,
|
| 4328 |
+
"MUSR": 0.4487,
|
| 4329 |
+
"MMLU-PRO": 0.396
|
| 4330 |
}
|
| 4331 |
},
|
| 4332 |
{
|
|
|
|
| 4646 |
"name": "MN-12B-LilithFrame",
|
| 4647 |
"developer": "DoppelReflEx",
|
| 4648 |
"scores": {
|
| 4649 |
+
"IFEval": 0.436,
|
| 4650 |
+
"BBH": 0.4956,
|
| 4651 |
+
"MATH Level 5": 0.0589,
|
| 4652 |
+
"GPQA": 0.3205,
|
| 4653 |
+
"MUSR": 0.3843,
|
| 4654 |
+
"MMLU-PRO": 0.3237
|
| 4655 |
}
|
| 4656 |
},
|
| 4657 |
{
|
|
|
|
| 9144 |
"name": "SmolLM2-135M-Instruct",
|
| 9145 |
"developer": "HuggingFaceTB",
|
| 9146 |
"scores": {
|
| 9147 |
+
"IFEval": 0.2883,
|
| 9148 |
+
"BBH": 0.3124,
|
| 9149 |
+
"MATH Level 5": 0.003,
|
| 9150 |
+
"GPQA": 0.2357,
|
| 9151 |
+
"MUSR": 0.3662,
|
| 9152 |
+
"MMLU-PRO": 0.1115
|
| 9153 |
}
|
| 9154 |
},
|
| 9155 |
{
|
|
|
|
| 13057 |
"name": "SpydazWeb_AI_HumanAI_012_INSTRUCT_XA",
|
| 13058 |
"developer": "LeroyDyer",
|
| 13059 |
"scores": {
|
| 13060 |
+
"IFEval": 0.3798,
|
| 13061 |
+
"BBH": 0.4483,
|
| 13062 |
+
"MATH Level 5": 0.04,
|
| 13063 |
+
"GPQA": 0.3129,
|
| 13064 |
+
"MUSR": 0.4148,
|
| 13065 |
+
"MMLU-PRO": 0.2389
|
| 13066 |
}
|
| 13067 |
},
|
| 13068 |
{
|
|
|
|
| 16874 |
"MMLU-PRO": 0.232
|
| 16875 |
}
|
| 16876 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16877 |
{
|
| 16878 |
"model_id": "NousResearch/Yarn-Llama-2-7b-64k",
|
| 16879 |
"name": "Yarn-Llama-2-7b-64k",
|
|
|
|
| 17191 |
"name": "code-yi",
|
| 17192 |
"developer": "Omkar1102",
|
| 17193 |
"scores": {
|
| 17194 |
+
"IFEval": 0.2148,
|
| 17195 |
+
"BBH": 0.276,
|
| 17196 |
"MATH Level 5": 0.0,
|
| 17197 |
+
"GPQA": 0.2508,
|
| 17198 |
+
"MUSR": 0.3802,
|
| 17199 |
+
"MMLU-PRO": 0.1126
|
| 17200 |
}
|
| 17201 |
},
|
| 17202 |
{
|
|
|
|
| 18128 |
"developer": "PrimeIntellect",
|
| 18129 |
"scores": {
|
| 18130 |
"IFEval": 0.1757,
|
| 18131 |
+
"BBH": 0.274,
|
| 18132 |
"MATH Level 5": 0.0,
|
| 18133 |
+
"GPQA": 0.25,
|
| 18134 |
+
"MUSR": 0.3753,
|
| 18135 |
+
"MMLU-PRO": 0.112
|
| 18136 |
}
|
| 18137 |
},
|
| 18138 |
{
|
|
|
|
| 18699 |
"name": "ODB-14B-sce",
|
| 18700 |
"developer": "Quazim0t0",
|
| 18701 |
"scores": {
|
| 18702 |
+
"IFEval": 0.2922,
|
| 18703 |
+
"BBH": 0.6559,
|
| 18704 |
+
"MATH Level 5": 0.2545,
|
| 18705 |
+
"GPQA": 0.2659,
|
| 18706 |
+
"MUSR": 0.3929,
|
| 18707 |
+
"MMLU-PRO": 0.5207
|
| 18708 |
}
|
| 18709 |
},
|
| 18710 |
{
|
|
|
|
| 19453 |
"name": "Qwen2.5-0.5B-Instruct",
|
| 19454 |
"developer": "Qwen",
|
| 19455 |
"scores": {
|
| 19456 |
+
"IFEval": 0.3071,
|
| 19457 |
+
"BBH": 0.3341,
|
| 19458 |
+
"MATH Level 5": 0.0,
|
| 19459 |
+
"GPQA": 0.2576,
|
| 19460 |
+
"MUSR": 0.3329,
|
| 19461 |
+
"MMLU-PRO": 0.1697
|
| 19462 |
}
|
| 19463 |
},
|
| 19464 |
{
|
|
|
|
| 19713 |
"name": "Qwen2.5-Coder-7B-Instruct",
|
| 19714 |
"developer": "Qwen",
|
| 19715 |
"scores": {
|
| 19716 |
+
"IFEval": 0.6101,
|
| 19717 |
+
"BBH": 0.5008,
|
| 19718 |
+
"MATH Level 5": 0.3716,
|
| 19719 |
+
"GPQA": 0.2919,
|
| 19720 |
+
"MUSR": 0.4073,
|
| 19721 |
+
"MMLU-PRO": 0.3352
|
| 19722 |
}
|
| 19723 |
},
|
| 19724 |
{
|
|
|
|
| 19973 |
"name": "Replete-LLM-Qwen2-7b",
|
| 19974 |
"developer": "Replete-AI",
|
| 19975 |
"scores": {
|
| 19976 |
+
"IFEval": 0.0905,
|
| 19977 |
+
"BBH": 0.2985,
|
| 19978 |
"MATH Level 5": 0.0,
|
| 19979 |
+
"GPQA": 0.2534,
|
| 19980 |
+
"MUSR": 0.3848,
|
| 19981 |
+
"MMLU-PRO": 0.1158
|
| 19982 |
}
|
| 19983 |
},
|
| 19984 |
{
|
|
|
|
| 24640 |
"name": "Llama-3-Instruct-8B-SPPO-Iter3",
|
| 24641 |
"developer": "UCLA-AGI",
|
| 24642 |
"scores": {
|
| 24643 |
+
"IFEval": 0.6703,
|
| 24644 |
+
"BBH": 0.5076,
|
| 24645 |
+
"MATH Level 5": 0.0718,
|
| 24646 |
"GPQA": 0.2651,
|
| 24647 |
+
"MUSR": 0.3647,
|
| 24648 |
+
"MMLU-PRO": 0.3658
|
| 24649 |
}
|
| 24650 |
},
|
| 24651 |
{
|
|
|
|
| 24991 |
"name": "llama-3-Korean-8B",
|
| 24992 |
"developer": "VIRNECT",
|
| 24993 |
"scores": {
|
| 24994 |
+
"IFEval": 0.5058,
|
| 24995 |
+
"BBH": 0.4908,
|
| 24996 |
+
"MATH Level 5": 0.0929,
|
| 24997 |
"GPQA": 0.271,
|
| 24998 |
+
"MUSR": 0.3662,
|
| 24999 |
+
"MMLU-PRO": 0.3539
|
| 25000 |
}
|
| 25001 |
},
|
| 25002 |
{
|
|
|
|
| 25095 |
"name": "Llama3.1-8B-Fireplace2",
|
| 25096 |
"developer": "ValiantLabs",
|
| 25097 |
"scores": {
|
| 25098 |
+
"IFEval": 0.5483,
|
| 25099 |
+
"BBH": 0.461,
|
| 25100 |
+
"MATH Level 5": 0.0582,
|
| 25101 |
+
"GPQA": 0.2886,
|
| 25102 |
+
"MUSR": 0.3433,
|
| 25103 |
+
"MMLU-PRO": 0.2407
|
| 25104 |
}
|
| 25105 |
},
|
| 25106 |
{
|
|
|
|
| 25108 |
"name": "Llama3.1-8B-ShiningValiant2",
|
| 25109 |
"developer": "ValiantLabs",
|
| 25110 |
"scores": {
|
| 25111 |
+
"IFEval": 0.2678,
|
| 25112 |
+
"BBH": 0.4429,
|
| 25113 |
+
"MATH Level 5": 0.0521,
|
| 25114 |
+
"GPQA": 0.302,
|
| 25115 |
+
"MUSR": 0.3959,
|
| 25116 |
+
"MMLU-PRO": 0.2927
|
| 25117 |
}
|
| 25118 |
},
|
| 25119 |
{
|
|
|
|
| 25641 |
"name": "Qwen2.5-14B-YOYO-1010",
|
| 25642 |
"developer": "YOYO-AI",
|
| 25643 |
"scores": {
|
| 25644 |
+
"IFEval": 0.7905,
|
| 25645 |
+
"BBH": 0.6406,
|
| 25646 |
+
"MATH Level 5": 0.0,
|
| 25647 |
+
"GPQA": 0.3163,
|
| 25648 |
+
"MUSR": 0.4181,
|
| 25649 |
+
"MMLU-PRO": 0.4944
|
| 25650 |
}
|
| 25651 |
},
|
| 25652 |
{
|
|
|
|
| 26590 |
"name": "QAIMath-Qwen2.5-7B-TIES",
|
| 26591 |
"developer": "adriszmar",
|
| 26592 |
"scores": {
|
| 26593 |
+
"IFEval": 0.1746,
|
| 26594 |
+
"BBH": 0.3126,
|
| 26595 |
+
"MATH Level 5": 0.0,
|
| 26596 |
+
"GPQA": 0.245,
|
| 26597 |
+
"MUSR": 0.4096,
|
| 26598 |
+
"MMLU-PRO": 0.1087
|
| 26599 |
}
|
| 26600 |
},
|
| 26601 |
{
|
|
|
|
| 26876 |
"name": "Llama-3.1-Storm-8B",
|
| 26877 |
"developer": "akjindal53244",
|
| 26878 |
"scores": {
|
| 26879 |
+
"IFEval": 0.8033,
|
| 26880 |
+
"BBH": 0.5196,
|
| 26881 |
+
"MATH Level 5": 0.1624,
|
| 26882 |
+
"GPQA": 0.3096,
|
| 26883 |
"MUSR": 0.4028,
|
| 26884 |
+
"MMLU-PRO": 0.3812
|
| 26885 |
}
|
| 26886 |
},
|
| 26887 |
{
|
|
|
|
| 26902 |
"name": "Llama-3.1-Tulu-3-70B",
|
| 26903 |
"developer": "allenai",
|
| 26904 |
"scores": {
|
| 26905 |
+
"IFEval": 0.8291,
|
| 26906 |
+
"BBH": 0.6164,
|
| 26907 |
+
"MATH Level 5": 0.4502,
|
| 26908 |
"GPQA": 0.3733,
|
| 26909 |
+
"MUSR": 0.4948,
|
| 26910 |
+
"MMLU-PRO": 0.4645
|
| 26911 |
}
|
| 26912 |
},
|
| 26913 |
{
|
|
|
|
| 31634 |
"name": "dolphin-2.9.2-Phi-3-Medium-abliterated",
|
| 31635 |
"developer": "cognitivecomputations",
|
| 31636 |
"scores": {
|
| 31637 |
+
"IFEval": 0.3613,
|
| 31638 |
+
"BBH": 0.6123,
|
| 31639 |
+
"MATH Level 5": 0.1239,
|
| 31640 |
+
"GPQA": 0.328,
|
| 31641 |
+
"MUSR": 0.4112,
|
| 31642 |
+
"MMLU-PRO": 0.4494
|
| 31643 |
}
|
| 31644 |
},
|
| 31645 |
{
|
|
|
|
| 31777 |
"name": "llama-43m-beta",
|
| 31778 |
"developer": "cpayne1303",
|
| 31779 |
"scores": {
|
| 31780 |
+
"IFEval": 0.1949,
|
| 31781 |
+
"BBH": 0.2965,
|
| 31782 |
+
"MATH Level 5": 0.0045,
|
| 31783 |
"GPQA": 0.2685,
|
| 31784 |
+
"MUSR": 0.3885,
|
| 31785 |
+
"MMLU-PRO": 0.1111
|
| 31786 |
}
|
| 31787 |
},
|
| 31788 |
{
|
|
|
|
| 32154 |
"name": "Llama-3-8B-Orpo-v0.1",
|
| 32155 |
"developer": "dfurman",
|
| 32156 |
"scores": {
|
| 32157 |
+
"IFEval": 0.3,
|
| 32158 |
+
"BBH": 0.3853,
|
| 32159 |
+
"MATH Level 5": 0.0415,
|
| 32160 |
+
"GPQA": 0.2617,
|
| 32161 |
+
"MUSR": 0.3579,
|
| 32162 |
+
"MMLU-PRO": 0.2281
|
| 32163 |
}
|
| 32164 |
},
|
| 32165 |
{
|
|
|
|
| 34650 |
"name": "gemma-2-2b",
|
| 34651 |
"developer": "Google",
|
| 34652 |
"scores": {
|
| 34653 |
+
"IFEval": 0.1993,
|
| 34654 |
+
"BBH": 0.3656,
|
| 34655 |
+
"MATH Level 5": 0.0287,
|
| 34656 |
"GPQA": 0.2626,
|
| 34657 |
+
"MUSR": 0.4232,
|
| 34658 |
+
"MMLU-PRO": 0.218
|
| 34659 |
}
|
| 34660 |
},
|
| 34661 |
{
|
|
|
|
| 34676 |
"name": "gemma-2-2b-jpn-it",
|
| 34677 |
"developer": "Google",
|
| 34678 |
"scores": {
|
| 34679 |
+
"IFEval": 0.5288,
|
| 34680 |
+
"BBH": 0.4178,
|
| 34681 |
+
"MATH Level 5": 0.0476,
|
| 34682 |
+
"GPQA": 0.2752,
|
| 34683 |
+
"MUSR": 0.3728,
|
| 34684 |
+
"MMLU-PRO": 0.2467
|
| 34685 |
}
|
| 34686 |
},
|
| 34687 |
{
|
|
|
|
| 37692 |
"name": "Kosmos-EVAA-Fusion-8B",
|
| 37693 |
"developer": "jaspionjader",
|
| 37694 |
"scores": {
|
| 37695 |
+
"IFEval": 0.4418,
|
| 37696 |
+
"BBH": 0.5406,
|
| 37697 |
+
"MATH Level 5": 0.1352,
|
| 37698 |
+
"GPQA": 0.3062,
|
| 37699 |
"MUSR": 0.4277,
|
| 37700 |
+
"MMLU-PRO": 0.386
|
| 37701 |
}
|
| 37702 |
},
|
| 37703 |
{
|
|
|
|
| 42346 |
"name": "Mistral-v0.3-7B-ORPO",
|
| 42347 |
"developer": "llmat",
|
| 42348 |
"scores": {
|
| 42349 |
+
"IFEval": 0.377,
|
| 42350 |
+
"BBH": 0.3978,
|
| 42351 |
+
"MATH Level 5": 0.0242,
|
| 42352 |
+
"GPQA": 0.2668,
|
| 42353 |
+
"MUSR": 0.3555,
|
| 42354 |
+
"MMLU-PRO": 0.2278
|
| 42355 |
}
|
| 42356 |
},
|
| 42357 |
{
|
|
|
|
| 44465 |
"name": "Mixtral-8x7B-v0.1",
|
| 44466 |
"developer": "mistralai",
|
| 44467 |
"scores": {
|
| 44468 |
+
"IFEval": 0.2415,
|
| 44469 |
+
"BBH": 0.5087,
|
| 44470 |
+
"MATH Level 5": 0.102,
|
| 44471 |
+
"GPQA": 0.3138,
|
| 44472 |
+
"MUSR": 0.4321,
|
| 44473 |
+
"MMLU-PRO": 0.385
|
| 44474 |
}
|
| 44475 |
},
|
| 44476 |
{
|
|
|
|
| 44725 |
"name": "NeuralDaredevil-8B-abliterated",
|
| 44726 |
"developer": "mlabonne",
|
| 44727 |
"scores": {
|
| 44728 |
+
"IFEval": 0.7561,
|
| 44729 |
+
"BBH": 0.5111,
|
| 44730 |
+
"MATH Level 5": 0.0906,
|
| 44731 |
+
"GPQA": 0.3062,
|
| 44732 |
+
"MUSR": 0.4019,
|
| 44733 |
+
"MMLU-PRO": 0.3841
|
| 44734 |
}
|
| 44735 |
},
|
| 44736 |
{
|
|
|
|
| 45063 |
"name": "Mistral-Nemo-Kurdish-Instruct",
|
| 45064 |
"developer": "nazimali",
|
| 45065 |
"scores": {
|
| 45066 |
+
"IFEval": 0.486,
|
| 45067 |
+
"BBH": 0.4721,
|
| 45068 |
+
"MATH Level 5": 0.0846,
|
| 45069 |
+
"GPQA": 0.2844,
|
| 45070 |
+
"MUSR": 0.4006,
|
| 45071 |
+
"MMLU-PRO": 0.3087
|
| 45072 |
}
|
| 45073 |
},
|
| 45074 |
{
|
|
|
|
| 46766 |
"name": "franqwenstein-35b",
|
| 46767 |
"developer": "nisten",
|
| 46768 |
"scores": {
|
| 46769 |
+
"IFEval": 0.3799,
|
| 46770 |
+
"BBH": 0.6647,
|
| 46771 |
+
"MATH Level 5": 0.3406,
|
| 46772 |
+
"GPQA": 0.4035,
|
| 46773 |
+
"MUSR": 0.494,
|
| 46774 |
+
"MMLU-PRO": 0.5731
|
| 46775 |
}
|
| 46776 |
},
|
| 46777 |
{
|
|
|
|
| 48716 |
"name": "Llama-3-8B-ProLong-512k-Instruct",
|
| 48717 |
"developer": "princeton-nlp",
|
| 48718 |
"scores": {
|
| 48719 |
+
"IFEval": 0.3978,
|
| 48720 |
+
"BBH": 0.4983,
|
| 48721 |
+
"MATH Level 5": 0.0582,
|
| 48722 |
+
"GPQA": 0.281,
|
| 48723 |
+
"MUSR": 0.425,
|
| 48724 |
+
"MMLU-PRO": 0.3246
|
| 48725 |
}
|
| 48726 |
},
|
| 48727 |
{
|
|
|
|
| 51290 |
"name": "Gemma-2-Ataraxy-Gemmasutra-9B-slerp",
|
| 51291 |
"developer": "recoilme",
|
| 51292 |
"scores": {
|
| 51293 |
+
"IFEval": 0.2854,
|
| 51294 |
+
"BBH": 0.5984,
|
| 51295 |
+
"MATH Level 5": 0.1005,
|
| 51296 |
+
"GPQA": 0.3297,
|
| 51297 |
+
"MUSR": 0.4607,
|
| 51298 |
+
"MMLU-PRO": 0.4162
|
| 51299 |
}
|
| 51300 |
},
|
| 51301 |
{
|
|
|
|
| 51316 |
"name": "recoilme-gemma-2-9B-v0.2",
|
| 51317 |
"developer": "recoilme",
|
| 51318 |
"scores": {
|
| 51319 |
+
"IFEval": 0.7592,
|
| 51320 |
+
"BBH": 0.6026,
|
| 51321 |
+
"MATH Level 5": 0.0529,
|
| 51322 |
+
"GPQA": 0.3289,
|
| 51323 |
+
"MUSR": 0.4099,
|
| 51324 |
+
"MMLU-PRO": 0.4163
|
| 51325 |
}
|
| 51326 |
},
|
| 51327 |
{
|
|
|
|
| 51329 |
"name": "recoilme-gemma-2-9B-v0.3",
|
| 51330 |
"developer": "recoilme",
|
| 51331 |
"scores": {
|
| 51332 |
+
"IFEval": 0.5761,
|
| 51333 |
+
"BBH": 0.602,
|
| 51334 |
+
"MATH Level 5": 0.1888,
|
| 51335 |
+
"GPQA": 0.3372,
|
| 51336 |
+
"MUSR": 0.4632,
|
| 51337 |
+
"MMLU-PRO": 0.4039
|
| 51338 |
}
|
| 51339 |
},
|
| 51340 |
{
|
|
|
|
| 56984 |
"name": "BagelMIsteryTour-v2-8x7B",
|
| 56985 |
"developer": "ycros",
|
| 56986 |
"scores": {
|
| 56987 |
+
"IFEval": 0.5994,
|
| 56988 |
+
"BBH": 0.5159,
|
| 56989 |
+
"MATH Level 5": 0.0785,
|
| 56990 |
+
"GPQA": 0.3045,
|
| 56991 |
+
"MUSR": 0.4203,
|
| 56992 |
+
"MMLU-PRO": 0.3473
|
| 56993 |
}
|
| 56994 |
},
|
| 56995 |
{
|
data/benchmarks/livecodebenchpro.json
CHANGED
|
@@ -205,9 +205,9 @@
|
|
| 205 |
"name": "gpt-5-2025-08-07",
|
| 206 |
"developer": "OpenAI",
|
| 207 |
"scores": {
|
| 208 |
-
"Hard Problems": 0.
|
| 209 |
-
"Medium Problems": 0.
|
| 210 |
-
"Easy Problems": 0.
|
| 211 |
}
|
| 212 |
},
|
| 213 |
{
|
|
|
|
| 205 |
"name": "gpt-5-2025-08-07",
|
| 206 |
"developer": "OpenAI",
|
| 207 |
"scores": {
|
| 208 |
+
"Hard Problems": 0.04225352112676056,
|
| 209 |
+
"Medium Problems": 0.4084507042253521,
|
| 210 |
+
"Easy Problems": 0.8873239436619719
|
| 211 |
}
|
| 212 |
},
|
| 213 |
{
|
data/benchmarks/reward-bench.json
CHANGED
|
@@ -453,16 +453,16 @@
|
|
| 453 |
"name": "LxzGordon/URM-LLaMa-3.1-8B",
|
| 454 |
"developer": "LxzGordon",
|
| 455 |
"scores": {
|
| 456 |
-
"Score": 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 457 |
"Factuality": 0.6884,
|
| 458 |
"Precise IF": 0.45,
|
| 459 |
"Math": 0.6393,
|
| 460 |
-
"Safety": 0.9108,
|
| 461 |
"Focus": 0.9758,
|
| 462 |
-
"Ties": 0.7653
|
| 463 |
-
"Chat": 0.9553,
|
| 464 |
-
"Chat Hard": 0.8816,
|
| 465 |
-
"Reasoning": 0.9698
|
| 466 |
}
|
| 467 |
},
|
| 468 |
{
|
|
@@ -555,17 +555,17 @@
|
|
| 555 |
"name": "OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1",
|
| 556 |
"developer": "OpenAssistant",
|
| 557 |
"scores": {
|
| 558 |
-
"Score": 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 559 |
"Factuality": 0.3979,
|
| 560 |
"Precise IF": 0.2875,
|
| 561 |
"Math": 0.377,
|
| 562 |
-
"Safety": 0.5446,
|
| 563 |
"Focus": 0.1535,
|
| 564 |
-
"Ties": 0.047
|
| 565 |
-
"Chat": 0.9246,
|
| 566 |
-
"Chat Hard": 0.3728,
|
| 567 |
-
"Reasoning": 0.5855,
|
| 568 |
-
"Prior Sets (0.5 weight)": 0.6801
|
| 569 |
}
|
| 570 |
},
|
| 571 |
{
|
|
@@ -573,17 +573,17 @@
|
|
| 573 |
"name": "OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5",
|
| 574 |
"developer": "OpenAssistant",
|
| 575 |
"scores": {
|
| 576 |
-
"Score": 0.
|
| 577 |
-
"Chat": 0.8855,
|
| 578 |
-
"Chat Hard": 0.4868,
|
| 579 |
-
"Safety": 0.3244,
|
| 580 |
-
"Reasoning": 0.7752,
|
| 581 |
-
"Prior Sets (0.5 weight)": 0.6533,
|
| 582 |
"Factuality": 0.3179,
|
| 583 |
"Precise IF": 0.2625,
|
| 584 |
"Math": 0.3934,
|
|
|
|
| 585 |
"Focus": 0.2707,
|
| 586 |
-
"Ties": 0.0198
|
|
|
|
|
|
|
|
|
|
|
|
|
| 587 |
}
|
| 588 |
},
|
| 589 |
{
|
|
@@ -609,17 +609,17 @@
|
|
| 609 |
"name": "PKU-Alignment/beaver-7b-v1.0-cost",
|
| 610 |
"developer": "PKU-Alignment",
|
| 611 |
"scores": {
|
| 612 |
-
"Score": 0.
|
| 613 |
-
"Chat": 0.6173,
|
| 614 |
-
"Chat Hard": 0.4232,
|
| 615 |
-
"Safety": 0.7589,
|
| 616 |
-
"Reasoning": 0.5482,
|
| 617 |
-
"Prior Sets (0.5 weight)": 0.57,
|
| 618 |
"Factuality": 0.3263,
|
| 619 |
"Precise IF": 0.2313,
|
| 620 |
"Math": 0.3989,
|
|
|
|
| 621 |
"Focus": 0.2939,
|
| 622 |
-
"Ties": -0.01
|
|
|
|
|
|
|
|
|
|
|
|
|
| 623 |
}
|
| 624 |
},
|
| 625 |
{
|
|
@@ -627,17 +627,17 @@
|
|
| 627 |
"name": "PKU-Alignment/beaver-7b-v1.0-reward",
|
| 628 |
"developer": "PKU-Alignment",
|
| 629 |
"scores": {
|
| 630 |
-
"Score": 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 631 |
"Factuality": 0.2105,
|
| 632 |
"Precise IF": 0.2938,
|
| 633 |
"Math": 0.2623,
|
| 634 |
-
"Safety": 0.3757,
|
| 635 |
"Focus": 0.0646,
|
| 636 |
-
"Ties": -0.01
|
| 637 |
-
"Chat": 0.8184,
|
| 638 |
-
"Chat Hard": 0.2873,
|
| 639 |
-
"Reasoning": 0.346,
|
| 640 |
-
"Prior Sets (0.5 weight)": 0.5993
|
| 641 |
}
|
| 642 |
},
|
| 643 |
{
|
|
@@ -663,17 +663,17 @@
|
|
| 663 |
"name": "PKU-Alignment/beaver-7b-v2.0-reward",
|
| 664 |
"developer": "PKU-Alignment",
|
| 665 |
"scores": {
|
| 666 |
-
"Score": 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 667 |
"Factuality": 0.2168,
|
| 668 |
"Precise IF": 0.2562,
|
| 669 |
"Math": 0.3825,
|
| 670 |
-
"Safety": 0.6041,
|
| 671 |
"Focus": 0.2606,
|
| 672 |
-
"Ties": 0.0944
|
| 673 |
-
"Chat": 0.8994,
|
| 674 |
-
"Chat Hard": 0.364,
|
| 675 |
-
"Reasoning": 0.6887,
|
| 676 |
-
"Prior Sets (0.5 weight)": 0.6171
|
| 677 |
}
|
| 678 |
},
|
| 679 |
{
|
|
@@ -921,16 +921,16 @@
|
|
| 921 |
"name": "Ray2333/GRM-gemma2-2B-rewardmodel-ft",
|
| 922 |
"developer": "Ray2333",
|
| 923 |
"scores": {
|
| 924 |
-
"Score": 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 925 |
"Factuality": 0.5305,
|
| 926 |
"Precise IF": 0.3125,
|
| 927 |
"Math": 0.5902,
|
| 928 |
-
"Safety": 0.9216,
|
| 929 |
"Focus": 0.7455,
|
| 930 |
-
"Ties": 0.4788
|
| 931 |
-
"Chat": 0.9302,
|
| 932 |
-
"Chat Hard": 0.7719,
|
| 933 |
-
"Reasoning": 0.912
|
| 934 |
}
|
| 935 |
},
|
| 936 |
{
|
|
@@ -956,17 +956,17 @@
|
|
| 956 |
"name": "Ray2333/GRM-llama3-8B-sftreg",
|
| 957 |
"developer": "Ray2333",
|
| 958 |
"scores": {
|
| 959 |
-
"Score": 0.
|
| 960 |
-
"Chat": 0.986,
|
| 961 |
-
"Chat Hard": 0.6776,
|
| 962 |
-
"Safety": 0.7867,
|
| 963 |
-
"Reasoning": 0.9229,
|
| 964 |
-
"Prior Sets (0.5 weight)": 0.7309,
|
| 965 |
"Factuality": 0.6189,
|
| 966 |
"Precise IF": 0.3875,
|
| 967 |
"Math": 0.5792,
|
|
|
|
| 968 |
"Focus": 0.6828,
|
| 969 |
-
"Ties": 0.5981
|
|
|
|
|
|
|
|
|
|
|
|
|
| 970 |
}
|
| 971 |
},
|
| 972 |
{
|
|
@@ -1139,16 +1139,16 @@
|
|
| 1139 |
"name": "Skywork/Skywork-Reward-Gemma-2-27B",
|
| 1140 |
"developer": "Skywork",
|
| 1141 |
"scores": {
|
| 1142 |
-
"Score": 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1143 |
"Factuality": 0.7368,
|
| 1144 |
"Precise IF": 0.4031,
|
| 1145 |
"Math": 0.7049,
|
| 1146 |
-
"Safety": 0.9189,
|
| 1147 |
"Focus": 0.9323,
|
| 1148 |
-
"Ties": 0.8261
|
| 1149 |
-
"Chat": 0.9581,
|
| 1150 |
-
"Chat Hard": 0.9145,
|
| 1151 |
-
"Reasoning": 0.9606
|
| 1152 |
}
|
| 1153 |
},
|
| 1154 |
{
|
|
@@ -1156,16 +1156,16 @@
|
|
| 1156 |
"name": "Skywork/Skywork-Reward-Gemma-2-27B-v0.2",
|
| 1157 |
"developer": "Skywork",
|
| 1158 |
"scores": {
|
| 1159 |
-
"Score": 0.
|
| 1160 |
-
"Chat": 0.9609,
|
| 1161 |
-
"Chat Hard": 0.8991,
|
| 1162 |
-
"Safety": 0.9689,
|
| 1163 |
-
"Reasoning": 0.9807,
|
| 1164 |
"Factuality": 0.7674,
|
| 1165 |
"Precise IF": 0.375,
|
| 1166 |
"Math": 0.6721,
|
|
|
|
| 1167 |
"Focus": 0.9172,
|
| 1168 |
-
"Ties": 0.8182
|
|
|
|
|
|
|
|
|
|
| 1169 |
}
|
| 1170 |
},
|
| 1171 |
{
|
|
@@ -1173,16 +1173,16 @@
|
|
| 1173 |
"name": "Skywork/Skywork-Reward-Llama-3.1-8B",
|
| 1174 |
"developer": "Skywork",
|
| 1175 |
"scores": {
|
| 1176 |
-
"Score": 0.
|
| 1177 |
-
"Chat": 0.9581,
|
| 1178 |
-
"Chat Hard": 0.8728,
|
| 1179 |
-
"Safety": 0.9333,
|
| 1180 |
-
"Reasoning": 0.962,
|
| 1181 |
"Factuality": 0.6989,
|
| 1182 |
"Precise IF": 0.425,
|
| 1183 |
"Math": 0.6284,
|
|
|
|
| 1184 |
"Focus": 0.9616,
|
| 1185 |
-
"Ties": 0.741
|
|
|
|
|
|
|
|
|
|
| 1186 |
}
|
| 1187 |
},
|
| 1188 |
{
|
|
@@ -1305,16 +1305,16 @@
|
|
| 1305 |
"name": "Skywork/Skywork-VL-Reward-7B",
|
| 1306 |
"developer": "Skywork",
|
| 1307 |
"scores": {
|
| 1308 |
-
"Score": 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1309 |
"Factuality": 0.6063,
|
| 1310 |
"Precise IF": 0.35,
|
| 1311 |
"Math": 0.6339,
|
| 1312 |
-
"Safety": 0.9108,
|
| 1313 |
"Focus": 0.8909,
|
| 1314 |
-
"Ties": 0.7586
|
| 1315 |
-
"Chat": 0.8994,
|
| 1316 |
-
"Chat Hard": 0.875,
|
| 1317 |
-
"Reasoning": 0.9176
|
| 1318 |
}
|
| 1319 |
},
|
| 1320 |
{
|
|
@@ -1379,9 +1379,9 @@
|
|
| 1379 |
"name": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...",
|
| 1380 |
"developer": "AI2",
|
| 1381 |
"scores": {
|
| 1382 |
-
"Score": 0.
|
| 1383 |
-
"Chat": 0.
|
| 1384 |
-
"Chat Hard": 0.
|
| 1385 |
"Safety": 0.7757
|
| 1386 |
}
|
| 1387 |
},
|
|
@@ -1423,17 +1423,17 @@
|
|
| 1423 |
"name": "allenai/Llama-3.1-70B-Instruct-RM-RB2",
|
| 1424 |
"developer": "allenai",
|
| 1425 |
"scores": {
|
| 1426 |
-
"Score": 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1427 |
"Factuality": 0.8126,
|
| 1428 |
"Precise IF": 0.4188,
|
| 1429 |
"Math": 0.6995,
|
| 1430 |
-
"Safety": 0.9095,
|
| 1431 |
"Focus": 0.8646,
|
| 1432 |
-
"Ties": 0.8835
|
| 1433 |
-
"Chat": 0.9665,
|
| 1434 |
-
"Chat Hard": 0.8355,
|
| 1435 |
-
"Reasoning": 0.8969,
|
| 1436 |
-
"Prior Sets (0.5 weight)": 0.0
|
| 1437 |
}
|
| 1438 |
},
|
| 1439 |
{
|
|
@@ -1459,17 +1459,17 @@
|
|
| 1459 |
"name": "allenai/Llama-3.1-8B-Instruct-RM-RB2",
|
| 1460 |
"developer": "allenai",
|
| 1461 |
"scores": {
|
| 1462 |
-
"Score": 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1463 |
"Factuality": 0.7432,
|
| 1464 |
"Precise IF": 0.4437,
|
| 1465 |
"Math": 0.6175,
|
| 1466 |
-
"Safety": 0.8932,
|
| 1467 |
"Focus": 0.9071,
|
| 1468 |
-
"Ties": 0.7638
|
| 1469 |
-
"Chat": 0.9581,
|
| 1470 |
-
"Chat Hard": 0.8158,
|
| 1471 |
-
"Reasoning": 0.887,
|
| 1472 |
-
"Prior Sets (0.5 weight)": 0.0
|
| 1473 |
}
|
| 1474 |
},
|
| 1475 |
{
|
|
@@ -1477,17 +1477,17 @@
|
|
| 1477 |
"name": "allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2",
|
| 1478 |
"developer": "allenai",
|
| 1479 |
"scores": {
|
| 1480 |
-
"Score": 0.
|
| 1481 |
-
"Chat": 0.9693,
|
| 1482 |
-
"Chat Hard": 0.8268,
|
| 1483 |
-
"Safety": 0.8689,
|
| 1484 |
-
"Reasoning": 0.8583,
|
| 1485 |
-
"Prior Sets (0.5 weight)": 0.0,
|
| 1486 |
"Factuality": 0.8084,
|
| 1487 |
"Precise IF": 0.3688,
|
| 1488 |
"Math": 0.6776,
|
|
|
|
| 1489 |
"Focus": 0.7778,
|
| 1490 |
-
"Ties": 0.8308
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1491 |
}
|
| 1492 |
},
|
| 1493 |
{
|
|
@@ -1495,17 +1495,17 @@
|
|
| 1495 |
"name": "allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2",
|
| 1496 |
"developer": "allenai",
|
| 1497 |
"scores": {
|
| 1498 |
-
"Score": 0.
|
| 1499 |
-
"Chat": 0.9553,
|
| 1500 |
-
"Chat Hard": 0.761,
|
| 1501 |
-
"Safety": 0.86,
|
| 1502 |
-
"Reasoning": 0.7898,
|
| 1503 |
-
"Prior Sets (0.5 weight)": 0.0,
|
| 1504 |
"Factuality": 0.7516,
|
| 1505 |
"Precise IF": 0.3875,
|
| 1506 |
"Math": 0.6284,
|
|
|
|
| 1507 |
"Focus": 0.8545,
|
| 1508 |
-
"Ties": 0.6397
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1509 |
}
|
| 1510 |
},
|
| 1511 |
{
|
|
@@ -3784,16 +3784,16 @@
|
|
| 3784 |
"name": "infly/INF-ORM-Llama3.1-70B",
|
| 3785 |
"developer": "infly",
|
| 3786 |
"scores": {
|
| 3787 |
-
"Score": 0.
|
| 3788 |
-
"Chat": 0.9665,
|
| 3789 |
-
"Chat Hard": 0.9101,
|
| 3790 |
-
"Safety": 0.9644,
|
| 3791 |
-
"Reasoning": 0.9912,
|
| 3792 |
"Factuality": 0.7411,
|
| 3793 |
"Precise IF": 0.4188,
|
| 3794 |
"Math": 0.6995,
|
|
|
|
| 3795 |
"Focus": 0.903,
|
| 3796 |
-
"Ties": 0.8622
|
|
|
|
|
|
|
|
|
|
| 3797 |
}
|
| 3798 |
},
|
| 3799 |
{
|
|
@@ -3835,16 +3835,16 @@
|
|
| 3835 |
"name": "internlm/internlm2-7b-reward",
|
| 3836 |
"developer": "internlm",
|
| 3837 |
"scores": {
|
| 3838 |
-
"Score": 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3839 |
"Factuality": 0.4211,
|
| 3840 |
"Precise IF": 0.4,
|
| 3841 |
"Math": 0.5628,
|
| 3842 |
-
"Safety": 0.8716,
|
| 3843 |
"Focus": 0.7051,
|
| 3844 |
-
"Ties": 0.5164
|
| 3845 |
-
"Chat": 0.9916,
|
| 3846 |
-
"Chat Hard": 0.6952,
|
| 3847 |
-
"Reasoning": 0.9453
|
| 3848 |
}
|
| 3849 |
},
|
| 3850 |
{
|
|
@@ -4014,16 +4014,16 @@
|
|
| 4014 |
"name": "nicolinho/QRM-Gemma-2-27B",
|
| 4015 |
"developer": "nicolinho",
|
| 4016 |
"scores": {
|
| 4017 |
-
"Score": 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4018 |
"Factuality": 0.7853,
|
| 4019 |
"Precise IF": 0.3719,
|
| 4020 |
"Math": 0.6995,
|
| 4021 |
-
"Safety": 0.927,
|
| 4022 |
"Focus": 0.9535,
|
| 4023 |
-
"Ties": 0.8321
|
| 4024 |
-
"Chat": 0.9665,
|
| 4025 |
-
"Chat Hard": 0.9013,
|
| 4026 |
-
"Reasoning": 0.9826
|
| 4027 |
}
|
| 4028 |
},
|
| 4029 |
{
|
|
@@ -4055,16 +4055,16 @@
|
|
| 4055 |
"name": "nicolinho/QRM-Llama3.1-8B-v2",
|
| 4056 |
"developer": "nicolinho",
|
| 4057 |
"scores": {
|
| 4058 |
-
"Score": 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4059 |
"Factuality": 0.6653,
|
| 4060 |
"Precise IF": 0.4062,
|
| 4061 |
"Math": 0.612,
|
| 4062 |
-
"Safety": 0.9257,
|
| 4063 |
"Focus": 0.8909,
|
| 4064 |
-
"Ties": 0.7234
|
| 4065 |
-
"Chat": 0.9637,
|
| 4066 |
-
"Chat Hard": 0.8684,
|
| 4067 |
-
"Reasoning": 0.9677
|
| 4068 |
}
|
| 4069 |
},
|
| 4070 |
{
|
|
@@ -4202,16 +4202,16 @@
|
|
| 4202 |
"name": "GPT-4o 2024-08-06",
|
| 4203 |
"developer": "OpenAI",
|
| 4204 |
"scores": {
|
| 4205 |
-
"Score": 0.
|
| 4206 |
-
"Chat": 0.9609,
|
| 4207 |
-
"Chat Hard": 0.761,
|
| 4208 |
-
"Safety": 0.8619,
|
| 4209 |
-
"Reasoning": 0.8661,
|
| 4210 |
"Factuality": 0.5684,
|
| 4211 |
"Precise IF": 0.3312,
|
| 4212 |
"Math": 0.623,
|
|
|
|
| 4213 |
"Focus": 0.7293,
|
| 4214 |
-
"Ties": 0.7819
|
|
|
|
|
|
|
|
|
|
| 4215 |
}
|
| 4216 |
},
|
| 4217 |
{
|
|
@@ -4249,17 +4249,17 @@
|
|
| 4249 |
"name": "openbmb/Eurus-RM-7b",
|
| 4250 |
"developer": "openbmb",
|
| 4251 |
"scores": {
|
| 4252 |
-
"Score": 0.
|
| 4253 |
-
"Chat": 0.9804,
|
| 4254 |
-
"Chat Hard": 0.6557,
|
| 4255 |
-
"Safety": 0.6267,
|
| 4256 |
-
"Reasoning": 0.8633,
|
| 4257 |
-
"Prior Sets (0.5 weight)": 0.7172,
|
| 4258 |
"Factuality": 0.6,
|
| 4259 |
"Precise IF": 0.3438,
|
| 4260 |
"Math": 0.5683,
|
|
|
|
| 4261 |
"Focus": 0.7475,
|
| 4262 |
-
"Ties": 0.5972
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4263 |
}
|
| 4264 |
},
|
| 4265 |
{
|
|
@@ -4370,17 +4370,17 @@
|
|
| 4370 |
"name": "sfairXC/FsfairX-LLaMA3-RM-v0.1",
|
| 4371 |
"developer": "sfairXC",
|
| 4372 |
"scores": {
|
| 4373 |
-
"Score": 0.
|
| 4374 |
-
"Chat": 0.9944,
|
| 4375 |
-
"Chat Hard": 0.6513,
|
| 4376 |
-
"Safety": 0.7667,
|
| 4377 |
-
"Reasoning": 0.8644,
|
| 4378 |
-
"Prior Sets (0.5 weight)": 0.7492,
|
| 4379 |
"Factuality": 0.5916,
|
| 4380 |
"Precise IF": 0.4188,
|
| 4381 |
"Math": 0.6284,
|
|
|
|
| 4382 |
"Focus": 0.7051,
|
| 4383 |
-
"Ties": 0.6647
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4384 |
}
|
| 4385 |
},
|
| 4386 |
{
|
|
@@ -4492,17 +4492,17 @@
|
|
| 4492 |
"name": "weqweasdas/RM-Gemma-2B",
|
| 4493 |
"developer": "weqweasdas",
|
| 4494 |
"scores": {
|
| 4495 |
-
"Score": 0.
|
| 4496 |
-
"Chat": 0.9441,
|
| 4497 |
-
"Chat Hard": 0.4079,
|
| 4498 |
-
"Safety": 0.3311,
|
| 4499 |
-
"Reasoning": 0.7637,
|
| 4500 |
-
"Prior Sets (0.5 weight)": 0.6652,
|
| 4501 |
"Factuality": 0.3705,
|
| 4502 |
"Precise IF": 0.2812,
|
| 4503 |
"Math": 0.4317,
|
|
|
|
| 4504 |
"Focus": 0.2343,
|
| 4505 |
-
"Ties": 0.1851
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4506 |
}
|
| 4507 |
},
|
| 4508 |
{
|
|
@@ -4541,17 +4541,17 @@
|
|
| 4541 |
"name": "weqweasdas/RM-Mistral-7B",
|
| 4542 |
"developer": "weqweasdas",
|
| 4543 |
"scores": {
|
| 4544 |
-
"Score": 0.
|
| 4545 |
-
"Chat": 0.9665,
|
| 4546 |
-
"Chat Hard": 0.6053,
|
| 4547 |
-
"Safety": 0.6911,
|
| 4548 |
-
"Reasoning": 0.7736,
|
| 4549 |
-
"Prior Sets (0.5 weight)": 0.753,
|
| 4550 |
"Factuality": 0.5937,
|
| 4551 |
"Precise IF": 0.3438,
|
| 4552 |
"Math": 0.5956,
|
|
|
|
| 4553 |
"Focus": 0.7293,
|
| 4554 |
-
"Ties": 0.6226
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4555 |
}
|
| 4556 |
},
|
| 4557 |
{
|
|
@@ -4559,17 +4559,17 @@
|
|
| 4559 |
"name": "weqweasdas/hh_rlhf_rm_open_llama_3b",
|
| 4560 |
"developer": "weqweasdas",
|
| 4561 |
"scores": {
|
| 4562 |
-
"Score": 0.
|
| 4563 |
-
"Chat": 0.8184,
|
| 4564 |
-
"Chat Hard": 0.3728,
|
| 4565 |
-
"Safety": 0.24,
|
| 4566 |
-
"Reasoning": 0.3281,
|
| 4567 |
-
"Prior Sets (0.5 weight)": 0.6564,
|
| 4568 |
"Factuality": 0.3642,
|
| 4569 |
"Precise IF": 0.275,
|
| 4570 |
"Math": 0.3497,
|
|
|
|
| 4571 |
"Focus": 0.2384,
|
| 4572 |
-
"Ties": 0.0315
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4573 |
}
|
| 4574 |
}
|
| 4575 |
]
|
|
|
|
| 453 |
"name": "LxzGordon/URM-LLaMa-3.1-8B",
|
| 454 |
"developer": "LxzGordon",
|
| 455 |
"scores": {
|
| 456 |
+
"Score": 0.7394,
|
| 457 |
+
"Chat": 0.9553,
|
| 458 |
+
"Chat Hard": 0.8816,
|
| 459 |
+
"Safety": 0.9178,
|
| 460 |
+
"Reasoning": 0.9698,
|
| 461 |
"Factuality": 0.6884,
|
| 462 |
"Precise IF": 0.45,
|
| 463 |
"Math": 0.6393,
|
|
|
|
| 464 |
"Focus": 0.9758,
|
| 465 |
+
"Ties": 0.7653
|
|
|
|
|
|
|
|
|
|
| 466 |
}
|
| 467 |
},
|
| 468 |
{
|
|
|
|
| 555 |
"name": "OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1",
|
| 556 |
"developer": "OpenAssistant",
|
| 557 |
"scores": {
|
| 558 |
+
"Score": 0.2653,
|
| 559 |
+
"Chat": 0.9246,
|
| 560 |
+
"Chat Hard": 0.3728,
|
| 561 |
+
"Safety": 0.3289,
|
| 562 |
+
"Reasoning": 0.5855,
|
| 563 |
+
"Prior Sets (0.5 weight)": 0.6801,
|
| 564 |
"Factuality": 0.3979,
|
| 565 |
"Precise IF": 0.2875,
|
| 566 |
"Math": 0.377,
|
|
|
|
| 567 |
"Focus": 0.1535,
|
| 568 |
+
"Ties": 0.047
|
|
|
|
|
|
|
|
|
|
|
|
|
| 569 |
}
|
| 570 |
},
|
| 571 |
{
|
|
|
|
| 573 |
"name": "OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5",
|
| 574 |
"developer": "OpenAssistant",
|
| 575 |
"scores": {
|
| 576 |
+
"Score": 0.6901,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 577 |
"Factuality": 0.3179,
|
| 578 |
"Precise IF": 0.2625,
|
| 579 |
"Math": 0.3934,
|
| 580 |
+
"Safety": 0.6311,
|
| 581 |
"Focus": 0.2707,
|
| 582 |
+
"Ties": 0.0198,
|
| 583 |
+
"Chat": 0.8855,
|
| 584 |
+
"Chat Hard": 0.4868,
|
| 585 |
+
"Reasoning": 0.7752,
|
| 586 |
+
"Prior Sets (0.5 weight)": 0.6533
|
| 587 |
}
|
| 588 |
},
|
| 589 |
{
|
|
|
|
| 609 |
"name": "PKU-Alignment/beaver-7b-v1.0-cost",
|
| 610 |
"developer": "PKU-Alignment",
|
| 611 |
"scores": {
|
| 612 |
+
"Score": 0.5798,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 613 |
"Factuality": 0.3263,
|
| 614 |
"Precise IF": 0.2313,
|
| 615 |
"Math": 0.3989,
|
| 616 |
+
"Safety": 0.7351,
|
| 617 |
"Focus": 0.2939,
|
| 618 |
+
"Ties": -0.01,
|
| 619 |
+
"Chat": 0.6173,
|
| 620 |
+
"Chat Hard": 0.4232,
|
| 621 |
+
"Reasoning": 0.5482,
|
| 622 |
+
"Prior Sets (0.5 weight)": 0.57
|
| 623 |
}
|
| 624 |
},
|
| 625 |
{
|
|
|
|
| 627 |
"name": "PKU-Alignment/beaver-7b-v1.0-reward",
|
| 628 |
"developer": "PKU-Alignment",
|
| 629 |
"scores": {
|
| 630 |
+
"Score": 0.1606,
|
| 631 |
+
"Chat": 0.8184,
|
| 632 |
+
"Chat Hard": 0.2873,
|
| 633 |
+
"Safety": 0.1422,
|
| 634 |
+
"Reasoning": 0.346,
|
| 635 |
+
"Prior Sets (0.5 weight)": 0.5993,
|
| 636 |
"Factuality": 0.2105,
|
| 637 |
"Precise IF": 0.2938,
|
| 638 |
"Math": 0.2623,
|
|
|
|
| 639 |
"Focus": 0.0646,
|
| 640 |
+
"Ties": -0.01
|
|
|
|
|
|
|
|
|
|
|
|
|
| 641 |
}
|
| 642 |
},
|
| 643 |
{
|
|
|
|
| 663 |
"name": "PKU-Alignment/beaver-7b-v2.0-reward",
|
| 664 |
"developer": "PKU-Alignment",
|
| 665 |
"scores": {
|
| 666 |
+
"Score": 0.2544,
|
| 667 |
+
"Chat": 0.8994,
|
| 668 |
+
"Chat Hard": 0.364,
|
| 669 |
+
"Safety": 0.3156,
|
| 670 |
+
"Reasoning": 0.6887,
|
| 671 |
+
"Prior Sets (0.5 weight)": 0.6171,
|
| 672 |
"Factuality": 0.2168,
|
| 673 |
"Precise IF": 0.2562,
|
| 674 |
"Math": 0.3825,
|
|
|
|
| 675 |
"Focus": 0.2606,
|
| 676 |
+
"Ties": 0.0944
|
|
|
|
|
|
|
|
|
|
|
|
|
| 677 |
}
|
| 678 |
},
|
| 679 |
{
|
|
|
|
| 921 |
"name": "Ray2333/GRM-gemma2-2B-rewardmodel-ft",
|
| 922 |
"developer": "Ray2333",
|
| 923 |
"scores": {
|
| 924 |
+
"Score": 0.5966,
|
| 925 |
+
"Chat": 0.9302,
|
| 926 |
+
"Chat Hard": 0.7719,
|
| 927 |
+
"Safety": 0.9222,
|
| 928 |
+
"Reasoning": 0.912,
|
| 929 |
"Factuality": 0.5305,
|
| 930 |
"Precise IF": 0.3125,
|
| 931 |
"Math": 0.5902,
|
|
|
|
| 932 |
"Focus": 0.7455,
|
| 933 |
+
"Ties": 0.4788
|
|
|
|
|
|
|
|
|
|
| 934 |
}
|
| 935 |
},
|
| 936 |
{
|
|
|
|
| 956 |
"name": "Ray2333/GRM-llama3-8B-sftreg",
|
| 957 |
"developer": "Ray2333",
|
| 958 |
"scores": {
|
| 959 |
+
"Score": 0.8542,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 960 |
"Factuality": 0.6189,
|
| 961 |
"Precise IF": 0.3875,
|
| 962 |
"Math": 0.5792,
|
| 963 |
+
"Safety": 0.8919,
|
| 964 |
"Focus": 0.6828,
|
| 965 |
+
"Ties": 0.5981,
|
| 966 |
+
"Chat": 0.986,
|
| 967 |
+
"Chat Hard": 0.6776,
|
| 968 |
+
"Reasoning": 0.9229,
|
| 969 |
+
"Prior Sets (0.5 weight)": 0.7309
|
| 970 |
}
|
| 971 |
},
|
| 972 |
{
|
|
|
|
| 1139 |
"name": "Skywork/Skywork-Reward-Gemma-2-27B",
|
| 1140 |
"developer": "Skywork",
|
| 1141 |
"scores": {
|
| 1142 |
+
"Score": 0.7576,
|
| 1143 |
+
"Chat": 0.9581,
|
| 1144 |
+
"Chat Hard": 0.9145,
|
| 1145 |
+
"Safety": 0.9422,
|
| 1146 |
+
"Reasoning": 0.9606,
|
| 1147 |
"Factuality": 0.7368,
|
| 1148 |
"Precise IF": 0.4031,
|
| 1149 |
"Math": 0.7049,
|
|
|
|
| 1150 |
"Focus": 0.9323,
|
| 1151 |
+
"Ties": 0.8261
|
|
|
|
|
|
|
|
|
|
| 1152 |
}
|
| 1153 |
},
|
| 1154 |
{
|
|
|
|
| 1156 |
"name": "Skywork/Skywork-Reward-Gemma-2-27B-v0.2",
|
| 1157 |
"developer": "Skywork",
|
| 1158 |
"scores": {
|
| 1159 |
+
"Score": 0.9426,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1160 |
"Factuality": 0.7674,
|
| 1161 |
"Precise IF": 0.375,
|
| 1162 |
"Math": 0.6721,
|
| 1163 |
+
"Safety": 0.9297,
|
| 1164 |
"Focus": 0.9172,
|
| 1165 |
+
"Ties": 0.8182,
|
| 1166 |
+
"Chat": 0.9609,
|
| 1167 |
+
"Chat Hard": 0.8991,
|
| 1168 |
+
"Reasoning": 0.9807
|
| 1169 |
}
|
| 1170 |
},
|
| 1171 |
{
|
|
|
|
| 1173 |
"name": "Skywork/Skywork-Reward-Llama-3.1-8B",
|
| 1174 |
"developer": "Skywork",
|
| 1175 |
"scores": {
|
| 1176 |
+
"Score": 0.9252,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1177 |
"Factuality": 0.6989,
|
| 1178 |
"Precise IF": 0.425,
|
| 1179 |
"Math": 0.6284,
|
| 1180 |
+
"Safety": 0.9081,
|
| 1181 |
"Focus": 0.9616,
|
| 1182 |
+
"Ties": 0.741,
|
| 1183 |
+
"Chat": 0.9581,
|
| 1184 |
+
"Chat Hard": 0.8728,
|
| 1185 |
+
"Reasoning": 0.962
|
| 1186 |
}
|
| 1187 |
},
|
| 1188 |
{
|
|
|
|
| 1305 |
"name": "Skywork/Skywork-VL-Reward-7B",
|
| 1306 |
"developer": "Skywork",
|
| 1307 |
"scores": {
|
| 1308 |
+
"Score": 0.6885,
|
| 1309 |
+
"Chat": 0.8994,
|
| 1310 |
+
"Chat Hard": 0.875,
|
| 1311 |
+
"Safety": 0.8911,
|
| 1312 |
+
"Reasoning": 0.9176,
|
| 1313 |
"Factuality": 0.6063,
|
| 1314 |
"Precise IF": 0.35,
|
| 1315 |
"Math": 0.6339,
|
|
|
|
| 1316 |
"Focus": 0.8909,
|
| 1317 |
+
"Ties": 0.7586
|
|
|
|
|
|
|
|
|
|
| 1318 |
}
|
| 1319 |
},
|
| 1320 |
{
|
|
|
|
| 1379 |
"name": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...",
|
| 1380 |
"developer": "AI2",
|
| 1381 |
"scores": {
|
| 1382 |
+
"Score": 0.6924,
|
| 1383 |
+
"Chat": 0.9441,
|
| 1384 |
+
"Chat Hard": 0.3575,
|
| 1385 |
"Safety": 0.7757
|
| 1386 |
}
|
| 1387 |
},
|
|
|
|
| 1423 |
"name": "allenai/Llama-3.1-70B-Instruct-RM-RB2",
|
| 1424 |
"developer": "allenai",
|
| 1425 |
"scores": {
|
| 1426 |
+
"Score": 0.7606,
|
| 1427 |
+
"Chat": 0.9665,
|
| 1428 |
+
"Chat Hard": 0.8355,
|
| 1429 |
+
"Safety": 0.8844,
|
| 1430 |
+
"Reasoning": 0.8969,
|
| 1431 |
+
"Prior Sets (0.5 weight)": 0.0,
|
| 1432 |
"Factuality": 0.8126,
|
| 1433 |
"Precise IF": 0.4188,
|
| 1434 |
"Math": 0.6995,
|
|
|
|
| 1435 |
"Focus": 0.8646,
|
| 1436 |
+
"Ties": 0.8835
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1437 |
}
|
| 1438 |
},
|
| 1439 |
{
|
|
|
|
| 1459 |
"name": "allenai/Llama-3.1-8B-Instruct-RM-RB2",
|
| 1460 |
"developer": "allenai",
|
| 1461 |
"scores": {
|
| 1462 |
+
"Score": 0.7285,
|
| 1463 |
+
"Chat": 0.9581,
|
| 1464 |
+
"Chat Hard": 0.8158,
|
| 1465 |
+
"Safety": 0.8956,
|
| 1466 |
+
"Reasoning": 0.887,
|
| 1467 |
+
"Prior Sets (0.5 weight)": 0.0,
|
| 1468 |
"Factuality": 0.7432,
|
| 1469 |
"Precise IF": 0.4437,
|
| 1470 |
"Math": 0.6175,
|
|
|
|
| 1471 |
"Focus": 0.9071,
|
| 1472 |
+
"Ties": 0.7638
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1473 |
}
|
| 1474 |
},
|
| 1475 |
{
|
|
|
|
| 1477 |
"name": "allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2",
|
| 1478 |
"developer": "allenai",
|
| 1479 |
"scores": {
|
| 1480 |
+
"Score": 0.8892,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1481 |
"Factuality": 0.8084,
|
| 1482 |
"Precise IF": 0.3688,
|
| 1483 |
"Math": 0.6776,
|
| 1484 |
+
"Safety": 0.9027,
|
| 1485 |
"Focus": 0.7778,
|
| 1486 |
+
"Ties": 0.8308,
|
| 1487 |
+
"Chat": 0.9693,
|
| 1488 |
+
"Chat Hard": 0.8268,
|
| 1489 |
+
"Reasoning": 0.8583,
|
| 1490 |
+
"Prior Sets (0.5 weight)": 0.0
|
| 1491 |
}
|
| 1492 |
},
|
| 1493 |
{
|
|
|
|
| 1495 |
"name": "allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2",
|
| 1496 |
"developer": "allenai",
|
| 1497 |
"scores": {
|
| 1498 |
+
"Score": 0.8431,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1499 |
"Factuality": 0.7516,
|
| 1500 |
"Precise IF": 0.3875,
|
| 1501 |
"Math": 0.6284,
|
| 1502 |
+
"Safety": 0.8662,
|
| 1503 |
"Focus": 0.8545,
|
| 1504 |
+
"Ties": 0.6397,
|
| 1505 |
+
"Chat": 0.9553,
|
| 1506 |
+
"Chat Hard": 0.761,
|
| 1507 |
+
"Reasoning": 0.7898,
|
| 1508 |
+
"Prior Sets (0.5 weight)": 0.0
|
| 1509 |
}
|
| 1510 |
},
|
| 1511 |
{
|
|
|
|
| 3784 |
"name": "infly/INF-ORM-Llama3.1-70B",
|
| 3785 |
"developer": "infly",
|
| 3786 |
"scores": {
|
| 3787 |
+
"Score": 0.9511,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3788 |
"Factuality": 0.7411,
|
| 3789 |
"Precise IF": 0.4188,
|
| 3790 |
"Math": 0.6995,
|
| 3791 |
+
"Safety": 0.9365,
|
| 3792 |
"Focus": 0.903,
|
| 3793 |
+
"Ties": 0.8622,
|
| 3794 |
+
"Chat": 0.9665,
|
| 3795 |
+
"Chat Hard": 0.9101,
|
| 3796 |
+
"Reasoning": 0.9912
|
| 3797 |
}
|
| 3798 |
},
|
| 3799 |
{
|
|
|
|
| 3835 |
"name": "internlm/internlm2-7b-reward",
|
| 3836 |
"developer": "internlm",
|
| 3837 |
"scores": {
|
| 3838 |
+
"Score": 0.5335,
|
| 3839 |
+
"Chat": 0.9916,
|
| 3840 |
+
"Chat Hard": 0.6952,
|
| 3841 |
+
"Safety": 0.5956,
|
| 3842 |
+
"Reasoning": 0.9453,
|
| 3843 |
"Factuality": 0.4211,
|
| 3844 |
"Precise IF": 0.4,
|
| 3845 |
"Math": 0.5628,
|
|
|
|
| 3846 |
"Focus": 0.7051,
|
| 3847 |
+
"Ties": 0.5164
|
|
|
|
|
|
|
|
|
|
| 3848 |
}
|
| 3849 |
},
|
| 3850 |
{
|
|
|
|
| 4014 |
"name": "nicolinho/QRM-Gemma-2-27B",
|
| 4015 |
"developer": "nicolinho",
|
| 4016 |
"scores": {
|
| 4017 |
+
"Score": 0.7667,
|
| 4018 |
+
"Chat": 0.9665,
|
| 4019 |
+
"Chat Hard": 0.9013,
|
| 4020 |
+
"Safety": 0.9578,
|
| 4021 |
+
"Reasoning": 0.9826,
|
| 4022 |
"Factuality": 0.7853,
|
| 4023 |
"Precise IF": 0.3719,
|
| 4024 |
"Math": 0.6995,
|
|
|
|
| 4025 |
"Focus": 0.9535,
|
| 4026 |
+
"Ties": 0.8321
|
|
|
|
|
|
|
|
|
|
| 4027 |
}
|
| 4028 |
},
|
| 4029 |
{
|
|
|
|
| 4055 |
"name": "nicolinho/QRM-Llama3.1-8B-v2",
|
| 4056 |
"developer": "nicolinho",
|
| 4057 |
"scores": {
|
| 4058 |
+
"Score": 0.7074,
|
| 4059 |
+
"Chat": 0.9637,
|
| 4060 |
+
"Chat Hard": 0.8684,
|
| 4061 |
+
"Safety": 0.9467,
|
| 4062 |
+
"Reasoning": 0.9677,
|
| 4063 |
"Factuality": 0.6653,
|
| 4064 |
"Precise IF": 0.4062,
|
| 4065 |
"Math": 0.612,
|
|
|
|
| 4066 |
"Focus": 0.8909,
|
| 4067 |
+
"Ties": 0.7234
|
|
|
|
|
|
|
|
|
|
| 4068 |
}
|
| 4069 |
},
|
| 4070 |
{
|
|
|
|
| 4202 |
"name": "GPT-4o 2024-08-06",
|
| 4203 |
"developer": "OpenAI",
|
| 4204 |
"scores": {
|
| 4205 |
+
"Score": 0.8673,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4206 |
"Factuality": 0.5684,
|
| 4207 |
"Precise IF": 0.3312,
|
| 4208 |
"Math": 0.623,
|
| 4209 |
+
"Safety": 0.8811,
|
| 4210 |
"Focus": 0.7293,
|
| 4211 |
+
"Ties": 0.7819,
|
| 4212 |
+
"Chat": 0.9609,
|
| 4213 |
+
"Chat Hard": 0.761,
|
| 4214 |
+
"Reasoning": 0.8661
|
| 4215 |
}
|
| 4216 |
},
|
| 4217 |
{
|
|
|
|
| 4249 |
"name": "openbmb/Eurus-RM-7b",
|
| 4250 |
"developer": "openbmb",
|
| 4251 |
"scores": {
|
| 4252 |
+
"Score": 0.8159,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4253 |
"Factuality": 0.6,
|
| 4254 |
"Precise IF": 0.3438,
|
| 4255 |
"Math": 0.5683,
|
| 4256 |
+
"Safety": 0.8135,
|
| 4257 |
"Focus": 0.7475,
|
| 4258 |
+
"Ties": 0.5972,
|
| 4259 |
+
"Chat": 0.9804,
|
| 4260 |
+
"Chat Hard": 0.6557,
|
| 4261 |
+
"Reasoning": 0.8633,
|
| 4262 |
+
"Prior Sets (0.5 weight)": 0.7172
|
| 4263 |
}
|
| 4264 |
},
|
| 4265 |
{
|
|
|
|
| 4370 |
"name": "sfairXC/FsfairX-LLaMA3-RM-v0.1",
|
| 4371 |
"developer": "sfairXC",
|
| 4372 |
"scores": {
|
| 4373 |
+
"Score": 0.8338,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4374 |
"Factuality": 0.5916,
|
| 4375 |
"Precise IF": 0.4188,
|
| 4376 |
"Math": 0.6284,
|
| 4377 |
+
"Safety": 0.8676,
|
| 4378 |
"Focus": 0.7051,
|
| 4379 |
+
"Ties": 0.6647,
|
| 4380 |
+
"Chat": 0.9944,
|
| 4381 |
+
"Chat Hard": 0.6513,
|
| 4382 |
+
"Reasoning": 0.8644,
|
| 4383 |
+
"Prior Sets (0.5 weight)": 0.7492
|
| 4384 |
}
|
| 4385 |
},
|
| 4386 |
{
|
|
|
|
| 4492 |
"name": "weqweasdas/RM-Gemma-2B",
|
| 4493 |
"developer": "weqweasdas",
|
| 4494 |
"scores": {
|
| 4495 |
+
"Score": 0.6549,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4496 |
"Factuality": 0.3705,
|
| 4497 |
"Precise IF": 0.2812,
|
| 4498 |
"Math": 0.4317,
|
| 4499 |
+
"Safety": 0.4986,
|
| 4500 |
"Focus": 0.2343,
|
| 4501 |
+
"Ties": 0.1851,
|
| 4502 |
+
"Chat": 0.9441,
|
| 4503 |
+
"Chat Hard": 0.4079,
|
| 4504 |
+
"Reasoning": 0.7637,
|
| 4505 |
+
"Prior Sets (0.5 weight)": 0.6652
|
| 4506 |
}
|
| 4507 |
},
|
| 4508 |
{
|
|
|
|
| 4541 |
"name": "weqweasdas/RM-Mistral-7B",
|
| 4542 |
"developer": "weqweasdas",
|
| 4543 |
"scores": {
|
| 4544 |
+
"Score": 0.7982,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4545 |
"Factuality": 0.5937,
|
| 4546 |
"Precise IF": 0.3438,
|
| 4547 |
"Math": 0.5956,
|
| 4548 |
+
"Safety": 0.8703,
|
| 4549 |
"Focus": 0.7293,
|
| 4550 |
+
"Ties": 0.6226,
|
| 4551 |
+
"Chat": 0.9665,
|
| 4552 |
+
"Chat Hard": 0.6053,
|
| 4553 |
+
"Reasoning": 0.7736,
|
| 4554 |
+
"Prior Sets (0.5 weight)": 0.753
|
| 4555 |
}
|
| 4556 |
},
|
| 4557 |
{
|
|
|
|
| 4559 |
"name": "weqweasdas/hh_rlhf_rm_open_llama_3b",
|
| 4560 |
"developer": "weqweasdas",
|
| 4561 |
"scores": {
|
| 4562 |
+
"Score": 0.5027,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4563 |
"Factuality": 0.3642,
|
| 4564 |
"Precise IF": 0.275,
|
| 4565 |
"Math": 0.3497,
|
| 4566 |
+
"Safety": 0.4149,
|
| 4567 |
"Focus": 0.2384,
|
| 4568 |
+
"Ties": 0.0315,
|
| 4569 |
+
"Chat": 0.8184,
|
| 4570 |
+
"Chat Hard": 0.3728,
|
| 4571 |
+
"Reasoning": 0.3281,
|
| 4572 |
+
"Prior Sets (0.5 weight)": 0.6564
|
| 4573 |
}
|
| 4574 |
}
|
| 4575 |
]
|
data/benchmarks/swe-bench.json
CHANGED
|
@@ -5,7 +5,7 @@
|
|
| 5 |
"name": "claude-opus-4-5",
|
| 6 |
"developer": "Anthropic",
|
| 7 |
"scores": {
|
| 8 |
-
"swe-bench": 0.
|
| 9 |
}
|
| 10 |
},
|
| 11 |
{
|
|
@@ -13,7 +13,7 @@
|
|
| 13 |
"name": "gemini-3-pro-preview",
|
| 14 |
"developer": "Google",
|
| 15 |
"scores": {
|
| 16 |
-
"swe-bench": 0.
|
| 17 |
}
|
| 18 |
},
|
| 19 |
{
|
|
|
|
| 5 |
"name": "claude-opus-4-5",
|
| 6 |
"developer": "Anthropic",
|
| 7 |
"scores": {
|
| 8 |
+
"swe-bench": 0.6061
|
| 9 |
}
|
| 10 |
},
|
| 11 |
{
|
|
|
|
| 13 |
"name": "gemini-3-pro-preview",
|
| 14 |
"developer": "Google",
|
| 15 |
"scores": {
|
| 16 |
+
"swe-bench": 0.71
|
| 17 |
}
|
| 18 |
},
|
| 19 |
{
|
data/benchmarks/tau-bench-2_airline.json
CHANGED
|
@@ -5,7 +5,7 @@
|
|
| 5 |
"name": "claude-opus-4-5",
|
| 6 |
"developer": "Anthropic",
|
| 7 |
"scores": {
|
| 8 |
-
"tau-bench-2/airline": 0.
|
| 9 |
}
|
| 10 |
},
|
| 11 |
{
|
|
@@ -13,7 +13,7 @@
|
|
| 13 |
"name": "gemini-3-pro-preview",
|
| 14 |
"developer": "Google",
|
| 15 |
"scores": {
|
| 16 |
-
"tau-bench-2/airline": 0.
|
| 17 |
}
|
| 18 |
},
|
| 19 |
{
|
|
|
|
| 5 |
"name": "claude-opus-4-5",
|
| 6 |
"developer": "Anthropic",
|
| 7 |
"scores": {
|
| 8 |
+
"tau-bench-2/airline": 0.66
|
| 9 |
}
|
| 10 |
},
|
| 11 |
{
|
|
|
|
| 13 |
"name": "gemini-3-pro-preview",
|
| 14 |
"developer": "Google",
|
| 15 |
"scores": {
|
| 16 |
+
"tau-bench-2/airline": 0.68
|
| 17 |
}
|
| 18 |
},
|
| 19 |
{
|
data/benchmarks/tau-bench-2_retail.json
CHANGED
|
@@ -21,7 +21,7 @@
|
|
| 21 |
"name": "gpt-5.2-2025-12-11",
|
| 22 |
"developer": "OpenAI",
|
| 23 |
"scores": {
|
| 24 |
-
"tau-bench-2/retail": 0.
|
| 25 |
}
|
| 26 |
}
|
| 27 |
]
|
|
|
|
| 21 |
"name": "gpt-5.2-2025-12-11",
|
| 22 |
"developer": "OpenAI",
|
| 23 |
"scores": {
|
| 24 |
+
"tau-bench-2/retail": 0.73
|
| 25 |
}
|
| 26 |
}
|
| 27 |
]
|
data/benchmarks/tau-bench-2_telecom.json
CHANGED
|
@@ -5,7 +5,7 @@
|
|
| 5 |
"name": "claude-opus-4-5",
|
| 6 |
"developer": "Anthropic",
|
| 7 |
"scores": {
|
| 8 |
-
"tau-bench-2/telecom": 0.
|
| 9 |
}
|
| 10 |
},
|
| 11 |
{
|
|
@@ -21,7 +21,7 @@
|
|
| 21 |
"name": "gpt-5.2-2025-12-11",
|
| 22 |
"developer": "OpenAI",
|
| 23 |
"scores": {
|
| 24 |
-
"tau-bench-2/telecom": 0.
|
| 25 |
}
|
| 26 |
}
|
| 27 |
]
|
|
|
|
| 5 |
"name": "claude-opus-4-5",
|
| 6 |
"developer": "Anthropic",
|
| 7 |
"scores": {
|
| 8 |
+
"tau-bench-2/telecom": 0.84
|
| 9 |
}
|
| 10 |
},
|
| 11 |
{
|
|
|
|
| 21 |
"name": "gpt-5.2-2025-12-11",
|
| 22 |
"developer": "OpenAI",
|
| 23 |
"scores": {
|
| 24 |
+
"tau-bench-2/telecom": 0.71
|
| 25 |
}
|
| 26 |
}
|
| 27 |
]
|
data/benchmarks/terminal-bench-2.0.json
CHANGED
|
@@ -21,7 +21,7 @@
|
|
| 21 |
"name": "Claude Opus 4.1",
|
| 22 |
"developer": "Anthropic",
|
| 23 |
"scores": {
|
| 24 |
-
"terminal-bench-2.0":
|
| 25 |
}
|
| 26 |
},
|
| 27 |
{
|
|
@@ -29,7 +29,7 @@
|
|
| 29 |
"name": "Claude Opus 4.5",
|
| 30 |
"developer": "Anthropic",
|
| 31 |
"scores": {
|
| 32 |
-
"terminal-bench-2.0":
|
| 33 |
}
|
| 34 |
},
|
| 35 |
{
|
|
@@ -37,7 +37,7 @@
|
|
| 37 |
"name": "Claude Opus 4.6",
|
| 38 |
"developer": "Anthropic",
|
| 39 |
"scores": {
|
| 40 |
-
"terminal-bench-2.0":
|
| 41 |
}
|
| 42 |
},
|
| 43 |
{
|
|
@@ -45,7 +45,7 @@
|
|
| 45 |
"name": "Claude Sonnet 4.5",
|
| 46 |
"developer": "Anthropic",
|
| 47 |
"scores": {
|
| 48 |
-
"terminal-bench-2.0":
|
| 49 |
}
|
| 50 |
},
|
| 51 |
{
|
|
@@ -61,7 +61,7 @@
|
|
| 61 |
"name": "Gemini 2.5 Flash",
|
| 62 |
"developer": "Google",
|
| 63 |
"scores": {
|
| 64 |
-
"terminal-bench-2.0":
|
| 65 |
}
|
| 66 |
},
|
| 67 |
{
|
|
@@ -77,7 +77,7 @@
|
|
| 77 |
"name": "Gemini 3 Flash",
|
| 78 |
"developer": "Google",
|
| 79 |
"scores": {
|
| 80 |
-
"terminal-bench-2.0":
|
| 81 |
}
|
| 82 |
},
|
| 83 |
{
|
|
@@ -109,7 +109,7 @@
|
|
| 109 |
"name": "MiniMax M2.1",
|
| 110 |
"developer": "MiniMax",
|
| 111 |
"scores": {
|
| 112 |
-
"terminal-bench-2.0":
|
| 113 |
}
|
| 114 |
},
|
| 115 |
{
|
|
@@ -125,7 +125,7 @@
|
|
| 125 |
"name": "Kimi K2 Instruct",
|
| 126 |
"developer": "Moonshot AI",
|
| 127 |
"scores": {
|
| 128 |
-
"terminal-bench-2.0":
|
| 129 |
}
|
| 130 |
},
|
| 131 |
{
|
|
@@ -149,7 +149,7 @@
|
|
| 149 |
"name": "Multiple",
|
| 150 |
"developer": "Multiple",
|
| 151 |
"scores": {
|
| 152 |
-
"terminal-bench-2.0":
|
| 153 |
}
|
| 154 |
},
|
| 155 |
{
|
|
@@ -157,7 +157,7 @@
|
|
| 157 |
"name": "GPT-5",
|
| 158 |
"developer": "OpenAI",
|
| 159 |
"scores": {
|
| 160 |
-
"terminal-bench-2.0":
|
| 161 |
}
|
| 162 |
},
|
| 163 |
{
|
|
@@ -165,7 +165,7 @@
|
|
| 165 |
"name": "GPT-5-Codex",
|
| 166 |
"developer": "OpenAI",
|
| 167 |
"scores": {
|
| 168 |
-
"terminal-bench-2.0":
|
| 169 |
}
|
| 170 |
},
|
| 171 |
{
|
|
@@ -173,7 +173,7 @@
|
|
| 173 |
"name": "GPT-5-Mini",
|
| 174 |
"developer": "OpenAI",
|
| 175 |
"scores": {
|
| 176 |
-
"terminal-bench-2.0":
|
| 177 |
}
|
| 178 |
},
|
| 179 |
{
|
|
@@ -181,7 +181,7 @@
|
|
| 181 |
"name": "GPT-5-Nano",
|
| 182 |
"developer": "OpenAI",
|
| 183 |
"scores": {
|
| 184 |
-
"terminal-bench-2.0":
|
| 185 |
}
|
| 186 |
},
|
| 187 |
{
|
|
@@ -197,7 +197,7 @@
|
|
| 197 |
"name": "GPT-5.1-Codex",
|
| 198 |
"developer": "OpenAI",
|
| 199 |
"scores": {
|
| 200 |
-
"terminal-bench-2.0":
|
| 201 |
}
|
| 202 |
},
|
| 203 |
{
|
|
@@ -221,7 +221,7 @@
|
|
| 221 |
"name": "GPT-5.2",
|
| 222 |
"developer": "OpenAI",
|
| 223 |
"scores": {
|
| 224 |
-
"terminal-bench-2.0":
|
| 225 |
}
|
| 226 |
},
|
| 227 |
{
|
|
@@ -237,7 +237,7 @@
|
|
| 237 |
"name": "GPT-5.3-Codex",
|
| 238 |
"developer": "OpenAI",
|
| 239 |
"scores": {
|
| 240 |
-
"terminal-bench-2.0":
|
| 241 |
}
|
| 242 |
},
|
| 243 |
{
|
|
@@ -245,7 +245,7 @@
|
|
| 245 |
"name": "GPT-OSS-120B",
|
| 246 |
"developer": "OpenAI",
|
| 247 |
"scores": {
|
| 248 |
-
"terminal-bench-2.0":
|
| 249 |
}
|
| 250 |
},
|
| 251 |
{
|
|
@@ -253,7 +253,7 @@
|
|
| 253 |
"name": "GPT-OSS-20B",
|
| 254 |
"developer": "OpenAI",
|
| 255 |
"scores": {
|
| 256 |
-
"terminal-bench-2.0": 3.
|
| 257 |
}
|
| 258 |
},
|
| 259 |
{
|
|
@@ -261,7 +261,7 @@
|
|
| 261 |
"name": "Grok 4",
|
| 262 |
"developer": "xAI",
|
| 263 |
"scores": {
|
| 264 |
-
"terminal-bench-2.0":
|
| 265 |
}
|
| 266 |
},
|
| 267 |
{
|
|
@@ -269,7 +269,7 @@
|
|
| 269 |
"name": "Grok Code Fast 1",
|
| 270 |
"developer": "xAI",
|
| 271 |
"scores": {
|
| 272 |
-
"terminal-bench-2.0":
|
| 273 |
}
|
| 274 |
},
|
| 275 |
{
|
|
|
|
| 21 |
"name": "Claude Opus 4.1",
|
| 22 |
"developer": "Anthropic",
|
| 23 |
"scores": {
|
| 24 |
+
"terminal-bench-2.0": 35.1
|
| 25 |
}
|
| 26 |
},
|
| 27 |
{
|
|
|
|
| 29 |
"name": "Claude Opus 4.5",
|
| 30 |
"developer": "Anthropic",
|
| 31 |
"scores": {
|
| 32 |
+
"terminal-bench-2.0": 52.1
|
| 33 |
}
|
| 34 |
},
|
| 35 |
{
|
|
|
|
| 37 |
"name": "Claude Opus 4.6",
|
| 38 |
"developer": "Anthropic",
|
| 39 |
"scores": {
|
| 40 |
+
"terminal-bench-2.0": 62.9
|
| 41 |
}
|
| 42 |
},
|
| 43 |
{
|
|
|
|
| 45 |
"name": "Claude Sonnet 4.5",
|
| 46 |
"developer": "Anthropic",
|
| 47 |
"scores": {
|
| 48 |
+
"terminal-bench-2.0": 42.6
|
| 49 |
}
|
| 50 |
},
|
| 51 |
{
|
|
|
|
| 61 |
"name": "Gemini 2.5 Flash",
|
| 62 |
"developer": "Google",
|
| 63 |
"scores": {
|
| 64 |
+
"terminal-bench-2.0": 16.9
|
| 65 |
}
|
| 66 |
},
|
| 67 |
{
|
|
|
|
| 77 |
"name": "Gemini 3 Flash",
|
| 78 |
"developer": "Google",
|
| 79 |
"scores": {
|
| 80 |
+
"terminal-bench-2.0": 47.4
|
| 81 |
}
|
| 82 |
},
|
| 83 |
{
|
|
|
|
| 109 |
"name": "MiniMax M2.1",
|
| 110 |
"developer": "MiniMax",
|
| 111 |
"scores": {
|
| 112 |
+
"terminal-bench-2.0": 36.6
|
| 113 |
}
|
| 114 |
},
|
| 115 |
{
|
|
|
|
| 125 |
"name": "Kimi K2 Instruct",
|
| 126 |
"developer": "Moonshot AI",
|
| 127 |
"scores": {
|
| 128 |
+
"terminal-bench-2.0": 27.8
|
| 129 |
}
|
| 130 |
},
|
| 131 |
{
|
|
|
|
| 149 |
"name": "Multiple",
|
| 150 |
"developer": "Multiple",
|
| 151 |
"scores": {
|
| 152 |
+
"terminal-bench-2.0": 72.4
|
| 153 |
}
|
| 154 |
},
|
| 155 |
{
|
|
|
|
| 157 |
"name": "GPT-5",
|
| 158 |
"developer": "OpenAI",
|
| 159 |
"scores": {
|
| 160 |
+
"terminal-bench-2.0": 49.6
|
| 161 |
}
|
| 162 |
},
|
| 163 |
{
|
|
|
|
| 165 |
"name": "GPT-5-Codex",
|
| 166 |
"developer": "OpenAI",
|
| 167 |
"scores": {
|
| 168 |
+
"terminal-bench-2.0": 43.4
|
| 169 |
}
|
| 170 |
},
|
| 171 |
{
|
|
|
|
| 173 |
"name": "GPT-5-Mini",
|
| 174 |
"developer": "OpenAI",
|
| 175 |
"scores": {
|
| 176 |
+
"terminal-bench-2.0": 24.0
|
| 177 |
}
|
| 178 |
},
|
| 179 |
{
|
|
|
|
| 181 |
"name": "GPT-5-Nano",
|
| 182 |
"developer": "OpenAI",
|
| 183 |
"scores": {
|
| 184 |
+
"terminal-bench-2.0": 11.5
|
| 185 |
}
|
| 186 |
},
|
| 187 |
{
|
|
|
|
| 197 |
"name": "GPT-5.1-Codex",
|
| 198 |
"developer": "OpenAI",
|
| 199 |
"scores": {
|
| 200 |
+
"terminal-bench-2.0": 57.8
|
| 201 |
}
|
| 202 |
},
|
| 203 |
{
|
|
|
|
| 221 |
"name": "GPT-5.2",
|
| 222 |
"developer": "OpenAI",
|
| 223 |
"scores": {
|
| 224 |
+
"terminal-bench-2.0": 62.9
|
| 225 |
}
|
| 226 |
},
|
| 227 |
{
|
|
|
|
| 237 |
"name": "GPT-5.3-Codex",
|
| 238 |
"developer": "OpenAI",
|
| 239 |
"scores": {
|
| 240 |
+
"terminal-bench-2.0": 77.3
|
| 241 |
}
|
| 242 |
},
|
| 243 |
{
|
|
|
|
| 245 |
"name": "GPT-OSS-120B",
|
| 246 |
"developer": "OpenAI",
|
| 247 |
"scores": {
|
| 248 |
+
"terminal-bench-2.0": 18.7
|
| 249 |
}
|
| 250 |
},
|
| 251 |
{
|
|
|
|
| 253 |
"name": "GPT-OSS-20B",
|
| 254 |
"developer": "OpenAI",
|
| 255 |
"scores": {
|
| 256 |
+
"terminal-bench-2.0": 3.4
|
| 257 |
}
|
| 258 |
},
|
| 259 |
{
|
|
|
|
| 261 |
"name": "Grok 4",
|
| 262 |
"developer": "xAI",
|
| 263 |
"scores": {
|
| 264 |
+
"terminal-bench-2.0": 23.1
|
| 265 |
}
|
| 266 |
},
|
| 267 |
{
|
|
|
|
| 269 |
"name": "Grok Code Fast 1",
|
| 270 |
"developer": "xAI",
|
| 271 |
"scores": {
|
| 272 |
+
"terminal-bench-2.0": 14.2
|
| 273 |
}
|
| 274 |
},
|
| 275 |
{
|
data/benchmarks/theory_of_mind.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"models": [
|
| 3 |
+
{
|
| 4 |
+
"model_id": "Qwen/Qwen2.5-3B-Instruct",
|
| 5 |
+
"name": "Qwen2.5-3B-Instruct",
|
| 6 |
+
"developer": "Qwen",
|
| 7 |
+
"scores": {
|
| 8 |
+
"accuracy on theory_of_mind for scorer model_graded_fact": 0.78
|
| 9 |
+
}
|
| 10 |
+
}
|
| 11 |
+
]
|
| 12 |
+
}
|
data/developers.json
CHANGED
|
@@ -1917,7 +1917,7 @@
|
|
| 1917 |
},
|
| 1918 |
{
|
| 1919 |
"developer": "NousResearch",
|
| 1920 |
-
"model_count":
|
| 1921 |
},
|
| 1922 |
{
|
| 1923 |
"developer": "Novaciano",
|
|
|
|
| 1917 |
},
|
| 1918 |
{
|
| 1919 |
"developer": "NousResearch",
|
| 1920 |
+
"model_count": 18
|
| 1921 |
},
|
| 1922 |
{
|
| 1923 |
"developer": "Novaciano",
|
data/developers/adriszmar.json
CHANGED
|
@@ -7,12 +7,12 @@
|
|
| 7 |
"developer": "adriszmar",
|
| 8 |
"evaluator_relationship": null,
|
| 9 |
"benchmark_scores": {
|
| 10 |
-
"hfopenllm_v2/IFEval": 0.
|
| 11 |
-
"hfopenllm_v2/BBH": 0.
|
| 12 |
-
"hfopenllm_v2/MATH Level 5": 0.
|
| 13 |
-
"hfopenllm_v2/GPQA": 0.
|
| 14 |
-
"hfopenllm_v2/MUSR": 0.
|
| 15 |
-
"hfopenllm_v2/MMLU-PRO": 0.
|
| 16 |
}
|
| 17 |
}
|
| 18 |
]
|
|
|
|
| 7 |
"developer": "adriszmar",
|
| 8 |
"evaluator_relationship": null,
|
| 9 |
"benchmark_scores": {
|
| 10 |
+
"hfopenllm_v2/IFEval": 0.1746,
|
| 11 |
+
"hfopenllm_v2/BBH": 0.3126,
|
| 12 |
+
"hfopenllm_v2/MATH Level 5": 0.0,
|
| 13 |
+
"hfopenllm_v2/GPQA": 0.245,
|
| 14 |
+
"hfopenllm_v2/MUSR": 0.4096,
|
| 15 |
+
"hfopenllm_v2/MMLU-PRO": 0.1087
|
| 16 |
}
|
| 17 |
}
|
| 18 |
]
|
data/developers/ai2.json
CHANGED
|
@@ -43,9 +43,9 @@
|
|
| 43 |
"developer": "AI2",
|
| 44 |
"evaluator_relationship": null,
|
| 45 |
"benchmark_scores": {
|
| 46 |
-
"reward-bench/Score": 0.
|
| 47 |
-
"reward-bench/Chat": 0.
|
| 48 |
-
"reward-bench/Chat Hard": 0.
|
| 49 |
"reward-bench/Safety": 0.7757
|
| 50 |
}
|
| 51 |
},
|
|
|
|
| 43 |
"developer": "AI2",
|
| 44 |
"evaluator_relationship": null,
|
| 45 |
"benchmark_scores": {
|
| 46 |
+
"reward-bench/Score": 0.6924,
|
| 47 |
+
"reward-bench/Chat": 0.9441,
|
| 48 |
+
"reward-bench/Chat Hard": 0.3575,
|
| 49 |
"reward-bench/Safety": 0.7757
|
| 50 |
}
|
| 51 |
},
|
data/developers/akjindal53244.json
CHANGED
|
@@ -7,12 +7,12 @@
|
|
| 7 |
"developer": "akjindal53244",
|
| 8 |
"evaluator_relationship": null,
|
| 9 |
"benchmark_scores": {
|
| 10 |
-
"hfopenllm_v2/IFEval": 0.
|
| 11 |
-
"hfopenllm_v2/BBH": 0.
|
| 12 |
-
"hfopenllm_v2/MATH Level 5": 0.
|
| 13 |
-
"hfopenllm_v2/GPQA": 0.
|
| 14 |
"hfopenllm_v2/MUSR": 0.4028,
|
| 15 |
-
"hfopenllm_v2/MMLU-PRO": 0.
|
| 16 |
}
|
| 17 |
}
|
| 18 |
]
|
|
|
|
| 7 |
"developer": "akjindal53244",
|
| 8 |
"evaluator_relationship": null,
|
| 9 |
"benchmark_scores": {
|
| 10 |
+
"hfopenllm_v2/IFEval": 0.8033,
|
| 11 |
+
"hfopenllm_v2/BBH": 0.5196,
|
| 12 |
+
"hfopenllm_v2/MATH Level 5": 0.1624,
|
| 13 |
+
"hfopenllm_v2/GPQA": 0.3096,
|
| 14 |
"hfopenllm_v2/MUSR": 0.4028,
|
| 15 |
+
"hfopenllm_v2/MMLU-PRO": 0.3812
|
| 16 |
}
|
| 17 |
}
|
| 18 |
]
|
data/developers/allenai.json
CHANGED
|
@@ -63,17 +63,17 @@
|
|
| 63 |
"developer": "allenai",
|
| 64 |
"evaluator_relationship": null,
|
| 65 |
"benchmark_scores": {
|
| 66 |
-
"reward-bench/Score": 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
"reward-bench/Factuality": 0.8126,
|
| 68 |
"reward-bench/Precise IF": 0.4188,
|
| 69 |
"reward-bench/Math": 0.6995,
|
| 70 |
-
"reward-bench/Safety": 0.9095,
|
| 71 |
"reward-bench/Focus": 0.8646,
|
| 72 |
-
"reward-bench/Ties": 0.8835
|
| 73 |
-
"reward-bench/Chat": 0.9665,
|
| 74 |
-
"reward-bench/Chat Hard": 0.8355,
|
| 75 |
-
"reward-bench/Reasoning": 0.8969,
|
| 76 |
-
"reward-bench/Prior Sets (0.5 weight)": 0.0
|
| 77 |
}
|
| 78 |
},
|
| 79 |
{
|
|
@@ -101,17 +101,17 @@
|
|
| 101 |
"developer": "allenai",
|
| 102 |
"evaluator_relationship": null,
|
| 103 |
"benchmark_scores": {
|
| 104 |
-
"reward-bench/Score": 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
"reward-bench/Factuality": 0.7432,
|
| 106 |
"reward-bench/Precise IF": 0.4437,
|
| 107 |
"reward-bench/Math": 0.6175,
|
| 108 |
-
"reward-bench/Safety": 0.8932,
|
| 109 |
"reward-bench/Focus": 0.9071,
|
| 110 |
-
"reward-bench/Ties": 0.7638
|
| 111 |
-
"reward-bench/Chat": 0.9581,
|
| 112 |
-
"reward-bench/Chat Hard": 0.8158,
|
| 113 |
-
"reward-bench/Reasoning": 0.887,
|
| 114 |
-
"reward-bench/Prior Sets (0.5 weight)": 0.0
|
| 115 |
}
|
| 116 |
},
|
| 117 |
{
|
|
@@ -120,12 +120,12 @@
|
|
| 120 |
"developer": "allenai",
|
| 121 |
"evaluator_relationship": null,
|
| 122 |
"benchmark_scores": {
|
| 123 |
-
"hfopenllm_v2/IFEval": 0.
|
| 124 |
-
"hfopenllm_v2/BBH": 0.
|
| 125 |
-
"hfopenllm_v2/MATH Level 5": 0.
|
| 126 |
"hfopenllm_v2/GPQA": 0.3733,
|
| 127 |
-
"hfopenllm_v2/MUSR": 0.
|
| 128 |
-
"hfopenllm_v2/MMLU-PRO": 0.
|
| 129 |
}
|
| 130 |
},
|
| 131 |
{
|
|
@@ -162,17 +162,17 @@
|
|
| 162 |
"developer": "allenai",
|
| 163 |
"evaluator_relationship": null,
|
| 164 |
"benchmark_scores": {
|
| 165 |
-
"reward-bench/Score": 0.
|
| 166 |
-
"reward-bench/Chat": 0.9693,
|
| 167 |
-
"reward-bench/Chat Hard": 0.8268,
|
| 168 |
-
"reward-bench/Safety": 0.8689,
|
| 169 |
-
"reward-bench/Reasoning": 0.8583,
|
| 170 |
-
"reward-bench/Prior Sets (0.5 weight)": 0.0,
|
| 171 |
"reward-bench/Factuality": 0.8084,
|
| 172 |
"reward-bench/Precise IF": 0.3688,
|
| 173 |
"reward-bench/Math": 0.6776,
|
|
|
|
| 174 |
"reward-bench/Focus": 0.7778,
|
| 175 |
-
"reward-bench/Ties": 0.8308
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
}
|
| 177 |
},
|
| 178 |
{
|
|
@@ -209,17 +209,17 @@
|
|
| 209 |
"developer": "allenai",
|
| 210 |
"evaluator_relationship": null,
|
| 211 |
"benchmark_scores": {
|
| 212 |
-
"reward-bench/Score": 0.
|
| 213 |
-
"reward-bench/Chat": 0.9553,
|
| 214 |
-
"reward-bench/Chat Hard": 0.761,
|
| 215 |
-
"reward-bench/Safety": 0.86,
|
| 216 |
-
"reward-bench/Reasoning": 0.7898,
|
| 217 |
-
"reward-bench/Prior Sets (0.5 weight)": 0.0,
|
| 218 |
"reward-bench/Factuality": 0.7516,
|
| 219 |
"reward-bench/Precise IF": 0.3875,
|
| 220 |
"reward-bench/Math": 0.6284,
|
|
|
|
| 221 |
"reward-bench/Focus": 0.8545,
|
| 222 |
-
"reward-bench/Ties": 0.6397
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
}
|
| 224 |
},
|
| 225 |
{
|
|
|
|
| 63 |
"developer": "allenai",
|
| 64 |
"evaluator_relationship": null,
|
| 65 |
"benchmark_scores": {
|
| 66 |
+
"reward-bench/Score": 0.7606,
|
| 67 |
+
"reward-bench/Chat": 0.9665,
|
| 68 |
+
"reward-bench/Chat Hard": 0.8355,
|
| 69 |
+
"reward-bench/Safety": 0.8844,
|
| 70 |
+
"reward-bench/Reasoning": 0.8969,
|
| 71 |
+
"reward-bench/Prior Sets (0.5 weight)": 0.0,
|
| 72 |
"reward-bench/Factuality": 0.8126,
|
| 73 |
"reward-bench/Precise IF": 0.4188,
|
| 74 |
"reward-bench/Math": 0.6995,
|
|
|
|
| 75 |
"reward-bench/Focus": 0.8646,
|
| 76 |
+
"reward-bench/Ties": 0.8835
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
}
|
| 78 |
},
|
| 79 |
{
|
|
|
|
| 101 |
"developer": "allenai",
|
| 102 |
"evaluator_relationship": null,
|
| 103 |
"benchmark_scores": {
|
| 104 |
+
"reward-bench/Score": 0.7285,
|
| 105 |
+
"reward-bench/Chat": 0.9581,
|
| 106 |
+
"reward-bench/Chat Hard": 0.8158,
|
| 107 |
+
"reward-bench/Safety": 0.8956,
|
| 108 |
+
"reward-bench/Reasoning": 0.887,
|
| 109 |
+
"reward-bench/Prior Sets (0.5 weight)": 0.0,
|
| 110 |
"reward-bench/Factuality": 0.7432,
|
| 111 |
"reward-bench/Precise IF": 0.4437,
|
| 112 |
"reward-bench/Math": 0.6175,
|
|
|
|
| 113 |
"reward-bench/Focus": 0.9071,
|
| 114 |
+
"reward-bench/Ties": 0.7638
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
}
|
| 116 |
},
|
| 117 |
{
|
|
|
|
| 120 |
"developer": "allenai",
|
| 121 |
"evaluator_relationship": null,
|
| 122 |
"benchmark_scores": {
|
| 123 |
+
"hfopenllm_v2/IFEval": 0.8291,
|
| 124 |
+
"hfopenllm_v2/BBH": 0.6164,
|
| 125 |
+
"hfopenllm_v2/MATH Level 5": 0.4502,
|
| 126 |
"hfopenllm_v2/GPQA": 0.3733,
|
| 127 |
+
"hfopenllm_v2/MUSR": 0.4948,
|
| 128 |
+
"hfopenllm_v2/MMLU-PRO": 0.4645
|
| 129 |
}
|
| 130 |
},
|
| 131 |
{
|
|
|
|
| 162 |
"developer": "allenai",
|
| 163 |
"evaluator_relationship": null,
|
| 164 |
"benchmark_scores": {
|
| 165 |
+
"reward-bench/Score": 0.8892,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
"reward-bench/Factuality": 0.8084,
|
| 167 |
"reward-bench/Precise IF": 0.3688,
|
| 168 |
"reward-bench/Math": 0.6776,
|
| 169 |
+
"reward-bench/Safety": 0.9027,
|
| 170 |
"reward-bench/Focus": 0.7778,
|
| 171 |
+
"reward-bench/Ties": 0.8308,
|
| 172 |
+
"reward-bench/Chat": 0.9693,
|
| 173 |
+
"reward-bench/Chat Hard": 0.8268,
|
| 174 |
+
"reward-bench/Reasoning": 0.8583,
|
| 175 |
+
"reward-bench/Prior Sets (0.5 weight)": 0.0
|
| 176 |
}
|
| 177 |
},
|
| 178 |
{
|
|
|
|
| 209 |
"developer": "allenai",
|
| 210 |
"evaluator_relationship": null,
|
| 211 |
"benchmark_scores": {
|
| 212 |
+
"reward-bench/Score": 0.8431,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
"reward-bench/Factuality": 0.7516,
|
| 214 |
"reward-bench/Precise IF": 0.3875,
|
| 215 |
"reward-bench/Math": 0.6284,
|
| 216 |
+
"reward-bench/Safety": 0.8662,
|
| 217 |
"reward-bench/Focus": 0.8545,
|
| 218 |
+
"reward-bench/Ties": 0.6397,
|
| 219 |
+
"reward-bench/Chat": 0.9553,
|
| 220 |
+
"reward-bench/Chat Hard": 0.761,
|
| 221 |
+
"reward-bench/Reasoning": 0.7898,
|
| 222 |
+
"reward-bench/Prior Sets (0.5 weight)": 0.0
|
| 223 |
}
|
| 224 |
},
|
| 225 |
{
|
data/developers/anthropic.json
CHANGED
|
@@ -650,12 +650,12 @@
|
|
| 650 |
"developer": "Anthropic",
|
| 651 |
"evaluator_relationship": null,
|
| 652 |
"benchmark_scores": {
|
| 653 |
-
"appworld_test_normal/appworld/test_normal": 0.
|
| 654 |
"browsecompplus/browsecompplus": 0.61,
|
| 655 |
-
"swe-bench/swe-bench": 0.
|
| 656 |
-
"tau-bench-2_airline/tau-bench-2/airline": 0.
|
| 657 |
"tau-bench-2_retail/tau-bench-2/retail": 0.78,
|
| 658 |
-
"tau-bench-2_telecom/tau-bench-2/telecom": 0.
|
| 659 |
}
|
| 660 |
},
|
| 661 |
{
|
|
@@ -664,7 +664,7 @@
|
|
| 664 |
"developer": "Anthropic",
|
| 665 |
"evaluator_relationship": null,
|
| 666 |
"benchmark_scores": {
|
| 667 |
-
"terminal-bench-2.0/terminal-bench-2.0":
|
| 668 |
}
|
| 669 |
},
|
| 670 |
{
|
|
@@ -673,7 +673,7 @@
|
|
| 673 |
"developer": "Anthropic",
|
| 674 |
"evaluator_relationship": null,
|
| 675 |
"benchmark_scores": {
|
| 676 |
-
"terminal-bench-2.0/terminal-bench-2.0":
|
| 677 |
}
|
| 678 |
},
|
| 679 |
{
|
|
@@ -682,7 +682,7 @@
|
|
| 682 |
"developer": "Anthropic",
|
| 683 |
"evaluator_relationship": null,
|
| 684 |
"benchmark_scores": {
|
| 685 |
-
"terminal-bench-2.0/terminal-bench-2.0":
|
| 686 |
}
|
| 687 |
},
|
| 688 |
{
|
|
@@ -756,7 +756,7 @@
|
|
| 756 |
"developer": "Anthropic",
|
| 757 |
"evaluator_relationship": null,
|
| 758 |
"benchmark_scores": {
|
| 759 |
-
"terminal-bench-2.0/terminal-bench-2.0":
|
| 760 |
}
|
| 761 |
},
|
| 762 |
{
|
|
@@ -800,8 +800,6 @@
|
|
| 800 |
"developer": "Anthropic",
|
| 801 |
"evaluator_relationship": null,
|
| 802 |
"benchmark_scores": {
|
| 803 |
-
"ace/Overall Score": 0.478,
|
| 804 |
-
"ace/Gaming Score": 0.391,
|
| 805 |
"apex-agents/Overall Pass@1": 0.184,
|
| 806 |
"apex-agents/Overall Pass@8": 0.34,
|
| 807 |
"apex-agents/Overall Mean Score": 0.348,
|
|
@@ -809,6 +807,8 @@
|
|
| 809 |
"apex-agents/Management Consulting Pass@1": 0.132,
|
| 810 |
"apex-agents/Corporate Law Pass@1": 0.202,
|
| 811 |
"apex-agents/Corporate Lawyer Mean Score": 0.471,
|
|
|
|
|
|
|
| 812 |
"apex-v1/Medicine (MD) Score": 0.65
|
| 813 |
}
|
| 814 |
},
|
|
|
|
| 650 |
"developer": "Anthropic",
|
| 651 |
"evaluator_relationship": null,
|
| 652 |
"benchmark_scores": {
|
| 653 |
+
"appworld_test_normal/appworld/test_normal": 0.7,
|
| 654 |
"browsecompplus/browsecompplus": 0.61,
|
| 655 |
+
"swe-bench/swe-bench": 0.6061,
|
| 656 |
+
"tau-bench-2_airline/tau-bench-2/airline": 0.66,
|
| 657 |
"tau-bench-2_retail/tau-bench-2/retail": 0.78,
|
| 658 |
+
"tau-bench-2_telecom/tau-bench-2/telecom": 0.84
|
| 659 |
}
|
| 660 |
},
|
| 661 |
{
|
|
|
|
| 664 |
"developer": "Anthropic",
|
| 665 |
"evaluator_relationship": null,
|
| 666 |
"benchmark_scores": {
|
| 667 |
+
"terminal-bench-2.0/terminal-bench-2.0": 35.1
|
| 668 |
}
|
| 669 |
},
|
| 670 |
{
|
|
|
|
| 673 |
"developer": "Anthropic",
|
| 674 |
"evaluator_relationship": null,
|
| 675 |
"benchmark_scores": {
|
| 676 |
+
"terminal-bench-2.0/terminal-bench-2.0": 52.1
|
| 677 |
}
|
| 678 |
},
|
| 679 |
{
|
|
|
|
| 682 |
"developer": "Anthropic",
|
| 683 |
"evaluator_relationship": null,
|
| 684 |
"benchmark_scores": {
|
| 685 |
+
"terminal-bench-2.0/terminal-bench-2.0": 62.9
|
| 686 |
}
|
| 687 |
},
|
| 688 |
{
|
|
|
|
| 756 |
"developer": "Anthropic",
|
| 757 |
"evaluator_relationship": null,
|
| 758 |
"benchmark_scores": {
|
| 759 |
+
"terminal-bench-2.0/terminal-bench-2.0": 42.6
|
| 760 |
}
|
| 761 |
},
|
| 762 |
{
|
|
|
|
| 800 |
"developer": "Anthropic",
|
| 801 |
"evaluator_relationship": null,
|
| 802 |
"benchmark_scores": {
|
|
|
|
|
|
|
| 803 |
"apex-agents/Overall Pass@1": 0.184,
|
| 804 |
"apex-agents/Overall Pass@8": 0.34,
|
| 805 |
"apex-agents/Overall Mean Score": 0.348,
|
|
|
|
| 807 |
"apex-agents/Management Consulting Pass@1": 0.132,
|
| 808 |
"apex-agents/Corporate Law Pass@1": 0.202,
|
| 809 |
"apex-agents/Corporate Lawyer Mean Score": 0.471,
|
| 810 |
+
"ace/Overall Score": 0.478,
|
| 811 |
+
"ace/Gaming Score": 0.391,
|
| 812 |
"apex-v1/Medicine (MD) Score": 0.65
|
| 813 |
}
|
| 814 |
},
|
data/developers/cognitivecomputations.json
CHANGED
|
@@ -77,12 +77,12 @@
|
|
| 77 |
"developer": "cognitivecomputations",
|
| 78 |
"evaluator_relationship": null,
|
| 79 |
"benchmark_scores": {
|
| 80 |
-
"hfopenllm_v2/IFEval": 0.
|
| 81 |
-
"hfopenllm_v2/BBH": 0.
|
| 82 |
-
"hfopenllm_v2/MATH Level 5": 0.
|
| 83 |
-
"hfopenllm_v2/GPQA": 0.
|
| 84 |
-
"hfopenllm_v2/MUSR": 0.
|
| 85 |
-
"hfopenllm_v2/MMLU-PRO": 0.
|
| 86 |
}
|
| 87 |
},
|
| 88 |
{
|
|
|
|
| 77 |
"developer": "cognitivecomputations",
|
| 78 |
"evaluator_relationship": null,
|
| 79 |
"benchmark_scores": {
|
| 80 |
+
"hfopenllm_v2/IFEval": 0.3613,
|
| 81 |
+
"hfopenllm_v2/BBH": 0.6123,
|
| 82 |
+
"hfopenllm_v2/MATH Level 5": 0.1239,
|
| 83 |
+
"hfopenllm_v2/GPQA": 0.328,
|
| 84 |
+
"hfopenllm_v2/MUSR": 0.4112,
|
| 85 |
+
"hfopenllm_v2/MMLU-PRO": 0.4494
|
| 86 |
}
|
| 87 |
},
|
| 88 |
{
|
data/developers/columbia-nlp.json
CHANGED
|
@@ -7,12 +7,12 @@
|
|
| 7 |
"developer": "Columbia-NLP",
|
| 8 |
"evaluator_relationship": null,
|
| 9 |
"benchmark_scores": {
|
| 10 |
-
"hfopenllm_v2/IFEval": 0.
|
| 11 |
-
"hfopenllm_v2/BBH": 0.
|
| 12 |
-
"hfopenllm_v2/MATH Level 5": 0.
|
| 13 |
-
"hfopenllm_v2/GPQA": 0.
|
| 14 |
-
"hfopenllm_v2/MUSR": 0.
|
| 15 |
-
"hfopenllm_v2/MMLU-PRO": 0.
|
| 16 |
}
|
| 17 |
},
|
| 18 |
{
|
|
|
|
| 7 |
"developer": "Columbia-NLP",
|
| 8 |
"evaluator_relationship": null,
|
| 9 |
"benchmark_scores": {
|
| 10 |
+
"hfopenllm_v2/IFEval": 0.3102,
|
| 11 |
+
"hfopenllm_v2/BBH": 0.3881,
|
| 12 |
+
"hfopenllm_v2/MATH Level 5": 0.0536,
|
| 13 |
+
"hfopenllm_v2/GPQA": 0.2534,
|
| 14 |
+
"hfopenllm_v2/MUSR": 0.4081,
|
| 15 |
+
"hfopenllm_v2/MMLU-PRO": 0.1665
|
| 16 |
}
|
| 17 |
},
|
| 18 |
{
|
data/developers/cpayne1303.json
CHANGED
|
@@ -35,12 +35,12 @@
|
|
| 35 |
"developer": "cpayne1303",
|
| 36 |
"evaluator_relationship": null,
|
| 37 |
"benchmark_scores": {
|
| 38 |
-
"hfopenllm_v2/IFEval": 0.
|
| 39 |
-
"hfopenllm_v2/BBH": 0.
|
| 40 |
-
"hfopenllm_v2/MATH Level 5": 0.
|
| 41 |
"hfopenllm_v2/GPQA": 0.2685,
|
| 42 |
-
"hfopenllm_v2/MUSR": 0.
|
| 43 |
-
"hfopenllm_v2/MMLU-PRO": 0.
|
| 44 |
}
|
| 45 |
},
|
| 46 |
{
|
|
|
|
| 35 |
"developer": "cpayne1303",
|
| 36 |
"evaluator_relationship": null,
|
| 37 |
"benchmark_scores": {
|
| 38 |
+
"hfopenllm_v2/IFEval": 0.1949,
|
| 39 |
+
"hfopenllm_v2/BBH": 0.2965,
|
| 40 |
+
"hfopenllm_v2/MATH Level 5": 0.0045,
|
| 41 |
"hfopenllm_v2/GPQA": 0.2685,
|
| 42 |
+
"hfopenllm_v2/MUSR": 0.3885,
|
| 43 |
+
"hfopenllm_v2/MMLU-PRO": 0.1111
|
| 44 |
}
|
| 45 |
},
|
| 46 |
{
|
data/developers/daemontatox.json
CHANGED
|
@@ -231,12 +231,12 @@
|
|
| 231 |
"developer": "Daemontatox",
|
| 232 |
"evaluator_relationship": null,
|
| 233 |
"benchmark_scores": {
|
| 234 |
-
"hfopenllm_v2/IFEval": 0.
|
| 235 |
-
"hfopenllm_v2/BBH": 0.
|
| 236 |
-
"hfopenllm_v2/MATH Level 5": 0.
|
| 237 |
-
"hfopenllm_v2/GPQA": 0.
|
| 238 |
-
"hfopenllm_v2/MUSR": 0.
|
| 239 |
-
"hfopenllm_v2/MMLU-PRO": 0.
|
| 240 |
}
|
| 241 |
},
|
| 242 |
{
|
|
|
|
| 231 |
"developer": "Daemontatox",
|
| 232 |
"evaluator_relationship": null,
|
| 233 |
"benchmark_scores": {
|
| 234 |
+
"hfopenllm_v2/IFEval": 0.4855,
|
| 235 |
+
"hfopenllm_v2/BBH": 0.6627,
|
| 236 |
+
"hfopenllm_v2/MATH Level 5": 0.4841,
|
| 237 |
+
"hfopenllm_v2/GPQA": 0.3096,
|
| 238 |
+
"hfopenllm_v2/MUSR": 0.4256,
|
| 239 |
+
"hfopenllm_v2/MMLU-PRO": 0.5542
|
| 240 |
}
|
| 241 |
},
|
| 242 |
{
|
data/developers/deepmount00.json
CHANGED
|
@@ -63,12 +63,12 @@
|
|
| 63 |
"developer": "DeepMount00",
|
| 64 |
"evaluator_relationship": null,
|
| 65 |
"benchmark_scores": {
|
| 66 |
-
"hfopenllm_v2/IFEval": 0.
|
| 67 |
-
"hfopenllm_v2/BBH": 0.
|
| 68 |
-
"hfopenllm_v2/MATH Level 5": 0.
|
| 69 |
-
"hfopenllm_v2/GPQA": 0.
|
| 70 |
-
"hfopenllm_v2/MUSR": 0.
|
| 71 |
-
"hfopenllm_v2/MMLU-PRO": 0.
|
| 72 |
}
|
| 73 |
},
|
| 74 |
{
|
|
|
|
| 63 |
"developer": "DeepMount00",
|
| 64 |
"evaluator_relationship": null,
|
| 65 |
"benchmark_scores": {
|
| 66 |
+
"hfopenllm_v2/IFEval": 0.5365,
|
| 67 |
+
"hfopenllm_v2/BBH": 0.517,
|
| 68 |
+
"hfopenllm_v2/MATH Level 5": 0.1707,
|
| 69 |
+
"hfopenllm_v2/GPQA": 0.3062,
|
| 70 |
+
"hfopenllm_v2/MUSR": 0.4487,
|
| 71 |
+
"hfopenllm_v2/MMLU-PRO": 0.396
|
| 72 |
}
|
| 73 |
},
|
| 74 |
{
|
data/developers/dfurman.json
CHANGED
|
@@ -35,12 +35,12 @@
|
|
| 35 |
"developer": "dfurman",
|
| 36 |
"evaluator_relationship": null,
|
| 37 |
"benchmark_scores": {
|
| 38 |
-
"hfopenllm_v2/IFEval": 0.
|
| 39 |
-
"hfopenllm_v2/BBH": 0.
|
| 40 |
-
"hfopenllm_v2/MATH Level 5": 0.
|
| 41 |
-
"hfopenllm_v2/GPQA": 0.
|
| 42 |
-
"hfopenllm_v2/MUSR": 0.
|
| 43 |
-
"hfopenllm_v2/MMLU-PRO": 0.
|
| 44 |
}
|
| 45 |
},
|
| 46 |
{
|
|
|
|
| 35 |
"developer": "dfurman",
|
| 36 |
"evaluator_relationship": null,
|
| 37 |
"benchmark_scores": {
|
| 38 |
+
"hfopenllm_v2/IFEval": 0.3,
|
| 39 |
+
"hfopenllm_v2/BBH": 0.3853,
|
| 40 |
+
"hfopenllm_v2/MATH Level 5": 0.0415,
|
| 41 |
+
"hfopenllm_v2/GPQA": 0.2617,
|
| 42 |
+
"hfopenllm_v2/MUSR": 0.3579,
|
| 43 |
+
"hfopenllm_v2/MMLU-PRO": 0.2281
|
| 44 |
}
|
| 45 |
},
|
| 46 |
{
|
data/developers/doppelreflex.json
CHANGED
|
@@ -175,12 +175,12 @@
|
|
| 175 |
"developer": "DoppelReflEx",
|
| 176 |
"evaluator_relationship": null,
|
| 177 |
"benchmark_scores": {
|
| 178 |
-
"hfopenllm_v2/IFEval": 0.
|
| 179 |
-
"hfopenllm_v2/BBH": 0.
|
| 180 |
-
"hfopenllm_v2/MATH Level 5": 0.
|
| 181 |
-
"hfopenllm_v2/GPQA": 0.
|
| 182 |
-
"hfopenllm_v2/MUSR": 0.
|
| 183 |
-
"hfopenllm_v2/MMLU-PRO": 0.
|
| 184 |
}
|
| 185 |
},
|
| 186 |
{
|
|
|
|
| 175 |
"developer": "DoppelReflEx",
|
| 176 |
"evaluator_relationship": null,
|
| 177 |
"benchmark_scores": {
|
| 178 |
+
"hfopenllm_v2/IFEval": 0.436,
|
| 179 |
+
"hfopenllm_v2/BBH": 0.4956,
|
| 180 |
+
"hfopenllm_v2/MATH Level 5": 0.0589,
|
| 181 |
+
"hfopenllm_v2/GPQA": 0.3205,
|
| 182 |
+
"hfopenllm_v2/MUSR": 0.3843,
|
| 183 |
+
"hfopenllm_v2/MMLU-PRO": 0.3237
|
| 184 |
}
|
| 185 |
},
|
| 186 |
{
|
data/developers/google.json
CHANGED
|
@@ -139,6 +139,7 @@
|
|
| 139 |
"developer": "Google",
|
| 140 |
"evaluator_relationship": null,
|
| 141 |
"benchmark_scores": {
|
|
|
|
| 142 |
"apex-agents/Overall Pass@1": 0.24,
|
| 143 |
"apex-agents/Overall Pass@8": 0.367,
|
| 144 |
"apex-agents/Overall Mean Score": 0.395,
|
|
@@ -146,7 +147,6 @@
|
|
| 146 |
"apex-agents/Management Consulting Pass@1": 0.193,
|
| 147 |
"apex-agents/Corporate Law Pass@1": 0.259,
|
| 148 |
"apex-agents/Corporate Lawyer Mean Score": 0.524,
|
| 149 |
-
"ace/Gaming Score": 0.415,
|
| 150 |
"apex-v1/Overall Score": 0.64,
|
| 151 |
"apex-v1/Consulting Score": 0.64
|
| 152 |
}
|
|
@@ -157,6 +157,8 @@
|
|
| 157 |
"developer": "Google",
|
| 158 |
"evaluator_relationship": null,
|
| 159 |
"benchmark_scores": {
|
|
|
|
|
|
|
| 160 |
"apex-agents/Overall Pass@1": 0.184,
|
| 161 |
"apex-agents/Overall Pass@8": 0.373,
|
| 162 |
"apex-agents/Overall Mean Score": 0.341,
|
|
@@ -164,8 +166,6 @@
|
|
| 164 |
"apex-agents/Management Consulting Pass@1": 0.124,
|
| 165 |
"apex-agents/Corporate Law Pass@1": 0.239,
|
| 166 |
"apex-agents/Corporate Lawyer Mean Score": 0.487,
|
| 167 |
-
"ace/Overall Score": 0.47,
|
| 168 |
-
"ace/Gaming Score": 0.509,
|
| 169 |
"apex-v1/Overall Score": 0.643,
|
| 170 |
"apex-v1/Consulting Score": 0.64,
|
| 171 |
"apex-v1/Investment Banking Score": 0.63
|
|
@@ -723,7 +723,7 @@
|
|
| 723 |
"reward-bench/Safety": 0.909,
|
| 724 |
"reward-bench/Focus": 0.841,
|
| 725 |
"reward-bench/Ties": 0.809,
|
| 726 |
-
"terminal-bench-2.0/terminal-bench-2.0":
|
| 727 |
}
|
| 728 |
},
|
| 729 |
{
|
|
@@ -861,7 +861,7 @@
|
|
| 861 |
"developer": "Google",
|
| 862 |
"evaluator_relationship": null,
|
| 863 |
"benchmark_scores": {
|
| 864 |
-
"terminal-bench-2.0/terminal-bench-2.0":
|
| 865 |
}
|
| 866 |
},
|
| 867 |
{
|
|
@@ -879,8 +879,8 @@
|
|
| 879 |
"developer": "Google",
|
| 880 |
"evaluator_relationship": null,
|
| 881 |
"benchmark_scores": {
|
| 882 |
-
"appworld_test_normal/appworld/test_normal": 0.
|
| 883 |
-
"browsecompplus/browsecompplus": 0.
|
| 884 |
"global-mmlu-lite/Global MMLU Lite": 0.9453,
|
| 885 |
"global-mmlu-lite/Culturally Sensitive": 0.9397,
|
| 886 |
"global-mmlu-lite/Culturally Agnostic": 0.9509,
|
|
@@ -900,8 +900,8 @@
|
|
| 900 |
"global-mmlu-lite/Yoruba": 0.9425,
|
| 901 |
"global-mmlu-lite/Chinese": 0.9475,
|
| 902 |
"global-mmlu-lite/Burmese": 0.9425,
|
| 903 |
-
"swe-bench/swe-bench": 0.
|
| 904 |
-
"tau-bench-2_airline/tau-bench-2/airline": 0.
|
| 905 |
"tau-bench-2_retail/tau-bench-2/retail": 0.73,
|
| 906 |
"tau-bench-2_telecom/tau-bench-2/telecom": 0.73
|
| 907 |
}
|
|
@@ -1028,12 +1028,12 @@
|
|
| 1028 |
"developer": "Google",
|
| 1029 |
"evaluator_relationship": null,
|
| 1030 |
"benchmark_scores": {
|
| 1031 |
-
"hfopenllm_v2/IFEval": 0.
|
| 1032 |
-
"hfopenllm_v2/BBH": 0.
|
| 1033 |
-
"hfopenllm_v2/MATH Level 5": 0.
|
| 1034 |
"hfopenllm_v2/GPQA": 0.2626,
|
| 1035 |
-
"hfopenllm_v2/MUSR": 0.
|
| 1036 |
-
"hfopenllm_v2/MMLU-PRO": 0.
|
| 1037 |
}
|
| 1038 |
},
|
| 1039 |
{
|
|
@@ -1056,12 +1056,12 @@
|
|
| 1056 |
"developer": "Google",
|
| 1057 |
"evaluator_relationship": null,
|
| 1058 |
"benchmark_scores": {
|
| 1059 |
-
"hfopenllm_v2/IFEval": 0.
|
| 1060 |
-
"hfopenllm_v2/BBH": 0.
|
| 1061 |
-
"hfopenllm_v2/MATH Level 5": 0.
|
| 1062 |
-
"hfopenllm_v2/GPQA": 0.
|
| 1063 |
-
"hfopenllm_v2/MUSR": 0.
|
| 1064 |
-
"hfopenllm_v2/MMLU-PRO": 0.
|
| 1065 |
}
|
| 1066 |
},
|
| 1067 |
{
|
|
|
|
| 139 |
"developer": "Google",
|
| 140 |
"evaluator_relationship": null,
|
| 141 |
"benchmark_scores": {
|
| 142 |
+
"ace/Gaming Score": 0.415,
|
| 143 |
"apex-agents/Overall Pass@1": 0.24,
|
| 144 |
"apex-agents/Overall Pass@8": 0.367,
|
| 145 |
"apex-agents/Overall Mean Score": 0.395,
|
|
|
|
| 147 |
"apex-agents/Management Consulting Pass@1": 0.193,
|
| 148 |
"apex-agents/Corporate Law Pass@1": 0.259,
|
| 149 |
"apex-agents/Corporate Lawyer Mean Score": 0.524,
|
|
|
|
| 150 |
"apex-v1/Overall Score": 0.64,
|
| 151 |
"apex-v1/Consulting Score": 0.64
|
| 152 |
}
|
|
|
|
| 157 |
"developer": "Google",
|
| 158 |
"evaluator_relationship": null,
|
| 159 |
"benchmark_scores": {
|
| 160 |
+
"ace/Overall Score": 0.47,
|
| 161 |
+
"ace/Gaming Score": 0.509,
|
| 162 |
"apex-agents/Overall Pass@1": 0.184,
|
| 163 |
"apex-agents/Overall Pass@8": 0.373,
|
| 164 |
"apex-agents/Overall Mean Score": 0.341,
|
|
|
|
| 166 |
"apex-agents/Management Consulting Pass@1": 0.124,
|
| 167 |
"apex-agents/Corporate Law Pass@1": 0.239,
|
| 168 |
"apex-agents/Corporate Lawyer Mean Score": 0.487,
|
|
|
|
|
|
|
| 169 |
"apex-v1/Overall Score": 0.643,
|
| 170 |
"apex-v1/Consulting Score": 0.64,
|
| 171 |
"apex-v1/Investment Banking Score": 0.63
|
|
|
|
| 723 |
"reward-bench/Safety": 0.909,
|
| 724 |
"reward-bench/Focus": 0.841,
|
| 725 |
"reward-bench/Ties": 0.809,
|
| 726 |
+
"terminal-bench-2.0/terminal-bench-2.0": 16.9
|
| 727 |
}
|
| 728 |
},
|
| 729 |
{
|
|
|
|
| 861 |
"developer": "Google",
|
| 862 |
"evaluator_relationship": null,
|
| 863 |
"benchmark_scores": {
|
| 864 |
+
"terminal-bench-2.0/terminal-bench-2.0": 47.4
|
| 865 |
}
|
| 866 |
},
|
| 867 |
{
|
|
|
|
| 879 |
"developer": "Google",
|
| 880 |
"evaluator_relationship": null,
|
| 881 |
"benchmark_scores": {
|
| 882 |
+
"appworld_test_normal/appworld/test_normal": 0.55,
|
| 883 |
+
"browsecompplus/browsecompplus": 0.3333,
|
| 884 |
"global-mmlu-lite/Global MMLU Lite": 0.9453,
|
| 885 |
"global-mmlu-lite/Culturally Sensitive": 0.9397,
|
| 886 |
"global-mmlu-lite/Culturally Agnostic": 0.9509,
|
|
|
|
| 900 |
"global-mmlu-lite/Yoruba": 0.9425,
|
| 901 |
"global-mmlu-lite/Chinese": 0.9475,
|
| 902 |
"global-mmlu-lite/Burmese": 0.9425,
|
| 903 |
+
"swe-bench/swe-bench": 0.71,
|
| 904 |
+
"tau-bench-2_airline/tau-bench-2/airline": 0.68,
|
| 905 |
"tau-bench-2_retail/tau-bench-2/retail": 0.73,
|
| 906 |
"tau-bench-2_telecom/tau-bench-2/telecom": 0.73
|
| 907 |
}
|
|
|
|
| 1028 |
"developer": "Google",
|
| 1029 |
"evaluator_relationship": null,
|
| 1030 |
"benchmark_scores": {
|
| 1031 |
+
"hfopenllm_v2/IFEval": 0.1993,
|
| 1032 |
+
"hfopenllm_v2/BBH": 0.3656,
|
| 1033 |
+
"hfopenllm_v2/MATH Level 5": 0.0287,
|
| 1034 |
"hfopenllm_v2/GPQA": 0.2626,
|
| 1035 |
+
"hfopenllm_v2/MUSR": 0.4232,
|
| 1036 |
+
"hfopenllm_v2/MMLU-PRO": 0.218
|
| 1037 |
}
|
| 1038 |
},
|
| 1039 |
{
|
|
|
|
| 1056 |
"developer": "Google",
|
| 1057 |
"evaluator_relationship": null,
|
| 1058 |
"benchmark_scores": {
|
| 1059 |
+
"hfopenllm_v2/IFEval": 0.5288,
|
| 1060 |
+
"hfopenllm_v2/BBH": 0.4178,
|
| 1061 |
+
"hfopenllm_v2/MATH Level 5": 0.0476,
|
| 1062 |
+
"hfopenllm_v2/GPQA": 0.2752,
|
| 1063 |
+
"hfopenllm_v2/MUSR": 0.3728,
|
| 1064 |
+
"hfopenllm_v2/MMLU-PRO": 0.2467
|
| 1065 |
}
|
| 1066 |
},
|
| 1067 |
{
|
data/developers/huggingfacetb.json
CHANGED
|
@@ -133,12 +133,12 @@
|
|
| 133 |
"developer": "HuggingFaceTB",
|
| 134 |
"evaluator_relationship": null,
|
| 135 |
"benchmark_scores": {
|
| 136 |
-
"hfopenllm_v2/IFEval": 0.
|
| 137 |
-
"hfopenllm_v2/BBH": 0.
|
| 138 |
-
"hfopenllm_v2/MATH Level 5": 0.
|
| 139 |
-
"hfopenllm_v2/GPQA": 0.
|
| 140 |
-
"hfopenllm_v2/MUSR": 0.
|
| 141 |
-
"hfopenllm_v2/MMLU-PRO": 0.
|
| 142 |
}
|
| 143 |
},
|
| 144 |
{
|
|
|
|
| 133 |
"developer": "HuggingFaceTB",
|
| 134 |
"evaluator_relationship": null,
|
| 135 |
"benchmark_scores": {
|
| 136 |
+
"hfopenllm_v2/IFEval": 0.2883,
|
| 137 |
+
"hfopenllm_v2/BBH": 0.3124,
|
| 138 |
+
"hfopenllm_v2/MATH Level 5": 0.003,
|
| 139 |
+
"hfopenllm_v2/GPQA": 0.2357,
|
| 140 |
+
"hfopenllm_v2/MUSR": 0.3662,
|
| 141 |
+
"hfopenllm_v2/MMLU-PRO": 0.1115
|
| 142 |
}
|
| 143 |
},
|
| 144 |
{
|
data/developers/infly.json
CHANGED
|
@@ -7,16 +7,16 @@
|
|
| 7 |
"developer": "infly",
|
| 8 |
"evaluator_relationship": null,
|
| 9 |
"benchmark_scores": {
|
| 10 |
-
"reward-bench/Score": 0.
|
| 11 |
-
"reward-bench/Chat": 0.9665,
|
| 12 |
-
"reward-bench/Chat Hard": 0.9101,
|
| 13 |
-
"reward-bench/Safety": 0.9644,
|
| 14 |
-
"reward-bench/Reasoning": 0.9912,
|
| 15 |
"reward-bench/Factuality": 0.7411,
|
| 16 |
"reward-bench/Precise IF": 0.4188,
|
| 17 |
"reward-bench/Math": 0.6995,
|
|
|
|
| 18 |
"reward-bench/Focus": 0.903,
|
| 19 |
-
"reward-bench/Ties": 0.8622
|
|
|
|
|
|
|
|
|
|
| 20 |
}
|
| 21 |
}
|
| 22 |
]
|
|
|
|
| 7 |
"developer": "infly",
|
| 8 |
"evaluator_relationship": null,
|
| 9 |
"benchmark_scores": {
|
| 10 |
+
"reward-bench/Score": 0.9511,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
"reward-bench/Factuality": 0.7411,
|
| 12 |
"reward-bench/Precise IF": 0.4188,
|
| 13 |
"reward-bench/Math": 0.6995,
|
| 14 |
+
"reward-bench/Safety": 0.9365,
|
| 15 |
"reward-bench/Focus": 0.903,
|
| 16 |
+
"reward-bench/Ties": 0.8622,
|
| 17 |
+
"reward-bench/Chat": 0.9665,
|
| 18 |
+
"reward-bench/Chat Hard": 0.9101,
|
| 19 |
+
"reward-bench/Reasoning": 0.9912
|
| 20 |
}
|
| 21 |
}
|
| 22 |
]
|
data/developers/internlm.json
CHANGED
|
@@ -71,16 +71,16 @@
|
|
| 71 |
"developer": "internlm",
|
| 72 |
"evaluator_relationship": null,
|
| 73 |
"benchmark_scores": {
|
| 74 |
-
"reward-bench/Score": 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
"reward-bench/Factuality": 0.4211,
|
| 76 |
"reward-bench/Precise IF": 0.4,
|
| 77 |
"reward-bench/Math": 0.5628,
|
| 78 |
-
"reward-bench/Safety": 0.8716,
|
| 79 |
"reward-bench/Focus": 0.7051,
|
| 80 |
-
"reward-bench/Ties": 0.5164
|
| 81 |
-
"reward-bench/Chat": 0.9916,
|
| 82 |
-
"reward-bench/Chat Hard": 0.6952,
|
| 83 |
-
"reward-bench/Reasoning": 0.9453
|
| 84 |
}
|
| 85 |
},
|
| 86 |
{
|
|
|
|
| 71 |
"developer": "internlm",
|
| 72 |
"evaluator_relationship": null,
|
| 73 |
"benchmark_scores": {
|
| 74 |
+
"reward-bench/Score": 0.5335,
|
| 75 |
+
"reward-bench/Chat": 0.9916,
|
| 76 |
+
"reward-bench/Chat Hard": 0.6952,
|
| 77 |
+
"reward-bench/Safety": 0.5956,
|
| 78 |
+
"reward-bench/Reasoning": 0.9453,
|
| 79 |
"reward-bench/Factuality": 0.4211,
|
| 80 |
"reward-bench/Precise IF": 0.4,
|
| 81 |
"reward-bench/Math": 0.5628,
|
|
|
|
| 82 |
"reward-bench/Focus": 0.7051,
|
| 83 |
+
"reward-bench/Ties": 0.5164
|
|
|
|
|
|
|
|
|
|
| 84 |
}
|
| 85 |
},
|
| 86 |
{
|
data/developers/jaspionjader.json
CHANGED
|
@@ -1477,12 +1477,12 @@
|
|
| 1477 |
"developer": "jaspionjader",
|
| 1478 |
"evaluator_relationship": null,
|
| 1479 |
"benchmark_scores": {
|
| 1480 |
-
"hfopenllm_v2/IFEval": 0.
|
| 1481 |
-
"hfopenllm_v2/BBH": 0.
|
| 1482 |
-
"hfopenllm_v2/MATH Level 5": 0.
|
| 1483 |
-
"hfopenllm_v2/GPQA": 0.
|
| 1484 |
"hfopenllm_v2/MUSR": 0.4277,
|
| 1485 |
-
"hfopenllm_v2/MMLU-PRO": 0.
|
| 1486 |
}
|
| 1487 |
},
|
| 1488 |
{
|
|
|
|
| 1477 |
"developer": "jaspionjader",
|
| 1478 |
"evaluator_relationship": null,
|
| 1479 |
"benchmark_scores": {
|
| 1480 |
+
"hfopenllm_v2/IFEval": 0.4418,
|
| 1481 |
+
"hfopenllm_v2/BBH": 0.5406,
|
| 1482 |
+
"hfopenllm_v2/MATH Level 5": 0.1352,
|
| 1483 |
+
"hfopenllm_v2/GPQA": 0.3062,
|
| 1484 |
"hfopenllm_v2/MUSR": 0.4277,
|
| 1485 |
+
"hfopenllm_v2/MMLU-PRO": 0.386
|
| 1486 |
}
|
| 1487 |
},
|
| 1488 |
{
|
data/developers/leroydyer.json
CHANGED
|
@@ -707,12 +707,12 @@
|
|
| 707 |
"developer": "LeroyDyer",
|
| 708 |
"evaluator_relationship": null,
|
| 709 |
"benchmark_scores": {
|
| 710 |
-
"hfopenllm_v2/IFEval": 0.
|
| 711 |
-
"hfopenllm_v2/BBH": 0.
|
| 712 |
-
"hfopenllm_v2/MATH Level 5": 0.
|
| 713 |
-
"hfopenllm_v2/GPQA": 0.
|
| 714 |
-
"hfopenllm_v2/MUSR": 0.
|
| 715 |
-
"hfopenllm_v2/MMLU-PRO": 0.
|
| 716 |
}
|
| 717 |
},
|
| 718 |
{
|
|
|
|
| 707 |
"developer": "LeroyDyer",
|
| 708 |
"evaluator_relationship": null,
|
| 709 |
"benchmark_scores": {
|
| 710 |
+
"hfopenllm_v2/IFEval": 0.3798,
|
| 711 |
+
"hfopenllm_v2/BBH": 0.4483,
|
| 712 |
+
"hfopenllm_v2/MATH Level 5": 0.04,
|
| 713 |
+
"hfopenllm_v2/GPQA": 0.3129,
|
| 714 |
+
"hfopenllm_v2/MUSR": 0.4148,
|
| 715 |
+
"hfopenllm_v2/MMLU-PRO": 0.2389
|
| 716 |
}
|
| 717 |
},
|
| 718 |
{
|
data/developers/llmat.json
CHANGED
|
@@ -7,12 +7,12 @@
|
|
| 7 |
"developer": "llmat",
|
| 8 |
"evaluator_relationship": null,
|
| 9 |
"benchmark_scores": {
|
| 10 |
-
"hfopenllm_v2/IFEval": 0.
|
| 11 |
-
"hfopenllm_v2/BBH": 0.
|
| 12 |
-
"hfopenllm_v2/MATH Level 5": 0.
|
| 13 |
-
"hfopenllm_v2/GPQA": 0.
|
| 14 |
-
"hfopenllm_v2/MUSR": 0.
|
| 15 |
-
"hfopenllm_v2/MMLU-PRO": 0.
|
| 16 |
}
|
| 17 |
}
|
| 18 |
]
|
|
|
|
| 7 |
"developer": "llmat",
|
| 8 |
"evaluator_relationship": null,
|
| 9 |
"benchmark_scores": {
|
| 10 |
+
"hfopenllm_v2/IFEval": 0.377,
|
| 11 |
+
"hfopenllm_v2/BBH": 0.3978,
|
| 12 |
+
"hfopenllm_v2/MATH Level 5": 0.0242,
|
| 13 |
+
"hfopenllm_v2/GPQA": 0.2668,
|
| 14 |
+
"hfopenllm_v2/MUSR": 0.3555,
|
| 15 |
+
"hfopenllm_v2/MMLU-PRO": 0.2278
|
| 16 |
}
|
| 17 |
}
|
| 18 |
]
|
data/developers/lxzgordon.json
CHANGED
|
@@ -20,16 +20,16 @@
|
|
| 20 |
"developer": "LxzGordon",
|
| 21 |
"evaluator_relationship": null,
|
| 22 |
"benchmark_scores": {
|
| 23 |
-
"reward-bench/Score": 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
"reward-bench/Factuality": 0.6884,
|
| 25 |
"reward-bench/Precise IF": 0.45,
|
| 26 |
"reward-bench/Math": 0.6393,
|
| 27 |
-
"reward-bench/Safety": 0.9108,
|
| 28 |
"reward-bench/Focus": 0.9758,
|
| 29 |
-
"reward-bench/Ties": 0.7653
|
| 30 |
-
"reward-bench/Chat": 0.9553,
|
| 31 |
-
"reward-bench/Chat Hard": 0.8816,
|
| 32 |
-
"reward-bench/Reasoning": 0.9698
|
| 33 |
}
|
| 34 |
}
|
| 35 |
]
|
|
|
|
| 20 |
"developer": "LxzGordon",
|
| 21 |
"evaluator_relationship": null,
|
| 22 |
"benchmark_scores": {
|
| 23 |
+
"reward-bench/Score": 0.7394,
|
| 24 |
+
"reward-bench/Chat": 0.9553,
|
| 25 |
+
"reward-bench/Chat Hard": 0.8816,
|
| 26 |
+
"reward-bench/Safety": 0.9178,
|
| 27 |
+
"reward-bench/Reasoning": 0.9698,
|
| 28 |
"reward-bench/Factuality": 0.6884,
|
| 29 |
"reward-bench/Precise IF": 0.45,
|
| 30 |
"reward-bench/Math": 0.6393,
|
|
|
|
| 31 |
"reward-bench/Focus": 0.9758,
|
| 32 |
+
"reward-bench/Ties": 0.7653
|
|
|
|
|
|
|
|
|
|
| 33 |
}
|
| 34 |
}
|
| 35 |
]
|
data/developers/meta.json
CHANGED
|
@@ -471,6 +471,16 @@
|
|
| 471 |
"helm_capabilities/IFEval": 0.743,
|
| 472 |
"helm_capabilities/WildBench": 0.686,
|
| 473 |
"helm_capabilities/Omni-MATH": 0.137,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 474 |
"helm_mmlu/MMLU All Subjects": 0.561,
|
| 475 |
"helm_mmlu/Abstract Algebra": 0.26,
|
| 476 |
"helm_mmlu/Anatomy": 0.459,
|
|
@@ -506,17 +516,7 @@
|
|
| 506 |
"helm_mmlu/Sociology": 0.701,
|
| 507 |
"helm_mmlu/Virology": 0.446,
|
| 508 |
"helm_mmlu/World Religions": 0.789,
|
| 509 |
-
"helm_mmlu/Mean win rate": 0.475
|
| 510 |
-
"helm_lite/Mean win rate": 0.303,
|
| 511 |
-
"helm_lite/NarrativeQA": 0.756,
|
| 512 |
-
"helm_lite/NaturalQuestions (closed-book)": 0.209,
|
| 513 |
-
"helm_lite/OpenbookQA": 0.74,
|
| 514 |
-
"helm_lite/MMLU": 0.5,
|
| 515 |
-
"helm_lite/MATH": 0.703,
|
| 516 |
-
"helm_lite/GSM8K": 0.798,
|
| 517 |
-
"helm_lite/LegalBench": 0.342,
|
| 518 |
-
"helm_lite/MedQA": 0.245,
|
| 519 |
-
"helm_lite/WMT 2014": 0.181
|
| 520 |
}
|
| 521 |
},
|
| 522 |
{
|
|
@@ -579,6 +579,16 @@
|
|
| 579 |
"developer": "Meta",
|
| 580 |
"evaluator_relationship": null,
|
| 581 |
"benchmark_scores": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 582 |
"helm_mmlu/MMLU All Subjects": 0.803,
|
| 583 |
"helm_mmlu/Abstract Algebra": 0.52,
|
| 584 |
"helm_mmlu/Anatomy": 0.8,
|
|
@@ -614,17 +624,7 @@
|
|
| 614 |
"helm_mmlu/Sociology": 0.92,
|
| 615 |
"helm_mmlu/Virology": 0.584,
|
| 616 |
"helm_mmlu/World Religions": 0.901,
|
| 617 |
-
"helm_mmlu/Mean win rate": 0.773
|
| 618 |
-
"helm_lite/Mean win rate": 0.819,
|
| 619 |
-
"helm_lite/NarrativeQA": 0.777,
|
| 620 |
-
"helm_lite/NaturalQuestions (closed-book)": 0.457,
|
| 621 |
-
"helm_lite/OpenbookQA": 0.942,
|
| 622 |
-
"helm_lite/MMLU": 0.703,
|
| 623 |
-
"helm_lite/MATH": 0.791,
|
| 624 |
-
"helm_lite/GSM8K": 0.936,
|
| 625 |
-
"helm_lite/LegalBench": 0.68,
|
| 626 |
-
"helm_lite/MedQA": 0.769,
|
| 627 |
-
"helm_lite/WMT 2014": 0.224
|
| 628 |
}
|
| 629 |
},
|
| 630 |
{
|
|
|
|
| 471 |
"helm_capabilities/IFEval": 0.743,
|
| 472 |
"helm_capabilities/WildBench": 0.686,
|
| 473 |
"helm_capabilities/Omni-MATH": 0.137,
|
| 474 |
+
"helm_lite/Mean win rate": 0.303,
|
| 475 |
+
"helm_lite/NarrativeQA": 0.756,
|
| 476 |
+
"helm_lite/NaturalQuestions (closed-book)": 0.209,
|
| 477 |
+
"helm_lite/OpenbookQA": 0.74,
|
| 478 |
+
"helm_lite/MMLU": 0.5,
|
| 479 |
+
"helm_lite/MATH": 0.703,
|
| 480 |
+
"helm_lite/GSM8K": 0.798,
|
| 481 |
+
"helm_lite/LegalBench": 0.342,
|
| 482 |
+
"helm_lite/MedQA": 0.245,
|
| 483 |
+
"helm_lite/WMT 2014": 0.181,
|
| 484 |
"helm_mmlu/MMLU All Subjects": 0.561,
|
| 485 |
"helm_mmlu/Abstract Algebra": 0.26,
|
| 486 |
"helm_mmlu/Anatomy": 0.459,
|
|
|
|
| 516 |
"helm_mmlu/Sociology": 0.701,
|
| 517 |
"helm_mmlu/Virology": 0.446,
|
| 518 |
"helm_mmlu/World Religions": 0.789,
|
| 519 |
+
"helm_mmlu/Mean win rate": 0.475
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 520 |
}
|
| 521 |
},
|
| 522 |
{
|
|
|
|
| 579 |
"developer": "Meta",
|
| 580 |
"evaluator_relationship": null,
|
| 581 |
"benchmark_scores": {
|
| 582 |
+
"helm_lite/Mean win rate": 0.819,
|
| 583 |
+
"helm_lite/NarrativeQA": 0.777,
|
| 584 |
+
"helm_lite/NaturalQuestions (closed-book)": 0.457,
|
| 585 |
+
"helm_lite/OpenbookQA": 0.942,
|
| 586 |
+
"helm_lite/MMLU": 0.703,
|
| 587 |
+
"helm_lite/MATH": 0.791,
|
| 588 |
+
"helm_lite/GSM8K": 0.936,
|
| 589 |
+
"helm_lite/LegalBench": 0.68,
|
| 590 |
+
"helm_lite/MedQA": 0.769,
|
| 591 |
+
"helm_lite/WMT 2014": 0.224,
|
| 592 |
"helm_mmlu/MMLU All Subjects": 0.803,
|
| 593 |
"helm_mmlu/Abstract Algebra": 0.52,
|
| 594 |
"helm_mmlu/Anatomy": 0.8,
|
|
|
|
| 624 |
"helm_mmlu/Sociology": 0.92,
|
| 625 |
"helm_mmlu/Virology": 0.584,
|
| 626 |
"helm_mmlu/World Religions": 0.901,
|
| 627 |
+
"helm_mmlu/Mean win rate": 0.773
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 628 |
}
|
| 629 |
},
|
| 630 |
{
|
data/developers/minimax.json
CHANGED
|
@@ -25,7 +25,7 @@
|
|
| 25 |
"developer": "MiniMax",
|
| 26 |
"evaluator_relationship": null,
|
| 27 |
"benchmark_scores": {
|
| 28 |
-
"terminal-bench-2.0/terminal-bench-2.0":
|
| 29 |
}
|
| 30 |
},
|
| 31 |
{
|
|
|
|
| 25 |
"developer": "MiniMax",
|
| 26 |
"evaluator_relationship": null,
|
| 27 |
"benchmark_scores": {
|
| 28 |
+
"terminal-bench-2.0/terminal-bench-2.0": 36.6
|
| 29 |
}
|
| 30 |
},
|
| 31 |
{
|
data/developers/mistralai.json
CHANGED
|
@@ -69,6 +69,16 @@
|
|
| 69 |
"helm_capabilities/IFEval": 0.567,
|
| 70 |
"helm_capabilities/WildBench": 0.66,
|
| 71 |
"helm_capabilities/Omni-MATH": 0.072,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
"helm_mmlu/MMLU All Subjects": 0.599,
|
| 73 |
"helm_mmlu/Abstract Algebra": 0.27,
|
| 74 |
"helm_mmlu/Anatomy": 0.585,
|
|
@@ -105,16 +115,6 @@
|
|
| 105 |
"helm_mmlu/Virology": 0.47,
|
| 106 |
"helm_mmlu/World Religions": 0.825,
|
| 107 |
"helm_mmlu/Mean win rate": 0.509,
|
| 108 |
-
"helm_lite/Mean win rate": 0.196,
|
| 109 |
-
"helm_lite/NarrativeQA": 0.716,
|
| 110 |
-
"helm_lite/NaturalQuestions (closed-book)": 0.253,
|
| 111 |
-
"helm_lite/OpenbookQA": 0.79,
|
| 112 |
-
"helm_lite/MMLU": 0.51,
|
| 113 |
-
"helm_lite/MATH": 0.289,
|
| 114 |
-
"helm_lite/GSM8K": 0.538,
|
| 115 |
-
"helm_lite/LegalBench": 0.331,
|
| 116 |
-
"helm_lite/MedQA": 0.517,
|
| 117 |
-
"helm_lite/WMT 2014": 0.142,
|
| 118 |
"hfopenllm_v2/IFEval": 0.5465,
|
| 119 |
"hfopenllm_v2/BBH": 0.4722,
|
| 120 |
"hfopenllm_v2/MATH Level 5": 0.0385,
|
|
@@ -718,12 +718,12 @@
|
|
| 718 |
"developer": "mistralai",
|
| 719 |
"evaluator_relationship": null,
|
| 720 |
"benchmark_scores": {
|
| 721 |
-
"hfopenllm_v2/IFEval": 0.
|
| 722 |
-
"hfopenllm_v2/BBH": 0.
|
| 723 |
-
"hfopenllm_v2/MATH Level 5": 0.
|
| 724 |
-
"hfopenllm_v2/GPQA": 0.
|
| 725 |
-
"hfopenllm_v2/MUSR": 0.
|
| 726 |
-
"hfopenllm_v2/MMLU-PRO": 0.
|
| 727 |
}
|
| 728 |
},
|
| 729 |
{
|
|
|
|
| 69 |
"helm_capabilities/IFEval": 0.567,
|
| 70 |
"helm_capabilities/WildBench": 0.66,
|
| 71 |
"helm_capabilities/Omni-MATH": 0.072,
|
| 72 |
+
"helm_lite/Mean win rate": 0.196,
|
| 73 |
+
"helm_lite/NarrativeQA": 0.716,
|
| 74 |
+
"helm_lite/NaturalQuestions (closed-book)": 0.253,
|
| 75 |
+
"helm_lite/OpenbookQA": 0.79,
|
| 76 |
+
"helm_lite/MMLU": 0.51,
|
| 77 |
+
"helm_lite/MATH": 0.289,
|
| 78 |
+
"helm_lite/GSM8K": 0.538,
|
| 79 |
+
"helm_lite/LegalBench": 0.331,
|
| 80 |
+
"helm_lite/MedQA": 0.517,
|
| 81 |
+
"helm_lite/WMT 2014": 0.142,
|
| 82 |
"helm_mmlu/MMLU All Subjects": 0.599,
|
| 83 |
"helm_mmlu/Abstract Algebra": 0.27,
|
| 84 |
"helm_mmlu/Anatomy": 0.585,
|
|
|
|
| 115 |
"helm_mmlu/Virology": 0.47,
|
| 116 |
"helm_mmlu/World Religions": 0.825,
|
| 117 |
"helm_mmlu/Mean win rate": 0.509,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
"hfopenllm_v2/IFEval": 0.5465,
|
| 119 |
"hfopenllm_v2/BBH": 0.4722,
|
| 120 |
"hfopenllm_v2/MATH Level 5": 0.0385,
|
|
|
|
| 718 |
"developer": "mistralai",
|
| 719 |
"evaluator_relationship": null,
|
| 720 |
"benchmark_scores": {
|
| 721 |
+
"hfopenllm_v2/IFEval": 0.2415,
|
| 722 |
+
"hfopenllm_v2/BBH": 0.5087,
|
| 723 |
+
"hfopenllm_v2/MATH Level 5": 0.102,
|
| 724 |
+
"hfopenllm_v2/GPQA": 0.3138,
|
| 725 |
+
"hfopenllm_v2/MUSR": 0.4321,
|
| 726 |
+
"hfopenllm_v2/MMLU-PRO": 0.385
|
| 727 |
}
|
| 728 |
},
|
| 729 |
{
|
data/developers/mlabonne.json
CHANGED
|
@@ -161,12 +161,12 @@
|
|
| 161 |
"developer": "mlabonne",
|
| 162 |
"evaluator_relationship": null,
|
| 163 |
"benchmark_scores": {
|
| 164 |
-
"hfopenllm_v2/IFEval": 0.
|
| 165 |
-
"hfopenllm_v2/BBH": 0.
|
| 166 |
-
"hfopenllm_v2/MATH Level 5": 0.
|
| 167 |
-
"hfopenllm_v2/GPQA": 0.
|
| 168 |
-
"hfopenllm_v2/MUSR": 0.
|
| 169 |
-
"hfopenllm_v2/MMLU-PRO": 0.
|
| 170 |
}
|
| 171 |
},
|
| 172 |
{
|
|
|
|
| 161 |
"developer": "mlabonne",
|
| 162 |
"evaluator_relationship": null,
|
| 163 |
"benchmark_scores": {
|
| 164 |
+
"hfopenllm_v2/IFEval": 0.7561,
|
| 165 |
+
"hfopenllm_v2/BBH": 0.5111,
|
| 166 |
+
"hfopenllm_v2/MATH Level 5": 0.0906,
|
| 167 |
+
"hfopenllm_v2/GPQA": 0.3062,
|
| 168 |
+
"hfopenllm_v2/MUSR": 0.4019,
|
| 169 |
+
"hfopenllm_v2/MMLU-PRO": 0.3841
|
| 170 |
}
|
| 171 |
},
|
| 172 |
{
|
data/developers/moonshot_ai.json
CHANGED
|
@@ -7,7 +7,7 @@
|
|
| 7 |
"developer": "Moonshot AI",
|
| 8 |
"evaluator_relationship": null,
|
| 9 |
"benchmark_scores": {
|
| 10 |
-
"terminal-bench-2.0/terminal-bench-2.0":
|
| 11 |
}
|
| 12 |
},
|
| 13 |
{
|
|
|
|
| 7 |
"developer": "Moonshot AI",
|
| 8 |
"evaluator_relationship": null,
|
| 9 |
"benchmark_scores": {
|
| 10 |
+
"terminal-bench-2.0/terminal-bench-2.0": 27.8
|
| 11 |
}
|
| 12 |
},
|
| 13 |
{
|
data/developers/multiple.json
CHANGED
|
@@ -7,7 +7,7 @@
|
|
| 7 |
"developer": "Multiple",
|
| 8 |
"evaluator_relationship": null,
|
| 9 |
"benchmark_scores": {
|
| 10 |
-
"terminal-bench-2.0/terminal-bench-2.0":
|
| 11 |
}
|
| 12 |
}
|
| 13 |
]
|
|
|
|
| 7 |
"developer": "Multiple",
|
| 8 |
"evaluator_relationship": null,
|
| 9 |
"benchmark_scores": {
|
| 10 |
+
"terminal-bench-2.0/terminal-bench-2.0": 72.4
|
| 11 |
}
|
| 12 |
}
|
| 13 |
]
|
data/developers/nazimali.json
CHANGED
|
@@ -21,12 +21,12 @@
|
|
| 21 |
"developer": "nazimali",
|
| 22 |
"evaluator_relationship": null,
|
| 23 |
"benchmark_scores": {
|
| 24 |
-
"hfopenllm_v2/IFEval": 0.
|
| 25 |
-
"hfopenllm_v2/BBH": 0.
|
| 26 |
-
"hfopenllm_v2/MATH Level 5": 0.
|
| 27 |
-
"hfopenllm_v2/GPQA": 0.
|
| 28 |
-
"hfopenllm_v2/MUSR": 0.
|
| 29 |
-
"hfopenllm_v2/MMLU-PRO": 0.
|
| 30 |
}
|
| 31 |
}
|
| 32 |
]
|
|
|
|
| 21 |
"developer": "nazimali",
|
| 22 |
"evaluator_relationship": null,
|
| 23 |
"benchmark_scores": {
|
| 24 |
+
"hfopenllm_v2/IFEval": 0.486,
|
| 25 |
+
"hfopenllm_v2/BBH": 0.4721,
|
| 26 |
+
"hfopenllm_v2/MATH Level 5": 0.0846,
|
| 27 |
+
"hfopenllm_v2/GPQA": 0.2844,
|
| 28 |
+
"hfopenllm_v2/MUSR": 0.4006,
|
| 29 |
+
"hfopenllm_v2/MMLU-PRO": 0.3087
|
| 30 |
}
|
| 31 |
}
|
| 32 |
]
|
data/developers/nicolinho.json
CHANGED
|
@@ -7,16 +7,16 @@
|
|
| 7 |
"developer": "nicolinho",
|
| 8 |
"evaluator_relationship": null,
|
| 9 |
"benchmark_scores": {
|
| 10 |
-
"reward-bench/Score": 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
"reward-bench/Factuality": 0.7853,
|
| 12 |
"reward-bench/Precise IF": 0.3719,
|
| 13 |
"reward-bench/Math": 0.6995,
|
| 14 |
-
"reward-bench/Safety": 0.927,
|
| 15 |
"reward-bench/Focus": 0.9535,
|
| 16 |
-
"reward-bench/Ties": 0.8321
|
| 17 |
-
"reward-bench/Chat": 0.9665,
|
| 18 |
-
"reward-bench/Chat Hard": 0.9013,
|
| 19 |
-
"reward-bench/Reasoning": 0.9826
|
| 20 |
}
|
| 21 |
},
|
| 22 |
{
|
|
@@ -51,16 +51,16 @@
|
|
| 51 |
"developer": "nicolinho",
|
| 52 |
"evaluator_relationship": null,
|
| 53 |
"benchmark_scores": {
|
| 54 |
-
"reward-bench/Score": 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
"reward-bench/Factuality": 0.6653,
|
| 56 |
"reward-bench/Precise IF": 0.4062,
|
| 57 |
"reward-bench/Math": 0.612,
|
| 58 |
-
"reward-bench/Safety": 0.9257,
|
| 59 |
"reward-bench/Focus": 0.8909,
|
| 60 |
-
"reward-bench/Ties": 0.7234
|
| 61 |
-
"reward-bench/Chat": 0.9637,
|
| 62 |
-
"reward-bench/Chat Hard": 0.8684,
|
| 63 |
-
"reward-bench/Reasoning": 0.9677
|
| 64 |
}
|
| 65 |
}
|
| 66 |
]
|
|
|
|
| 7 |
"developer": "nicolinho",
|
| 8 |
"evaluator_relationship": null,
|
| 9 |
"benchmark_scores": {
|
| 10 |
+
"reward-bench/Score": 0.7667,
|
| 11 |
+
"reward-bench/Chat": 0.9665,
|
| 12 |
+
"reward-bench/Chat Hard": 0.9013,
|
| 13 |
+
"reward-bench/Safety": 0.9578,
|
| 14 |
+
"reward-bench/Reasoning": 0.9826,
|
| 15 |
"reward-bench/Factuality": 0.7853,
|
| 16 |
"reward-bench/Precise IF": 0.3719,
|
| 17 |
"reward-bench/Math": 0.6995,
|
|
|
|
| 18 |
"reward-bench/Focus": 0.9535,
|
| 19 |
+
"reward-bench/Ties": 0.8321
|
|
|
|
|
|
|
|
|
|
| 20 |
}
|
| 21 |
},
|
| 22 |
{
|
|
|
|
| 51 |
"developer": "nicolinho",
|
| 52 |
"evaluator_relationship": null,
|
| 53 |
"benchmark_scores": {
|
| 54 |
+
"reward-bench/Score": 0.7074,
|
| 55 |
+
"reward-bench/Chat": 0.9637,
|
| 56 |
+
"reward-bench/Chat Hard": 0.8684,
|
| 57 |
+
"reward-bench/Safety": 0.9467,
|
| 58 |
+
"reward-bench/Reasoning": 0.9677,
|
| 59 |
"reward-bench/Factuality": 0.6653,
|
| 60 |
"reward-bench/Precise IF": 0.4062,
|
| 61 |
"reward-bench/Math": 0.612,
|
|
|
|
| 62 |
"reward-bench/Focus": 0.8909,
|
| 63 |
+
"reward-bench/Ties": 0.7234
|
|
|
|
|
|
|
|
|
|
| 64 |
}
|
| 65 |
}
|
| 66 |
]
|
data/developers/nisten.json
CHANGED
|
@@ -7,12 +7,12 @@
|
|
| 7 |
"developer": "nisten",
|
| 8 |
"evaluator_relationship": null,
|
| 9 |
"benchmark_scores": {
|
| 10 |
-
"hfopenllm_v2/IFEval": 0.
|
| 11 |
-
"hfopenllm_v2/BBH": 0.
|
| 12 |
-
"hfopenllm_v2/MATH Level 5": 0.
|
| 13 |
-
"hfopenllm_v2/GPQA": 0.
|
| 14 |
-
"hfopenllm_v2/MUSR": 0.
|
| 15 |
-
"hfopenllm_v2/MMLU-PRO": 0.
|
| 16 |
}
|
| 17 |
},
|
| 18 |
{
|
|
|
|
| 7 |
"developer": "nisten",
|
| 8 |
"evaluator_relationship": null,
|
| 9 |
"benchmark_scores": {
|
| 10 |
+
"hfopenllm_v2/IFEval": 0.3799,
|
| 11 |
+
"hfopenllm_v2/BBH": 0.6647,
|
| 12 |
+
"hfopenllm_v2/MATH Level 5": 0.3406,
|
| 13 |
+
"hfopenllm_v2/GPQA": 0.4035,
|
| 14 |
+
"hfopenllm_v2/MUSR": 0.494,
|
| 15 |
+
"hfopenllm_v2/MMLU-PRO": 0.5731
|
| 16 |
}
|
| 17 |
},
|
| 18 |
{
|
data/developers/nousresearch.json
CHANGED
|
@@ -200,20 +200,6 @@
|
|
| 200 |
"hfopenllm_v2/MMLU-PRO": 0.232
|
| 201 |
}
|
| 202 |
},
|
| 203 |
-
{
|
| 204 |
-
"id": "NousResearch/Yarn-Llama-2-7b-128k",
|
| 205 |
-
"name": "Yarn-Llama-2-7b-128k",
|
| 206 |
-
"developer": "NousResearch",
|
| 207 |
-
"evaluator_relationship": null,
|
| 208 |
-
"benchmark_scores": {
|
| 209 |
-
"hfopenllm_v2/IFEval": 0.1485,
|
| 210 |
-
"hfopenllm_v2/BBH": 0.3248,
|
| 211 |
-
"hfopenllm_v2/MATH Level 5": 0.0151,
|
| 212 |
-
"hfopenllm_v2/GPQA": 0.2601,
|
| 213 |
-
"hfopenllm_v2/MUSR": 0.3967,
|
| 214 |
-
"hfopenllm_v2/MMLU-PRO": 0.1791
|
| 215 |
-
}
|
| 216 |
-
},
|
| 217 |
{
|
| 218 |
"id": "NousResearch/Yarn-Llama-2-7b-64k",
|
| 219 |
"name": "Yarn-Llama-2-7b-64k",
|
|
|
|
| 200 |
"hfopenllm_v2/MMLU-PRO": 0.232
|
| 201 |
}
|
| 202 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
{
|
| 204 |
"id": "NousResearch/Yarn-Llama-2-7b-64k",
|
| 205 |
"name": "Yarn-Llama-2-7b-64k",
|
data/developers/omkar1102.json
CHANGED
|
@@ -7,12 +7,12 @@
|
|
| 7 |
"developer": "Omkar1102",
|
| 8 |
"evaluator_relationship": null,
|
| 9 |
"benchmark_scores": {
|
| 10 |
-
"hfopenllm_v2/IFEval": 0.
|
| 11 |
-
"hfopenllm_v2/BBH": 0.
|
| 12 |
"hfopenllm_v2/MATH Level 5": 0.0,
|
| 13 |
-
"hfopenllm_v2/GPQA": 0.
|
| 14 |
-
"hfopenllm_v2/MUSR": 0.
|
| 15 |
-
"hfopenllm_v2/MMLU-PRO": 0.
|
| 16 |
}
|
| 17 |
}
|
| 18 |
]
|
|
|
|
| 7 |
"developer": "Omkar1102",
|
| 8 |
"evaluator_relationship": null,
|
| 9 |
"benchmark_scores": {
|
| 10 |
+
"hfopenllm_v2/IFEval": 0.2148,
|
| 11 |
+
"hfopenllm_v2/BBH": 0.276,
|
| 12 |
"hfopenllm_v2/MATH Level 5": 0.0,
|
| 13 |
+
"hfopenllm_v2/GPQA": 0.2508,
|
| 14 |
+
"hfopenllm_v2/MUSR": 0.3802,
|
| 15 |
+
"hfopenllm_v2/MMLU-PRO": 0.1126
|
| 16 |
}
|
| 17 |
}
|
| 18 |
]
|
data/developers/openai.json
CHANGED
|
@@ -163,16 +163,16 @@
|
|
| 163 |
"developer": "OpenAI",
|
| 164 |
"evaluator_relationship": null,
|
| 165 |
"benchmark_scores": {
|
|
|
|
|
|
|
|
|
|
| 166 |
"apex-agents/Overall Pass@1": 0.23,
|
| 167 |
"apex-agents/Overall Pass@8": 0.4,
|
| 168 |
"apex-agents/Overall Mean Score": 0.387,
|
| 169 |
"apex-agents/Investment Banking Pass@1": 0.273,
|
| 170 |
"apex-agents/Management Consulting Pass@1": 0.227,
|
| 171 |
"apex-agents/Corporate Law Pass@1": 0.189,
|
| 172 |
-
"apex-agents/Corporate Lawyer Mean Score": 0.443
|
| 173 |
-
"ace/Overall Score": 0.515,
|
| 174 |
-
"ace/Food Score": 0.65,
|
| 175 |
-
"ace/Gaming Score": 0.578
|
| 176 |
}
|
| 177 |
},
|
| 178 |
{
|
|
@@ -300,13 +300,6 @@
|
|
| 300 |
"developer": "OpenAI",
|
| 301 |
"evaluator_relationship": null,
|
| 302 |
"benchmark_scores": {
|
| 303 |
-
"helm_instruct/Mean win rate": 0.689,
|
| 304 |
-
"helm_instruct/Anthropic RLHF dataset": 4.964,
|
| 305 |
-
"helm_instruct/Best ChatGPT Prompts": 4.986,
|
| 306 |
-
"helm_instruct/Koala test dataset": 4.987,
|
| 307 |
-
"helm_instruct/Open Assistant": 4.987,
|
| 308 |
-
"helm_instruct/Self Instruct": 4.99,
|
| 309 |
-
"helm_instruct/Vicuna": 4.992,
|
| 310 |
"helm_classic/Mean win rate": 0.783,
|
| 311 |
"helm_classic/MMLU": 0.391,
|
| 312 |
"helm_classic/BoolQ": 0.87,
|
|
@@ -322,6 +315,13 @@
|
|
| 322 |
"helm_classic/IMDB": 0.943,
|
| 323 |
"helm_classic/CivilComments": 0.696,
|
| 324 |
"helm_classic/RAFT": 0.748,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 325 |
"helm_lite/Mean win rate": 0.358,
|
| 326 |
"helm_lite/NarrativeQA": 0.655,
|
| 327 |
"helm_lite/NaturalQuestions (closed-book)": 0.335,
|
|
@@ -405,6 +405,16 @@
|
|
| 405 |
"developer": "OpenAI",
|
| 406 |
"evaluator_relationship": null,
|
| 407 |
"benchmark_scores": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 408 |
"helm_mmlu/MMLU All Subjects": 0.824,
|
| 409 |
"helm_mmlu/Abstract Algebra": 0.63,
|
| 410 |
"helm_mmlu/Anatomy": 0.8,
|
|
@@ -440,17 +450,7 @@
|
|
| 440 |
"helm_mmlu/Sociology": 0.93,
|
| 441 |
"helm_mmlu/Virology": 0.596,
|
| 442 |
"helm_mmlu/World Religions": 0.877,
|
| 443 |
-
"helm_mmlu/Mean win rate": 0.517
|
| 444 |
-
"helm_lite/Mean win rate": 0.867,
|
| 445 |
-
"helm_lite/NarrativeQA": 0.768,
|
| 446 |
-
"helm_lite/NaturalQuestions (closed-book)": 0.457,
|
| 447 |
-
"helm_lite/OpenbookQA": 0.96,
|
| 448 |
-
"helm_lite/MMLU": 0.735,
|
| 449 |
-
"helm_lite/MATH": 0.802,
|
| 450 |
-
"helm_lite/GSM8K": 0.932,
|
| 451 |
-
"helm_lite/LegalBench": 0.713,
|
| 452 |
-
"helm_lite/MedQA": 0.815,
|
| 453 |
-
"helm_lite/WMT 2014": 0.211
|
| 454 |
}
|
| 455 |
},
|
| 456 |
{
|
|
@@ -513,6 +513,16 @@
|
|
| 513 |
"developer": "OpenAI",
|
| 514 |
"evaluator_relationship": null,
|
| 515 |
"benchmark_scores": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 516 |
"helm_mmlu/MMLU All Subjects": 0.813,
|
| 517 |
"helm_mmlu/Abstract Algebra": 0.56,
|
| 518 |
"helm_mmlu/Anatomy": 0.822,
|
|
@@ -549,16 +559,6 @@
|
|
| 549 |
"helm_mmlu/Virology": 0.602,
|
| 550 |
"helm_mmlu/World Religions": 0.848,
|
| 551 |
"helm_mmlu/Mean win rate": 0.351,
|
| 552 |
-
"helm_lite/Mean win rate": 0.864,
|
| 553 |
-
"helm_lite/NarrativeQA": 0.761,
|
| 554 |
-
"helm_lite/NaturalQuestions (closed-book)": 0.482,
|
| 555 |
-
"helm_lite/OpenbookQA": 0.97,
|
| 556 |
-
"helm_lite/MMLU": 0.711,
|
| 557 |
-
"helm_lite/MATH": 0.833,
|
| 558 |
-
"helm_lite/GSM8K": 0.824,
|
| 559 |
-
"helm_lite/LegalBench": 0.727,
|
| 560 |
-
"helm_lite/MedQA": 0.783,
|
| 561 |
-
"helm_lite/WMT 2014": 0.218,
|
| 562 |
"reward-bench/Score": 0.8395,
|
| 563 |
"reward-bench/Chat": 0.9525,
|
| 564 |
"reward-bench/Chat Hard": 0.7544,
|
|
@@ -772,16 +772,16 @@
|
|
| 772 |
"helm_mmlu/Virology": 0.578,
|
| 773 |
"helm_mmlu/World Religions": 0.883,
|
| 774 |
"helm_mmlu/Mean win rate": 0.52,
|
| 775 |
-
"reward-bench/Score": 0.
|
| 776 |
-
"reward-bench/Chat": 0.9609,
|
| 777 |
-
"reward-bench/Chat Hard": 0.761,
|
| 778 |
-
"reward-bench/Safety": 0.8619,
|
| 779 |
-
"reward-bench/Reasoning": 0.8661,
|
| 780 |
"reward-bench/Factuality": 0.5684,
|
| 781 |
"reward-bench/Precise IF": 0.3312,
|
| 782 |
"reward-bench/Math": 0.623,
|
|
|
|
| 783 |
"reward-bench/Focus": 0.7293,
|
| 784 |
-
"reward-bench/Ties": 0.7819
|
|
|
|
|
|
|
|
|
|
| 785 |
}
|
| 786 |
},
|
| 787 |
{
|
|
@@ -877,7 +877,7 @@
|
|
| 877 |
"developer": "OpenAI",
|
| 878 |
"evaluator_relationship": null,
|
| 879 |
"benchmark_scores": {
|
| 880 |
-
"terminal-bench-2.0/terminal-bench-2.0":
|
| 881 |
}
|
| 882 |
},
|
| 883 |
{
|
|
@@ -911,9 +911,9 @@
|
|
| 911 |
"helm_capabilities/IFEval": 0.875,
|
| 912 |
"helm_capabilities/WildBench": 0.857,
|
| 913 |
"helm_capabilities/Omni-MATH": 0.647,
|
| 914 |
-
"livecodebenchpro/Hard Problems": 0.
|
| 915 |
-
"livecodebenchpro/Medium Problems": 0.
|
| 916 |
-
"livecodebenchpro/Easy Problems": 0.
|
| 917 |
}
|
| 918 |
},
|
| 919 |
{
|
|
@@ -922,7 +922,7 @@
|
|
| 922 |
"developer": "OpenAI",
|
| 923 |
"evaluator_relationship": null,
|
| 924 |
"benchmark_scores": {
|
| 925 |
-
"terminal-bench-2.0/terminal-bench-2.0":
|
| 926 |
}
|
| 927 |
},
|
| 928 |
{
|
|
@@ -931,7 +931,7 @@
|
|
| 931 |
"developer": "OpenAI",
|
| 932 |
"evaluator_relationship": null,
|
| 933 |
"benchmark_scores": {
|
| 934 |
-
"terminal-bench-2.0/terminal-bench-2.0":
|
| 935 |
}
|
| 936 |
},
|
| 937 |
{
|
|
@@ -954,7 +954,7 @@
|
|
| 954 |
"developer": "OpenAI",
|
| 955 |
"evaluator_relationship": null,
|
| 956 |
"benchmark_scores": {
|
| 957 |
-
"terminal-bench-2.0/terminal-bench-2.0":
|
| 958 |
}
|
| 959 |
},
|
| 960 |
{
|
|
@@ -986,7 +986,7 @@
|
|
| 986 |
"developer": "OpenAI",
|
| 987 |
"evaluator_relationship": null,
|
| 988 |
"benchmark_scores": {
|
| 989 |
-
"terminal-bench-2.0/terminal-bench-2.0":
|
| 990 |
}
|
| 991 |
},
|
| 992 |
{
|
|
@@ -1013,7 +1013,7 @@
|
|
| 1013 |
"developer": "OpenAI",
|
| 1014 |
"evaluator_relationship": null,
|
| 1015 |
"benchmark_scores": {
|
| 1016 |
-
"terminal-bench-2.0/terminal-bench-2.0":
|
| 1017 |
}
|
| 1018 |
},
|
| 1019 |
{
|
|
@@ -1023,14 +1023,14 @@
|
|
| 1023 |
"evaluator_relationship": null,
|
| 1024 |
"benchmark_scores": {
|
| 1025 |
"appworld_test_normal/appworld/test_normal": 0.0,
|
| 1026 |
-
"browsecompplus/browsecompplus": 0.
|
| 1027 |
"livecodebenchpro/Hard Problems": 0.1594,
|
| 1028 |
"livecodebenchpro/Medium Problems": 0.5211,
|
| 1029 |
"livecodebenchpro/Easy Problems": 0.9014,
|
| 1030 |
"swe-bench/swe-bench": 0.5455,
|
| 1031 |
"tau-bench-2_airline/tau-bench-2/airline": 0.6,
|
| 1032 |
-
"tau-bench-2_retail/tau-bench-2/retail": 0.
|
| 1033 |
-
"tau-bench-2_telecom/tau-bench-2/telecom": 0.
|
| 1034 |
}
|
| 1035 |
},
|
| 1036 |
{
|
|
@@ -1048,7 +1048,7 @@
|
|
| 1048 |
"developer": "OpenAI",
|
| 1049 |
"evaluator_relationship": null,
|
| 1050 |
"benchmark_scores": {
|
| 1051 |
-
"terminal-bench-2.0/terminal-bench-2.0":
|
| 1052 |
}
|
| 1053 |
},
|
| 1054 |
{
|
|
@@ -1112,7 +1112,7 @@
|
|
| 1112 |
"livecodebenchpro/Hard Problems": 0.0,
|
| 1113 |
"livecodebenchpro/Medium Problems": 0.11267605633802817,
|
| 1114 |
"livecodebenchpro/Easy Problems": 0.6619718309859155,
|
| 1115 |
-
"terminal-bench-2.0/terminal-bench-2.0":
|
| 1116 |
}
|
| 1117 |
},
|
| 1118 |
{
|
|
@@ -1130,7 +1130,7 @@
|
|
| 1130 |
"livecodebenchpro/Hard Problems": 0.0,
|
| 1131 |
"livecodebenchpro/Medium Problems": 0.056338028169014086,
|
| 1132 |
"livecodebenchpro/Easy Problems": 0.5070422535211268,
|
| 1133 |
-
"terminal-bench-2.0/terminal-bench-2.0": 3.
|
| 1134 |
}
|
| 1135 |
},
|
| 1136 |
{
|
|
|
|
| 163 |
"developer": "OpenAI",
|
| 164 |
"evaluator_relationship": null,
|
| 165 |
"benchmark_scores": {
|
| 166 |
+
"ace/Overall Score": 0.515,
|
| 167 |
+
"ace/Food Score": 0.65,
|
| 168 |
+
"ace/Gaming Score": 0.578,
|
| 169 |
"apex-agents/Overall Pass@1": 0.23,
|
| 170 |
"apex-agents/Overall Pass@8": 0.4,
|
| 171 |
"apex-agents/Overall Mean Score": 0.387,
|
| 172 |
"apex-agents/Investment Banking Pass@1": 0.273,
|
| 173 |
"apex-agents/Management Consulting Pass@1": 0.227,
|
| 174 |
"apex-agents/Corporate Law Pass@1": 0.189,
|
| 175 |
+
"apex-agents/Corporate Lawyer Mean Score": 0.443
|
|
|
|
|
|
|
|
|
|
| 176 |
}
|
| 177 |
},
|
| 178 |
{
|
|
|
|
| 300 |
"developer": "OpenAI",
|
| 301 |
"evaluator_relationship": null,
|
| 302 |
"benchmark_scores": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 303 |
"helm_classic/Mean win rate": 0.783,
|
| 304 |
"helm_classic/MMLU": 0.391,
|
| 305 |
"helm_classic/BoolQ": 0.87,
|
|
|
|
| 315 |
"helm_classic/IMDB": 0.943,
|
| 316 |
"helm_classic/CivilComments": 0.696,
|
| 317 |
"helm_classic/RAFT": 0.748,
|
| 318 |
+
"helm_instruct/Mean win rate": 0.689,
|
| 319 |
+
"helm_instruct/Anthropic RLHF dataset": 4.964,
|
| 320 |
+
"helm_instruct/Best ChatGPT Prompts": 4.986,
|
| 321 |
+
"helm_instruct/Koala test dataset": 4.987,
|
| 322 |
+
"helm_instruct/Open Assistant": 4.987,
|
| 323 |
+
"helm_instruct/Self Instruct": 4.99,
|
| 324 |
+
"helm_instruct/Vicuna": 4.992,
|
| 325 |
"helm_lite/Mean win rate": 0.358,
|
| 326 |
"helm_lite/NarrativeQA": 0.655,
|
| 327 |
"helm_lite/NaturalQuestions (closed-book)": 0.335,
|
|
|
|
| 405 |
"developer": "OpenAI",
|
| 406 |
"evaluator_relationship": null,
|
| 407 |
"benchmark_scores": {
|
| 408 |
+
"helm_lite/Mean win rate": 0.867,
|
| 409 |
+
"helm_lite/NarrativeQA": 0.768,
|
| 410 |
+
"helm_lite/NaturalQuestions (closed-book)": 0.457,
|
| 411 |
+
"helm_lite/OpenbookQA": 0.96,
|
| 412 |
+
"helm_lite/MMLU": 0.735,
|
| 413 |
+
"helm_lite/MATH": 0.802,
|
| 414 |
+
"helm_lite/GSM8K": 0.932,
|
| 415 |
+
"helm_lite/LegalBench": 0.713,
|
| 416 |
+
"helm_lite/MedQA": 0.815,
|
| 417 |
+
"helm_lite/WMT 2014": 0.211,
|
| 418 |
"helm_mmlu/MMLU All Subjects": 0.824,
|
| 419 |
"helm_mmlu/Abstract Algebra": 0.63,
|
| 420 |
"helm_mmlu/Anatomy": 0.8,
|
|
|
|
| 450 |
"helm_mmlu/Sociology": 0.93,
|
| 451 |
"helm_mmlu/Virology": 0.596,
|
| 452 |
"helm_mmlu/World Religions": 0.877,
|
| 453 |
+
"helm_mmlu/Mean win rate": 0.517
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 454 |
}
|
| 455 |
},
|
| 456 |
{
|
|
|
|
| 513 |
"developer": "OpenAI",
|
| 514 |
"evaluator_relationship": null,
|
| 515 |
"benchmark_scores": {
|
| 516 |
+
"helm_lite/Mean win rate": 0.864,
|
| 517 |
+
"helm_lite/NarrativeQA": 0.761,
|
| 518 |
+
"helm_lite/NaturalQuestions (closed-book)": 0.482,
|
| 519 |
+
"helm_lite/OpenbookQA": 0.97,
|
| 520 |
+
"helm_lite/MMLU": 0.711,
|
| 521 |
+
"helm_lite/MATH": 0.833,
|
| 522 |
+
"helm_lite/GSM8K": 0.824,
|
| 523 |
+
"helm_lite/LegalBench": 0.727,
|
| 524 |
+
"helm_lite/MedQA": 0.783,
|
| 525 |
+
"helm_lite/WMT 2014": 0.218,
|
| 526 |
"helm_mmlu/MMLU All Subjects": 0.813,
|
| 527 |
"helm_mmlu/Abstract Algebra": 0.56,
|
| 528 |
"helm_mmlu/Anatomy": 0.822,
|
|
|
|
| 559 |
"helm_mmlu/Virology": 0.602,
|
| 560 |
"helm_mmlu/World Religions": 0.848,
|
| 561 |
"helm_mmlu/Mean win rate": 0.351,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 562 |
"reward-bench/Score": 0.8395,
|
| 563 |
"reward-bench/Chat": 0.9525,
|
| 564 |
"reward-bench/Chat Hard": 0.7544,
|
|
|
|
| 772 |
"helm_mmlu/Virology": 0.578,
|
| 773 |
"helm_mmlu/World Religions": 0.883,
|
| 774 |
"helm_mmlu/Mean win rate": 0.52,
|
| 775 |
+
"reward-bench/Score": 0.8673,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 776 |
"reward-bench/Factuality": 0.5684,
|
| 777 |
"reward-bench/Precise IF": 0.3312,
|
| 778 |
"reward-bench/Math": 0.623,
|
| 779 |
+
"reward-bench/Safety": 0.8811,
|
| 780 |
"reward-bench/Focus": 0.7293,
|
| 781 |
+
"reward-bench/Ties": 0.7819,
|
| 782 |
+
"reward-bench/Chat": 0.9609,
|
| 783 |
+
"reward-bench/Chat Hard": 0.761,
|
| 784 |
+
"reward-bench/Reasoning": 0.8661
|
| 785 |
}
|
| 786 |
},
|
| 787 |
{
|
|
|
|
| 877 |
"developer": "OpenAI",
|
| 878 |
"evaluator_relationship": null,
|
| 879 |
"benchmark_scores": {
|
| 880 |
+
"terminal-bench-2.0/terminal-bench-2.0": 49.6
|
| 881 |
}
|
| 882 |
},
|
| 883 |
{
|
|
|
|
| 911 |
"helm_capabilities/IFEval": 0.875,
|
| 912 |
"helm_capabilities/WildBench": 0.857,
|
| 913 |
"helm_capabilities/Omni-MATH": 0.647,
|
| 914 |
+
"livecodebenchpro/Hard Problems": 0.04225352112676056,
|
| 915 |
+
"livecodebenchpro/Medium Problems": 0.4084507042253521,
|
| 916 |
+
"livecodebenchpro/Easy Problems": 0.8873239436619719
|
| 917 |
}
|
| 918 |
},
|
| 919 |
{
|
|
|
|
| 922 |
"developer": "OpenAI",
|
| 923 |
"evaluator_relationship": null,
|
| 924 |
"benchmark_scores": {
|
| 925 |
+
"terminal-bench-2.0/terminal-bench-2.0": 43.4
|
| 926 |
}
|
| 927 |
},
|
| 928 |
{
|
|
|
|
| 931 |
"developer": "OpenAI",
|
| 932 |
"evaluator_relationship": null,
|
| 933 |
"benchmark_scores": {
|
| 934 |
+
"terminal-bench-2.0/terminal-bench-2.0": 24.0
|
| 935 |
}
|
| 936 |
},
|
| 937 |
{
|
|
|
|
| 954 |
"developer": "OpenAI",
|
| 955 |
"evaluator_relationship": null,
|
| 956 |
"benchmark_scores": {
|
| 957 |
+
"terminal-bench-2.0/terminal-bench-2.0": 11.5
|
| 958 |
}
|
| 959 |
},
|
| 960 |
{
|
|
|
|
| 986 |
"developer": "OpenAI",
|
| 987 |
"evaluator_relationship": null,
|
| 988 |
"benchmark_scores": {
|
| 989 |
+
"terminal-bench-2.0/terminal-bench-2.0": 57.8
|
| 990 |
}
|
| 991 |
},
|
| 992 |
{
|
|
|
|
| 1013 |
"developer": "OpenAI",
|
| 1014 |
"evaluator_relationship": null,
|
| 1015 |
"benchmark_scores": {
|
| 1016 |
+
"terminal-bench-2.0/terminal-bench-2.0": 62.9
|
| 1017 |
}
|
| 1018 |
},
|
| 1019 |
{
|
|
|
|
| 1023 |
"evaluator_relationship": null,
|
| 1024 |
"benchmark_scores": {
|
| 1025 |
"appworld_test_normal/appworld/test_normal": 0.0,
|
| 1026 |
+
"browsecompplus/browsecompplus": 0.43,
|
| 1027 |
"livecodebenchpro/Hard Problems": 0.1594,
|
| 1028 |
"livecodebenchpro/Medium Problems": 0.5211,
|
| 1029 |
"livecodebenchpro/Easy Problems": 0.9014,
|
| 1030 |
"swe-bench/swe-bench": 0.5455,
|
| 1031 |
"tau-bench-2_airline/tau-bench-2/airline": 0.6,
|
| 1032 |
+
"tau-bench-2_retail/tau-bench-2/retail": 0.73,
|
| 1033 |
+
"tau-bench-2_telecom/tau-bench-2/telecom": 0.71
|
| 1034 |
}
|
| 1035 |
},
|
| 1036 |
{
|
|
|
|
| 1048 |
"developer": "OpenAI",
|
| 1049 |
"evaluator_relationship": null,
|
| 1050 |
"benchmark_scores": {
|
| 1051 |
+
"terminal-bench-2.0/terminal-bench-2.0": 77.3
|
| 1052 |
}
|
| 1053 |
},
|
| 1054 |
{
|
|
|
|
| 1112 |
"livecodebenchpro/Hard Problems": 0.0,
|
| 1113 |
"livecodebenchpro/Medium Problems": 0.11267605633802817,
|
| 1114 |
"livecodebenchpro/Easy Problems": 0.6619718309859155,
|
| 1115 |
+
"terminal-bench-2.0/terminal-bench-2.0": 18.7
|
| 1116 |
}
|
| 1117 |
},
|
| 1118 |
{
|
|
|
|
| 1130 |
"livecodebenchpro/Hard Problems": 0.0,
|
| 1131 |
"livecodebenchpro/Medium Problems": 0.056338028169014086,
|
| 1132 |
"livecodebenchpro/Easy Problems": 0.5070422535211268,
|
| 1133 |
+
"terminal-bench-2.0/terminal-bench-2.0": 3.4
|
| 1134 |
}
|
| 1135 |
},
|
| 1136 |
{
|
data/developers/openassistant.json
CHANGED
|
@@ -7,17 +7,17 @@
|
|
| 7 |
"developer": "OpenAssistant",
|
| 8 |
"evaluator_relationship": null,
|
| 9 |
"benchmark_scores": {
|
| 10 |
-
"reward-bench/Score": 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
"reward-bench/Factuality": 0.3979,
|
| 12 |
"reward-bench/Precise IF": 0.2875,
|
| 13 |
"reward-bench/Math": 0.377,
|
| 14 |
-
"reward-bench/Safety": 0.5446,
|
| 15 |
"reward-bench/Focus": 0.1535,
|
| 16 |
-
"reward-bench/Ties": 0.047
|
| 17 |
-
"reward-bench/Chat": 0.9246,
|
| 18 |
-
"reward-bench/Chat Hard": 0.3728,
|
| 19 |
-
"reward-bench/Reasoning": 0.5855,
|
| 20 |
-
"reward-bench/Prior Sets (0.5 weight)": 0.6801
|
| 21 |
}
|
| 22 |
},
|
| 23 |
{
|
|
@@ -26,17 +26,17 @@
|
|
| 26 |
"developer": "OpenAssistant",
|
| 27 |
"evaluator_relationship": null,
|
| 28 |
"benchmark_scores": {
|
| 29 |
-
"reward-bench/Score": 0.
|
| 30 |
-
"reward-bench/Chat": 0.8855,
|
| 31 |
-
"reward-bench/Chat Hard": 0.4868,
|
| 32 |
-
"reward-bench/Safety": 0.3244,
|
| 33 |
-
"reward-bench/Reasoning": 0.7752,
|
| 34 |
-
"reward-bench/Prior Sets (0.5 weight)": 0.6533,
|
| 35 |
"reward-bench/Factuality": 0.3179,
|
| 36 |
"reward-bench/Precise IF": 0.2625,
|
| 37 |
"reward-bench/Math": 0.3934,
|
|
|
|
| 38 |
"reward-bench/Focus": 0.2707,
|
| 39 |
-
"reward-bench/Ties": 0.0198
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
}
|
| 41 |
},
|
| 42 |
{
|
|
|
|
| 7 |
"developer": "OpenAssistant",
|
| 8 |
"evaluator_relationship": null,
|
| 9 |
"benchmark_scores": {
|
| 10 |
+
"reward-bench/Score": 0.2653,
|
| 11 |
+
"reward-bench/Chat": 0.9246,
|
| 12 |
+
"reward-bench/Chat Hard": 0.3728,
|
| 13 |
+
"reward-bench/Safety": 0.3289,
|
| 14 |
+
"reward-bench/Reasoning": 0.5855,
|
| 15 |
+
"reward-bench/Prior Sets (0.5 weight)": 0.6801,
|
| 16 |
"reward-bench/Factuality": 0.3979,
|
| 17 |
"reward-bench/Precise IF": 0.2875,
|
| 18 |
"reward-bench/Math": 0.377,
|
|
|
|
| 19 |
"reward-bench/Focus": 0.1535,
|
| 20 |
+
"reward-bench/Ties": 0.047
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
}
|
| 22 |
},
|
| 23 |
{
|
|
|
|
| 26 |
"developer": "OpenAssistant",
|
| 27 |
"evaluator_relationship": null,
|
| 28 |
"benchmark_scores": {
|
| 29 |
+
"reward-bench/Score": 0.6901,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
"reward-bench/Factuality": 0.3179,
|
| 31 |
"reward-bench/Precise IF": 0.2625,
|
| 32 |
"reward-bench/Math": 0.3934,
|
| 33 |
+
"reward-bench/Safety": 0.6311,
|
| 34 |
"reward-bench/Focus": 0.2707,
|
| 35 |
+
"reward-bench/Ties": 0.0198,
|
| 36 |
+
"reward-bench/Chat": 0.8855,
|
| 37 |
+
"reward-bench/Chat Hard": 0.4868,
|
| 38 |
+
"reward-bench/Reasoning": 0.7752,
|
| 39 |
+
"reward-bench/Prior Sets (0.5 weight)": 0.6533
|
| 40 |
}
|
| 41 |
},
|
| 42 |
{
|
data/developers/openbmb.json
CHANGED
|
@@ -21,17 +21,17 @@
|
|
| 21 |
"developer": "openbmb",
|
| 22 |
"evaluator_relationship": null,
|
| 23 |
"benchmark_scores": {
|
| 24 |
-
"reward-bench/Score": 0.
|
| 25 |
-
"reward-bench/Chat": 0.9804,
|
| 26 |
-
"reward-bench/Chat Hard": 0.6557,
|
| 27 |
-
"reward-bench/Safety": 0.6267,
|
| 28 |
-
"reward-bench/Reasoning": 0.8633,
|
| 29 |
-
"reward-bench/Prior Sets (0.5 weight)": 0.7172,
|
| 30 |
"reward-bench/Factuality": 0.6,
|
| 31 |
"reward-bench/Precise IF": 0.3438,
|
| 32 |
"reward-bench/Math": 0.5683,
|
|
|
|
| 33 |
"reward-bench/Focus": 0.7475,
|
| 34 |
-
"reward-bench/Ties": 0.5972
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
}
|
| 36 |
},
|
| 37 |
{
|
|
|
|
| 21 |
"developer": "openbmb",
|
| 22 |
"evaluator_relationship": null,
|
| 23 |
"benchmark_scores": {
|
| 24 |
+
"reward-bench/Score": 0.8159,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
"reward-bench/Factuality": 0.6,
|
| 26 |
"reward-bench/Precise IF": 0.3438,
|
| 27 |
"reward-bench/Math": 0.5683,
|
| 28 |
+
"reward-bench/Safety": 0.8135,
|
| 29 |
"reward-bench/Focus": 0.7475,
|
| 30 |
+
"reward-bench/Ties": 0.5972,
|
| 31 |
+
"reward-bench/Chat": 0.9804,
|
| 32 |
+
"reward-bench/Chat Hard": 0.6557,
|
| 33 |
+
"reward-bench/Reasoning": 0.8633,
|
| 34 |
+
"reward-bench/Prior Sets (0.5 weight)": 0.7172
|
| 35 |
}
|
| 36 |
},
|
| 37 |
{
|
data/developers/pku-alignment.json
CHANGED
|
@@ -7,17 +7,17 @@
|
|
| 7 |
"developer": "PKU-Alignment",
|
| 8 |
"evaluator_relationship": null,
|
| 9 |
"benchmark_scores": {
|
| 10 |
-
"reward-bench/Score": 0.
|
| 11 |
-
"reward-bench/Chat": 0.6173,
|
| 12 |
-
"reward-bench/Chat Hard": 0.4232,
|
| 13 |
-
"reward-bench/Safety": 0.7589,
|
| 14 |
-
"reward-bench/Reasoning": 0.5482,
|
| 15 |
-
"reward-bench/Prior Sets (0.5 weight)": 0.57,
|
| 16 |
"reward-bench/Factuality": 0.3263,
|
| 17 |
"reward-bench/Precise IF": 0.2313,
|
| 18 |
"reward-bench/Math": 0.3989,
|
|
|
|
| 19 |
"reward-bench/Focus": 0.2939,
|
| 20 |
-
"reward-bench/Ties": -0.01
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
}
|
| 22 |
},
|
| 23 |
{
|
|
@@ -26,17 +26,17 @@
|
|
| 26 |
"developer": "PKU-Alignment",
|
| 27 |
"evaluator_relationship": null,
|
| 28 |
"benchmark_scores": {
|
| 29 |
-
"reward-bench/Score": 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
"reward-bench/Factuality": 0.2105,
|
| 31 |
"reward-bench/Precise IF": 0.2938,
|
| 32 |
"reward-bench/Math": 0.2623,
|
| 33 |
-
"reward-bench/Safety": 0.3757,
|
| 34 |
"reward-bench/Focus": 0.0646,
|
| 35 |
-
"reward-bench/Ties": -0.01
|
| 36 |
-
"reward-bench/Chat": 0.8184,
|
| 37 |
-
"reward-bench/Chat Hard": 0.2873,
|
| 38 |
-
"reward-bench/Reasoning": 0.346,
|
| 39 |
-
"reward-bench/Prior Sets (0.5 weight)": 0.5993
|
| 40 |
}
|
| 41 |
},
|
| 42 |
{
|
|
@@ -64,17 +64,17 @@
|
|
| 64 |
"developer": "PKU-Alignment",
|
| 65 |
"evaluator_relationship": null,
|
| 66 |
"benchmark_scores": {
|
| 67 |
-
"reward-bench/Score": 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
"reward-bench/Factuality": 0.2168,
|
| 69 |
"reward-bench/Precise IF": 0.2562,
|
| 70 |
"reward-bench/Math": 0.3825,
|
| 71 |
-
"reward-bench/Safety": 0.6041,
|
| 72 |
"reward-bench/Focus": 0.2606,
|
| 73 |
-
"reward-bench/Ties": 0.0944
|
| 74 |
-
"reward-bench/Chat": 0.8994,
|
| 75 |
-
"reward-bench/Chat Hard": 0.364,
|
| 76 |
-
"reward-bench/Reasoning": 0.6887,
|
| 77 |
-
"reward-bench/Prior Sets (0.5 weight)": 0.6171
|
| 78 |
}
|
| 79 |
}
|
| 80 |
]
|
|
|
|
| 7 |
"developer": "PKU-Alignment",
|
| 8 |
"evaluator_relationship": null,
|
| 9 |
"benchmark_scores": {
|
| 10 |
+
"reward-bench/Score": 0.5798,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
"reward-bench/Factuality": 0.3263,
|
| 12 |
"reward-bench/Precise IF": 0.2313,
|
| 13 |
"reward-bench/Math": 0.3989,
|
| 14 |
+
"reward-bench/Safety": 0.7351,
|
| 15 |
"reward-bench/Focus": 0.2939,
|
| 16 |
+
"reward-bench/Ties": -0.01,
|
| 17 |
+
"reward-bench/Chat": 0.6173,
|
| 18 |
+
"reward-bench/Chat Hard": 0.4232,
|
| 19 |
+
"reward-bench/Reasoning": 0.5482,
|
| 20 |
+
"reward-bench/Prior Sets (0.5 weight)": 0.57
|
| 21 |
}
|
| 22 |
},
|
| 23 |
{
|
|
|
|
| 26 |
"developer": "PKU-Alignment",
|
| 27 |
"evaluator_relationship": null,
|
| 28 |
"benchmark_scores": {
|
| 29 |
+
"reward-bench/Score": 0.1606,
|
| 30 |
+
"reward-bench/Chat": 0.8184,
|
| 31 |
+
"reward-bench/Chat Hard": 0.2873,
|
| 32 |
+
"reward-bench/Safety": 0.1422,
|
| 33 |
+
"reward-bench/Reasoning": 0.346,
|
| 34 |
+
"reward-bench/Prior Sets (0.5 weight)": 0.5993,
|
| 35 |
"reward-bench/Factuality": 0.2105,
|
| 36 |
"reward-bench/Precise IF": 0.2938,
|
| 37 |
"reward-bench/Math": 0.2623,
|
|
|
|
| 38 |
"reward-bench/Focus": 0.0646,
|
| 39 |
+
"reward-bench/Ties": -0.01
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
}
|
| 41 |
},
|
| 42 |
{
|
|
|
|
| 64 |
"developer": "PKU-Alignment",
|
| 65 |
"evaluator_relationship": null,
|
| 66 |
"benchmark_scores": {
|
| 67 |
+
"reward-bench/Score": 0.2544,
|
| 68 |
+
"reward-bench/Chat": 0.8994,
|
| 69 |
+
"reward-bench/Chat Hard": 0.364,
|
| 70 |
+
"reward-bench/Safety": 0.3156,
|
| 71 |
+
"reward-bench/Reasoning": 0.6887,
|
| 72 |
+
"reward-bench/Prior Sets (0.5 weight)": 0.6171,
|
| 73 |
"reward-bench/Factuality": 0.2168,
|
| 74 |
"reward-bench/Precise IF": 0.2562,
|
| 75 |
"reward-bench/Math": 0.3825,
|
|
|
|
| 76 |
"reward-bench/Focus": 0.2606,
|
| 77 |
+
"reward-bench/Ties": 0.0944
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
}
|
| 79 |
}
|
| 80 |
]
|
data/developers/primeintellect.json
CHANGED
|
@@ -8,11 +8,11 @@
|
|
| 8 |
"evaluator_relationship": null,
|
| 9 |
"benchmark_scores": {
|
| 10 |
"hfopenllm_v2/IFEval": 0.1757,
|
| 11 |
-
"hfopenllm_v2/BBH": 0.
|
| 12 |
"hfopenllm_v2/MATH Level 5": 0.0,
|
| 13 |
-
"hfopenllm_v2/GPQA": 0.
|
| 14 |
-
"hfopenllm_v2/MUSR": 0.
|
| 15 |
-
"hfopenllm_v2/MMLU-PRO": 0.
|
| 16 |
}
|
| 17 |
},
|
| 18 |
{
|
|
|
|
| 8 |
"evaluator_relationship": null,
|
| 9 |
"benchmark_scores": {
|
| 10 |
"hfopenllm_v2/IFEval": 0.1757,
|
| 11 |
+
"hfopenllm_v2/BBH": 0.274,
|
| 12 |
"hfopenllm_v2/MATH Level 5": 0.0,
|
| 13 |
+
"hfopenllm_v2/GPQA": 0.25,
|
| 14 |
+
"hfopenllm_v2/MUSR": 0.3753,
|
| 15 |
+
"hfopenllm_v2/MMLU-PRO": 0.112
|
| 16 |
}
|
| 17 |
},
|
| 18 |
{
|
data/developers/princeton-nlp.json
CHANGED
|
@@ -49,12 +49,12 @@
|
|
| 49 |
"developer": "princeton-nlp",
|
| 50 |
"evaluator_relationship": null,
|
| 51 |
"benchmark_scores": {
|
| 52 |
-
"hfopenllm_v2/IFEval": 0.
|
| 53 |
-
"hfopenllm_v2/BBH": 0.
|
| 54 |
-
"hfopenllm_v2/MATH Level 5": 0.
|
| 55 |
-
"hfopenllm_v2/GPQA": 0.
|
| 56 |
-
"hfopenllm_v2/MUSR": 0.
|
| 57 |
-
"hfopenllm_v2/MMLU-PRO": 0.
|
| 58 |
}
|
| 59 |
},
|
| 60 |
{
|
|
|
|
| 49 |
"developer": "princeton-nlp",
|
| 50 |
"evaluator_relationship": null,
|
| 51 |
"benchmark_scores": {
|
| 52 |
+
"hfopenllm_v2/IFEval": 0.3978,
|
| 53 |
+
"hfopenllm_v2/BBH": 0.4983,
|
| 54 |
+
"hfopenllm_v2/MATH Level 5": 0.0582,
|
| 55 |
+
"hfopenllm_v2/GPQA": 0.281,
|
| 56 |
+
"hfopenllm_v2/MUSR": 0.425,
|
| 57 |
+
"hfopenllm_v2/MMLU-PRO": 0.3246
|
| 58 |
}
|
| 59 |
},
|
| 60 |
{
|