GitHub Actions commited on
Commit
49596d9
·
1 Parent(s): d8be99e

chore: sync EEE pipeline output [2026-04-02 05:07 UTC]

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. data/benchmarks.json +5 -1
  2. data/benchmarks/appworld_test_normal.json +2 -2
  3. data/benchmarks/browsecompplus.json +2 -2
  4. data/benchmarks/hfopenllm_v2.json +205 -218
  5. data/benchmarks/livecodebenchpro.json +3 -3
  6. data/benchmarks/reward-bench.json +174 -174
  7. data/benchmarks/swe-bench.json +2 -2
  8. data/benchmarks/tau-bench-2_airline.json +2 -2
  9. data/benchmarks/tau-bench-2_retail.json +1 -1
  10. data/benchmarks/tau-bench-2_telecom.json +2 -2
  11. data/benchmarks/terminal-bench-2.0.json +20 -20
  12. data/benchmarks/theory_of_mind.json +12 -0
  13. data/developers.json +1 -1
  14. data/developers/adriszmar.json +6 -6
  15. data/developers/ai2.json +3 -3
  16. data/developers/akjindal53244.json +5 -5
  17. data/developers/allenai.json +33 -33
  18. data/developers/anthropic.json +10 -10
  19. data/developers/cognitivecomputations.json +6 -6
  20. data/developers/columbia-nlp.json +6 -6
  21. data/developers/cpayne1303.json +5 -5
  22. data/developers/daemontatox.json +6 -6
  23. data/developers/deepmount00.json +6 -6
  24. data/developers/dfurman.json +6 -6
  25. data/developers/doppelreflex.json +6 -6
  26. data/developers/google.json +20 -20
  27. data/developers/huggingfacetb.json +6 -6
  28. data/developers/infly.json +6 -6
  29. data/developers/internlm.json +6 -6
  30. data/developers/jaspionjader.json +5 -5
  31. data/developers/leroydyer.json +6 -6
  32. data/developers/llmat.json +6 -6
  33. data/developers/lxzgordon.json +6 -6
  34. data/developers/meta.json +22 -22
  35. data/developers/minimax.json +1 -1
  36. data/developers/mistralai.json +16 -16
  37. data/developers/mlabonne.json +6 -6
  38. data/developers/moonshot_ai.json +1 -1
  39. data/developers/multiple.json +1 -1
  40. data/developers/nazimali.json +6 -6
  41. data/developers/nicolinho.json +12 -12
  42. data/developers/nisten.json +6 -6
  43. data/developers/nousresearch.json +0 -14
  44. data/developers/omkar1102.json +5 -5
  45. data/developers/openai.json +53 -53
  46. data/developers/openassistant.json +14 -14
  47. data/developers/openbmb.json +7 -7
  48. data/developers/pku-alignment.json +21 -21
  49. data/developers/primeintellect.json +4 -4
  50. data/developers/princeton-nlp.json +6 -6
data/benchmarks.json CHANGED
@@ -45,7 +45,7 @@
45
  },
46
  {
47
  "benchmark": "hfopenllm_v2",
48
- "model_count": 4494
49
  },
50
  {
51
  "benchmark": "la_leaderboard",
@@ -78,5 +78,9 @@
78
  {
79
  "benchmark": "terminal-bench-2.0",
80
  "model_count": 37
 
 
 
 
81
  }
82
  ]
 
45
  },
46
  {
47
  "benchmark": "hfopenllm_v2",
48
+ "model_count": 4493
49
  },
50
  {
51
  "benchmark": "la_leaderboard",
 
78
  {
79
  "benchmark": "terminal-bench-2.0",
80
  "model_count": 37
81
+ },
82
+ {
83
+ "benchmark": "theory_of_mind",
84
+ "model_count": 1
85
  }
86
  ]
data/benchmarks/appworld_test_normal.json CHANGED
@@ -5,7 +5,7 @@
5
  "name": "claude-opus-4-5",
6
  "developer": "Anthropic",
7
  "scores": {
8
- "appworld/test_normal": 0.68
9
  }
10
  },
11
  {
@@ -13,7 +13,7 @@
13
  "name": "gemini-3-pro-preview",
14
  "developer": "Google",
15
  "scores": {
16
- "appworld/test_normal": 0.13
17
  }
18
  },
19
  {
 
5
  "name": "claude-opus-4-5",
6
  "developer": "Anthropic",
7
  "scores": {
8
+ "appworld/test_normal": 0.7
9
  }
10
  },
11
  {
 
13
  "name": "gemini-3-pro-preview",
14
  "developer": "Google",
15
  "scores": {
16
+ "appworld/test_normal": 0.55
17
  }
18
  },
19
  {
data/benchmarks/browsecompplus.json CHANGED
@@ -13,7 +13,7 @@
13
  "name": "gemini-3-pro-preview",
14
  "developer": "Google",
15
  "scores": {
16
- "browsecompplus": 0.48
17
  }
18
  },
19
  {
@@ -21,7 +21,7 @@
21
  "name": "gpt-5.2-2025-12-11",
22
  "developer": "OpenAI",
23
  "scores": {
24
- "browsecompplus": 0.48
25
  }
26
  }
27
  ]
 
13
  "name": "gemini-3-pro-preview",
14
  "developer": "Google",
15
  "scores": {
16
+ "browsecompplus": 0.3333
17
  }
18
  },
19
  {
 
21
  "name": "gpt-5.2-2025-12-11",
22
  "developer": "OpenAI",
23
  "scores": {
24
+ "browsecompplus": 0.43
25
  }
26
  }
27
  ]
data/benchmarks/hfopenllm_v2.json CHANGED
@@ -2176,12 +2176,12 @@
2176
  "name": "LION-Gemma-2b-dpo-v1.0",
2177
  "developer": "Columbia-NLP",
2178
  "scores": {
2179
- "IFEval": 0.3278,
2180
- "BBH": 0.392,
2181
- "MATH Level 5": 0.0431,
2182
- "GPQA": 0.2492,
2183
- "MUSR": 0.412,
2184
- "MMLU-PRO": 0.1666
2185
  }
2186
  },
2187
  {
@@ -3229,12 +3229,12 @@
3229
  "name": "PathfinderAI",
3230
  "developer": "Daemontatox",
3231
  "scores": {
3232
- "IFEval": 0.3745,
3233
- "BBH": 0.6668,
3234
- "MATH Level 5": 0.4758,
3235
- "GPQA": 0.3943,
3236
- "MUSR": 0.4858,
3237
- "MMLU-PRO": 0.5593
3238
  }
3239
  },
3240
  {
@@ -4321,12 +4321,12 @@
4321
  "name": "Llama-3.1-8b-ITA",
4322
  "developer": "DeepMount00",
4323
  "scores": {
4324
- "IFEval": 0.7917,
4325
- "BBH": 0.5109,
4326
- "MATH Level 5": 0.1088,
4327
- "GPQA": 0.2878,
4328
- "MUSR": 0.4136,
4329
- "MMLU-PRO": 0.3876
4330
  }
4331
  },
4332
  {
@@ -4646,12 +4646,12 @@
4646
  "name": "MN-12B-LilithFrame",
4647
  "developer": "DoppelReflEx",
4648
  "scores": {
4649
- "IFEval": 0.451,
4650
- "BBH": 0.4944,
4651
- "MATH Level 5": 0.1156,
4652
- "GPQA": 0.3196,
4653
- "MUSR": 0.3896,
4654
- "MMLU-PRO": 0.3256
4655
  }
4656
  },
4657
  {
@@ -9144,12 +9144,12 @@
9144
  "name": "SmolLM2-135M-Instruct",
9145
  "developer": "HuggingFaceTB",
9146
  "scores": {
9147
- "IFEval": 0.0593,
9148
- "BBH": 0.3135,
9149
- "MATH Level 5": 0.0144,
9150
- "GPQA": 0.2341,
9151
- "MUSR": 0.3871,
9152
- "MMLU-PRO": 0.1092
9153
  }
9154
  },
9155
  {
@@ -13057,12 +13057,12 @@
13057
  "name": "SpydazWeb_AI_HumanAI_012_INSTRUCT_XA",
13058
  "developer": "LeroyDyer",
13059
  "scores": {
13060
- "IFEval": 0.3579,
13061
- "BBH": 0.4477,
13062
- "MATH Level 5": 0.0423,
13063
- "GPQA": 0.3096,
13064
- "MUSR": 0.4134,
13065
- "MMLU-PRO": 0.2376
13066
  }
13067
  },
13068
  {
@@ -16874,19 +16874,6 @@
16874
  "MMLU-PRO": 0.232
16875
  }
16876
  },
16877
- {
16878
- "model_id": "NousResearch/Yarn-Llama-2-7b-128k",
16879
- "name": "Yarn-Llama-2-7b-128k",
16880
- "developer": "NousResearch",
16881
- "scores": {
16882
- "IFEval": 0.1485,
16883
- "BBH": 0.3248,
16884
- "MATH Level 5": 0.0151,
16885
- "GPQA": 0.2601,
16886
- "MUSR": 0.3967,
16887
- "MMLU-PRO": 0.1791
16888
- }
16889
- },
16890
  {
16891
  "model_id": "NousResearch/Yarn-Llama-2-7b-64k",
16892
  "name": "Yarn-Llama-2-7b-64k",
@@ -17204,12 +17191,12 @@
17204
  "name": "code-yi",
17205
  "developer": "Omkar1102",
17206
  "scores": {
17207
- "IFEval": 0.2254,
17208
- "BBH": 0.275,
17209
  "MATH Level 5": 0.0,
17210
- "GPQA": 0.2576,
17211
- "MUSR": 0.3762,
17212
- "MMLU-PRO": 0.1123
17213
  }
17214
  },
17215
  {
@@ -18141,11 +18128,11 @@
18141
  "developer": "PrimeIntellect",
18142
  "scores": {
18143
  "IFEval": 0.1757,
18144
- "BBH": 0.276,
18145
  "MATH Level 5": 0.0,
18146
- "GPQA": 0.2534,
18147
- "MUSR": 0.3339,
18148
- "MMLU-PRO": 0.1123
18149
  }
18150
  },
18151
  {
@@ -18712,12 +18699,12 @@
18712
  "name": "ODB-14B-sce",
18713
  "developer": "Quazim0t0",
18714
  "scores": {
18715
- "IFEval": 0.7016,
18716
- "BBH": 0.6942,
18717
- "MATH Level 5": 0.4116,
18718
- "GPQA": 0.3624,
18719
- "MUSR": 0.4571,
18720
- "MMLU-PRO": 0.5411
18721
  }
18722
  },
18723
  {
@@ -19466,12 +19453,12 @@
19466
  "name": "Qwen2.5-0.5B-Instruct",
19467
  "developer": "Qwen",
19468
  "scores": {
19469
- "IFEval": 0.3153,
19470
- "BBH": 0.3322,
19471
- "MATH Level 5": 0.1035,
19472
- "GPQA": 0.2592,
19473
- "MUSR": 0.3342,
19474
- "MMLU-PRO": 0.172
19475
  }
19476
  },
19477
  {
@@ -19726,12 +19713,12 @@
19726
  "name": "Qwen2.5-Coder-7B-Instruct",
19727
  "developer": "Qwen",
19728
  "scores": {
19729
- "IFEval": 0.6147,
19730
- "BBH": 0.4999,
19731
- "MATH Level 5": 0.031,
19732
- "GPQA": 0.2936,
19733
- "MUSR": 0.4099,
19734
- "MMLU-PRO": 0.3354
19735
  }
19736
  },
19737
  {
@@ -19986,12 +19973,12 @@
19986
  "name": "Replete-LLM-Qwen2-7b",
19987
  "developer": "Replete-AI",
19988
  "scores": {
19989
- "IFEval": 0.0932,
19990
- "BBH": 0.2977,
19991
  "MATH Level 5": 0.0,
19992
- "GPQA": 0.2475,
19993
- "MUSR": 0.3941,
19994
- "MMLU-PRO": 0.1157
19995
  }
19996
  },
19997
  {
@@ -24653,12 +24640,12 @@
24653
  "name": "Llama-3-Instruct-8B-SPPO-Iter3",
24654
  "developer": "UCLA-AGI",
24655
  "scores": {
24656
- "IFEval": 0.6834,
24657
- "BBH": 0.508,
24658
- "MATH Level 5": 0.0959,
24659
  "GPQA": 0.2651,
24660
- "MUSR": 0.3661,
24661
- "MMLU-PRO": 0.3644
24662
  }
24663
  },
24664
  {
@@ -25004,12 +24991,12 @@
25004
  "name": "llama-3-Korean-8B",
25005
  "developer": "VIRNECT",
25006
  "scores": {
25007
- "IFEval": 0.5021,
25008
- "BBH": 0.4918,
25009
- "MATH Level 5": 0.108,
25010
  "GPQA": 0.271,
25011
- "MUSR": 0.3648,
25012
- "MMLU-PRO": 0.3536
25013
  }
25014
  },
25015
  {
@@ -25108,12 +25095,12 @@
25108
  "name": "Llama3.1-8B-Fireplace2",
25109
  "developer": "ValiantLabs",
25110
  "scores": {
25111
- "IFEval": 0.5328,
25112
- "BBH": 0.4613,
25113
- "MATH Level 5": 0.0876,
25114
- "GPQA": 0.2894,
25115
- "MUSR": 0.3367,
25116
- "MMLU-PRO": 0.2424
25117
  }
25118
  },
25119
  {
@@ -25121,12 +25108,12 @@
25121
  "name": "Llama3.1-8B-ShiningValiant2",
25122
  "developer": "ValiantLabs",
25123
  "scores": {
25124
- "IFEval": 0.6496,
25125
- "BBH": 0.4774,
25126
- "MATH Level 5": 0.0566,
25127
- "GPQA": 0.3104,
25128
- "MUSR": 0.3909,
25129
- "MMLU-PRO": 0.3382
25130
  }
25131
  },
25132
  {
@@ -25654,12 +25641,12 @@
25654
  "name": "Qwen2.5-14B-YOYO-1010",
25655
  "developer": "YOYO-AI",
25656
  "scores": {
25657
- "IFEval": 0.5899,
25658
- "BBH": 0.654,
25659
- "MATH Level 5": 0.4509,
25660
- "GPQA": 0.3834,
25661
- "MUSR": 0.4744,
25662
- "MMLU-PRO": 0.5376
25663
  }
25664
  },
25665
  {
@@ -26603,12 +26590,12 @@
26603
  "name": "QAIMath-Qwen2.5-7B-TIES",
26604
  "developer": "adriszmar",
26605
  "scores": {
26606
- "IFEval": 0.1685,
26607
- "BBH": 0.3124,
26608
- "MATH Level 5": 0.0015,
26609
- "GPQA": 0.2492,
26610
- "MUSR": 0.3963,
26611
- "MMLU-PRO": 0.1066
26612
  }
26613
  },
26614
  {
@@ -26889,12 +26876,12 @@
26889
  "name": "Llama-3.1-Storm-8B",
26890
  "developer": "akjindal53244",
26891
  "scores": {
26892
- "IFEval": 0.8051,
26893
- "BBH": 0.5189,
26894
- "MATH Level 5": 0.1722,
26895
- "GPQA": 0.3263,
26896
  "MUSR": 0.4028,
26897
- "MMLU-PRO": 0.3803
26898
  }
26899
  },
26900
  {
@@ -26915,12 +26902,12 @@
26915
  "name": "Llama-3.1-Tulu-3-70B",
26916
  "developer": "allenai",
26917
  "scores": {
26918
- "IFEval": 0.8379,
26919
- "BBH": 0.6157,
26920
- "MATH Level 5": 0.3829,
26921
  "GPQA": 0.3733,
26922
- "MUSR": 0.4988,
26923
- "MMLU-PRO": 0.4656
26924
  }
26925
  },
26926
  {
@@ -31647,12 +31634,12 @@
31647
  "name": "dolphin-2.9.2-Phi-3-Medium-abliterated",
31648
  "developer": "cognitivecomputations",
31649
  "scores": {
31650
- "IFEval": 0.4124,
31651
- "BBH": 0.6383,
31652
- "MATH Level 5": 0.182,
31653
- "GPQA": 0.3289,
31654
- "MUSR": 0.4349,
31655
- "MMLU-PRO": 0.4525
31656
  }
31657
  },
31658
  {
@@ -31790,12 +31777,12 @@
31790
  "name": "llama-43m-beta",
31791
  "developer": "cpayne1303",
31792
  "scores": {
31793
- "IFEval": 0.1916,
31794
- "BBH": 0.2977,
31795
- "MATH Level 5": 0.0,
31796
  "GPQA": 0.2685,
31797
- "MUSR": 0.3872,
31798
- "MMLU-PRO": 0.1132
31799
  }
31800
  },
31801
  {
@@ -32167,12 +32154,12 @@
32167
  "name": "Llama-3-8B-Orpo-v0.1",
32168
  "developer": "dfurman",
32169
  "scores": {
32170
- "IFEval": 0.2835,
32171
- "BBH": 0.3842,
32172
- "MATH Level 5": 0.0521,
32173
- "GPQA": 0.2609,
32174
- "MUSR": 0.3566,
32175
- "MMLU-PRO": 0.2298
32176
  }
32177
  },
32178
  {
@@ -34663,12 +34650,12 @@
34663
  "name": "gemma-2-2b",
34664
  "developer": "Google",
34665
  "scores": {
34666
- "IFEval": 0.2018,
34667
- "BBH": 0.3709,
34668
- "MATH Level 5": 0.0302,
34669
  "GPQA": 0.2626,
34670
- "MUSR": 0.4219,
34671
- "MMLU-PRO": 0.2217
34672
  }
34673
  },
34674
  {
@@ -34689,12 +34676,12 @@
34689
  "name": "gemma-2-2b-jpn-it",
34690
  "developer": "Google",
34691
  "scores": {
34692
- "IFEval": 0.5078,
34693
- "BBH": 0.4226,
34694
- "MATH Level 5": 0.0347,
34695
- "GPQA": 0.2852,
34696
- "MUSR": 0.3964,
34697
- "MMLU-PRO": 0.2578
34698
  }
34699
  },
34700
  {
@@ -37705,12 +37692,12 @@
37705
  "name": "Kosmos-EVAA-Fusion-8B",
37706
  "developer": "jaspionjader",
37707
  "scores": {
37708
- "IFEval": 0.4345,
37709
- "BBH": 0.5419,
37710
- "MATH Level 5": 0.1292,
37711
- "GPQA": 0.3087,
37712
  "MUSR": 0.4277,
37713
- "MMLU-PRO": 0.3854
37714
  }
37715
  },
37716
  {
@@ -42359,12 +42346,12 @@
42359
  "name": "Mistral-v0.3-7B-ORPO",
42360
  "developer": "llmat",
42361
  "scores": {
42362
- "IFEval": 0.364,
42363
- "BBH": 0.4005,
42364
- "MATH Level 5": 0.0015,
42365
- "GPQA": 0.2693,
42366
- "MUSR": 0.3529,
42367
- "MMLU-PRO": 0.2301
42368
  }
42369
  },
42370
  {
@@ -44478,12 +44465,12 @@
44478
  "name": "Mixtral-8x7B-v0.1",
44479
  "developer": "mistralai",
44480
  "scores": {
44481
- "IFEval": 0.2326,
44482
- "BBH": 0.5098,
44483
- "MATH Level 5": 0.0937,
44484
- "GPQA": 0.3205,
44485
- "MUSR": 0.4413,
44486
- "MMLU-PRO": 0.3871
44487
  }
44488
  },
44489
  {
@@ -44738,12 +44725,12 @@
44738
  "name": "NeuralDaredevil-8B-abliterated",
44739
  "developer": "mlabonne",
44740
  "scores": {
44741
- "IFEval": 0.4162,
44742
- "BBH": 0.5124,
44743
- "MATH Level 5": 0.0853,
44744
- "GPQA": 0.3029,
44745
- "MUSR": 0.415,
44746
- "MMLU-PRO": 0.3802
44747
  }
44748
  },
44749
  {
@@ -45076,12 +45063,12 @@
45076
  "name": "Mistral-Nemo-Kurdish-Instruct",
45077
  "developer": "nazimali",
45078
  "scores": {
45079
- "IFEval": 0.4964,
45080
- "BBH": 0.4699,
45081
- "MATH Level 5": 0.0045,
45082
- "GPQA": 0.2827,
45083
- "MUSR": 0.3979,
45084
- "MMLU-PRO": 0.3063
45085
  }
45086
  },
45087
  {
@@ -46779,12 +46766,12 @@
46779
  "name": "franqwenstein-35b",
46780
  "developer": "nisten",
46781
  "scores": {
46782
- "IFEval": 0.3914,
46783
- "BBH": 0.6591,
46784
- "MATH Level 5": 0.3044,
46785
- "GPQA": 0.3591,
46786
- "MUSR": 0.4681,
46787
- "MMLU-PRO": 0.5611
46788
  }
46789
  },
46790
  {
@@ -48729,12 +48716,12 @@
48729
  "name": "Llama-3-8B-ProLong-512k-Instruct",
48730
  "developer": "princeton-nlp",
48731
  "scores": {
48732
- "IFEval": 0.5508,
48733
- "BBH": 0.5028,
48734
- "MATH Level 5": 0.0529,
48735
- "GPQA": 0.2861,
48736
- "MUSR": 0.4266,
48737
- "MMLU-PRO": 0.3231
48738
  }
48739
  },
48740
  {
@@ -51303,12 +51290,12 @@
51303
  "name": "Gemma-2-Ataraxy-Gemmasutra-9B-slerp",
51304
  "developer": "recoilme",
51305
  "scores": {
51306
- "IFEval": 0.7649,
51307
- "BBH": 0.5974,
51308
- "MATH Level 5": 0.0174,
51309
- "GPQA": 0.3305,
51310
- "MUSR": 0.4245,
51311
- "MMLU-PRO": 0.4207
51312
  }
51313
  },
51314
  {
@@ -51329,12 +51316,12 @@
51329
  "name": "recoilme-gemma-2-9B-v0.2",
51330
  "developer": "recoilme",
51331
  "scores": {
51332
- "IFEval": 0.2747,
51333
- "BBH": 0.6031,
51334
- "MATH Level 5": 0.0831,
51335
- "GPQA": 0.3305,
51336
- "MUSR": 0.4686,
51337
- "MMLU-PRO": 0.4122
51338
  }
51339
  },
51340
  {
@@ -51342,12 +51329,12 @@
51342
  "name": "recoilme-gemma-2-9B-v0.3",
51343
  "developer": "recoilme",
51344
  "scores": {
51345
- "IFEval": 0.7439,
51346
- "BBH": 0.5993,
51347
- "MATH Level 5": 0.0876,
51348
- "GPQA": 0.3238,
51349
- "MUSR": 0.4204,
51350
- "MMLU-PRO": 0.4072
51351
  }
51352
  },
51353
  {
@@ -56997,12 +56984,12 @@
56997
  "name": "BagelMIsteryTour-v2-8x7B",
56998
  "developer": "ycros",
56999
  "scores": {
57000
- "IFEval": 0.6262,
57001
- "BBH": 0.5142,
57002
- "MATH Level 5": 0.0937,
57003
- "GPQA": 0.3079,
57004
- "MUSR": 0.4138,
57005
- "MMLU-PRO": 0.3481
57006
  }
57007
  },
57008
  {
 
2176
  "name": "LION-Gemma-2b-dpo-v1.0",
2177
  "developer": "Columbia-NLP",
2178
  "scores": {
2179
+ "IFEval": 0.3102,
2180
+ "BBH": 0.3881,
2181
+ "MATH Level 5": 0.0536,
2182
+ "GPQA": 0.2534,
2183
+ "MUSR": 0.4081,
2184
+ "MMLU-PRO": 0.1665
2185
  }
2186
  },
2187
  {
 
3229
  "name": "PathfinderAI",
3230
  "developer": "Daemontatox",
3231
  "scores": {
3232
+ "IFEval": 0.4855,
3233
+ "BBH": 0.6627,
3234
+ "MATH Level 5": 0.4841,
3235
+ "GPQA": 0.3096,
3236
+ "MUSR": 0.4256,
3237
+ "MMLU-PRO": 0.5542
3238
  }
3239
  },
3240
  {
 
4321
  "name": "Llama-3.1-8b-ITA",
4322
  "developer": "DeepMount00",
4323
  "scores": {
4324
+ "IFEval": 0.5365,
4325
+ "BBH": 0.517,
4326
+ "MATH Level 5": 0.1707,
4327
+ "GPQA": 0.3062,
4328
+ "MUSR": 0.4487,
4329
+ "MMLU-PRO": 0.396
4330
  }
4331
  },
4332
  {
 
4646
  "name": "MN-12B-LilithFrame",
4647
  "developer": "DoppelReflEx",
4648
  "scores": {
4649
+ "IFEval": 0.436,
4650
+ "BBH": 0.4956,
4651
+ "MATH Level 5": 0.0589,
4652
+ "GPQA": 0.3205,
4653
+ "MUSR": 0.3843,
4654
+ "MMLU-PRO": 0.3237
4655
  }
4656
  },
4657
  {
 
9144
  "name": "SmolLM2-135M-Instruct",
9145
  "developer": "HuggingFaceTB",
9146
  "scores": {
9147
+ "IFEval": 0.2883,
9148
+ "BBH": 0.3124,
9149
+ "MATH Level 5": 0.003,
9150
+ "GPQA": 0.2357,
9151
+ "MUSR": 0.3662,
9152
+ "MMLU-PRO": 0.1115
9153
  }
9154
  },
9155
  {
 
13057
  "name": "SpydazWeb_AI_HumanAI_012_INSTRUCT_XA",
13058
  "developer": "LeroyDyer",
13059
  "scores": {
13060
+ "IFEval": 0.3798,
13061
+ "BBH": 0.4483,
13062
+ "MATH Level 5": 0.04,
13063
+ "GPQA": 0.3129,
13064
+ "MUSR": 0.4148,
13065
+ "MMLU-PRO": 0.2389
13066
  }
13067
  },
13068
  {
 
16874
  "MMLU-PRO": 0.232
16875
  }
16876
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
16877
  {
16878
  "model_id": "NousResearch/Yarn-Llama-2-7b-64k",
16879
  "name": "Yarn-Llama-2-7b-64k",
 
17191
  "name": "code-yi",
17192
  "developer": "Omkar1102",
17193
  "scores": {
17194
+ "IFEval": 0.2148,
17195
+ "BBH": 0.276,
17196
  "MATH Level 5": 0.0,
17197
+ "GPQA": 0.2508,
17198
+ "MUSR": 0.3802,
17199
+ "MMLU-PRO": 0.1126
17200
  }
17201
  },
17202
  {
 
18128
  "developer": "PrimeIntellect",
18129
  "scores": {
18130
  "IFEval": 0.1757,
18131
+ "BBH": 0.274,
18132
  "MATH Level 5": 0.0,
18133
+ "GPQA": 0.25,
18134
+ "MUSR": 0.3753,
18135
+ "MMLU-PRO": 0.112
18136
  }
18137
  },
18138
  {
 
18699
  "name": "ODB-14B-sce",
18700
  "developer": "Quazim0t0",
18701
  "scores": {
18702
+ "IFEval": 0.2922,
18703
+ "BBH": 0.6559,
18704
+ "MATH Level 5": 0.2545,
18705
+ "GPQA": 0.2659,
18706
+ "MUSR": 0.3929,
18707
+ "MMLU-PRO": 0.5207
18708
  }
18709
  },
18710
  {
 
19453
  "name": "Qwen2.5-0.5B-Instruct",
19454
  "developer": "Qwen",
19455
  "scores": {
19456
+ "IFEval": 0.3071,
19457
+ "BBH": 0.3341,
19458
+ "MATH Level 5": 0.0,
19459
+ "GPQA": 0.2576,
19460
+ "MUSR": 0.3329,
19461
+ "MMLU-PRO": 0.1697
19462
  }
19463
  },
19464
  {
 
19713
  "name": "Qwen2.5-Coder-7B-Instruct",
19714
  "developer": "Qwen",
19715
  "scores": {
19716
+ "IFEval": 0.6101,
19717
+ "BBH": 0.5008,
19718
+ "MATH Level 5": 0.3716,
19719
+ "GPQA": 0.2919,
19720
+ "MUSR": 0.4073,
19721
+ "MMLU-PRO": 0.3352
19722
  }
19723
  },
19724
  {
 
19973
  "name": "Replete-LLM-Qwen2-7b",
19974
  "developer": "Replete-AI",
19975
  "scores": {
19976
+ "IFEval": 0.0905,
19977
+ "BBH": 0.2985,
19978
  "MATH Level 5": 0.0,
19979
+ "GPQA": 0.2534,
19980
+ "MUSR": 0.3848,
19981
+ "MMLU-PRO": 0.1158
19982
  }
19983
  },
19984
  {
 
24640
  "name": "Llama-3-Instruct-8B-SPPO-Iter3",
24641
  "developer": "UCLA-AGI",
24642
  "scores": {
24643
+ "IFEval": 0.6703,
24644
+ "BBH": 0.5076,
24645
+ "MATH Level 5": 0.0718,
24646
  "GPQA": 0.2651,
24647
+ "MUSR": 0.3647,
24648
+ "MMLU-PRO": 0.3658
24649
  }
24650
  },
24651
  {
 
24991
  "name": "llama-3-Korean-8B",
24992
  "developer": "VIRNECT",
24993
  "scores": {
24994
+ "IFEval": 0.5058,
24995
+ "BBH": 0.4908,
24996
+ "MATH Level 5": 0.0929,
24997
  "GPQA": 0.271,
24998
+ "MUSR": 0.3662,
24999
+ "MMLU-PRO": 0.3539
25000
  }
25001
  },
25002
  {
 
25095
  "name": "Llama3.1-8B-Fireplace2",
25096
  "developer": "ValiantLabs",
25097
  "scores": {
25098
+ "IFEval": 0.5483,
25099
+ "BBH": 0.461,
25100
+ "MATH Level 5": 0.0582,
25101
+ "GPQA": 0.2886,
25102
+ "MUSR": 0.3433,
25103
+ "MMLU-PRO": 0.2407
25104
  }
25105
  },
25106
  {
 
25108
  "name": "Llama3.1-8B-ShiningValiant2",
25109
  "developer": "ValiantLabs",
25110
  "scores": {
25111
+ "IFEval": 0.2678,
25112
+ "BBH": 0.4429,
25113
+ "MATH Level 5": 0.0521,
25114
+ "GPQA": 0.302,
25115
+ "MUSR": 0.3959,
25116
+ "MMLU-PRO": 0.2927
25117
  }
25118
  },
25119
  {
 
25641
  "name": "Qwen2.5-14B-YOYO-1010",
25642
  "developer": "YOYO-AI",
25643
  "scores": {
25644
+ "IFEval": 0.7905,
25645
+ "BBH": 0.6406,
25646
+ "MATH Level 5": 0.0,
25647
+ "GPQA": 0.3163,
25648
+ "MUSR": 0.4181,
25649
+ "MMLU-PRO": 0.4944
25650
  }
25651
  },
25652
  {
 
26590
  "name": "QAIMath-Qwen2.5-7B-TIES",
26591
  "developer": "adriszmar",
26592
  "scores": {
26593
+ "IFEval": 0.1746,
26594
+ "BBH": 0.3126,
26595
+ "MATH Level 5": 0.0,
26596
+ "GPQA": 0.245,
26597
+ "MUSR": 0.4096,
26598
+ "MMLU-PRO": 0.1087
26599
  }
26600
  },
26601
  {
 
26876
  "name": "Llama-3.1-Storm-8B",
26877
  "developer": "akjindal53244",
26878
  "scores": {
26879
+ "IFEval": 0.8033,
26880
+ "BBH": 0.5196,
26881
+ "MATH Level 5": 0.1624,
26882
+ "GPQA": 0.3096,
26883
  "MUSR": 0.4028,
26884
+ "MMLU-PRO": 0.3812
26885
  }
26886
  },
26887
  {
 
26902
  "name": "Llama-3.1-Tulu-3-70B",
26903
  "developer": "allenai",
26904
  "scores": {
26905
+ "IFEval": 0.8291,
26906
+ "BBH": 0.6164,
26907
+ "MATH Level 5": 0.4502,
26908
  "GPQA": 0.3733,
26909
+ "MUSR": 0.4948,
26910
+ "MMLU-PRO": 0.4645
26911
  }
26912
  },
26913
  {
 
31634
  "name": "dolphin-2.9.2-Phi-3-Medium-abliterated",
31635
  "developer": "cognitivecomputations",
31636
  "scores": {
31637
+ "IFEval": 0.3613,
31638
+ "BBH": 0.6123,
31639
+ "MATH Level 5": 0.1239,
31640
+ "GPQA": 0.328,
31641
+ "MUSR": 0.4112,
31642
+ "MMLU-PRO": 0.4494
31643
  }
31644
  },
31645
  {
 
31777
  "name": "llama-43m-beta",
31778
  "developer": "cpayne1303",
31779
  "scores": {
31780
+ "IFEval": 0.1949,
31781
+ "BBH": 0.2965,
31782
+ "MATH Level 5": 0.0045,
31783
  "GPQA": 0.2685,
31784
+ "MUSR": 0.3885,
31785
+ "MMLU-PRO": 0.1111
31786
  }
31787
  },
31788
  {
 
32154
  "name": "Llama-3-8B-Orpo-v0.1",
32155
  "developer": "dfurman",
32156
  "scores": {
32157
+ "IFEval": 0.3,
32158
+ "BBH": 0.3853,
32159
+ "MATH Level 5": 0.0415,
32160
+ "GPQA": 0.2617,
32161
+ "MUSR": 0.3579,
32162
+ "MMLU-PRO": 0.2281
32163
  }
32164
  },
32165
  {
 
34650
  "name": "gemma-2-2b",
34651
  "developer": "Google",
34652
  "scores": {
34653
+ "IFEval": 0.1993,
34654
+ "BBH": 0.3656,
34655
+ "MATH Level 5": 0.0287,
34656
  "GPQA": 0.2626,
34657
+ "MUSR": 0.4232,
34658
+ "MMLU-PRO": 0.218
34659
  }
34660
  },
34661
  {
 
34676
  "name": "gemma-2-2b-jpn-it",
34677
  "developer": "Google",
34678
  "scores": {
34679
+ "IFEval": 0.5288,
34680
+ "BBH": 0.4178,
34681
+ "MATH Level 5": 0.0476,
34682
+ "GPQA": 0.2752,
34683
+ "MUSR": 0.3728,
34684
+ "MMLU-PRO": 0.2467
34685
  }
34686
  },
34687
  {
 
37692
  "name": "Kosmos-EVAA-Fusion-8B",
37693
  "developer": "jaspionjader",
37694
  "scores": {
37695
+ "IFEval": 0.4418,
37696
+ "BBH": 0.5406,
37697
+ "MATH Level 5": 0.1352,
37698
+ "GPQA": 0.3062,
37699
  "MUSR": 0.4277,
37700
+ "MMLU-PRO": 0.386
37701
  }
37702
  },
37703
  {
 
42346
  "name": "Mistral-v0.3-7B-ORPO",
42347
  "developer": "llmat",
42348
  "scores": {
42349
+ "IFEval": 0.377,
42350
+ "BBH": 0.3978,
42351
+ "MATH Level 5": 0.0242,
42352
+ "GPQA": 0.2668,
42353
+ "MUSR": 0.3555,
42354
+ "MMLU-PRO": 0.2278
42355
  }
42356
  },
42357
  {
 
44465
  "name": "Mixtral-8x7B-v0.1",
44466
  "developer": "mistralai",
44467
  "scores": {
44468
+ "IFEval": 0.2415,
44469
+ "BBH": 0.5087,
44470
+ "MATH Level 5": 0.102,
44471
+ "GPQA": 0.3138,
44472
+ "MUSR": 0.4321,
44473
+ "MMLU-PRO": 0.385
44474
  }
44475
  },
44476
  {
 
44725
  "name": "NeuralDaredevil-8B-abliterated",
44726
  "developer": "mlabonne",
44727
  "scores": {
44728
+ "IFEval": 0.7561,
44729
+ "BBH": 0.5111,
44730
+ "MATH Level 5": 0.0906,
44731
+ "GPQA": 0.3062,
44732
+ "MUSR": 0.4019,
44733
+ "MMLU-PRO": 0.3841
44734
  }
44735
  },
44736
  {
 
45063
  "name": "Mistral-Nemo-Kurdish-Instruct",
45064
  "developer": "nazimali",
45065
  "scores": {
45066
+ "IFEval": 0.486,
45067
+ "BBH": 0.4721,
45068
+ "MATH Level 5": 0.0846,
45069
+ "GPQA": 0.2844,
45070
+ "MUSR": 0.4006,
45071
+ "MMLU-PRO": 0.3087
45072
  }
45073
  },
45074
  {
 
46766
  "name": "franqwenstein-35b",
46767
  "developer": "nisten",
46768
  "scores": {
46769
+ "IFEval": 0.3799,
46770
+ "BBH": 0.6647,
46771
+ "MATH Level 5": 0.3406,
46772
+ "GPQA": 0.4035,
46773
+ "MUSR": 0.494,
46774
+ "MMLU-PRO": 0.5731
46775
  }
46776
  },
46777
  {
 
48716
  "name": "Llama-3-8B-ProLong-512k-Instruct",
48717
  "developer": "princeton-nlp",
48718
  "scores": {
48719
+ "IFEval": 0.3978,
48720
+ "BBH": 0.4983,
48721
+ "MATH Level 5": 0.0582,
48722
+ "GPQA": 0.281,
48723
+ "MUSR": 0.425,
48724
+ "MMLU-PRO": 0.3246
48725
  }
48726
  },
48727
  {
 
51290
  "name": "Gemma-2-Ataraxy-Gemmasutra-9B-slerp",
51291
  "developer": "recoilme",
51292
  "scores": {
51293
+ "IFEval": 0.2854,
51294
+ "BBH": 0.5984,
51295
+ "MATH Level 5": 0.1005,
51296
+ "GPQA": 0.3297,
51297
+ "MUSR": 0.4607,
51298
+ "MMLU-PRO": 0.4162
51299
  }
51300
  },
51301
  {
 
51316
  "name": "recoilme-gemma-2-9B-v0.2",
51317
  "developer": "recoilme",
51318
  "scores": {
51319
+ "IFEval": 0.7592,
51320
+ "BBH": 0.6026,
51321
+ "MATH Level 5": 0.0529,
51322
+ "GPQA": 0.3289,
51323
+ "MUSR": 0.4099,
51324
+ "MMLU-PRO": 0.4163
51325
  }
51326
  },
51327
  {
 
51329
  "name": "recoilme-gemma-2-9B-v0.3",
51330
  "developer": "recoilme",
51331
  "scores": {
51332
+ "IFEval": 0.5761,
51333
+ "BBH": 0.602,
51334
+ "MATH Level 5": 0.1888,
51335
+ "GPQA": 0.3372,
51336
+ "MUSR": 0.4632,
51337
+ "MMLU-PRO": 0.4039
51338
  }
51339
  },
51340
  {
 
56984
  "name": "BagelMIsteryTour-v2-8x7B",
56985
  "developer": "ycros",
56986
  "scores": {
56987
+ "IFEval": 0.5994,
56988
+ "BBH": 0.5159,
56989
+ "MATH Level 5": 0.0785,
56990
+ "GPQA": 0.3045,
56991
+ "MUSR": 0.4203,
56992
+ "MMLU-PRO": 0.3473
56993
  }
56994
  },
56995
  {
data/benchmarks/livecodebenchpro.json CHANGED
@@ -205,9 +205,9 @@
205
  "name": "gpt-5-2025-08-07",
206
  "developer": "OpenAI",
207
  "scores": {
208
- "Hard Problems": 0.0423,
209
- "Medium Problems": 0.4085,
210
- "Easy Problems": 0.9014
211
  }
212
  },
213
  {
 
205
  "name": "gpt-5-2025-08-07",
206
  "developer": "OpenAI",
207
  "scores": {
208
+ "Hard Problems": 0.04225352112676056,
209
+ "Medium Problems": 0.4084507042253521,
210
+ "Easy Problems": 0.8873239436619719
211
  }
212
  },
213
  {
data/benchmarks/reward-bench.json CHANGED
@@ -453,16 +453,16 @@
453
  "name": "LxzGordon/URM-LLaMa-3.1-8B",
454
  "developer": "LxzGordon",
455
  "scores": {
456
- "Score": 0.9294,
 
 
 
 
457
  "Factuality": 0.6884,
458
  "Precise IF": 0.45,
459
  "Math": 0.6393,
460
- "Safety": 0.9108,
461
  "Focus": 0.9758,
462
- "Ties": 0.7653,
463
- "Chat": 0.9553,
464
- "Chat Hard": 0.8816,
465
- "Reasoning": 0.9698
466
  }
467
  },
468
  {
@@ -555,17 +555,17 @@
555
  "name": "OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1",
556
  "developer": "OpenAssistant",
557
  "scores": {
558
- "Score": 0.615,
 
 
 
 
 
559
  "Factuality": 0.3979,
560
  "Precise IF": 0.2875,
561
  "Math": 0.377,
562
- "Safety": 0.5446,
563
  "Focus": 0.1535,
564
- "Ties": 0.047,
565
- "Chat": 0.9246,
566
- "Chat Hard": 0.3728,
567
- "Reasoning": 0.5855,
568
- "Prior Sets (0.5 weight)": 0.6801
569
  }
570
  },
571
  {
@@ -573,17 +573,17 @@
573
  "name": "OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5",
574
  "developer": "OpenAssistant",
575
  "scores": {
576
- "Score": 0.2648,
577
- "Chat": 0.8855,
578
- "Chat Hard": 0.4868,
579
- "Safety": 0.3244,
580
- "Reasoning": 0.7752,
581
- "Prior Sets (0.5 weight)": 0.6533,
582
  "Factuality": 0.3179,
583
  "Precise IF": 0.2625,
584
  "Math": 0.3934,
 
585
  "Focus": 0.2707,
586
- "Ties": 0.0198
 
 
 
 
587
  }
588
  },
589
  {
@@ -609,17 +609,17 @@
609
  "name": "PKU-Alignment/beaver-7b-v1.0-cost",
610
  "developer": "PKU-Alignment",
611
  "scores": {
612
- "Score": 0.3332,
613
- "Chat": 0.6173,
614
- "Chat Hard": 0.4232,
615
- "Safety": 0.7589,
616
- "Reasoning": 0.5482,
617
- "Prior Sets (0.5 weight)": 0.57,
618
  "Factuality": 0.3263,
619
  "Precise IF": 0.2313,
620
  "Math": 0.3989,
 
621
  "Focus": 0.2939,
622
- "Ties": -0.01
 
 
 
 
623
  }
624
  },
625
  {
@@ -627,17 +627,17 @@
627
  "name": "PKU-Alignment/beaver-7b-v1.0-reward",
628
  "developer": "PKU-Alignment",
629
  "scores": {
630
- "Score": 0.4727,
 
 
 
 
 
631
  "Factuality": 0.2105,
632
  "Precise IF": 0.2938,
633
  "Math": 0.2623,
634
- "Safety": 0.3757,
635
  "Focus": 0.0646,
636
- "Ties": -0.01,
637
- "Chat": 0.8184,
638
- "Chat Hard": 0.2873,
639
- "Reasoning": 0.346,
640
- "Prior Sets (0.5 weight)": 0.5993
641
  }
642
  },
643
  {
@@ -663,17 +663,17 @@
663
  "name": "PKU-Alignment/beaver-7b-v2.0-reward",
664
  "developer": "PKU-Alignment",
665
  "scores": {
666
- "Score": 0.6366,
 
 
 
 
 
667
  "Factuality": 0.2168,
668
  "Precise IF": 0.2562,
669
  "Math": 0.3825,
670
- "Safety": 0.6041,
671
  "Focus": 0.2606,
672
- "Ties": 0.0944,
673
- "Chat": 0.8994,
674
- "Chat Hard": 0.364,
675
- "Reasoning": 0.6887,
676
- "Prior Sets (0.5 weight)": 0.6171
677
  }
678
  },
679
  {
@@ -921,16 +921,16 @@
921
  "name": "Ray2333/GRM-gemma2-2B-rewardmodel-ft",
922
  "developer": "Ray2333",
923
  "scores": {
924
- "Score": 0.8839,
 
 
 
 
925
  "Factuality": 0.5305,
926
  "Precise IF": 0.3125,
927
  "Math": 0.5902,
928
- "Safety": 0.9216,
929
  "Focus": 0.7455,
930
- "Ties": 0.4788,
931
- "Chat": 0.9302,
932
- "Chat Hard": 0.7719,
933
- "Reasoning": 0.912
934
  }
935
  },
936
  {
@@ -956,17 +956,17 @@
956
  "name": "Ray2333/GRM-llama3-8B-sftreg",
957
  "developer": "Ray2333",
958
  "scores": {
959
- "Score": 0.6089,
960
- "Chat": 0.986,
961
- "Chat Hard": 0.6776,
962
- "Safety": 0.7867,
963
- "Reasoning": 0.9229,
964
- "Prior Sets (0.5 weight)": 0.7309,
965
  "Factuality": 0.6189,
966
  "Precise IF": 0.3875,
967
  "Math": 0.5792,
 
968
  "Focus": 0.6828,
969
- "Ties": 0.5981
 
 
 
 
970
  }
971
  },
972
  {
@@ -1139,16 +1139,16 @@
1139
  "name": "Skywork/Skywork-Reward-Gemma-2-27B",
1140
  "developer": "Skywork",
1141
  "scores": {
1142
- "Score": 0.938,
 
 
 
 
1143
  "Factuality": 0.7368,
1144
  "Precise IF": 0.4031,
1145
  "Math": 0.7049,
1146
- "Safety": 0.9189,
1147
  "Focus": 0.9323,
1148
- "Ties": 0.8261,
1149
- "Chat": 0.9581,
1150
- "Chat Hard": 0.9145,
1151
- "Reasoning": 0.9606
1152
  }
1153
  },
1154
  {
@@ -1156,16 +1156,16 @@
1156
  "name": "Skywork/Skywork-Reward-Gemma-2-27B-v0.2",
1157
  "developer": "Skywork",
1158
  "scores": {
1159
- "Score": 0.7531,
1160
- "Chat": 0.9609,
1161
- "Chat Hard": 0.8991,
1162
- "Safety": 0.9689,
1163
- "Reasoning": 0.9807,
1164
  "Factuality": 0.7674,
1165
  "Precise IF": 0.375,
1166
  "Math": 0.6721,
 
1167
  "Focus": 0.9172,
1168
- "Ties": 0.8182
 
 
 
1169
  }
1170
  },
1171
  {
@@ -1173,16 +1173,16 @@
1173
  "name": "Skywork/Skywork-Reward-Llama-3.1-8B",
1174
  "developer": "Skywork",
1175
  "scores": {
1176
- "Score": 0.7314,
1177
- "Chat": 0.9581,
1178
- "Chat Hard": 0.8728,
1179
- "Safety": 0.9333,
1180
- "Reasoning": 0.962,
1181
  "Factuality": 0.6989,
1182
  "Precise IF": 0.425,
1183
  "Math": 0.6284,
 
1184
  "Focus": 0.9616,
1185
- "Ties": 0.741
 
 
 
1186
  }
1187
  },
1188
  {
@@ -1305,16 +1305,16 @@
1305
  "name": "Skywork/Skywork-VL-Reward-7B",
1306
  "developer": "Skywork",
1307
  "scores": {
1308
- "Score": 0.9007,
 
 
 
 
1309
  "Factuality": 0.6063,
1310
  "Precise IF": 0.35,
1311
  "Math": 0.6339,
1312
- "Safety": 0.9108,
1313
  "Focus": 0.8909,
1314
- "Ties": 0.7586,
1315
- "Chat": 0.8994,
1316
- "Chat Hard": 0.875,
1317
- "Reasoning": 0.9176
1318
  }
1319
  },
1320
  {
@@ -1379,9 +1379,9 @@
1379
  "name": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...",
1380
  "developer": "AI2",
1381
  "scores": {
1382
- "Score": 0.7008,
1383
- "Chat": 0.9385,
1384
- "Chat Hard": 0.3882,
1385
  "Safety": 0.7757
1386
  }
1387
  },
@@ -1423,17 +1423,17 @@
1423
  "name": "allenai/Llama-3.1-70B-Instruct-RM-RB2",
1424
  "developer": "allenai",
1425
  "scores": {
1426
- "Score": 0.9021,
 
 
 
 
 
1427
  "Factuality": 0.8126,
1428
  "Precise IF": 0.4188,
1429
  "Math": 0.6995,
1430
- "Safety": 0.9095,
1431
  "Focus": 0.8646,
1432
- "Ties": 0.8835,
1433
- "Chat": 0.9665,
1434
- "Chat Hard": 0.8355,
1435
- "Reasoning": 0.8969,
1436
- "Prior Sets (0.5 weight)": 0.0
1437
  }
1438
  },
1439
  {
@@ -1459,17 +1459,17 @@
1459
  "name": "allenai/Llama-3.1-8B-Instruct-RM-RB2",
1460
  "developer": "allenai",
1461
  "scores": {
1462
- "Score": 0.8885,
 
 
 
 
 
1463
  "Factuality": 0.7432,
1464
  "Precise IF": 0.4437,
1465
  "Math": 0.6175,
1466
- "Safety": 0.8932,
1467
  "Focus": 0.9071,
1468
- "Ties": 0.7638,
1469
- "Chat": 0.9581,
1470
- "Chat Hard": 0.8158,
1471
- "Reasoning": 0.887,
1472
- "Prior Sets (0.5 weight)": 0.0
1473
  }
1474
  },
1475
  {
@@ -1477,17 +1477,17 @@
1477
  "name": "allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2",
1478
  "developer": "allenai",
1479
  "scores": {
1480
- "Score": 0.722,
1481
- "Chat": 0.9693,
1482
- "Chat Hard": 0.8268,
1483
- "Safety": 0.8689,
1484
- "Reasoning": 0.8583,
1485
- "Prior Sets (0.5 weight)": 0.0,
1486
  "Factuality": 0.8084,
1487
  "Precise IF": 0.3688,
1488
  "Math": 0.6776,
 
1489
  "Focus": 0.7778,
1490
- "Ties": 0.8308
 
 
 
 
1491
  }
1492
  },
1493
  {
@@ -1495,17 +1495,17 @@
1495
  "name": "allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2",
1496
  "developer": "allenai",
1497
  "scores": {
1498
- "Score": 0.687,
1499
- "Chat": 0.9553,
1500
- "Chat Hard": 0.761,
1501
- "Safety": 0.86,
1502
- "Reasoning": 0.7898,
1503
- "Prior Sets (0.5 weight)": 0.0,
1504
  "Factuality": 0.7516,
1505
  "Precise IF": 0.3875,
1506
  "Math": 0.6284,
 
1507
  "Focus": 0.8545,
1508
- "Ties": 0.6397
 
 
 
 
1509
  }
1510
  },
1511
  {
@@ -3784,16 +3784,16 @@
3784
  "name": "infly/INF-ORM-Llama3.1-70B",
3785
  "developer": "infly",
3786
  "scores": {
3787
- "Score": 0.7648,
3788
- "Chat": 0.9665,
3789
- "Chat Hard": 0.9101,
3790
- "Safety": 0.9644,
3791
- "Reasoning": 0.9912,
3792
  "Factuality": 0.7411,
3793
  "Precise IF": 0.4188,
3794
  "Math": 0.6995,
 
3795
  "Focus": 0.903,
3796
- "Ties": 0.8622
 
 
 
3797
  }
3798
  },
3799
  {
@@ -3835,16 +3835,16 @@
3835
  "name": "internlm/internlm2-7b-reward",
3836
  "developer": "internlm",
3837
  "scores": {
3838
- "Score": 0.8759,
 
 
 
 
3839
  "Factuality": 0.4211,
3840
  "Precise IF": 0.4,
3841
  "Math": 0.5628,
3842
- "Safety": 0.8716,
3843
  "Focus": 0.7051,
3844
- "Ties": 0.5164,
3845
- "Chat": 0.9916,
3846
- "Chat Hard": 0.6952,
3847
- "Reasoning": 0.9453
3848
  }
3849
  },
3850
  {
@@ -4014,16 +4014,16 @@
4014
  "name": "nicolinho/QRM-Gemma-2-27B",
4015
  "developer": "nicolinho",
4016
  "scores": {
4017
- "Score": 0.9444,
 
 
 
 
4018
  "Factuality": 0.7853,
4019
  "Precise IF": 0.3719,
4020
  "Math": 0.6995,
4021
- "Safety": 0.927,
4022
  "Focus": 0.9535,
4023
- "Ties": 0.8321,
4024
- "Chat": 0.9665,
4025
- "Chat Hard": 0.9013,
4026
- "Reasoning": 0.9826
4027
  }
4028
  },
4029
  {
@@ -4055,16 +4055,16 @@
4055
  "name": "nicolinho/QRM-Llama3.1-8B-v2",
4056
  "developer": "nicolinho",
4057
  "scores": {
4058
- "Score": 0.9314,
 
 
 
 
4059
  "Factuality": 0.6653,
4060
  "Precise IF": 0.4062,
4061
  "Math": 0.612,
4062
- "Safety": 0.9257,
4063
  "Focus": 0.8909,
4064
- "Ties": 0.7234,
4065
- "Chat": 0.9637,
4066
- "Chat Hard": 0.8684,
4067
- "Reasoning": 0.9677
4068
  }
4069
  },
4070
  {
@@ -4202,16 +4202,16 @@
4202
  "name": "GPT-4o 2024-08-06",
4203
  "developer": "OpenAI",
4204
  "scores": {
4205
- "Score": 0.6493,
4206
- "Chat": 0.9609,
4207
- "Chat Hard": 0.761,
4208
- "Safety": 0.8619,
4209
- "Reasoning": 0.8661,
4210
  "Factuality": 0.5684,
4211
  "Precise IF": 0.3312,
4212
  "Math": 0.623,
 
4213
  "Focus": 0.7293,
4214
- "Ties": 0.7819
 
 
 
4215
  }
4216
  },
4217
  {
@@ -4249,17 +4249,17 @@
4249
  "name": "openbmb/Eurus-RM-7b",
4250
  "developer": "openbmb",
4251
  "scores": {
4252
- "Score": 0.5806,
4253
- "Chat": 0.9804,
4254
- "Chat Hard": 0.6557,
4255
- "Safety": 0.6267,
4256
- "Reasoning": 0.8633,
4257
- "Prior Sets (0.5 weight)": 0.7172,
4258
  "Factuality": 0.6,
4259
  "Precise IF": 0.3438,
4260
  "Math": 0.5683,
 
4261
  "Focus": 0.7475,
4262
- "Ties": 0.5972
 
 
 
 
4263
  }
4264
  },
4265
  {
@@ -4370,17 +4370,17 @@
4370
  "name": "sfairXC/FsfairX-LLaMA3-RM-v0.1",
4371
  "developer": "sfairXC",
4372
  "scores": {
4373
- "Score": 0.6292,
4374
- "Chat": 0.9944,
4375
- "Chat Hard": 0.6513,
4376
- "Safety": 0.7667,
4377
- "Reasoning": 0.8644,
4378
- "Prior Sets (0.5 weight)": 0.7492,
4379
  "Factuality": 0.5916,
4380
  "Precise IF": 0.4188,
4381
  "Math": 0.6284,
 
4382
  "Focus": 0.7051,
4383
- "Ties": 0.6647
 
 
 
 
4384
  }
4385
  },
4386
  {
@@ -4492,17 +4492,17 @@
4492
  "name": "weqweasdas/RM-Gemma-2B",
4493
  "developer": "weqweasdas",
4494
  "scores": {
4495
- "Score": 0.3057,
4496
- "Chat": 0.9441,
4497
- "Chat Hard": 0.4079,
4498
- "Safety": 0.3311,
4499
- "Reasoning": 0.7637,
4500
- "Prior Sets (0.5 weight)": 0.6652,
4501
  "Factuality": 0.3705,
4502
  "Precise IF": 0.2812,
4503
  "Math": 0.4317,
 
4504
  "Focus": 0.2343,
4505
- "Ties": 0.1851
 
 
 
 
4506
  }
4507
  },
4508
  {
@@ -4541,17 +4541,17 @@
4541
  "name": "weqweasdas/RM-Mistral-7B",
4542
  "developer": "weqweasdas",
4543
  "scores": {
4544
- "Score": 0.596,
4545
- "Chat": 0.9665,
4546
- "Chat Hard": 0.6053,
4547
- "Safety": 0.6911,
4548
- "Reasoning": 0.7736,
4549
- "Prior Sets (0.5 weight)": 0.753,
4550
  "Factuality": 0.5937,
4551
  "Precise IF": 0.3438,
4552
  "Math": 0.5956,
 
4553
  "Focus": 0.7293,
4554
- "Ties": 0.6226
 
 
 
 
4555
  }
4556
  },
4557
  {
@@ -4559,17 +4559,17 @@
4559
  "name": "weqweasdas/hh_rlhf_rm_open_llama_3b",
4560
  "developer": "weqweasdas",
4561
  "scores": {
4562
- "Score": 0.2498,
4563
- "Chat": 0.8184,
4564
- "Chat Hard": 0.3728,
4565
- "Safety": 0.24,
4566
- "Reasoning": 0.3281,
4567
- "Prior Sets (0.5 weight)": 0.6564,
4568
  "Factuality": 0.3642,
4569
  "Precise IF": 0.275,
4570
  "Math": 0.3497,
 
4571
  "Focus": 0.2384,
4572
- "Ties": 0.0315
 
 
 
 
4573
  }
4574
  }
4575
  ]
 
453
  "name": "LxzGordon/URM-LLaMa-3.1-8B",
454
  "developer": "LxzGordon",
455
  "scores": {
456
+ "Score": 0.7394,
457
+ "Chat": 0.9553,
458
+ "Chat Hard": 0.8816,
459
+ "Safety": 0.9178,
460
+ "Reasoning": 0.9698,
461
  "Factuality": 0.6884,
462
  "Precise IF": 0.45,
463
  "Math": 0.6393,
 
464
  "Focus": 0.9758,
465
+ "Ties": 0.7653
 
 
 
466
  }
467
  },
468
  {
 
555
  "name": "OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1",
556
  "developer": "OpenAssistant",
557
  "scores": {
558
+ "Score": 0.2653,
559
+ "Chat": 0.9246,
560
+ "Chat Hard": 0.3728,
561
+ "Safety": 0.3289,
562
+ "Reasoning": 0.5855,
563
+ "Prior Sets (0.5 weight)": 0.6801,
564
  "Factuality": 0.3979,
565
  "Precise IF": 0.2875,
566
  "Math": 0.377,
 
567
  "Focus": 0.1535,
568
+ "Ties": 0.047
 
 
 
 
569
  }
570
  },
571
  {
 
573
  "name": "OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5",
574
  "developer": "OpenAssistant",
575
  "scores": {
576
+ "Score": 0.6901,
 
 
 
 
 
577
  "Factuality": 0.3179,
578
  "Precise IF": 0.2625,
579
  "Math": 0.3934,
580
+ "Safety": 0.6311,
581
  "Focus": 0.2707,
582
+ "Ties": 0.0198,
583
+ "Chat": 0.8855,
584
+ "Chat Hard": 0.4868,
585
+ "Reasoning": 0.7752,
586
+ "Prior Sets (0.5 weight)": 0.6533
587
  }
588
  },
589
  {
 
609
  "name": "PKU-Alignment/beaver-7b-v1.0-cost",
610
  "developer": "PKU-Alignment",
611
  "scores": {
612
+ "Score": 0.5798,
 
 
 
 
 
613
  "Factuality": 0.3263,
614
  "Precise IF": 0.2313,
615
  "Math": 0.3989,
616
+ "Safety": 0.7351,
617
  "Focus": 0.2939,
618
+ "Ties": -0.01,
619
+ "Chat": 0.6173,
620
+ "Chat Hard": 0.4232,
621
+ "Reasoning": 0.5482,
622
+ "Prior Sets (0.5 weight)": 0.57
623
  }
624
  },
625
  {
 
627
  "name": "PKU-Alignment/beaver-7b-v1.0-reward",
628
  "developer": "PKU-Alignment",
629
  "scores": {
630
+ "Score": 0.1606,
631
+ "Chat": 0.8184,
632
+ "Chat Hard": 0.2873,
633
+ "Safety": 0.1422,
634
+ "Reasoning": 0.346,
635
+ "Prior Sets (0.5 weight)": 0.5993,
636
  "Factuality": 0.2105,
637
  "Precise IF": 0.2938,
638
  "Math": 0.2623,
 
639
  "Focus": 0.0646,
640
+ "Ties": -0.01
 
 
 
 
641
  }
642
  },
643
  {
 
663
  "name": "PKU-Alignment/beaver-7b-v2.0-reward",
664
  "developer": "PKU-Alignment",
665
  "scores": {
666
+ "Score": 0.2544,
667
+ "Chat": 0.8994,
668
+ "Chat Hard": 0.364,
669
+ "Safety": 0.3156,
670
+ "Reasoning": 0.6887,
671
+ "Prior Sets (0.5 weight)": 0.6171,
672
  "Factuality": 0.2168,
673
  "Precise IF": 0.2562,
674
  "Math": 0.3825,
 
675
  "Focus": 0.2606,
676
+ "Ties": 0.0944
 
 
 
 
677
  }
678
  },
679
  {
 
921
  "name": "Ray2333/GRM-gemma2-2B-rewardmodel-ft",
922
  "developer": "Ray2333",
923
  "scores": {
924
+ "Score": 0.5966,
925
+ "Chat": 0.9302,
926
+ "Chat Hard": 0.7719,
927
+ "Safety": 0.9222,
928
+ "Reasoning": 0.912,
929
  "Factuality": 0.5305,
930
  "Precise IF": 0.3125,
931
  "Math": 0.5902,
 
932
  "Focus": 0.7455,
933
+ "Ties": 0.4788
 
 
 
934
  }
935
  },
936
  {
 
956
  "name": "Ray2333/GRM-llama3-8B-sftreg",
957
  "developer": "Ray2333",
958
  "scores": {
959
+ "Score": 0.8542,
 
 
 
 
 
960
  "Factuality": 0.6189,
961
  "Precise IF": 0.3875,
962
  "Math": 0.5792,
963
+ "Safety": 0.8919,
964
  "Focus": 0.6828,
965
+ "Ties": 0.5981,
966
+ "Chat": 0.986,
967
+ "Chat Hard": 0.6776,
968
+ "Reasoning": 0.9229,
969
+ "Prior Sets (0.5 weight)": 0.7309
970
  }
971
  },
972
  {
 
1139
  "name": "Skywork/Skywork-Reward-Gemma-2-27B",
1140
  "developer": "Skywork",
1141
  "scores": {
1142
+ "Score": 0.7576,
1143
+ "Chat": 0.9581,
1144
+ "Chat Hard": 0.9145,
1145
+ "Safety": 0.9422,
1146
+ "Reasoning": 0.9606,
1147
  "Factuality": 0.7368,
1148
  "Precise IF": 0.4031,
1149
  "Math": 0.7049,
 
1150
  "Focus": 0.9323,
1151
+ "Ties": 0.8261
 
 
 
1152
  }
1153
  },
1154
  {
 
1156
  "name": "Skywork/Skywork-Reward-Gemma-2-27B-v0.2",
1157
  "developer": "Skywork",
1158
  "scores": {
1159
+ "Score": 0.9426,
 
 
 
 
1160
  "Factuality": 0.7674,
1161
  "Precise IF": 0.375,
1162
  "Math": 0.6721,
1163
+ "Safety": 0.9297,
1164
  "Focus": 0.9172,
1165
+ "Ties": 0.8182,
1166
+ "Chat": 0.9609,
1167
+ "Chat Hard": 0.8991,
1168
+ "Reasoning": 0.9807
1169
  }
1170
  },
1171
  {
 
1173
  "name": "Skywork/Skywork-Reward-Llama-3.1-8B",
1174
  "developer": "Skywork",
1175
  "scores": {
1176
+ "Score": 0.9252,
 
 
 
 
1177
  "Factuality": 0.6989,
1178
  "Precise IF": 0.425,
1179
  "Math": 0.6284,
1180
+ "Safety": 0.9081,
1181
  "Focus": 0.9616,
1182
+ "Ties": 0.741,
1183
+ "Chat": 0.9581,
1184
+ "Chat Hard": 0.8728,
1185
+ "Reasoning": 0.962
1186
  }
1187
  },
1188
  {
 
1305
  "name": "Skywork/Skywork-VL-Reward-7B",
1306
  "developer": "Skywork",
1307
  "scores": {
1308
+ "Score": 0.6885,
1309
+ "Chat": 0.8994,
1310
+ "Chat Hard": 0.875,
1311
+ "Safety": 0.8911,
1312
+ "Reasoning": 0.9176,
1313
  "Factuality": 0.6063,
1314
  "Precise IF": 0.35,
1315
  "Math": 0.6339,
 
1316
  "Focus": 0.8909,
1317
+ "Ties": 0.7586
 
 
 
1318
  }
1319
  },
1320
  {
 
1379
  "name": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...",
1380
  "developer": "AI2",
1381
  "scores": {
1382
+ "Score": 0.6924,
1383
+ "Chat": 0.9441,
1384
+ "Chat Hard": 0.3575,
1385
  "Safety": 0.7757
1386
  }
1387
  },
 
1423
  "name": "allenai/Llama-3.1-70B-Instruct-RM-RB2",
1424
  "developer": "allenai",
1425
  "scores": {
1426
+ "Score": 0.7606,
1427
+ "Chat": 0.9665,
1428
+ "Chat Hard": 0.8355,
1429
+ "Safety": 0.8844,
1430
+ "Reasoning": 0.8969,
1431
+ "Prior Sets (0.5 weight)": 0.0,
1432
  "Factuality": 0.8126,
1433
  "Precise IF": 0.4188,
1434
  "Math": 0.6995,
 
1435
  "Focus": 0.8646,
1436
+ "Ties": 0.8835
 
 
 
 
1437
  }
1438
  },
1439
  {
 
1459
  "name": "allenai/Llama-3.1-8B-Instruct-RM-RB2",
1460
  "developer": "allenai",
1461
  "scores": {
1462
+ "Score": 0.7285,
1463
+ "Chat": 0.9581,
1464
+ "Chat Hard": 0.8158,
1465
+ "Safety": 0.8956,
1466
+ "Reasoning": 0.887,
1467
+ "Prior Sets (0.5 weight)": 0.0,
1468
  "Factuality": 0.7432,
1469
  "Precise IF": 0.4437,
1470
  "Math": 0.6175,
 
1471
  "Focus": 0.9071,
1472
+ "Ties": 0.7638
 
 
 
 
1473
  }
1474
  },
1475
  {
 
1477
  "name": "allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2",
1478
  "developer": "allenai",
1479
  "scores": {
1480
+ "Score": 0.8892,
 
 
 
 
 
1481
  "Factuality": 0.8084,
1482
  "Precise IF": 0.3688,
1483
  "Math": 0.6776,
1484
+ "Safety": 0.9027,
1485
  "Focus": 0.7778,
1486
+ "Ties": 0.8308,
1487
+ "Chat": 0.9693,
1488
+ "Chat Hard": 0.8268,
1489
+ "Reasoning": 0.8583,
1490
+ "Prior Sets (0.5 weight)": 0.0
1491
  }
1492
  },
1493
  {
 
1495
  "name": "allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2",
1496
  "developer": "allenai",
1497
  "scores": {
1498
+ "Score": 0.8431,
 
 
 
 
 
1499
  "Factuality": 0.7516,
1500
  "Precise IF": 0.3875,
1501
  "Math": 0.6284,
1502
+ "Safety": 0.8662,
1503
  "Focus": 0.8545,
1504
+ "Ties": 0.6397,
1505
+ "Chat": 0.9553,
1506
+ "Chat Hard": 0.761,
1507
+ "Reasoning": 0.7898,
1508
+ "Prior Sets (0.5 weight)": 0.0
1509
  }
1510
  },
1511
  {
 
3784
  "name": "infly/INF-ORM-Llama3.1-70B",
3785
  "developer": "infly",
3786
  "scores": {
3787
+ "Score": 0.9511,
 
 
 
 
3788
  "Factuality": 0.7411,
3789
  "Precise IF": 0.4188,
3790
  "Math": 0.6995,
3791
+ "Safety": 0.9365,
3792
  "Focus": 0.903,
3793
+ "Ties": 0.8622,
3794
+ "Chat": 0.9665,
3795
+ "Chat Hard": 0.9101,
3796
+ "Reasoning": 0.9912
3797
  }
3798
  },
3799
  {
 
3835
  "name": "internlm/internlm2-7b-reward",
3836
  "developer": "internlm",
3837
  "scores": {
3838
+ "Score": 0.5335,
3839
+ "Chat": 0.9916,
3840
+ "Chat Hard": 0.6952,
3841
+ "Safety": 0.5956,
3842
+ "Reasoning": 0.9453,
3843
  "Factuality": 0.4211,
3844
  "Precise IF": 0.4,
3845
  "Math": 0.5628,
 
3846
  "Focus": 0.7051,
3847
+ "Ties": 0.5164
 
 
 
3848
  }
3849
  },
3850
  {
 
4014
  "name": "nicolinho/QRM-Gemma-2-27B",
4015
  "developer": "nicolinho",
4016
  "scores": {
4017
+ "Score": 0.7667,
4018
+ "Chat": 0.9665,
4019
+ "Chat Hard": 0.9013,
4020
+ "Safety": 0.9578,
4021
+ "Reasoning": 0.9826,
4022
  "Factuality": 0.7853,
4023
  "Precise IF": 0.3719,
4024
  "Math": 0.6995,
 
4025
  "Focus": 0.9535,
4026
+ "Ties": 0.8321
 
 
 
4027
  }
4028
  },
4029
  {
 
4055
  "name": "nicolinho/QRM-Llama3.1-8B-v2",
4056
  "developer": "nicolinho",
4057
  "scores": {
4058
+ "Score": 0.7074,
4059
+ "Chat": 0.9637,
4060
+ "Chat Hard": 0.8684,
4061
+ "Safety": 0.9467,
4062
+ "Reasoning": 0.9677,
4063
  "Factuality": 0.6653,
4064
  "Precise IF": 0.4062,
4065
  "Math": 0.612,
 
4066
  "Focus": 0.8909,
4067
+ "Ties": 0.7234
 
 
 
4068
  }
4069
  },
4070
  {
 
4202
  "name": "GPT-4o 2024-08-06",
4203
  "developer": "OpenAI",
4204
  "scores": {
4205
+ "Score": 0.8673,
 
 
 
 
4206
  "Factuality": 0.5684,
4207
  "Precise IF": 0.3312,
4208
  "Math": 0.623,
4209
+ "Safety": 0.8811,
4210
  "Focus": 0.7293,
4211
+ "Ties": 0.7819,
4212
+ "Chat": 0.9609,
4213
+ "Chat Hard": 0.761,
4214
+ "Reasoning": 0.8661
4215
  }
4216
  },
4217
  {
 
4249
  "name": "openbmb/Eurus-RM-7b",
4250
  "developer": "openbmb",
4251
  "scores": {
4252
+ "Score": 0.8159,
 
 
 
 
 
4253
  "Factuality": 0.6,
4254
  "Precise IF": 0.3438,
4255
  "Math": 0.5683,
4256
+ "Safety": 0.8135,
4257
  "Focus": 0.7475,
4258
+ "Ties": 0.5972,
4259
+ "Chat": 0.9804,
4260
+ "Chat Hard": 0.6557,
4261
+ "Reasoning": 0.8633,
4262
+ "Prior Sets (0.5 weight)": 0.7172
4263
  }
4264
  },
4265
  {
 
4370
  "name": "sfairXC/FsfairX-LLaMA3-RM-v0.1",
4371
  "developer": "sfairXC",
4372
  "scores": {
4373
+ "Score": 0.8338,
 
 
 
 
 
4374
  "Factuality": 0.5916,
4375
  "Precise IF": 0.4188,
4376
  "Math": 0.6284,
4377
+ "Safety": 0.8676,
4378
  "Focus": 0.7051,
4379
+ "Ties": 0.6647,
4380
+ "Chat": 0.9944,
4381
+ "Chat Hard": 0.6513,
4382
+ "Reasoning": 0.8644,
4383
+ "Prior Sets (0.5 weight)": 0.7492
4384
  }
4385
  },
4386
  {
 
4492
  "name": "weqweasdas/RM-Gemma-2B",
4493
  "developer": "weqweasdas",
4494
  "scores": {
4495
+ "Score": 0.6549,
 
 
 
 
 
4496
  "Factuality": 0.3705,
4497
  "Precise IF": 0.2812,
4498
  "Math": 0.4317,
4499
+ "Safety": 0.4986,
4500
  "Focus": 0.2343,
4501
+ "Ties": 0.1851,
4502
+ "Chat": 0.9441,
4503
+ "Chat Hard": 0.4079,
4504
+ "Reasoning": 0.7637,
4505
+ "Prior Sets (0.5 weight)": 0.6652
4506
  }
4507
  },
4508
  {
 
4541
  "name": "weqweasdas/RM-Mistral-7B",
4542
  "developer": "weqweasdas",
4543
  "scores": {
4544
+ "Score": 0.7982,
 
 
 
 
 
4545
  "Factuality": 0.5937,
4546
  "Precise IF": 0.3438,
4547
  "Math": 0.5956,
4548
+ "Safety": 0.8703,
4549
  "Focus": 0.7293,
4550
+ "Ties": 0.6226,
4551
+ "Chat": 0.9665,
4552
+ "Chat Hard": 0.6053,
4553
+ "Reasoning": 0.7736,
4554
+ "Prior Sets (0.5 weight)": 0.753
4555
  }
4556
  },
4557
  {
 
4559
  "name": "weqweasdas/hh_rlhf_rm_open_llama_3b",
4560
  "developer": "weqweasdas",
4561
  "scores": {
4562
+ "Score": 0.5027,
 
 
 
 
 
4563
  "Factuality": 0.3642,
4564
  "Precise IF": 0.275,
4565
  "Math": 0.3497,
4566
+ "Safety": 0.4149,
4567
  "Focus": 0.2384,
4568
+ "Ties": 0.0315,
4569
+ "Chat": 0.8184,
4570
+ "Chat Hard": 0.3728,
4571
+ "Reasoning": 0.3281,
4572
+ "Prior Sets (0.5 weight)": 0.6564
4573
  }
4574
  }
4575
  ]
data/benchmarks/swe-bench.json CHANGED
@@ -5,7 +5,7 @@
5
  "name": "claude-opus-4-5",
6
  "developer": "Anthropic",
7
  "scores": {
8
- "swe-bench": 0.65
9
  }
10
  },
11
  {
@@ -13,7 +13,7 @@
13
  "name": "gemini-3-pro-preview",
14
  "developer": "Google",
15
  "scores": {
16
- "swe-bench": 0.7234
17
  }
18
  },
19
  {
 
5
  "name": "claude-opus-4-5",
6
  "developer": "Anthropic",
7
  "scores": {
8
+ "swe-bench": 0.6061
9
  }
10
  },
11
  {
 
13
  "name": "gemini-3-pro-preview",
14
  "developer": "Google",
15
  "scores": {
16
+ "swe-bench": 0.71
17
  }
18
  },
19
  {
data/benchmarks/tau-bench-2_airline.json CHANGED
@@ -5,7 +5,7 @@
5
  "name": "claude-opus-4-5",
6
  "developer": "Anthropic",
7
  "scores": {
8
- "tau-bench-2/airline": 0.72
9
  }
10
  },
11
  {
@@ -13,7 +13,7 @@
13
  "name": "gemini-3-pro-preview",
14
  "developer": "Google",
15
  "scores": {
16
- "tau-bench-2/airline": 0.7
17
  }
18
  },
19
  {
 
5
  "name": "claude-opus-4-5",
6
  "developer": "Anthropic",
7
  "scores": {
8
+ "tau-bench-2/airline": 0.66
9
  }
10
  },
11
  {
 
13
  "name": "gemini-3-pro-preview",
14
  "developer": "Google",
15
  "scores": {
16
+ "tau-bench-2/airline": 0.68
17
  }
18
  },
19
  {
data/benchmarks/tau-bench-2_retail.json CHANGED
@@ -21,7 +21,7 @@
21
  "name": "gpt-5.2-2025-12-11",
22
  "developer": "OpenAI",
23
  "scores": {
24
- "tau-bench-2/retail": 0.68
25
  }
26
  }
27
  ]
 
21
  "name": "gpt-5.2-2025-12-11",
22
  "developer": "OpenAI",
23
  "scores": {
24
+ "tau-bench-2/retail": 0.73
25
  }
26
  }
27
  ]
data/benchmarks/tau-bench-2_telecom.json CHANGED
@@ -5,7 +5,7 @@
5
  "name": "claude-opus-4-5",
6
  "developer": "Anthropic",
7
  "scores": {
8
- "tau-bench-2/telecom": 0.76
9
  }
10
  },
11
  {
@@ -21,7 +21,7 @@
21
  "name": "gpt-5.2-2025-12-11",
22
  "developer": "OpenAI",
23
  "scores": {
24
- "tau-bench-2/telecom": 0.5354
25
  }
26
  }
27
  ]
 
5
  "name": "claude-opus-4-5",
6
  "developer": "Anthropic",
7
  "scores": {
8
+ "tau-bench-2/telecom": 0.84
9
  }
10
  },
11
  {
 
21
  "name": "gpt-5.2-2025-12-11",
22
  "developer": "OpenAI",
23
  "scores": {
24
+ "tau-bench-2/telecom": 0.71
25
  }
26
  }
27
  ]
data/benchmarks/terminal-bench-2.0.json CHANGED
@@ -21,7 +21,7 @@
21
  "name": "Claude Opus 4.1",
22
  "developer": "Anthropic",
23
  "scores": {
24
- "terminal-bench-2.0": 38.0
25
  }
26
  },
27
  {
@@ -29,7 +29,7 @@
29
  "name": "Claude Opus 4.5",
30
  "developer": "Anthropic",
31
  "scores": {
32
- "terminal-bench-2.0": 59.1
33
  }
34
  },
35
  {
@@ -37,7 +37,7 @@
37
  "name": "Claude Opus 4.6",
38
  "developer": "Anthropic",
39
  "scores": {
40
- "terminal-bench-2.0": 58.0
41
  }
42
  },
43
  {
@@ -45,7 +45,7 @@
45
  "name": "Claude Sonnet 4.5",
46
  "developer": "Anthropic",
47
  "scores": {
48
- "terminal-bench-2.0": 43.1
49
  }
50
  },
51
  {
@@ -61,7 +61,7 @@
61
  "name": "Gemini 2.5 Flash",
62
  "developer": "Google",
63
  "scores": {
64
- "terminal-bench-2.0": 17.1
65
  }
66
  },
67
  {
@@ -77,7 +77,7 @@
77
  "name": "Gemini 3 Flash",
78
  "developer": "Google",
79
  "scores": {
80
- "terminal-bench-2.0": 51.0
81
  }
82
  },
83
  {
@@ -109,7 +109,7 @@
109
  "name": "MiniMax M2.1",
110
  "developer": "MiniMax",
111
  "scores": {
112
- "terminal-bench-2.0": 29.2
113
  }
114
  },
115
  {
@@ -125,7 +125,7 @@
125
  "name": "Kimi K2 Instruct",
126
  "developer": "Moonshot AI",
127
  "scores": {
128
- "terminal-bench-2.0": 26.7
129
  }
130
  },
131
  {
@@ -149,7 +149,7 @@
149
  "name": "Multiple",
150
  "developer": "Multiple",
151
  "scores": {
152
- "terminal-bench-2.0": 71.0
153
  }
154
  },
155
  {
@@ -157,7 +157,7 @@
157
  "name": "GPT-5",
158
  "developer": "OpenAI",
159
  "scores": {
160
- "terminal-bench-2.0": 35.2
161
  }
162
  },
163
  {
@@ -165,7 +165,7 @@
165
  "name": "GPT-5-Codex",
166
  "developer": "OpenAI",
167
  "scores": {
168
- "terminal-bench-2.0": 44.3
169
  }
170
  },
171
  {
@@ -173,7 +173,7 @@
173
  "name": "GPT-5-Mini",
174
  "developer": "OpenAI",
175
  "scores": {
176
- "terminal-bench-2.0": 34.8
177
  }
178
  },
179
  {
@@ -181,7 +181,7 @@
181
  "name": "GPT-5-Nano",
182
  "developer": "OpenAI",
183
  "scores": {
184
- "terminal-bench-2.0": 9.9
185
  }
186
  },
187
  {
@@ -197,7 +197,7 @@
197
  "name": "GPT-5.1-Codex",
198
  "developer": "OpenAI",
199
  "scores": {
200
- "terminal-bench-2.0": 53.5
201
  }
202
  },
203
  {
@@ -221,7 +221,7 @@
221
  "name": "GPT-5.2",
222
  "developer": "OpenAI",
223
  "scores": {
224
- "terminal-bench-2.0": 60.7
225
  }
226
  },
227
  {
@@ -237,7 +237,7 @@
237
  "name": "GPT-5.3-Codex",
238
  "developer": "OpenAI",
239
  "scores": {
240
- "terminal-bench-2.0": 64.7
241
  }
242
  },
243
  {
@@ -245,7 +245,7 @@
245
  "name": "GPT-OSS-120B",
246
  "developer": "OpenAI",
247
  "scores": {
248
- "terminal-bench-2.0": 14.2
249
  }
250
  },
251
  {
@@ -253,7 +253,7 @@
253
  "name": "GPT-OSS-20B",
254
  "developer": "OpenAI",
255
  "scores": {
256
- "terminal-bench-2.0": 3.1
257
  }
258
  },
259
  {
@@ -261,7 +261,7 @@
261
  "name": "Grok 4",
262
  "developer": "xAI",
263
  "scores": {
264
- "terminal-bench-2.0": 25.4
265
  }
266
  },
267
  {
@@ -269,7 +269,7 @@
269
  "name": "Grok Code Fast 1",
270
  "developer": "xAI",
271
  "scores": {
272
- "terminal-bench-2.0": 25.8
273
  }
274
  },
275
  {
 
21
  "name": "Claude Opus 4.1",
22
  "developer": "Anthropic",
23
  "scores": {
24
+ "terminal-bench-2.0": 35.1
25
  }
26
  },
27
  {
 
29
  "name": "Claude Opus 4.5",
30
  "developer": "Anthropic",
31
  "scores": {
32
+ "terminal-bench-2.0": 52.1
33
  }
34
  },
35
  {
 
37
  "name": "Claude Opus 4.6",
38
  "developer": "Anthropic",
39
  "scores": {
40
+ "terminal-bench-2.0": 62.9
41
  }
42
  },
43
  {
 
45
  "name": "Claude Sonnet 4.5",
46
  "developer": "Anthropic",
47
  "scores": {
48
+ "terminal-bench-2.0": 42.6
49
  }
50
  },
51
  {
 
61
  "name": "Gemini 2.5 Flash",
62
  "developer": "Google",
63
  "scores": {
64
+ "terminal-bench-2.0": 16.9
65
  }
66
  },
67
  {
 
77
  "name": "Gemini 3 Flash",
78
  "developer": "Google",
79
  "scores": {
80
+ "terminal-bench-2.0": 47.4
81
  }
82
  },
83
  {
 
109
  "name": "MiniMax M2.1",
110
  "developer": "MiniMax",
111
  "scores": {
112
+ "terminal-bench-2.0": 36.6
113
  }
114
  },
115
  {
 
125
  "name": "Kimi K2 Instruct",
126
  "developer": "Moonshot AI",
127
  "scores": {
128
+ "terminal-bench-2.0": 27.8
129
  }
130
  },
131
  {
 
149
  "name": "Multiple",
150
  "developer": "Multiple",
151
  "scores": {
152
+ "terminal-bench-2.0": 72.4
153
  }
154
  },
155
  {
 
157
  "name": "GPT-5",
158
  "developer": "OpenAI",
159
  "scores": {
160
+ "terminal-bench-2.0": 49.6
161
  }
162
  },
163
  {
 
165
  "name": "GPT-5-Codex",
166
  "developer": "OpenAI",
167
  "scores": {
168
+ "terminal-bench-2.0": 43.4
169
  }
170
  },
171
  {
 
173
  "name": "GPT-5-Mini",
174
  "developer": "OpenAI",
175
  "scores": {
176
+ "terminal-bench-2.0": 24.0
177
  }
178
  },
179
  {
 
181
  "name": "GPT-5-Nano",
182
  "developer": "OpenAI",
183
  "scores": {
184
+ "terminal-bench-2.0": 11.5
185
  }
186
  },
187
  {
 
197
  "name": "GPT-5.1-Codex",
198
  "developer": "OpenAI",
199
  "scores": {
200
+ "terminal-bench-2.0": 57.8
201
  }
202
  },
203
  {
 
221
  "name": "GPT-5.2",
222
  "developer": "OpenAI",
223
  "scores": {
224
+ "terminal-bench-2.0": 62.9
225
  }
226
  },
227
  {
 
237
  "name": "GPT-5.3-Codex",
238
  "developer": "OpenAI",
239
  "scores": {
240
+ "terminal-bench-2.0": 77.3
241
  }
242
  },
243
  {
 
245
  "name": "GPT-OSS-120B",
246
  "developer": "OpenAI",
247
  "scores": {
248
+ "terminal-bench-2.0": 18.7
249
  }
250
  },
251
  {
 
253
  "name": "GPT-OSS-20B",
254
  "developer": "OpenAI",
255
  "scores": {
256
+ "terminal-bench-2.0": 3.4
257
  }
258
  },
259
  {
 
261
  "name": "Grok 4",
262
  "developer": "xAI",
263
  "scores": {
264
+ "terminal-bench-2.0": 23.1
265
  }
266
  },
267
  {
 
269
  "name": "Grok Code Fast 1",
270
  "developer": "xAI",
271
  "scores": {
272
+ "terminal-bench-2.0": 14.2
273
  }
274
  },
275
  {
data/benchmarks/theory_of_mind.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "models": [
3
+ {
4
+ "model_id": "Qwen/Qwen2.5-3B-Instruct",
5
+ "name": "Qwen2.5-3B-Instruct",
6
+ "developer": "Qwen",
7
+ "scores": {
8
+ "accuracy on theory_of_mind for scorer model_graded_fact": 0.78
9
+ }
10
+ }
11
+ ]
12
+ }
data/developers.json CHANGED
@@ -1917,7 +1917,7 @@
1917
  },
1918
  {
1919
  "developer": "NousResearch",
1920
- "model_count": 19
1921
  },
1922
  {
1923
  "developer": "Novaciano",
 
1917
  },
1918
  {
1919
  "developer": "NousResearch",
1920
+ "model_count": 18
1921
  },
1922
  {
1923
  "developer": "Novaciano",
data/developers/adriszmar.json CHANGED
@@ -7,12 +7,12 @@
7
  "developer": "adriszmar",
8
  "evaluator_relationship": null,
9
  "benchmark_scores": {
10
- "hfopenllm_v2/IFEval": 0.1685,
11
- "hfopenllm_v2/BBH": 0.3124,
12
- "hfopenllm_v2/MATH Level 5": 0.0015,
13
- "hfopenllm_v2/GPQA": 0.2492,
14
- "hfopenllm_v2/MUSR": 0.3963,
15
- "hfopenllm_v2/MMLU-PRO": 0.1066
16
  }
17
  }
18
  ]
 
7
  "developer": "adriszmar",
8
  "evaluator_relationship": null,
9
  "benchmark_scores": {
10
+ "hfopenllm_v2/IFEval": 0.1746,
11
+ "hfopenllm_v2/BBH": 0.3126,
12
+ "hfopenllm_v2/MATH Level 5": 0.0,
13
+ "hfopenllm_v2/GPQA": 0.245,
14
+ "hfopenllm_v2/MUSR": 0.4096,
15
+ "hfopenllm_v2/MMLU-PRO": 0.1087
16
  }
17
  }
18
  ]
data/developers/ai2.json CHANGED
@@ -43,9 +43,9 @@
43
  "developer": "AI2",
44
  "evaluator_relationship": null,
45
  "benchmark_scores": {
46
- "reward-bench/Score": 0.7008,
47
- "reward-bench/Chat": 0.9385,
48
- "reward-bench/Chat Hard": 0.3882,
49
  "reward-bench/Safety": 0.7757
50
  }
51
  },
 
43
  "developer": "AI2",
44
  "evaluator_relationship": null,
45
  "benchmark_scores": {
46
+ "reward-bench/Score": 0.6924,
47
+ "reward-bench/Chat": 0.9441,
48
+ "reward-bench/Chat Hard": 0.3575,
49
  "reward-bench/Safety": 0.7757
50
  }
51
  },
data/developers/akjindal53244.json CHANGED
@@ -7,12 +7,12 @@
7
  "developer": "akjindal53244",
8
  "evaluator_relationship": null,
9
  "benchmark_scores": {
10
- "hfopenllm_v2/IFEval": 0.8051,
11
- "hfopenllm_v2/BBH": 0.5189,
12
- "hfopenllm_v2/MATH Level 5": 0.1722,
13
- "hfopenllm_v2/GPQA": 0.3263,
14
  "hfopenllm_v2/MUSR": 0.4028,
15
- "hfopenllm_v2/MMLU-PRO": 0.3803
16
  }
17
  }
18
  ]
 
7
  "developer": "akjindal53244",
8
  "evaluator_relationship": null,
9
  "benchmark_scores": {
10
+ "hfopenllm_v2/IFEval": 0.8033,
11
+ "hfopenllm_v2/BBH": 0.5196,
12
+ "hfopenllm_v2/MATH Level 5": 0.1624,
13
+ "hfopenllm_v2/GPQA": 0.3096,
14
  "hfopenllm_v2/MUSR": 0.4028,
15
+ "hfopenllm_v2/MMLU-PRO": 0.3812
16
  }
17
  }
18
  ]
data/developers/allenai.json CHANGED
@@ -63,17 +63,17 @@
63
  "developer": "allenai",
64
  "evaluator_relationship": null,
65
  "benchmark_scores": {
66
- "reward-bench/Score": 0.9021,
 
 
 
 
 
67
  "reward-bench/Factuality": 0.8126,
68
  "reward-bench/Precise IF": 0.4188,
69
  "reward-bench/Math": 0.6995,
70
- "reward-bench/Safety": 0.9095,
71
  "reward-bench/Focus": 0.8646,
72
- "reward-bench/Ties": 0.8835,
73
- "reward-bench/Chat": 0.9665,
74
- "reward-bench/Chat Hard": 0.8355,
75
- "reward-bench/Reasoning": 0.8969,
76
- "reward-bench/Prior Sets (0.5 weight)": 0.0
77
  }
78
  },
79
  {
@@ -101,17 +101,17 @@
101
  "developer": "allenai",
102
  "evaluator_relationship": null,
103
  "benchmark_scores": {
104
- "reward-bench/Score": 0.8885,
 
 
 
 
 
105
  "reward-bench/Factuality": 0.7432,
106
  "reward-bench/Precise IF": 0.4437,
107
  "reward-bench/Math": 0.6175,
108
- "reward-bench/Safety": 0.8932,
109
  "reward-bench/Focus": 0.9071,
110
- "reward-bench/Ties": 0.7638,
111
- "reward-bench/Chat": 0.9581,
112
- "reward-bench/Chat Hard": 0.8158,
113
- "reward-bench/Reasoning": 0.887,
114
- "reward-bench/Prior Sets (0.5 weight)": 0.0
115
  }
116
  },
117
  {
@@ -120,12 +120,12 @@
120
  "developer": "allenai",
121
  "evaluator_relationship": null,
122
  "benchmark_scores": {
123
- "hfopenllm_v2/IFEval": 0.8379,
124
- "hfopenllm_v2/BBH": 0.6157,
125
- "hfopenllm_v2/MATH Level 5": 0.3829,
126
  "hfopenllm_v2/GPQA": 0.3733,
127
- "hfopenllm_v2/MUSR": 0.4988,
128
- "hfopenllm_v2/MMLU-PRO": 0.4656
129
  }
130
  },
131
  {
@@ -162,17 +162,17 @@
162
  "developer": "allenai",
163
  "evaluator_relationship": null,
164
  "benchmark_scores": {
165
- "reward-bench/Score": 0.722,
166
- "reward-bench/Chat": 0.9693,
167
- "reward-bench/Chat Hard": 0.8268,
168
- "reward-bench/Safety": 0.8689,
169
- "reward-bench/Reasoning": 0.8583,
170
- "reward-bench/Prior Sets (0.5 weight)": 0.0,
171
  "reward-bench/Factuality": 0.8084,
172
  "reward-bench/Precise IF": 0.3688,
173
  "reward-bench/Math": 0.6776,
 
174
  "reward-bench/Focus": 0.7778,
175
- "reward-bench/Ties": 0.8308
 
 
 
 
176
  }
177
  },
178
  {
@@ -209,17 +209,17 @@
209
  "developer": "allenai",
210
  "evaluator_relationship": null,
211
  "benchmark_scores": {
212
- "reward-bench/Score": 0.687,
213
- "reward-bench/Chat": 0.9553,
214
- "reward-bench/Chat Hard": 0.761,
215
- "reward-bench/Safety": 0.86,
216
- "reward-bench/Reasoning": 0.7898,
217
- "reward-bench/Prior Sets (0.5 weight)": 0.0,
218
  "reward-bench/Factuality": 0.7516,
219
  "reward-bench/Precise IF": 0.3875,
220
  "reward-bench/Math": 0.6284,
 
221
  "reward-bench/Focus": 0.8545,
222
- "reward-bench/Ties": 0.6397
 
 
 
 
223
  }
224
  },
225
  {
 
63
  "developer": "allenai",
64
  "evaluator_relationship": null,
65
  "benchmark_scores": {
66
+ "reward-bench/Score": 0.7606,
67
+ "reward-bench/Chat": 0.9665,
68
+ "reward-bench/Chat Hard": 0.8355,
69
+ "reward-bench/Safety": 0.8844,
70
+ "reward-bench/Reasoning": 0.8969,
71
+ "reward-bench/Prior Sets (0.5 weight)": 0.0,
72
  "reward-bench/Factuality": 0.8126,
73
  "reward-bench/Precise IF": 0.4188,
74
  "reward-bench/Math": 0.6995,
 
75
  "reward-bench/Focus": 0.8646,
76
+ "reward-bench/Ties": 0.8835
 
 
 
 
77
  }
78
  },
79
  {
 
101
  "developer": "allenai",
102
  "evaluator_relationship": null,
103
  "benchmark_scores": {
104
+ "reward-bench/Score": 0.7285,
105
+ "reward-bench/Chat": 0.9581,
106
+ "reward-bench/Chat Hard": 0.8158,
107
+ "reward-bench/Safety": 0.8956,
108
+ "reward-bench/Reasoning": 0.887,
109
+ "reward-bench/Prior Sets (0.5 weight)": 0.0,
110
  "reward-bench/Factuality": 0.7432,
111
  "reward-bench/Precise IF": 0.4437,
112
  "reward-bench/Math": 0.6175,
 
113
  "reward-bench/Focus": 0.9071,
114
+ "reward-bench/Ties": 0.7638
 
 
 
 
115
  }
116
  },
117
  {
 
120
  "developer": "allenai",
121
  "evaluator_relationship": null,
122
  "benchmark_scores": {
123
+ "hfopenllm_v2/IFEval": 0.8291,
124
+ "hfopenllm_v2/BBH": 0.6164,
125
+ "hfopenllm_v2/MATH Level 5": 0.4502,
126
  "hfopenllm_v2/GPQA": 0.3733,
127
+ "hfopenllm_v2/MUSR": 0.4948,
128
+ "hfopenllm_v2/MMLU-PRO": 0.4645
129
  }
130
  },
131
  {
 
162
  "developer": "allenai",
163
  "evaluator_relationship": null,
164
  "benchmark_scores": {
165
+ "reward-bench/Score": 0.8892,
 
 
 
 
 
166
  "reward-bench/Factuality": 0.8084,
167
  "reward-bench/Precise IF": 0.3688,
168
  "reward-bench/Math": 0.6776,
169
+ "reward-bench/Safety": 0.9027,
170
  "reward-bench/Focus": 0.7778,
171
+ "reward-bench/Ties": 0.8308,
172
+ "reward-bench/Chat": 0.9693,
173
+ "reward-bench/Chat Hard": 0.8268,
174
+ "reward-bench/Reasoning": 0.8583,
175
+ "reward-bench/Prior Sets (0.5 weight)": 0.0
176
  }
177
  },
178
  {
 
209
  "developer": "allenai",
210
  "evaluator_relationship": null,
211
  "benchmark_scores": {
212
+ "reward-bench/Score": 0.8431,
 
 
 
 
 
213
  "reward-bench/Factuality": 0.7516,
214
  "reward-bench/Precise IF": 0.3875,
215
  "reward-bench/Math": 0.6284,
216
+ "reward-bench/Safety": 0.8662,
217
  "reward-bench/Focus": 0.8545,
218
+ "reward-bench/Ties": 0.6397,
219
+ "reward-bench/Chat": 0.9553,
220
+ "reward-bench/Chat Hard": 0.761,
221
+ "reward-bench/Reasoning": 0.7898,
222
+ "reward-bench/Prior Sets (0.5 weight)": 0.0
223
  }
224
  },
225
  {
data/developers/anthropic.json CHANGED
@@ -650,12 +650,12 @@
650
  "developer": "Anthropic",
651
  "evaluator_relationship": null,
652
  "benchmark_scores": {
653
- "appworld_test_normal/appworld/test_normal": 0.68,
654
  "browsecompplus/browsecompplus": 0.61,
655
- "swe-bench/swe-bench": 0.65,
656
- "tau-bench-2_airline/tau-bench-2/airline": 0.72,
657
  "tau-bench-2_retail/tau-bench-2/retail": 0.78,
658
- "tau-bench-2_telecom/tau-bench-2/telecom": 0.76
659
  }
660
  },
661
  {
@@ -664,7 +664,7 @@
664
  "developer": "Anthropic",
665
  "evaluator_relationship": null,
666
  "benchmark_scores": {
667
- "terminal-bench-2.0/terminal-bench-2.0": 38.0
668
  }
669
  },
670
  {
@@ -673,7 +673,7 @@
673
  "developer": "Anthropic",
674
  "evaluator_relationship": null,
675
  "benchmark_scores": {
676
- "terminal-bench-2.0/terminal-bench-2.0": 59.1
677
  }
678
  },
679
  {
@@ -682,7 +682,7 @@
682
  "developer": "Anthropic",
683
  "evaluator_relationship": null,
684
  "benchmark_scores": {
685
- "terminal-bench-2.0/terminal-bench-2.0": 58.0
686
  }
687
  },
688
  {
@@ -756,7 +756,7 @@
756
  "developer": "Anthropic",
757
  "evaluator_relationship": null,
758
  "benchmark_scores": {
759
- "terminal-bench-2.0/terminal-bench-2.0": 43.1
760
  }
761
  },
762
  {
@@ -800,8 +800,6 @@
800
  "developer": "Anthropic",
801
  "evaluator_relationship": null,
802
  "benchmark_scores": {
803
- "ace/Overall Score": 0.478,
804
- "ace/Gaming Score": 0.391,
805
  "apex-agents/Overall Pass@1": 0.184,
806
  "apex-agents/Overall Pass@8": 0.34,
807
  "apex-agents/Overall Mean Score": 0.348,
@@ -809,6 +807,8 @@
809
  "apex-agents/Management Consulting Pass@1": 0.132,
810
  "apex-agents/Corporate Law Pass@1": 0.202,
811
  "apex-agents/Corporate Lawyer Mean Score": 0.471,
 
 
812
  "apex-v1/Medicine (MD) Score": 0.65
813
  }
814
  },
 
650
  "developer": "Anthropic",
651
  "evaluator_relationship": null,
652
  "benchmark_scores": {
653
+ "appworld_test_normal/appworld/test_normal": 0.7,
654
  "browsecompplus/browsecompplus": 0.61,
655
+ "swe-bench/swe-bench": 0.6061,
656
+ "tau-bench-2_airline/tau-bench-2/airline": 0.66,
657
  "tau-bench-2_retail/tau-bench-2/retail": 0.78,
658
+ "tau-bench-2_telecom/tau-bench-2/telecom": 0.84
659
  }
660
  },
661
  {
 
664
  "developer": "Anthropic",
665
  "evaluator_relationship": null,
666
  "benchmark_scores": {
667
+ "terminal-bench-2.0/terminal-bench-2.0": 35.1
668
  }
669
  },
670
  {
 
673
  "developer": "Anthropic",
674
  "evaluator_relationship": null,
675
  "benchmark_scores": {
676
+ "terminal-bench-2.0/terminal-bench-2.0": 52.1
677
  }
678
  },
679
  {
 
682
  "developer": "Anthropic",
683
  "evaluator_relationship": null,
684
  "benchmark_scores": {
685
+ "terminal-bench-2.0/terminal-bench-2.0": 62.9
686
  }
687
  },
688
  {
 
756
  "developer": "Anthropic",
757
  "evaluator_relationship": null,
758
  "benchmark_scores": {
759
+ "terminal-bench-2.0/terminal-bench-2.0": 42.6
760
  }
761
  },
762
  {
 
800
  "developer": "Anthropic",
801
  "evaluator_relationship": null,
802
  "benchmark_scores": {
 
 
803
  "apex-agents/Overall Pass@1": 0.184,
804
  "apex-agents/Overall Pass@8": 0.34,
805
  "apex-agents/Overall Mean Score": 0.348,
 
807
  "apex-agents/Management Consulting Pass@1": 0.132,
808
  "apex-agents/Corporate Law Pass@1": 0.202,
809
  "apex-agents/Corporate Lawyer Mean Score": 0.471,
810
+ "ace/Overall Score": 0.478,
811
+ "ace/Gaming Score": 0.391,
812
  "apex-v1/Medicine (MD) Score": 0.65
813
  }
814
  },
data/developers/cognitivecomputations.json CHANGED
@@ -77,12 +77,12 @@
77
  "developer": "cognitivecomputations",
78
  "evaluator_relationship": null,
79
  "benchmark_scores": {
80
- "hfopenllm_v2/IFEval": 0.4124,
81
- "hfopenllm_v2/BBH": 0.6383,
82
- "hfopenllm_v2/MATH Level 5": 0.182,
83
- "hfopenllm_v2/GPQA": 0.3289,
84
- "hfopenllm_v2/MUSR": 0.4349,
85
- "hfopenllm_v2/MMLU-PRO": 0.4525
86
  }
87
  },
88
  {
 
77
  "developer": "cognitivecomputations",
78
  "evaluator_relationship": null,
79
  "benchmark_scores": {
80
+ "hfopenllm_v2/IFEval": 0.3613,
81
+ "hfopenllm_v2/BBH": 0.6123,
82
+ "hfopenllm_v2/MATH Level 5": 0.1239,
83
+ "hfopenllm_v2/GPQA": 0.328,
84
+ "hfopenllm_v2/MUSR": 0.4112,
85
+ "hfopenllm_v2/MMLU-PRO": 0.4494
86
  }
87
  },
88
  {
data/developers/columbia-nlp.json CHANGED
@@ -7,12 +7,12 @@
7
  "developer": "Columbia-NLP",
8
  "evaluator_relationship": null,
9
  "benchmark_scores": {
10
- "hfopenllm_v2/IFEval": 0.3278,
11
- "hfopenllm_v2/BBH": 0.392,
12
- "hfopenllm_v2/MATH Level 5": 0.0431,
13
- "hfopenllm_v2/GPQA": 0.2492,
14
- "hfopenllm_v2/MUSR": 0.412,
15
- "hfopenllm_v2/MMLU-PRO": 0.1666
16
  }
17
  },
18
  {
 
7
  "developer": "Columbia-NLP",
8
  "evaluator_relationship": null,
9
  "benchmark_scores": {
10
+ "hfopenllm_v2/IFEval": 0.3102,
11
+ "hfopenllm_v2/BBH": 0.3881,
12
+ "hfopenllm_v2/MATH Level 5": 0.0536,
13
+ "hfopenllm_v2/GPQA": 0.2534,
14
+ "hfopenllm_v2/MUSR": 0.4081,
15
+ "hfopenllm_v2/MMLU-PRO": 0.1665
16
  }
17
  },
18
  {
data/developers/cpayne1303.json CHANGED
@@ -35,12 +35,12 @@
35
  "developer": "cpayne1303",
36
  "evaluator_relationship": null,
37
  "benchmark_scores": {
38
- "hfopenllm_v2/IFEval": 0.1916,
39
- "hfopenllm_v2/BBH": 0.2977,
40
- "hfopenllm_v2/MATH Level 5": 0.0,
41
  "hfopenllm_v2/GPQA": 0.2685,
42
- "hfopenllm_v2/MUSR": 0.3872,
43
- "hfopenllm_v2/MMLU-PRO": 0.1132
44
  }
45
  },
46
  {
 
35
  "developer": "cpayne1303",
36
  "evaluator_relationship": null,
37
  "benchmark_scores": {
38
+ "hfopenllm_v2/IFEval": 0.1949,
39
+ "hfopenllm_v2/BBH": 0.2965,
40
+ "hfopenllm_v2/MATH Level 5": 0.0045,
41
  "hfopenllm_v2/GPQA": 0.2685,
42
+ "hfopenllm_v2/MUSR": 0.3885,
43
+ "hfopenllm_v2/MMLU-PRO": 0.1111
44
  }
45
  },
46
  {
data/developers/daemontatox.json CHANGED
@@ -231,12 +231,12 @@
231
  "developer": "Daemontatox",
232
  "evaluator_relationship": null,
233
  "benchmark_scores": {
234
- "hfopenllm_v2/IFEval": 0.3745,
235
- "hfopenllm_v2/BBH": 0.6668,
236
- "hfopenllm_v2/MATH Level 5": 0.4758,
237
- "hfopenllm_v2/GPQA": 0.3943,
238
- "hfopenllm_v2/MUSR": 0.4858,
239
- "hfopenllm_v2/MMLU-PRO": 0.5593
240
  }
241
  },
242
  {
 
231
  "developer": "Daemontatox",
232
  "evaluator_relationship": null,
233
  "benchmark_scores": {
234
+ "hfopenllm_v2/IFEval": 0.4855,
235
+ "hfopenllm_v2/BBH": 0.6627,
236
+ "hfopenllm_v2/MATH Level 5": 0.4841,
237
+ "hfopenllm_v2/GPQA": 0.3096,
238
+ "hfopenllm_v2/MUSR": 0.4256,
239
+ "hfopenllm_v2/MMLU-PRO": 0.5542
240
  }
241
  },
242
  {
data/developers/deepmount00.json CHANGED
@@ -63,12 +63,12 @@
63
  "developer": "DeepMount00",
64
  "evaluator_relationship": null,
65
  "benchmark_scores": {
66
- "hfopenllm_v2/IFEval": 0.7917,
67
- "hfopenllm_v2/BBH": 0.5109,
68
- "hfopenllm_v2/MATH Level 5": 0.1088,
69
- "hfopenllm_v2/GPQA": 0.2878,
70
- "hfopenllm_v2/MUSR": 0.4136,
71
- "hfopenllm_v2/MMLU-PRO": 0.3876
72
  }
73
  },
74
  {
 
63
  "developer": "DeepMount00",
64
  "evaluator_relationship": null,
65
  "benchmark_scores": {
66
+ "hfopenllm_v2/IFEval": 0.5365,
67
+ "hfopenllm_v2/BBH": 0.517,
68
+ "hfopenllm_v2/MATH Level 5": 0.1707,
69
+ "hfopenllm_v2/GPQA": 0.3062,
70
+ "hfopenllm_v2/MUSR": 0.4487,
71
+ "hfopenllm_v2/MMLU-PRO": 0.396
72
  }
73
  },
74
  {
data/developers/dfurman.json CHANGED
@@ -35,12 +35,12 @@
35
  "developer": "dfurman",
36
  "evaluator_relationship": null,
37
  "benchmark_scores": {
38
- "hfopenllm_v2/IFEval": 0.2835,
39
- "hfopenllm_v2/BBH": 0.3842,
40
- "hfopenllm_v2/MATH Level 5": 0.0521,
41
- "hfopenllm_v2/GPQA": 0.2609,
42
- "hfopenllm_v2/MUSR": 0.3566,
43
- "hfopenllm_v2/MMLU-PRO": 0.2298
44
  }
45
  },
46
  {
 
35
  "developer": "dfurman",
36
  "evaluator_relationship": null,
37
  "benchmark_scores": {
38
+ "hfopenllm_v2/IFEval": 0.3,
39
+ "hfopenllm_v2/BBH": 0.3853,
40
+ "hfopenllm_v2/MATH Level 5": 0.0415,
41
+ "hfopenllm_v2/GPQA": 0.2617,
42
+ "hfopenllm_v2/MUSR": 0.3579,
43
+ "hfopenllm_v2/MMLU-PRO": 0.2281
44
  }
45
  },
46
  {
data/developers/doppelreflex.json CHANGED
@@ -175,12 +175,12 @@
175
  "developer": "DoppelReflEx",
176
  "evaluator_relationship": null,
177
  "benchmark_scores": {
178
- "hfopenllm_v2/IFEval": 0.451,
179
- "hfopenllm_v2/BBH": 0.4944,
180
- "hfopenllm_v2/MATH Level 5": 0.1156,
181
- "hfopenllm_v2/GPQA": 0.3196,
182
- "hfopenllm_v2/MUSR": 0.3896,
183
- "hfopenllm_v2/MMLU-PRO": 0.3256
184
  }
185
  },
186
  {
 
175
  "developer": "DoppelReflEx",
176
  "evaluator_relationship": null,
177
  "benchmark_scores": {
178
+ "hfopenllm_v2/IFEval": 0.436,
179
+ "hfopenllm_v2/BBH": 0.4956,
180
+ "hfopenllm_v2/MATH Level 5": 0.0589,
181
+ "hfopenllm_v2/GPQA": 0.3205,
182
+ "hfopenllm_v2/MUSR": 0.3843,
183
+ "hfopenllm_v2/MMLU-PRO": 0.3237
184
  }
185
  },
186
  {
data/developers/google.json CHANGED
@@ -139,6 +139,7 @@
139
  "developer": "Google",
140
  "evaluator_relationship": null,
141
  "benchmark_scores": {
 
142
  "apex-agents/Overall Pass@1": 0.24,
143
  "apex-agents/Overall Pass@8": 0.367,
144
  "apex-agents/Overall Mean Score": 0.395,
@@ -146,7 +147,6 @@
146
  "apex-agents/Management Consulting Pass@1": 0.193,
147
  "apex-agents/Corporate Law Pass@1": 0.259,
148
  "apex-agents/Corporate Lawyer Mean Score": 0.524,
149
- "ace/Gaming Score": 0.415,
150
  "apex-v1/Overall Score": 0.64,
151
  "apex-v1/Consulting Score": 0.64
152
  }
@@ -157,6 +157,8 @@
157
  "developer": "Google",
158
  "evaluator_relationship": null,
159
  "benchmark_scores": {
 
 
160
  "apex-agents/Overall Pass@1": 0.184,
161
  "apex-agents/Overall Pass@8": 0.373,
162
  "apex-agents/Overall Mean Score": 0.341,
@@ -164,8 +166,6 @@
164
  "apex-agents/Management Consulting Pass@1": 0.124,
165
  "apex-agents/Corporate Law Pass@1": 0.239,
166
  "apex-agents/Corporate Lawyer Mean Score": 0.487,
167
- "ace/Overall Score": 0.47,
168
- "ace/Gaming Score": 0.509,
169
  "apex-v1/Overall Score": 0.643,
170
  "apex-v1/Consulting Score": 0.64,
171
  "apex-v1/Investment Banking Score": 0.63
@@ -723,7 +723,7 @@
723
  "reward-bench/Safety": 0.909,
724
  "reward-bench/Focus": 0.841,
725
  "reward-bench/Ties": 0.809,
726
- "terminal-bench-2.0/terminal-bench-2.0": 17.1
727
  }
728
  },
729
  {
@@ -861,7 +861,7 @@
861
  "developer": "Google",
862
  "evaluator_relationship": null,
863
  "benchmark_scores": {
864
- "terminal-bench-2.0/terminal-bench-2.0": 51.0
865
  }
866
  },
867
  {
@@ -879,8 +879,8 @@
879
  "developer": "Google",
880
  "evaluator_relationship": null,
881
  "benchmark_scores": {
882
- "appworld_test_normal/appworld/test_normal": 0.13,
883
- "browsecompplus/browsecompplus": 0.48,
884
  "global-mmlu-lite/Global MMLU Lite": 0.9453,
885
  "global-mmlu-lite/Culturally Sensitive": 0.9397,
886
  "global-mmlu-lite/Culturally Agnostic": 0.9509,
@@ -900,8 +900,8 @@
900
  "global-mmlu-lite/Yoruba": 0.9425,
901
  "global-mmlu-lite/Chinese": 0.9475,
902
  "global-mmlu-lite/Burmese": 0.9425,
903
- "swe-bench/swe-bench": 0.7234,
904
- "tau-bench-2_airline/tau-bench-2/airline": 0.7,
905
  "tau-bench-2_retail/tau-bench-2/retail": 0.73,
906
  "tau-bench-2_telecom/tau-bench-2/telecom": 0.73
907
  }
@@ -1028,12 +1028,12 @@
1028
  "developer": "Google",
1029
  "evaluator_relationship": null,
1030
  "benchmark_scores": {
1031
- "hfopenllm_v2/IFEval": 0.2018,
1032
- "hfopenllm_v2/BBH": 0.3709,
1033
- "hfopenllm_v2/MATH Level 5": 0.0302,
1034
  "hfopenllm_v2/GPQA": 0.2626,
1035
- "hfopenllm_v2/MUSR": 0.4219,
1036
- "hfopenllm_v2/MMLU-PRO": 0.2217
1037
  }
1038
  },
1039
  {
@@ -1056,12 +1056,12 @@
1056
  "developer": "Google",
1057
  "evaluator_relationship": null,
1058
  "benchmark_scores": {
1059
- "hfopenllm_v2/IFEval": 0.5078,
1060
- "hfopenllm_v2/BBH": 0.4226,
1061
- "hfopenllm_v2/MATH Level 5": 0.0347,
1062
- "hfopenllm_v2/GPQA": 0.2852,
1063
- "hfopenllm_v2/MUSR": 0.3964,
1064
- "hfopenllm_v2/MMLU-PRO": 0.2578
1065
  }
1066
  },
1067
  {
 
139
  "developer": "Google",
140
  "evaluator_relationship": null,
141
  "benchmark_scores": {
142
+ "ace/Gaming Score": 0.415,
143
  "apex-agents/Overall Pass@1": 0.24,
144
  "apex-agents/Overall Pass@8": 0.367,
145
  "apex-agents/Overall Mean Score": 0.395,
 
147
  "apex-agents/Management Consulting Pass@1": 0.193,
148
  "apex-agents/Corporate Law Pass@1": 0.259,
149
  "apex-agents/Corporate Lawyer Mean Score": 0.524,
 
150
  "apex-v1/Overall Score": 0.64,
151
  "apex-v1/Consulting Score": 0.64
152
  }
 
157
  "developer": "Google",
158
  "evaluator_relationship": null,
159
  "benchmark_scores": {
160
+ "ace/Overall Score": 0.47,
161
+ "ace/Gaming Score": 0.509,
162
  "apex-agents/Overall Pass@1": 0.184,
163
  "apex-agents/Overall Pass@8": 0.373,
164
  "apex-agents/Overall Mean Score": 0.341,
 
166
  "apex-agents/Management Consulting Pass@1": 0.124,
167
  "apex-agents/Corporate Law Pass@1": 0.239,
168
  "apex-agents/Corporate Lawyer Mean Score": 0.487,
 
 
169
  "apex-v1/Overall Score": 0.643,
170
  "apex-v1/Consulting Score": 0.64,
171
  "apex-v1/Investment Banking Score": 0.63
 
723
  "reward-bench/Safety": 0.909,
724
  "reward-bench/Focus": 0.841,
725
  "reward-bench/Ties": 0.809,
726
+ "terminal-bench-2.0/terminal-bench-2.0": 16.9
727
  }
728
  },
729
  {
 
861
  "developer": "Google",
862
  "evaluator_relationship": null,
863
  "benchmark_scores": {
864
+ "terminal-bench-2.0/terminal-bench-2.0": 47.4
865
  }
866
  },
867
  {
 
879
  "developer": "Google",
880
  "evaluator_relationship": null,
881
  "benchmark_scores": {
882
+ "appworld_test_normal/appworld/test_normal": 0.55,
883
+ "browsecompplus/browsecompplus": 0.3333,
884
  "global-mmlu-lite/Global MMLU Lite": 0.9453,
885
  "global-mmlu-lite/Culturally Sensitive": 0.9397,
886
  "global-mmlu-lite/Culturally Agnostic": 0.9509,
 
900
  "global-mmlu-lite/Yoruba": 0.9425,
901
  "global-mmlu-lite/Chinese": 0.9475,
902
  "global-mmlu-lite/Burmese": 0.9425,
903
+ "swe-bench/swe-bench": 0.71,
904
+ "tau-bench-2_airline/tau-bench-2/airline": 0.68,
905
  "tau-bench-2_retail/tau-bench-2/retail": 0.73,
906
  "tau-bench-2_telecom/tau-bench-2/telecom": 0.73
907
  }
 
1028
  "developer": "Google",
1029
  "evaluator_relationship": null,
1030
  "benchmark_scores": {
1031
+ "hfopenllm_v2/IFEval": 0.1993,
1032
+ "hfopenllm_v2/BBH": 0.3656,
1033
+ "hfopenllm_v2/MATH Level 5": 0.0287,
1034
  "hfopenllm_v2/GPQA": 0.2626,
1035
+ "hfopenllm_v2/MUSR": 0.4232,
1036
+ "hfopenllm_v2/MMLU-PRO": 0.218
1037
  }
1038
  },
1039
  {
 
1056
  "developer": "Google",
1057
  "evaluator_relationship": null,
1058
  "benchmark_scores": {
1059
+ "hfopenllm_v2/IFEval": 0.5288,
1060
+ "hfopenllm_v2/BBH": 0.4178,
1061
+ "hfopenllm_v2/MATH Level 5": 0.0476,
1062
+ "hfopenllm_v2/GPQA": 0.2752,
1063
+ "hfopenllm_v2/MUSR": 0.3728,
1064
+ "hfopenllm_v2/MMLU-PRO": 0.2467
1065
  }
1066
  },
1067
  {
data/developers/huggingfacetb.json CHANGED
@@ -133,12 +133,12 @@
133
  "developer": "HuggingFaceTB",
134
  "evaluator_relationship": null,
135
  "benchmark_scores": {
136
- "hfopenllm_v2/IFEval": 0.0593,
137
- "hfopenllm_v2/BBH": 0.3135,
138
- "hfopenllm_v2/MATH Level 5": 0.0144,
139
- "hfopenllm_v2/GPQA": 0.2341,
140
- "hfopenllm_v2/MUSR": 0.3871,
141
- "hfopenllm_v2/MMLU-PRO": 0.1092
142
  }
143
  },
144
  {
 
133
  "developer": "HuggingFaceTB",
134
  "evaluator_relationship": null,
135
  "benchmark_scores": {
136
+ "hfopenllm_v2/IFEval": 0.2883,
137
+ "hfopenllm_v2/BBH": 0.3124,
138
+ "hfopenllm_v2/MATH Level 5": 0.003,
139
+ "hfopenllm_v2/GPQA": 0.2357,
140
+ "hfopenllm_v2/MUSR": 0.3662,
141
+ "hfopenllm_v2/MMLU-PRO": 0.1115
142
  }
143
  },
144
  {
data/developers/infly.json CHANGED
@@ -7,16 +7,16 @@
7
  "developer": "infly",
8
  "evaluator_relationship": null,
9
  "benchmark_scores": {
10
- "reward-bench/Score": 0.7648,
11
- "reward-bench/Chat": 0.9665,
12
- "reward-bench/Chat Hard": 0.9101,
13
- "reward-bench/Safety": 0.9644,
14
- "reward-bench/Reasoning": 0.9912,
15
  "reward-bench/Factuality": 0.7411,
16
  "reward-bench/Precise IF": 0.4188,
17
  "reward-bench/Math": 0.6995,
 
18
  "reward-bench/Focus": 0.903,
19
- "reward-bench/Ties": 0.8622
 
 
 
20
  }
21
  }
22
  ]
 
7
  "developer": "infly",
8
  "evaluator_relationship": null,
9
  "benchmark_scores": {
10
+ "reward-bench/Score": 0.9511,
 
 
 
 
11
  "reward-bench/Factuality": 0.7411,
12
  "reward-bench/Precise IF": 0.4188,
13
  "reward-bench/Math": 0.6995,
14
+ "reward-bench/Safety": 0.9365,
15
  "reward-bench/Focus": 0.903,
16
+ "reward-bench/Ties": 0.8622,
17
+ "reward-bench/Chat": 0.9665,
18
+ "reward-bench/Chat Hard": 0.9101,
19
+ "reward-bench/Reasoning": 0.9912
20
  }
21
  }
22
  ]
data/developers/internlm.json CHANGED
@@ -71,16 +71,16 @@
71
  "developer": "internlm",
72
  "evaluator_relationship": null,
73
  "benchmark_scores": {
74
- "reward-bench/Score": 0.8759,
 
 
 
 
75
  "reward-bench/Factuality": 0.4211,
76
  "reward-bench/Precise IF": 0.4,
77
  "reward-bench/Math": 0.5628,
78
- "reward-bench/Safety": 0.8716,
79
  "reward-bench/Focus": 0.7051,
80
- "reward-bench/Ties": 0.5164,
81
- "reward-bench/Chat": 0.9916,
82
- "reward-bench/Chat Hard": 0.6952,
83
- "reward-bench/Reasoning": 0.9453
84
  }
85
  },
86
  {
 
71
  "developer": "internlm",
72
  "evaluator_relationship": null,
73
  "benchmark_scores": {
74
+ "reward-bench/Score": 0.5335,
75
+ "reward-bench/Chat": 0.9916,
76
+ "reward-bench/Chat Hard": 0.6952,
77
+ "reward-bench/Safety": 0.5956,
78
+ "reward-bench/Reasoning": 0.9453,
79
  "reward-bench/Factuality": 0.4211,
80
  "reward-bench/Precise IF": 0.4,
81
  "reward-bench/Math": 0.5628,
 
82
  "reward-bench/Focus": 0.7051,
83
+ "reward-bench/Ties": 0.5164
 
 
 
84
  }
85
  },
86
  {
data/developers/jaspionjader.json CHANGED
@@ -1477,12 +1477,12 @@
1477
  "developer": "jaspionjader",
1478
  "evaluator_relationship": null,
1479
  "benchmark_scores": {
1480
- "hfopenllm_v2/IFEval": 0.4345,
1481
- "hfopenllm_v2/BBH": 0.5419,
1482
- "hfopenllm_v2/MATH Level 5": 0.1292,
1483
- "hfopenllm_v2/GPQA": 0.3087,
1484
  "hfopenllm_v2/MUSR": 0.4277,
1485
- "hfopenllm_v2/MMLU-PRO": 0.3854
1486
  }
1487
  },
1488
  {
 
1477
  "developer": "jaspionjader",
1478
  "evaluator_relationship": null,
1479
  "benchmark_scores": {
1480
+ "hfopenllm_v2/IFEval": 0.4418,
1481
+ "hfopenllm_v2/BBH": 0.5406,
1482
+ "hfopenllm_v2/MATH Level 5": 0.1352,
1483
+ "hfopenllm_v2/GPQA": 0.3062,
1484
  "hfopenllm_v2/MUSR": 0.4277,
1485
+ "hfopenllm_v2/MMLU-PRO": 0.386
1486
  }
1487
  },
1488
  {
data/developers/leroydyer.json CHANGED
@@ -707,12 +707,12 @@
707
  "developer": "LeroyDyer",
708
  "evaluator_relationship": null,
709
  "benchmark_scores": {
710
- "hfopenllm_v2/IFEval": 0.3579,
711
- "hfopenllm_v2/BBH": 0.4477,
712
- "hfopenllm_v2/MATH Level 5": 0.0423,
713
- "hfopenllm_v2/GPQA": 0.3096,
714
- "hfopenllm_v2/MUSR": 0.4134,
715
- "hfopenllm_v2/MMLU-PRO": 0.2376
716
  }
717
  },
718
  {
 
707
  "developer": "LeroyDyer",
708
  "evaluator_relationship": null,
709
  "benchmark_scores": {
710
+ "hfopenllm_v2/IFEval": 0.3798,
711
+ "hfopenllm_v2/BBH": 0.4483,
712
+ "hfopenllm_v2/MATH Level 5": 0.04,
713
+ "hfopenllm_v2/GPQA": 0.3129,
714
+ "hfopenllm_v2/MUSR": 0.4148,
715
+ "hfopenllm_v2/MMLU-PRO": 0.2389
716
  }
717
  },
718
  {
data/developers/llmat.json CHANGED
@@ -7,12 +7,12 @@
7
  "developer": "llmat",
8
  "evaluator_relationship": null,
9
  "benchmark_scores": {
10
- "hfopenllm_v2/IFEval": 0.364,
11
- "hfopenllm_v2/BBH": 0.4005,
12
- "hfopenllm_v2/MATH Level 5": 0.0015,
13
- "hfopenllm_v2/GPQA": 0.2693,
14
- "hfopenllm_v2/MUSR": 0.3529,
15
- "hfopenllm_v2/MMLU-PRO": 0.2301
16
  }
17
  }
18
  ]
 
7
  "developer": "llmat",
8
  "evaluator_relationship": null,
9
  "benchmark_scores": {
10
+ "hfopenllm_v2/IFEval": 0.377,
11
+ "hfopenllm_v2/BBH": 0.3978,
12
+ "hfopenllm_v2/MATH Level 5": 0.0242,
13
+ "hfopenllm_v2/GPQA": 0.2668,
14
+ "hfopenllm_v2/MUSR": 0.3555,
15
+ "hfopenllm_v2/MMLU-PRO": 0.2278
16
  }
17
  }
18
  ]
data/developers/lxzgordon.json CHANGED
@@ -20,16 +20,16 @@
20
  "developer": "LxzGordon",
21
  "evaluator_relationship": null,
22
  "benchmark_scores": {
23
- "reward-bench/Score": 0.9294,
 
 
 
 
24
  "reward-bench/Factuality": 0.6884,
25
  "reward-bench/Precise IF": 0.45,
26
  "reward-bench/Math": 0.6393,
27
- "reward-bench/Safety": 0.9108,
28
  "reward-bench/Focus": 0.9758,
29
- "reward-bench/Ties": 0.7653,
30
- "reward-bench/Chat": 0.9553,
31
- "reward-bench/Chat Hard": 0.8816,
32
- "reward-bench/Reasoning": 0.9698
33
  }
34
  }
35
  ]
 
20
  "developer": "LxzGordon",
21
  "evaluator_relationship": null,
22
  "benchmark_scores": {
23
+ "reward-bench/Score": 0.7394,
24
+ "reward-bench/Chat": 0.9553,
25
+ "reward-bench/Chat Hard": 0.8816,
26
+ "reward-bench/Safety": 0.9178,
27
+ "reward-bench/Reasoning": 0.9698,
28
  "reward-bench/Factuality": 0.6884,
29
  "reward-bench/Precise IF": 0.45,
30
  "reward-bench/Math": 0.6393,
 
31
  "reward-bench/Focus": 0.9758,
32
+ "reward-bench/Ties": 0.7653
 
 
 
33
  }
34
  }
35
  ]
data/developers/meta.json CHANGED
@@ -471,6 +471,16 @@
471
  "helm_capabilities/IFEval": 0.743,
472
  "helm_capabilities/WildBench": 0.686,
473
  "helm_capabilities/Omni-MATH": 0.137,
 
 
 
 
 
 
 
 
 
 
474
  "helm_mmlu/MMLU All Subjects": 0.561,
475
  "helm_mmlu/Abstract Algebra": 0.26,
476
  "helm_mmlu/Anatomy": 0.459,
@@ -506,17 +516,7 @@
506
  "helm_mmlu/Sociology": 0.701,
507
  "helm_mmlu/Virology": 0.446,
508
  "helm_mmlu/World Religions": 0.789,
509
- "helm_mmlu/Mean win rate": 0.475,
510
- "helm_lite/Mean win rate": 0.303,
511
- "helm_lite/NarrativeQA": 0.756,
512
- "helm_lite/NaturalQuestions (closed-book)": 0.209,
513
- "helm_lite/OpenbookQA": 0.74,
514
- "helm_lite/MMLU": 0.5,
515
- "helm_lite/MATH": 0.703,
516
- "helm_lite/GSM8K": 0.798,
517
- "helm_lite/LegalBench": 0.342,
518
- "helm_lite/MedQA": 0.245,
519
- "helm_lite/WMT 2014": 0.181
520
  }
521
  },
522
  {
@@ -579,6 +579,16 @@
579
  "developer": "Meta",
580
  "evaluator_relationship": null,
581
  "benchmark_scores": {
 
 
 
 
 
 
 
 
 
 
582
  "helm_mmlu/MMLU All Subjects": 0.803,
583
  "helm_mmlu/Abstract Algebra": 0.52,
584
  "helm_mmlu/Anatomy": 0.8,
@@ -614,17 +624,7 @@
614
  "helm_mmlu/Sociology": 0.92,
615
  "helm_mmlu/Virology": 0.584,
616
  "helm_mmlu/World Religions": 0.901,
617
- "helm_mmlu/Mean win rate": 0.773,
618
- "helm_lite/Mean win rate": 0.819,
619
- "helm_lite/NarrativeQA": 0.777,
620
- "helm_lite/NaturalQuestions (closed-book)": 0.457,
621
- "helm_lite/OpenbookQA": 0.942,
622
- "helm_lite/MMLU": 0.703,
623
- "helm_lite/MATH": 0.791,
624
- "helm_lite/GSM8K": 0.936,
625
- "helm_lite/LegalBench": 0.68,
626
- "helm_lite/MedQA": 0.769,
627
- "helm_lite/WMT 2014": 0.224
628
  }
629
  },
630
  {
 
471
  "helm_capabilities/IFEval": 0.743,
472
  "helm_capabilities/WildBench": 0.686,
473
  "helm_capabilities/Omni-MATH": 0.137,
474
+ "helm_lite/Mean win rate": 0.303,
475
+ "helm_lite/NarrativeQA": 0.756,
476
+ "helm_lite/NaturalQuestions (closed-book)": 0.209,
477
+ "helm_lite/OpenbookQA": 0.74,
478
+ "helm_lite/MMLU": 0.5,
479
+ "helm_lite/MATH": 0.703,
480
+ "helm_lite/GSM8K": 0.798,
481
+ "helm_lite/LegalBench": 0.342,
482
+ "helm_lite/MedQA": 0.245,
483
+ "helm_lite/WMT 2014": 0.181,
484
  "helm_mmlu/MMLU All Subjects": 0.561,
485
  "helm_mmlu/Abstract Algebra": 0.26,
486
  "helm_mmlu/Anatomy": 0.459,
 
516
  "helm_mmlu/Sociology": 0.701,
517
  "helm_mmlu/Virology": 0.446,
518
  "helm_mmlu/World Religions": 0.789,
519
+ "helm_mmlu/Mean win rate": 0.475
 
 
 
 
 
 
 
 
 
 
520
  }
521
  },
522
  {
 
579
  "developer": "Meta",
580
  "evaluator_relationship": null,
581
  "benchmark_scores": {
582
+ "helm_lite/Mean win rate": 0.819,
583
+ "helm_lite/NarrativeQA": 0.777,
584
+ "helm_lite/NaturalQuestions (closed-book)": 0.457,
585
+ "helm_lite/OpenbookQA": 0.942,
586
+ "helm_lite/MMLU": 0.703,
587
+ "helm_lite/MATH": 0.791,
588
+ "helm_lite/GSM8K": 0.936,
589
+ "helm_lite/LegalBench": 0.68,
590
+ "helm_lite/MedQA": 0.769,
591
+ "helm_lite/WMT 2014": 0.224,
592
  "helm_mmlu/MMLU All Subjects": 0.803,
593
  "helm_mmlu/Abstract Algebra": 0.52,
594
  "helm_mmlu/Anatomy": 0.8,
 
624
  "helm_mmlu/Sociology": 0.92,
625
  "helm_mmlu/Virology": 0.584,
626
  "helm_mmlu/World Religions": 0.901,
627
+ "helm_mmlu/Mean win rate": 0.773
 
 
 
 
 
 
 
 
 
 
628
  }
629
  },
630
  {
data/developers/minimax.json CHANGED
@@ -25,7 +25,7 @@
25
  "developer": "MiniMax",
26
  "evaluator_relationship": null,
27
  "benchmark_scores": {
28
- "terminal-bench-2.0/terminal-bench-2.0": 29.2
29
  }
30
  },
31
  {
 
25
  "developer": "MiniMax",
26
  "evaluator_relationship": null,
27
  "benchmark_scores": {
28
+ "terminal-bench-2.0/terminal-bench-2.0": 36.6
29
  }
30
  },
31
  {
data/developers/mistralai.json CHANGED
@@ -69,6 +69,16 @@
69
  "helm_capabilities/IFEval": 0.567,
70
  "helm_capabilities/WildBench": 0.66,
71
  "helm_capabilities/Omni-MATH": 0.072,
 
 
 
 
 
 
 
 
 
 
72
  "helm_mmlu/MMLU All Subjects": 0.599,
73
  "helm_mmlu/Abstract Algebra": 0.27,
74
  "helm_mmlu/Anatomy": 0.585,
@@ -105,16 +115,6 @@
105
  "helm_mmlu/Virology": 0.47,
106
  "helm_mmlu/World Religions": 0.825,
107
  "helm_mmlu/Mean win rate": 0.509,
108
- "helm_lite/Mean win rate": 0.196,
109
- "helm_lite/NarrativeQA": 0.716,
110
- "helm_lite/NaturalQuestions (closed-book)": 0.253,
111
- "helm_lite/OpenbookQA": 0.79,
112
- "helm_lite/MMLU": 0.51,
113
- "helm_lite/MATH": 0.289,
114
- "helm_lite/GSM8K": 0.538,
115
- "helm_lite/LegalBench": 0.331,
116
- "helm_lite/MedQA": 0.517,
117
- "helm_lite/WMT 2014": 0.142,
118
  "hfopenllm_v2/IFEval": 0.5465,
119
  "hfopenllm_v2/BBH": 0.4722,
120
  "hfopenllm_v2/MATH Level 5": 0.0385,
@@ -718,12 +718,12 @@
718
  "developer": "mistralai",
719
  "evaluator_relationship": null,
720
  "benchmark_scores": {
721
- "hfopenllm_v2/IFEval": 0.2326,
722
- "hfopenllm_v2/BBH": 0.5098,
723
- "hfopenllm_v2/MATH Level 5": 0.0937,
724
- "hfopenllm_v2/GPQA": 0.3205,
725
- "hfopenllm_v2/MUSR": 0.4413,
726
- "hfopenllm_v2/MMLU-PRO": 0.3871
727
  }
728
  },
729
  {
 
69
  "helm_capabilities/IFEval": 0.567,
70
  "helm_capabilities/WildBench": 0.66,
71
  "helm_capabilities/Omni-MATH": 0.072,
72
+ "helm_lite/Mean win rate": 0.196,
73
+ "helm_lite/NarrativeQA": 0.716,
74
+ "helm_lite/NaturalQuestions (closed-book)": 0.253,
75
+ "helm_lite/OpenbookQA": 0.79,
76
+ "helm_lite/MMLU": 0.51,
77
+ "helm_lite/MATH": 0.289,
78
+ "helm_lite/GSM8K": 0.538,
79
+ "helm_lite/LegalBench": 0.331,
80
+ "helm_lite/MedQA": 0.517,
81
+ "helm_lite/WMT 2014": 0.142,
82
  "helm_mmlu/MMLU All Subjects": 0.599,
83
  "helm_mmlu/Abstract Algebra": 0.27,
84
  "helm_mmlu/Anatomy": 0.585,
 
115
  "helm_mmlu/Virology": 0.47,
116
  "helm_mmlu/World Religions": 0.825,
117
  "helm_mmlu/Mean win rate": 0.509,
 
 
 
 
 
 
 
 
 
 
118
  "hfopenllm_v2/IFEval": 0.5465,
119
  "hfopenllm_v2/BBH": 0.4722,
120
  "hfopenllm_v2/MATH Level 5": 0.0385,
 
718
  "developer": "mistralai",
719
  "evaluator_relationship": null,
720
  "benchmark_scores": {
721
+ "hfopenllm_v2/IFEval": 0.2415,
722
+ "hfopenllm_v2/BBH": 0.5087,
723
+ "hfopenllm_v2/MATH Level 5": 0.102,
724
+ "hfopenllm_v2/GPQA": 0.3138,
725
+ "hfopenllm_v2/MUSR": 0.4321,
726
+ "hfopenllm_v2/MMLU-PRO": 0.385
727
  }
728
  },
729
  {
data/developers/mlabonne.json CHANGED
@@ -161,12 +161,12 @@
161
  "developer": "mlabonne",
162
  "evaluator_relationship": null,
163
  "benchmark_scores": {
164
- "hfopenllm_v2/IFEval": 0.4162,
165
- "hfopenllm_v2/BBH": 0.5124,
166
- "hfopenllm_v2/MATH Level 5": 0.0853,
167
- "hfopenllm_v2/GPQA": 0.3029,
168
- "hfopenllm_v2/MUSR": 0.415,
169
- "hfopenllm_v2/MMLU-PRO": 0.3802
170
  }
171
  },
172
  {
 
161
  "developer": "mlabonne",
162
  "evaluator_relationship": null,
163
  "benchmark_scores": {
164
+ "hfopenllm_v2/IFEval": 0.7561,
165
+ "hfopenllm_v2/BBH": 0.5111,
166
+ "hfopenllm_v2/MATH Level 5": 0.0906,
167
+ "hfopenllm_v2/GPQA": 0.3062,
168
+ "hfopenllm_v2/MUSR": 0.4019,
169
+ "hfopenllm_v2/MMLU-PRO": 0.3841
170
  }
171
  },
172
  {
data/developers/moonshot_ai.json CHANGED
@@ -7,7 +7,7 @@
7
  "developer": "Moonshot AI",
8
  "evaluator_relationship": null,
9
  "benchmark_scores": {
10
- "terminal-bench-2.0/terminal-bench-2.0": 26.7
11
  }
12
  },
13
  {
 
7
  "developer": "Moonshot AI",
8
  "evaluator_relationship": null,
9
  "benchmark_scores": {
10
+ "terminal-bench-2.0/terminal-bench-2.0": 27.8
11
  }
12
  },
13
  {
data/developers/multiple.json CHANGED
@@ -7,7 +7,7 @@
7
  "developer": "Multiple",
8
  "evaluator_relationship": null,
9
  "benchmark_scores": {
10
- "terminal-bench-2.0/terminal-bench-2.0": 71.0
11
  }
12
  }
13
  ]
 
7
  "developer": "Multiple",
8
  "evaluator_relationship": null,
9
  "benchmark_scores": {
10
+ "terminal-bench-2.0/terminal-bench-2.0": 72.4
11
  }
12
  }
13
  ]
data/developers/nazimali.json CHANGED
@@ -21,12 +21,12 @@
21
  "developer": "nazimali",
22
  "evaluator_relationship": null,
23
  "benchmark_scores": {
24
- "hfopenllm_v2/IFEval": 0.4964,
25
- "hfopenllm_v2/BBH": 0.4699,
26
- "hfopenllm_v2/MATH Level 5": 0.0045,
27
- "hfopenllm_v2/GPQA": 0.2827,
28
- "hfopenllm_v2/MUSR": 0.3979,
29
- "hfopenllm_v2/MMLU-PRO": 0.3063
30
  }
31
  }
32
  ]
 
21
  "developer": "nazimali",
22
  "evaluator_relationship": null,
23
  "benchmark_scores": {
24
+ "hfopenllm_v2/IFEval": 0.486,
25
+ "hfopenllm_v2/BBH": 0.4721,
26
+ "hfopenllm_v2/MATH Level 5": 0.0846,
27
+ "hfopenllm_v2/GPQA": 0.2844,
28
+ "hfopenllm_v2/MUSR": 0.4006,
29
+ "hfopenllm_v2/MMLU-PRO": 0.3087
30
  }
31
  }
32
  ]
data/developers/nicolinho.json CHANGED
@@ -7,16 +7,16 @@
7
  "developer": "nicolinho",
8
  "evaluator_relationship": null,
9
  "benchmark_scores": {
10
- "reward-bench/Score": 0.9444,
 
 
 
 
11
  "reward-bench/Factuality": 0.7853,
12
  "reward-bench/Precise IF": 0.3719,
13
  "reward-bench/Math": 0.6995,
14
- "reward-bench/Safety": 0.927,
15
  "reward-bench/Focus": 0.9535,
16
- "reward-bench/Ties": 0.8321,
17
- "reward-bench/Chat": 0.9665,
18
- "reward-bench/Chat Hard": 0.9013,
19
- "reward-bench/Reasoning": 0.9826
20
  }
21
  },
22
  {
@@ -51,16 +51,16 @@
51
  "developer": "nicolinho",
52
  "evaluator_relationship": null,
53
  "benchmark_scores": {
54
- "reward-bench/Score": 0.9314,
 
 
 
 
55
  "reward-bench/Factuality": 0.6653,
56
  "reward-bench/Precise IF": 0.4062,
57
  "reward-bench/Math": 0.612,
58
- "reward-bench/Safety": 0.9257,
59
  "reward-bench/Focus": 0.8909,
60
- "reward-bench/Ties": 0.7234,
61
- "reward-bench/Chat": 0.9637,
62
- "reward-bench/Chat Hard": 0.8684,
63
- "reward-bench/Reasoning": 0.9677
64
  }
65
  }
66
  ]
 
7
  "developer": "nicolinho",
8
  "evaluator_relationship": null,
9
  "benchmark_scores": {
10
+ "reward-bench/Score": 0.7667,
11
+ "reward-bench/Chat": 0.9665,
12
+ "reward-bench/Chat Hard": 0.9013,
13
+ "reward-bench/Safety": 0.9578,
14
+ "reward-bench/Reasoning": 0.9826,
15
  "reward-bench/Factuality": 0.7853,
16
  "reward-bench/Precise IF": 0.3719,
17
  "reward-bench/Math": 0.6995,
 
18
  "reward-bench/Focus": 0.9535,
19
+ "reward-bench/Ties": 0.8321
 
 
 
20
  }
21
  },
22
  {
 
51
  "developer": "nicolinho",
52
  "evaluator_relationship": null,
53
  "benchmark_scores": {
54
+ "reward-bench/Score": 0.7074,
55
+ "reward-bench/Chat": 0.9637,
56
+ "reward-bench/Chat Hard": 0.8684,
57
+ "reward-bench/Safety": 0.9467,
58
+ "reward-bench/Reasoning": 0.9677,
59
  "reward-bench/Factuality": 0.6653,
60
  "reward-bench/Precise IF": 0.4062,
61
  "reward-bench/Math": 0.612,
 
62
  "reward-bench/Focus": 0.8909,
63
+ "reward-bench/Ties": 0.7234
 
 
 
64
  }
65
  }
66
  ]
data/developers/nisten.json CHANGED
@@ -7,12 +7,12 @@
7
  "developer": "nisten",
8
  "evaluator_relationship": null,
9
  "benchmark_scores": {
10
- "hfopenllm_v2/IFEval": 0.3914,
11
- "hfopenllm_v2/BBH": 0.6591,
12
- "hfopenllm_v2/MATH Level 5": 0.3044,
13
- "hfopenllm_v2/GPQA": 0.3591,
14
- "hfopenllm_v2/MUSR": 0.4681,
15
- "hfopenllm_v2/MMLU-PRO": 0.5611
16
  }
17
  },
18
  {
 
7
  "developer": "nisten",
8
  "evaluator_relationship": null,
9
  "benchmark_scores": {
10
+ "hfopenllm_v2/IFEval": 0.3799,
11
+ "hfopenllm_v2/BBH": 0.6647,
12
+ "hfopenllm_v2/MATH Level 5": 0.3406,
13
+ "hfopenllm_v2/GPQA": 0.4035,
14
+ "hfopenllm_v2/MUSR": 0.494,
15
+ "hfopenllm_v2/MMLU-PRO": 0.5731
16
  }
17
  },
18
  {
data/developers/nousresearch.json CHANGED
@@ -200,20 +200,6 @@
200
  "hfopenllm_v2/MMLU-PRO": 0.232
201
  }
202
  },
203
- {
204
- "id": "NousResearch/Yarn-Llama-2-7b-128k",
205
- "name": "Yarn-Llama-2-7b-128k",
206
- "developer": "NousResearch",
207
- "evaluator_relationship": null,
208
- "benchmark_scores": {
209
- "hfopenllm_v2/IFEval": 0.1485,
210
- "hfopenllm_v2/BBH": 0.3248,
211
- "hfopenllm_v2/MATH Level 5": 0.0151,
212
- "hfopenllm_v2/GPQA": 0.2601,
213
- "hfopenllm_v2/MUSR": 0.3967,
214
- "hfopenllm_v2/MMLU-PRO": 0.1791
215
- }
216
- },
217
  {
218
  "id": "NousResearch/Yarn-Llama-2-7b-64k",
219
  "name": "Yarn-Llama-2-7b-64k",
 
200
  "hfopenllm_v2/MMLU-PRO": 0.232
201
  }
202
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  {
204
  "id": "NousResearch/Yarn-Llama-2-7b-64k",
205
  "name": "Yarn-Llama-2-7b-64k",
data/developers/omkar1102.json CHANGED
@@ -7,12 +7,12 @@
7
  "developer": "Omkar1102",
8
  "evaluator_relationship": null,
9
  "benchmark_scores": {
10
- "hfopenllm_v2/IFEval": 0.2254,
11
- "hfopenllm_v2/BBH": 0.275,
12
  "hfopenllm_v2/MATH Level 5": 0.0,
13
- "hfopenllm_v2/GPQA": 0.2576,
14
- "hfopenllm_v2/MUSR": 0.3762,
15
- "hfopenllm_v2/MMLU-PRO": 0.1123
16
  }
17
  }
18
  ]
 
7
  "developer": "Omkar1102",
8
  "evaluator_relationship": null,
9
  "benchmark_scores": {
10
+ "hfopenllm_v2/IFEval": 0.2148,
11
+ "hfopenllm_v2/BBH": 0.276,
12
  "hfopenllm_v2/MATH Level 5": 0.0,
13
+ "hfopenllm_v2/GPQA": 0.2508,
14
+ "hfopenllm_v2/MUSR": 0.3802,
15
+ "hfopenllm_v2/MMLU-PRO": 0.1126
16
  }
17
  }
18
  ]
data/developers/openai.json CHANGED
@@ -163,16 +163,16 @@
163
  "developer": "OpenAI",
164
  "evaluator_relationship": null,
165
  "benchmark_scores": {
 
 
 
166
  "apex-agents/Overall Pass@1": 0.23,
167
  "apex-agents/Overall Pass@8": 0.4,
168
  "apex-agents/Overall Mean Score": 0.387,
169
  "apex-agents/Investment Banking Pass@1": 0.273,
170
  "apex-agents/Management Consulting Pass@1": 0.227,
171
  "apex-agents/Corporate Law Pass@1": 0.189,
172
- "apex-agents/Corporate Lawyer Mean Score": 0.443,
173
- "ace/Overall Score": 0.515,
174
- "ace/Food Score": 0.65,
175
- "ace/Gaming Score": 0.578
176
  }
177
  },
178
  {
@@ -300,13 +300,6 @@
300
  "developer": "OpenAI",
301
  "evaluator_relationship": null,
302
  "benchmark_scores": {
303
- "helm_instruct/Mean win rate": 0.689,
304
- "helm_instruct/Anthropic RLHF dataset": 4.964,
305
- "helm_instruct/Best ChatGPT Prompts": 4.986,
306
- "helm_instruct/Koala test dataset": 4.987,
307
- "helm_instruct/Open Assistant": 4.987,
308
- "helm_instruct/Self Instruct": 4.99,
309
- "helm_instruct/Vicuna": 4.992,
310
  "helm_classic/Mean win rate": 0.783,
311
  "helm_classic/MMLU": 0.391,
312
  "helm_classic/BoolQ": 0.87,
@@ -322,6 +315,13 @@
322
  "helm_classic/IMDB": 0.943,
323
  "helm_classic/CivilComments": 0.696,
324
  "helm_classic/RAFT": 0.748,
 
 
 
 
 
 
 
325
  "helm_lite/Mean win rate": 0.358,
326
  "helm_lite/NarrativeQA": 0.655,
327
  "helm_lite/NaturalQuestions (closed-book)": 0.335,
@@ -405,6 +405,16 @@
405
  "developer": "OpenAI",
406
  "evaluator_relationship": null,
407
  "benchmark_scores": {
 
 
 
 
 
 
 
 
 
 
408
  "helm_mmlu/MMLU All Subjects": 0.824,
409
  "helm_mmlu/Abstract Algebra": 0.63,
410
  "helm_mmlu/Anatomy": 0.8,
@@ -440,17 +450,7 @@
440
  "helm_mmlu/Sociology": 0.93,
441
  "helm_mmlu/Virology": 0.596,
442
  "helm_mmlu/World Religions": 0.877,
443
- "helm_mmlu/Mean win rate": 0.517,
444
- "helm_lite/Mean win rate": 0.867,
445
- "helm_lite/NarrativeQA": 0.768,
446
- "helm_lite/NaturalQuestions (closed-book)": 0.457,
447
- "helm_lite/OpenbookQA": 0.96,
448
- "helm_lite/MMLU": 0.735,
449
- "helm_lite/MATH": 0.802,
450
- "helm_lite/GSM8K": 0.932,
451
- "helm_lite/LegalBench": 0.713,
452
- "helm_lite/MedQA": 0.815,
453
- "helm_lite/WMT 2014": 0.211
454
  }
455
  },
456
  {
@@ -513,6 +513,16 @@
513
  "developer": "OpenAI",
514
  "evaluator_relationship": null,
515
  "benchmark_scores": {
 
 
 
 
 
 
 
 
 
 
516
  "helm_mmlu/MMLU All Subjects": 0.813,
517
  "helm_mmlu/Abstract Algebra": 0.56,
518
  "helm_mmlu/Anatomy": 0.822,
@@ -549,16 +559,6 @@
549
  "helm_mmlu/Virology": 0.602,
550
  "helm_mmlu/World Religions": 0.848,
551
  "helm_mmlu/Mean win rate": 0.351,
552
- "helm_lite/Mean win rate": 0.864,
553
- "helm_lite/NarrativeQA": 0.761,
554
- "helm_lite/NaturalQuestions (closed-book)": 0.482,
555
- "helm_lite/OpenbookQA": 0.97,
556
- "helm_lite/MMLU": 0.711,
557
- "helm_lite/MATH": 0.833,
558
- "helm_lite/GSM8K": 0.824,
559
- "helm_lite/LegalBench": 0.727,
560
- "helm_lite/MedQA": 0.783,
561
- "helm_lite/WMT 2014": 0.218,
562
  "reward-bench/Score": 0.8395,
563
  "reward-bench/Chat": 0.9525,
564
  "reward-bench/Chat Hard": 0.7544,
@@ -772,16 +772,16 @@
772
  "helm_mmlu/Virology": 0.578,
773
  "helm_mmlu/World Religions": 0.883,
774
  "helm_mmlu/Mean win rate": 0.52,
775
- "reward-bench/Score": 0.6493,
776
- "reward-bench/Chat": 0.9609,
777
- "reward-bench/Chat Hard": 0.761,
778
- "reward-bench/Safety": 0.8619,
779
- "reward-bench/Reasoning": 0.8661,
780
  "reward-bench/Factuality": 0.5684,
781
  "reward-bench/Precise IF": 0.3312,
782
  "reward-bench/Math": 0.623,
 
783
  "reward-bench/Focus": 0.7293,
784
- "reward-bench/Ties": 0.7819
 
 
 
785
  }
786
  },
787
  {
@@ -877,7 +877,7 @@
877
  "developer": "OpenAI",
878
  "evaluator_relationship": null,
879
  "benchmark_scores": {
880
- "terminal-bench-2.0/terminal-bench-2.0": 35.2
881
  }
882
  },
883
  {
@@ -911,9 +911,9 @@
911
  "helm_capabilities/IFEval": 0.875,
912
  "helm_capabilities/WildBench": 0.857,
913
  "helm_capabilities/Omni-MATH": 0.647,
914
- "livecodebenchpro/Hard Problems": 0.0423,
915
- "livecodebenchpro/Medium Problems": 0.4085,
916
- "livecodebenchpro/Easy Problems": 0.9014
917
  }
918
  },
919
  {
@@ -922,7 +922,7 @@
922
  "developer": "OpenAI",
923
  "evaluator_relationship": null,
924
  "benchmark_scores": {
925
- "terminal-bench-2.0/terminal-bench-2.0": 44.3
926
  }
927
  },
928
  {
@@ -931,7 +931,7 @@
931
  "developer": "OpenAI",
932
  "evaluator_relationship": null,
933
  "benchmark_scores": {
934
- "terminal-bench-2.0/terminal-bench-2.0": 34.8
935
  }
936
  },
937
  {
@@ -954,7 +954,7 @@
954
  "developer": "OpenAI",
955
  "evaluator_relationship": null,
956
  "benchmark_scores": {
957
- "terminal-bench-2.0/terminal-bench-2.0": 9.9
958
  }
959
  },
960
  {
@@ -986,7 +986,7 @@
986
  "developer": "OpenAI",
987
  "evaluator_relationship": null,
988
  "benchmark_scores": {
989
- "terminal-bench-2.0/terminal-bench-2.0": 53.5
990
  }
991
  },
992
  {
@@ -1013,7 +1013,7 @@
1013
  "developer": "OpenAI",
1014
  "evaluator_relationship": null,
1015
  "benchmark_scores": {
1016
- "terminal-bench-2.0/terminal-bench-2.0": 60.7
1017
  }
1018
  },
1019
  {
@@ -1023,14 +1023,14 @@
1023
  "evaluator_relationship": null,
1024
  "benchmark_scores": {
1025
  "appworld_test_normal/appworld/test_normal": 0.0,
1026
- "browsecompplus/browsecompplus": 0.48,
1027
  "livecodebenchpro/Hard Problems": 0.1594,
1028
  "livecodebenchpro/Medium Problems": 0.5211,
1029
  "livecodebenchpro/Easy Problems": 0.9014,
1030
  "swe-bench/swe-bench": 0.5455,
1031
  "tau-bench-2_airline/tau-bench-2/airline": 0.6,
1032
- "tau-bench-2_retail/tau-bench-2/retail": 0.68,
1033
- "tau-bench-2_telecom/tau-bench-2/telecom": 0.5354
1034
  }
1035
  },
1036
  {
@@ -1048,7 +1048,7 @@
1048
  "developer": "OpenAI",
1049
  "evaluator_relationship": null,
1050
  "benchmark_scores": {
1051
- "terminal-bench-2.0/terminal-bench-2.0": 64.7
1052
  }
1053
  },
1054
  {
@@ -1112,7 +1112,7 @@
1112
  "livecodebenchpro/Hard Problems": 0.0,
1113
  "livecodebenchpro/Medium Problems": 0.11267605633802817,
1114
  "livecodebenchpro/Easy Problems": 0.6619718309859155,
1115
- "terminal-bench-2.0/terminal-bench-2.0": 14.2
1116
  }
1117
  },
1118
  {
@@ -1130,7 +1130,7 @@
1130
  "livecodebenchpro/Hard Problems": 0.0,
1131
  "livecodebenchpro/Medium Problems": 0.056338028169014086,
1132
  "livecodebenchpro/Easy Problems": 0.5070422535211268,
1133
- "terminal-bench-2.0/terminal-bench-2.0": 3.1
1134
  }
1135
  },
1136
  {
 
163
  "developer": "OpenAI",
164
  "evaluator_relationship": null,
165
  "benchmark_scores": {
166
+ "ace/Overall Score": 0.515,
167
+ "ace/Food Score": 0.65,
168
+ "ace/Gaming Score": 0.578,
169
  "apex-agents/Overall Pass@1": 0.23,
170
  "apex-agents/Overall Pass@8": 0.4,
171
  "apex-agents/Overall Mean Score": 0.387,
172
  "apex-agents/Investment Banking Pass@1": 0.273,
173
  "apex-agents/Management Consulting Pass@1": 0.227,
174
  "apex-agents/Corporate Law Pass@1": 0.189,
175
+ "apex-agents/Corporate Lawyer Mean Score": 0.443
 
 
 
176
  }
177
  },
178
  {
 
300
  "developer": "OpenAI",
301
  "evaluator_relationship": null,
302
  "benchmark_scores": {
 
 
 
 
 
 
 
303
  "helm_classic/Mean win rate": 0.783,
304
  "helm_classic/MMLU": 0.391,
305
  "helm_classic/BoolQ": 0.87,
 
315
  "helm_classic/IMDB": 0.943,
316
  "helm_classic/CivilComments": 0.696,
317
  "helm_classic/RAFT": 0.748,
318
+ "helm_instruct/Mean win rate": 0.689,
319
+ "helm_instruct/Anthropic RLHF dataset": 4.964,
320
+ "helm_instruct/Best ChatGPT Prompts": 4.986,
321
+ "helm_instruct/Koala test dataset": 4.987,
322
+ "helm_instruct/Open Assistant": 4.987,
323
+ "helm_instruct/Self Instruct": 4.99,
324
+ "helm_instruct/Vicuna": 4.992,
325
  "helm_lite/Mean win rate": 0.358,
326
  "helm_lite/NarrativeQA": 0.655,
327
  "helm_lite/NaturalQuestions (closed-book)": 0.335,
 
405
  "developer": "OpenAI",
406
  "evaluator_relationship": null,
407
  "benchmark_scores": {
408
+ "helm_lite/Mean win rate": 0.867,
409
+ "helm_lite/NarrativeQA": 0.768,
410
+ "helm_lite/NaturalQuestions (closed-book)": 0.457,
411
+ "helm_lite/OpenbookQA": 0.96,
412
+ "helm_lite/MMLU": 0.735,
413
+ "helm_lite/MATH": 0.802,
414
+ "helm_lite/GSM8K": 0.932,
415
+ "helm_lite/LegalBench": 0.713,
416
+ "helm_lite/MedQA": 0.815,
417
+ "helm_lite/WMT 2014": 0.211,
418
  "helm_mmlu/MMLU All Subjects": 0.824,
419
  "helm_mmlu/Abstract Algebra": 0.63,
420
  "helm_mmlu/Anatomy": 0.8,
 
450
  "helm_mmlu/Sociology": 0.93,
451
  "helm_mmlu/Virology": 0.596,
452
  "helm_mmlu/World Religions": 0.877,
453
+ "helm_mmlu/Mean win rate": 0.517
 
 
 
 
 
 
 
 
 
 
454
  }
455
  },
456
  {
 
513
  "developer": "OpenAI",
514
  "evaluator_relationship": null,
515
  "benchmark_scores": {
516
+ "helm_lite/Mean win rate": 0.864,
517
+ "helm_lite/NarrativeQA": 0.761,
518
+ "helm_lite/NaturalQuestions (closed-book)": 0.482,
519
+ "helm_lite/OpenbookQA": 0.97,
520
+ "helm_lite/MMLU": 0.711,
521
+ "helm_lite/MATH": 0.833,
522
+ "helm_lite/GSM8K": 0.824,
523
+ "helm_lite/LegalBench": 0.727,
524
+ "helm_lite/MedQA": 0.783,
525
+ "helm_lite/WMT 2014": 0.218,
526
  "helm_mmlu/MMLU All Subjects": 0.813,
527
  "helm_mmlu/Abstract Algebra": 0.56,
528
  "helm_mmlu/Anatomy": 0.822,
 
559
  "helm_mmlu/Virology": 0.602,
560
  "helm_mmlu/World Religions": 0.848,
561
  "helm_mmlu/Mean win rate": 0.351,
 
 
 
 
 
 
 
 
 
 
562
  "reward-bench/Score": 0.8395,
563
  "reward-bench/Chat": 0.9525,
564
  "reward-bench/Chat Hard": 0.7544,
 
772
  "helm_mmlu/Virology": 0.578,
773
  "helm_mmlu/World Religions": 0.883,
774
  "helm_mmlu/Mean win rate": 0.52,
775
+ "reward-bench/Score": 0.8673,
 
 
 
 
776
  "reward-bench/Factuality": 0.5684,
777
  "reward-bench/Precise IF": 0.3312,
778
  "reward-bench/Math": 0.623,
779
+ "reward-bench/Safety": 0.8811,
780
  "reward-bench/Focus": 0.7293,
781
+ "reward-bench/Ties": 0.7819,
782
+ "reward-bench/Chat": 0.9609,
783
+ "reward-bench/Chat Hard": 0.761,
784
+ "reward-bench/Reasoning": 0.8661
785
  }
786
  },
787
  {
 
877
  "developer": "OpenAI",
878
  "evaluator_relationship": null,
879
  "benchmark_scores": {
880
+ "terminal-bench-2.0/terminal-bench-2.0": 49.6
881
  }
882
  },
883
  {
 
911
  "helm_capabilities/IFEval": 0.875,
912
  "helm_capabilities/WildBench": 0.857,
913
  "helm_capabilities/Omni-MATH": 0.647,
914
+ "livecodebenchpro/Hard Problems": 0.04225352112676056,
915
+ "livecodebenchpro/Medium Problems": 0.4084507042253521,
916
+ "livecodebenchpro/Easy Problems": 0.8873239436619719
917
  }
918
  },
919
  {
 
922
  "developer": "OpenAI",
923
  "evaluator_relationship": null,
924
  "benchmark_scores": {
925
+ "terminal-bench-2.0/terminal-bench-2.0": 43.4
926
  }
927
  },
928
  {
 
931
  "developer": "OpenAI",
932
  "evaluator_relationship": null,
933
  "benchmark_scores": {
934
+ "terminal-bench-2.0/terminal-bench-2.0": 24.0
935
  }
936
  },
937
  {
 
954
  "developer": "OpenAI",
955
  "evaluator_relationship": null,
956
  "benchmark_scores": {
957
+ "terminal-bench-2.0/terminal-bench-2.0": 11.5
958
  }
959
  },
960
  {
 
986
  "developer": "OpenAI",
987
  "evaluator_relationship": null,
988
  "benchmark_scores": {
989
+ "terminal-bench-2.0/terminal-bench-2.0": 57.8
990
  }
991
  },
992
  {
 
1013
  "developer": "OpenAI",
1014
  "evaluator_relationship": null,
1015
  "benchmark_scores": {
1016
+ "terminal-bench-2.0/terminal-bench-2.0": 62.9
1017
  }
1018
  },
1019
  {
 
1023
  "evaluator_relationship": null,
1024
  "benchmark_scores": {
1025
  "appworld_test_normal/appworld/test_normal": 0.0,
1026
+ "browsecompplus/browsecompplus": 0.43,
1027
  "livecodebenchpro/Hard Problems": 0.1594,
1028
  "livecodebenchpro/Medium Problems": 0.5211,
1029
  "livecodebenchpro/Easy Problems": 0.9014,
1030
  "swe-bench/swe-bench": 0.5455,
1031
  "tau-bench-2_airline/tau-bench-2/airline": 0.6,
1032
+ "tau-bench-2_retail/tau-bench-2/retail": 0.73,
1033
+ "tau-bench-2_telecom/tau-bench-2/telecom": 0.71
1034
  }
1035
  },
1036
  {
 
1048
  "developer": "OpenAI",
1049
  "evaluator_relationship": null,
1050
  "benchmark_scores": {
1051
+ "terminal-bench-2.0/terminal-bench-2.0": 77.3
1052
  }
1053
  },
1054
  {
 
1112
  "livecodebenchpro/Hard Problems": 0.0,
1113
  "livecodebenchpro/Medium Problems": 0.11267605633802817,
1114
  "livecodebenchpro/Easy Problems": 0.6619718309859155,
1115
+ "terminal-bench-2.0/terminal-bench-2.0": 18.7
1116
  }
1117
  },
1118
  {
 
1130
  "livecodebenchpro/Hard Problems": 0.0,
1131
  "livecodebenchpro/Medium Problems": 0.056338028169014086,
1132
  "livecodebenchpro/Easy Problems": 0.5070422535211268,
1133
+ "terminal-bench-2.0/terminal-bench-2.0": 3.4
1134
  }
1135
  },
1136
  {
data/developers/openassistant.json CHANGED
@@ -7,17 +7,17 @@
7
  "developer": "OpenAssistant",
8
  "evaluator_relationship": null,
9
  "benchmark_scores": {
10
- "reward-bench/Score": 0.615,
 
 
 
 
 
11
  "reward-bench/Factuality": 0.3979,
12
  "reward-bench/Precise IF": 0.2875,
13
  "reward-bench/Math": 0.377,
14
- "reward-bench/Safety": 0.5446,
15
  "reward-bench/Focus": 0.1535,
16
- "reward-bench/Ties": 0.047,
17
- "reward-bench/Chat": 0.9246,
18
- "reward-bench/Chat Hard": 0.3728,
19
- "reward-bench/Reasoning": 0.5855,
20
- "reward-bench/Prior Sets (0.5 weight)": 0.6801
21
  }
22
  },
23
  {
@@ -26,17 +26,17 @@
26
  "developer": "OpenAssistant",
27
  "evaluator_relationship": null,
28
  "benchmark_scores": {
29
- "reward-bench/Score": 0.2648,
30
- "reward-bench/Chat": 0.8855,
31
- "reward-bench/Chat Hard": 0.4868,
32
- "reward-bench/Safety": 0.3244,
33
- "reward-bench/Reasoning": 0.7752,
34
- "reward-bench/Prior Sets (0.5 weight)": 0.6533,
35
  "reward-bench/Factuality": 0.3179,
36
  "reward-bench/Precise IF": 0.2625,
37
  "reward-bench/Math": 0.3934,
 
38
  "reward-bench/Focus": 0.2707,
39
- "reward-bench/Ties": 0.0198
 
 
 
 
40
  }
41
  },
42
  {
 
7
  "developer": "OpenAssistant",
8
  "evaluator_relationship": null,
9
  "benchmark_scores": {
10
+ "reward-bench/Score": 0.2653,
11
+ "reward-bench/Chat": 0.9246,
12
+ "reward-bench/Chat Hard": 0.3728,
13
+ "reward-bench/Safety": 0.3289,
14
+ "reward-bench/Reasoning": 0.5855,
15
+ "reward-bench/Prior Sets (0.5 weight)": 0.6801,
16
  "reward-bench/Factuality": 0.3979,
17
  "reward-bench/Precise IF": 0.2875,
18
  "reward-bench/Math": 0.377,
 
19
  "reward-bench/Focus": 0.1535,
20
+ "reward-bench/Ties": 0.047
 
 
 
 
21
  }
22
  },
23
  {
 
26
  "developer": "OpenAssistant",
27
  "evaluator_relationship": null,
28
  "benchmark_scores": {
29
+ "reward-bench/Score": 0.6901,
 
 
 
 
 
30
  "reward-bench/Factuality": 0.3179,
31
  "reward-bench/Precise IF": 0.2625,
32
  "reward-bench/Math": 0.3934,
33
+ "reward-bench/Safety": 0.6311,
34
  "reward-bench/Focus": 0.2707,
35
+ "reward-bench/Ties": 0.0198,
36
+ "reward-bench/Chat": 0.8855,
37
+ "reward-bench/Chat Hard": 0.4868,
38
+ "reward-bench/Reasoning": 0.7752,
39
+ "reward-bench/Prior Sets (0.5 weight)": 0.6533
40
  }
41
  },
42
  {
data/developers/openbmb.json CHANGED
@@ -21,17 +21,17 @@
21
  "developer": "openbmb",
22
  "evaluator_relationship": null,
23
  "benchmark_scores": {
24
- "reward-bench/Score": 0.5806,
25
- "reward-bench/Chat": 0.9804,
26
- "reward-bench/Chat Hard": 0.6557,
27
- "reward-bench/Safety": 0.6267,
28
- "reward-bench/Reasoning": 0.8633,
29
- "reward-bench/Prior Sets (0.5 weight)": 0.7172,
30
  "reward-bench/Factuality": 0.6,
31
  "reward-bench/Precise IF": 0.3438,
32
  "reward-bench/Math": 0.5683,
 
33
  "reward-bench/Focus": 0.7475,
34
- "reward-bench/Ties": 0.5972
 
 
 
 
35
  }
36
  },
37
  {
 
21
  "developer": "openbmb",
22
  "evaluator_relationship": null,
23
  "benchmark_scores": {
24
+ "reward-bench/Score": 0.8159,
 
 
 
 
 
25
  "reward-bench/Factuality": 0.6,
26
  "reward-bench/Precise IF": 0.3438,
27
  "reward-bench/Math": 0.5683,
28
+ "reward-bench/Safety": 0.8135,
29
  "reward-bench/Focus": 0.7475,
30
+ "reward-bench/Ties": 0.5972,
31
+ "reward-bench/Chat": 0.9804,
32
+ "reward-bench/Chat Hard": 0.6557,
33
+ "reward-bench/Reasoning": 0.8633,
34
+ "reward-bench/Prior Sets (0.5 weight)": 0.7172
35
  }
36
  },
37
  {
data/developers/pku-alignment.json CHANGED
@@ -7,17 +7,17 @@
7
  "developer": "PKU-Alignment",
8
  "evaluator_relationship": null,
9
  "benchmark_scores": {
10
- "reward-bench/Score": 0.3332,
11
- "reward-bench/Chat": 0.6173,
12
- "reward-bench/Chat Hard": 0.4232,
13
- "reward-bench/Safety": 0.7589,
14
- "reward-bench/Reasoning": 0.5482,
15
- "reward-bench/Prior Sets (0.5 weight)": 0.57,
16
  "reward-bench/Factuality": 0.3263,
17
  "reward-bench/Precise IF": 0.2313,
18
  "reward-bench/Math": 0.3989,
 
19
  "reward-bench/Focus": 0.2939,
20
- "reward-bench/Ties": -0.01
 
 
 
 
21
  }
22
  },
23
  {
@@ -26,17 +26,17 @@
26
  "developer": "PKU-Alignment",
27
  "evaluator_relationship": null,
28
  "benchmark_scores": {
29
- "reward-bench/Score": 0.4727,
 
 
 
 
 
30
  "reward-bench/Factuality": 0.2105,
31
  "reward-bench/Precise IF": 0.2938,
32
  "reward-bench/Math": 0.2623,
33
- "reward-bench/Safety": 0.3757,
34
  "reward-bench/Focus": 0.0646,
35
- "reward-bench/Ties": -0.01,
36
- "reward-bench/Chat": 0.8184,
37
- "reward-bench/Chat Hard": 0.2873,
38
- "reward-bench/Reasoning": 0.346,
39
- "reward-bench/Prior Sets (0.5 weight)": 0.5993
40
  }
41
  },
42
  {
@@ -64,17 +64,17 @@
64
  "developer": "PKU-Alignment",
65
  "evaluator_relationship": null,
66
  "benchmark_scores": {
67
- "reward-bench/Score": 0.6366,
 
 
 
 
 
68
  "reward-bench/Factuality": 0.2168,
69
  "reward-bench/Precise IF": 0.2562,
70
  "reward-bench/Math": 0.3825,
71
- "reward-bench/Safety": 0.6041,
72
  "reward-bench/Focus": 0.2606,
73
- "reward-bench/Ties": 0.0944,
74
- "reward-bench/Chat": 0.8994,
75
- "reward-bench/Chat Hard": 0.364,
76
- "reward-bench/Reasoning": 0.6887,
77
- "reward-bench/Prior Sets (0.5 weight)": 0.6171
78
  }
79
  }
80
  ]
 
7
  "developer": "PKU-Alignment",
8
  "evaluator_relationship": null,
9
  "benchmark_scores": {
10
+ "reward-bench/Score": 0.5798,
 
 
 
 
 
11
  "reward-bench/Factuality": 0.3263,
12
  "reward-bench/Precise IF": 0.2313,
13
  "reward-bench/Math": 0.3989,
14
+ "reward-bench/Safety": 0.7351,
15
  "reward-bench/Focus": 0.2939,
16
+ "reward-bench/Ties": -0.01,
17
+ "reward-bench/Chat": 0.6173,
18
+ "reward-bench/Chat Hard": 0.4232,
19
+ "reward-bench/Reasoning": 0.5482,
20
+ "reward-bench/Prior Sets (0.5 weight)": 0.57
21
  }
22
  },
23
  {
 
26
  "developer": "PKU-Alignment",
27
  "evaluator_relationship": null,
28
  "benchmark_scores": {
29
+ "reward-bench/Score": 0.1606,
30
+ "reward-bench/Chat": 0.8184,
31
+ "reward-bench/Chat Hard": 0.2873,
32
+ "reward-bench/Safety": 0.1422,
33
+ "reward-bench/Reasoning": 0.346,
34
+ "reward-bench/Prior Sets (0.5 weight)": 0.5993,
35
  "reward-bench/Factuality": 0.2105,
36
  "reward-bench/Precise IF": 0.2938,
37
  "reward-bench/Math": 0.2623,
 
38
  "reward-bench/Focus": 0.0646,
39
+ "reward-bench/Ties": -0.01
 
 
 
 
40
  }
41
  },
42
  {
 
64
  "developer": "PKU-Alignment",
65
  "evaluator_relationship": null,
66
  "benchmark_scores": {
67
+ "reward-bench/Score": 0.2544,
68
+ "reward-bench/Chat": 0.8994,
69
+ "reward-bench/Chat Hard": 0.364,
70
+ "reward-bench/Safety": 0.3156,
71
+ "reward-bench/Reasoning": 0.6887,
72
+ "reward-bench/Prior Sets (0.5 weight)": 0.6171,
73
  "reward-bench/Factuality": 0.2168,
74
  "reward-bench/Precise IF": 0.2562,
75
  "reward-bench/Math": 0.3825,
 
76
  "reward-bench/Focus": 0.2606,
77
+ "reward-bench/Ties": 0.0944
 
 
 
 
78
  }
79
  }
80
  ]
data/developers/primeintellect.json CHANGED
@@ -8,11 +8,11 @@
8
  "evaluator_relationship": null,
9
  "benchmark_scores": {
10
  "hfopenllm_v2/IFEval": 0.1757,
11
- "hfopenllm_v2/BBH": 0.276,
12
  "hfopenllm_v2/MATH Level 5": 0.0,
13
- "hfopenllm_v2/GPQA": 0.2534,
14
- "hfopenllm_v2/MUSR": 0.3339,
15
- "hfopenllm_v2/MMLU-PRO": 0.1123
16
  }
17
  },
18
  {
 
8
  "evaluator_relationship": null,
9
  "benchmark_scores": {
10
  "hfopenllm_v2/IFEval": 0.1757,
11
+ "hfopenllm_v2/BBH": 0.274,
12
  "hfopenllm_v2/MATH Level 5": 0.0,
13
+ "hfopenllm_v2/GPQA": 0.25,
14
+ "hfopenllm_v2/MUSR": 0.3753,
15
+ "hfopenllm_v2/MMLU-PRO": 0.112
16
  }
17
  },
18
  {
data/developers/princeton-nlp.json CHANGED
@@ -49,12 +49,12 @@
49
  "developer": "princeton-nlp",
50
  "evaluator_relationship": null,
51
  "benchmark_scores": {
52
- "hfopenllm_v2/IFEval": 0.5508,
53
- "hfopenllm_v2/BBH": 0.5028,
54
- "hfopenllm_v2/MATH Level 5": 0.0529,
55
- "hfopenllm_v2/GPQA": 0.2861,
56
- "hfopenllm_v2/MUSR": 0.4266,
57
- "hfopenllm_v2/MMLU-PRO": 0.3231
58
  }
59
  },
60
  {
 
49
  "developer": "princeton-nlp",
50
  "evaluator_relationship": null,
51
  "benchmark_scores": {
52
+ "hfopenllm_v2/IFEval": 0.3978,
53
+ "hfopenllm_v2/BBH": 0.4983,
54
+ "hfopenllm_v2/MATH Level 5": 0.0582,
55
+ "hfopenllm_v2/GPQA": 0.281,
56
+ "hfopenllm_v2/MUSR": 0.425,
57
+ "hfopenllm_v2/MMLU-PRO": 0.3246
58
  }
59
  },
60
  {