benchbench / assets /combined_20240704.csv
Yotam Perlitz
build app
0f8e886
raw
history blame
360 kB
,model,score,scenario,source,aggragated_from,tag
0,gpt_4_turbo_2024_04_09,82.6,arena_hard,arena_hard_2404,[],holistic
1,gpt_4_0125_preview,78.0,arena_hard,arena_hard_2404,[],holistic
2,gemini_1.5_pro_api_preview,72.0,arena_hard,arena_hard_2404,[],holistic
3,yi_large,63.7,arena_hard,arena_hard_2404,[],holistic
4,claude_3_opus_20240229,60.4,arena_hard,arena_hard_2404,[],holistic
5,glm_4,55.7,arena_hard,arena_hard_2404,[],holistic
6,gpt_4_0314,50.0,arena_hard,arena_hard_2404,[],holistic
7,gemini_1.5_flash_api_preview,49.6,arena_hard,arena_hard_2404,[],holistic
8,claude_3_sonnet_20240229,46.8,arena_hard,arena_hard_2404,[],holistic
9,claude_3_haiku_20240307,41.5,arena_hard,arena_hard_2404,[],holistic
10,llama_3_70b_chat,41.1,arena_hard,arena_hard_2404,[],holistic
11,gpt_4_0613,37.9,arena_hard,arena_hard_2404,[],holistic
12,mistral_large_2402,37.7,arena_hard,arena_hard_2404,[],holistic
13,mixtral_8x22b_instruct_v0.1,36.4,arena_hard,arena_hard_2404,[],holistic
14,qwen1.5_72b_chat,36.1,arena_hard,arena_hard_2404,[],holistic
15,command_r_plus,33.1,arena_hard,arena_hard_2404,[],holistic
16,mistral_medium,31.9,arena_hard,arena_hard_2404,[],holistic
17,mistral_next,27.4,arena_hard,arena_hard_2404,[],holistic
18,gpt_3.5_turbo_0613,24.8,arena_hard,arena_hard_2404,[],holistic
19,claude_2.0,24.0,arena_hard,arena_hard_2404,[],holistic
20,dbrx_instructruct,23.9,arena_hard,arena_hard_2404,[],holistic
21,mixtral_8x7b_instruct_v0.1,23.4,arena_hard,arena_hard_2404,[],holistic
22,gpt_3.5_turbo_0125,23.3,arena_hard,arena_hard_2404,[],holistic
23,yi_34b_chat,23.1,arena_hard,arena_hard_2404,[],holistic
24,starling_lm_7b_beta,23.0,arena_hard,arena_hard_2404,[],holistic
25,claude_2.1,22.8,arena_hard,arena_hard_2404,[],holistic
26,snorkel_mistral_pairrm_dpo,20.7,arena_hard,arena_hard_2404,[],holistic
27,llama_3_8b_chat,20.6,arena_hard,arena_hard_2404,[],holistic
28,gpt_3.5_turbo_1106,18.9,arena_hard,arena_hard_2404,[],holistic
29,gpt_3.5_turbo_0301,18.1,arena_hard,arena_hard_2404,[],holistic
30,gemini_1.0_pro,17.8,arena_hard,arena_hard_2404,[],holistic
31,snowflake_arctic_instruct,17.6,arena_hard,arena_hard_2404,[],holistic
32,command_r,17.0,arena_hard,arena_hard_2404,[],holistic
33,phi_3_mini_128k_instruct,15.4,arena_hard,arena_hard_2404,[],holistic
34,tulu_2_dpo_70b,15.0,arena_hard,arena_hard_2404,[],holistic
35,starling_lm_7b_alpha,12.8,arena_hard,arena_hard_2404,[],holistic
36,mistral_7b_instruct,12.6,arena_hard,arena_hard_2404,[],holistic
37,gemma_1.1_7b_it,12.1,arena_hard,arena_hard_2404,[],holistic
38,llama_2_70b_chat,11.6,arena_hard,arena_hard_2404,[],holistic
39,vicuna_33b_v1.3,8.6,arena_hard,arena_hard_2404,[],holistic
40,gemma_7b_it,7.5,arena_hard,arena_hard_2404,[],holistic
41,llama_2_7b_chat,4.6,arena_hard,arena_hard_2404,[],holistic
42,gemma_1.1_2b_it,3.4,arena_hard,arena_hard_2404,[],holistic
43,gemma_2b_it,3.0,arena_hard,arena_hard_2404,[],holistic
0,gpt_4o_2024_05_13,64.7,mixeval_hard-mixed,mixeval_240601,[],holistic
1,claude_3_opus,63.5,mixeval_hard-mixed,mixeval_240601,[],holistic
2,gpt_4_turbo_2024_04_09,62.6,mixeval_hard-mixed,mixeval_240601,[],holistic
3,gemini_1.5_pro_api_0409,58.7,mixeval_hard-mixed,mixeval_240601,[],holistic
4,yi_large_preview,56.8,mixeval_hard-mixed,mixeval_240601,[],holistic
5,llama_3_70b_instruct,55.9,mixeval_hard-mixed,mixeval_240601,[],holistic
6,qwen_max_0428,55.8,mixeval_hard-mixed,mixeval_240601,[],holistic
7,claude_3_sonnet,54.0,mixeval_hard-mixed,mixeval_240601,[],holistic
8,reka_core_20240415,52.9,mixeval_hard-mixed,mixeval_240601,[],holistic
9,mammoth2_8x7b_plus,51.8,mixeval_hard-mixed,mixeval_240601,[],holistic
10,deepseek_v2,51.7,mixeval_hard-mixed,mixeval_240601,[],holistic
11,command_r_plus,51.4,mixeval_hard-mixed,mixeval_240601,[],holistic
12,yi_1.5_34b_chat,51.2,mixeval_hard-mixed,mixeval_240601,[],holistic
13,mistral_large,50.3,mixeval_hard-mixed,mixeval_240601,[],holistic
14,qwen1.5_72b_chat,48.3,mixeval_hard-mixed,mixeval_240601,[],holistic
15,mistral_medium,47.8,mixeval_hard-mixed,mixeval_240601,[],holistic
16,gemini_1.0_pro,46.4,mixeval_hard-mixed,mixeval_240601,[],holistic
17,reka_flash_20240226,46.2,mixeval_hard-mixed,mixeval_240601,[],holistic
18,mistral_small,46.2,mixeval_hard-mixed,mixeval_240601,[],holistic
19,llama_3_8b_instruct,45.6,mixeval_hard-mixed,mixeval_240601,[],holistic
20,command_r,45.2,mixeval_hard-mixed,mixeval_240601,[],holistic
21,qwen1.5_32b_chat,43.3,mixeval_hard-mixed,mixeval_240601,[],holistic
22,gpt_3.5_turbo_0125,43.0,mixeval_hard-mixed,mixeval_240601,[],holistic
23,claude_3_haiku,42.8,mixeval_hard-mixed,mixeval_240601,[],holistic
24,yi_34b_chat,42.6,mixeval_hard-mixed,mixeval_240601,[],holistic
25,mixtral_8x7b_instruct_v0.1,42.5,mixeval_hard-mixed,mixeval_240601,[],holistic
26,starling_lm_7b_beta,41.8,mixeval_hard-mixed,mixeval_240601,[],holistic
27,yi_1.5_9b_chat,40.9,mixeval_hard-mixed,mixeval_240601,[],holistic
28,gemma_1.1_7b_it,39.1,mixeval_hard-mixed,mixeval_240601,[],holistic
29,vicuna_33b_v1.3,38.7,mixeval_hard-mixed,mixeval_240601,[],holistic
30,llama_2_70b_chat,38.0,mixeval_hard-mixed,mixeval_240601,[],holistic
31,map_neo_instruct_v0.1,37.8,mixeval_hard-mixed,mixeval_240601,[],holistic
32,mistral_7b_instruct_v0.2,36.2,mixeval_hard-mixed,mixeval_240601,[],holistic
33,qwen1.5_7b_chat,35.5,mixeval_hard-mixed,mixeval_240601,[],holistic
34,reka_edge_20240208,32.2,mixeval_hard-mixed,mixeval_240601,[],holistic
35,zephyr_7b_beta,31.6,mixeval_hard-mixed,mixeval_240601,[],holistic
36,llama_2_7b_chat,30.8,mixeval_hard-mixed,mixeval_240601,[],holistic
37,yi_6b_chat,30.1,mixeval_hard-mixed,mixeval_240601,[],holistic
38,qwen1.5_moe_a2.7b_chat,29.1,mixeval_hard-mixed,mixeval_240601,[],holistic
39,gemma_1.1_2b_it,28.4,mixeval_hard-mixed,mixeval_240601,[],holistic
40,vicuna_7b_v1.5,27.8,mixeval_hard-mixed,mixeval_240601,[],holistic
41,olmo_7b_instruct,26.7,mixeval_hard-mixed,mixeval_240601,[],holistic
42,qwen1.5_4b_chat,24.6,mixeval_hard-mixed,mixeval_240601,[],holistic
43,jetmoe_8b_chat,24.3,mixeval_hard-mixed,mixeval_240601,[],holistic
44,mpt_7b_chat,23.8,mixeval_hard-mixed,mixeval_240601,[],holistic
45,llama_3_70b,54.0,mixeval_hard-mixed,mixeval_240601,[],holistic
46,qwen1.5_72b,41.9,mixeval_hard-mixed,mixeval_240601,[],holistic
47,yi_34b,47.2,mixeval_hard-mixed,mixeval_240601,[],holistic
48,qwen1.5_32b,41.0,mixeval_hard-mixed,mixeval_240601,[],holistic
49,mixtral_8x7b,40.7,mixeval_hard-mixed,mixeval_240601,[],holistic
50,llama_2_70b,41.6,mixeval_hard-mixed,mixeval_240601,[],holistic
51,qwen1.5_moe_a2.7b,33.5,mixeval_hard-mixed,mixeval_240601,[],holistic
52,qwen1.5_7b,33.7,mixeval_hard-mixed,mixeval_240601,[],holistic
53,llama_3_8b,31.7,mixeval_hard-mixed,mixeval_240601,[],holistic
54,mistral_7b,27.1,mixeval_hard-mixed,mixeval_240601,[],holistic
55,gemma_7b,32.7,mixeval_hard-mixed,mixeval_240601,[],holistic
56,yi_6b,30.4,mixeval_hard-mixed,mixeval_240601,[],holistic
57,qwen1.5_4b,23.5,mixeval_hard-mixed,mixeval_240601,[],holistic
58,jetmoe_8b,27.0,mixeval_hard-mixed,mixeval_240601,[],holistic
59,deepseek_7b,21.7,mixeval_hard-mixed,mixeval_240601,[],holistic
60,phi_2,21.9,mixeval_hard-mixed,mixeval_240601,[],holistic
61,deepseekmoe_16b,24.2,mixeval_hard-mixed,mixeval_240601,[],holistic
62,llama_2_7b,22.1,mixeval_hard-mixed,mixeval_240601,[],holistic
63,gemma_2b,22.6,mixeval_hard-mixed,mixeval_240601,[],holistic
64,olmo_7b,21.2,mixeval_hard-mixed,mixeval_240601,[],holistic
65,mpt_7b,17.4,mixeval_hard-mixed,mixeval_240601,[],holistic
66,gpt_4o_2024_05_13,87.9,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
67,claude_3_opus,88.1,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
68,gpt_4_turbo_2024_04_09,88.8,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
69,gemini_1.5_pro_api_0409,84.2,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
70,yi_large_preview,84.4,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
71,llama_3_70b_instruct,84.0,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
72,qwen_max_0428,86.1,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
73,claude_3_sonnet,81.7,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
74,reka_core_20240415,83.3,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
75,mammoth2_8x7b_plus,81.5,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
76,deepseek_v2,83.7,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
77,command_r_plus,81.5,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
78,yi_1.5_34b_chat,81.7,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
79,mistral_large,84.2,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
80,qwen1.5_72b_chat,84.1,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
81,mistral_medium,81.9,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
82,gemini_1.0_pro,78.9,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
83,reka_flash_20240226,79.8,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
84,mistral_small,81.2,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
85,llama_3_8b_instruct,75.0,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
86,command_r,77.0,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
87,qwen1.5_32b_chat,81.0,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
88,gpt_3.5_turbo_0125,79.7,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
89,claude_3_haiku,79.7,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
90,yi_34b_chat,80.1,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
91,mixtral_8x7b_instruct_v0.1,76.4,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
92,starling_lm_7b_beta,74.8,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
93,yi_1.5_9b_chat,74.2,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
94,gemma_1.1_7b_it,69.6,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
95,vicuna_33b_v1.3,66.3,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
96,llama_2_70b_chat,74.6,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
97,map_neo_instruct_v0.1,70.0,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
98,mistral_7b_instruct_v0.2,70.0,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
99,qwen1.5_7b_chat,71.4,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
100,reka_edge_20240208,68.5,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
101,zephyr_7b_beta,69.1,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
102,llama_2_7b_chat,61.7,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
103,yi_6b_chat,65.6,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
104,qwen1.5_moe_a2.7b_chat,69.1,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
105,gemma_1.1_2b_it,51.9,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
106,vicuna_7b_v1.5,60.3,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
107,olmo_7b_instruct,55.0,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
108,qwen1.5_4b_chat,57.2,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
109,jetmoe_8b_chat,51.6,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
110,mpt_7b_chat,43.8,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
111,llama_3_70b,82.2,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
112,qwen1.5_72b,79.5,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
113,yi_34b,78.3,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
114,qwen1.5_32b,77.6,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
115,mixtral_8x7b,74.0,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
116,llama_2_70b,73.2,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
117,qwen1.5_moe_a2.7b,70.2,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
118,qwen1.5_7b,68.2,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
119,llama_3_8b,65.1,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
120,mistral_7b,64.8,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
121,gemma_7b,64.7,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
122,yi_6b,63.1,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
123,qwen1.5_4b,58.2,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
124,jetmoe_8b,57.1,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
125,deepseek_7b,52.2,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
126,phi_2,51.9,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
127,deepseekmoe_16b,51.4,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
128,llama_2_7b,43.1,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
129,gemma_2b,38.9,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
130,olmo_7b,31.8,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
131,mpt_7b,30.8,mixeval,mixeval_240601,"['mixeval_hard-mixed', 'arena_elo-mixed', 'triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa_hard-mixed', 'mmlu_hard-mixed', 'drop_hard-mixed', 'boolq-mixed']",holistic
132,gpt_4o_2024_05_13,1287.0,arena_elo-mixed,mixeval_240601,[],holistic
133,claude_3_opus,1248.0,arena_elo-mixed,mixeval_240601,[],holistic
134,gpt_4_turbo_2024_04_09,1256.0,arena_elo-mixed,mixeval_240601,[],holistic
135,gemini_1.5_pro_api_0409,1258.0,arena_elo-mixed,mixeval_240601,[],holistic
136,yi_large_preview,1239.0,arena_elo-mixed,mixeval_240601,[],holistic
137,llama_3_70b_instruct,1208.0,arena_elo-mixed,mixeval_240601,[],holistic
138,qwen_max_0428,1184.0,arena_elo-mixed,mixeval_240601,[],holistic
139,claude_3_sonnet,1201.0,arena_elo-mixed,mixeval_240601,[],holistic
143,command_r_plus,1189.0,arena_elo-mixed,mixeval_240601,[],holistic
145,mistral_large,1156.0,arena_elo-mixed,mixeval_240601,[],holistic
146,qwen1.5_72b_chat,1147.0,arena_elo-mixed,mixeval_240601,[],holistic
147,mistral_medium,1148.0,arena_elo-mixed,mixeval_240601,[],holistic
148,gemini_1.0_pro,1131.0,arena_elo-mixed,mixeval_240601,[],holistic
149,reka_flash_20240226,1148.0,arena_elo-mixed,mixeval_240601,[],holistic
151,llama_3_8b_instruct,1153.0,arena_elo-mixed,mixeval_240601,[],holistic
152,command_r,1147.0,arena_elo-mixed,mixeval_240601,[],holistic
153,qwen1.5_32b_chat,1126.0,arena_elo-mixed,mixeval_240601,[],holistic
154,gpt_3.5_turbo_0125,1102.0,arena_elo-mixed,mixeval_240601,[],holistic
155,claude_3_haiku,1178.0,arena_elo-mixed,mixeval_240601,[],holistic
156,yi_34b_chat,1111.0,arena_elo-mixed,mixeval_240601,[],holistic
157,mixtral_8x7b_instruct_v0.1,1114.0,arena_elo-mixed,mixeval_240601,[],holistic
158,starling_lm_7b_beta,1119.0,arena_elo-mixed,mixeval_240601,[],holistic
160,gemma_1.1_7b_it,1084.0,arena_elo-mixed,mixeval_240601,[],holistic
161,vicuna_33b_v1.3,1090.0,arena_elo-mixed,mixeval_240601,[],holistic
162,llama_2_70b_chat,1093.0,arena_elo-mixed,mixeval_240601,[],holistic
164,mistral_7b_instruct_v0.2,1072.0,arena_elo-mixed,mixeval_240601,[],holistic
165,qwen1.5_7b_chat,1069.0,arena_elo-mixed,mixeval_240601,[],holistic
168,llama_2_7b_chat,1037.0,arena_elo-mixed,mixeval_240601,[],holistic
171,gemma_1.1_2b_it,1019.0,arena_elo-mixed,mixeval_240601,[],holistic
172,vicuna_7b_v1.5,1004.0,arena_elo-mixed,mixeval_240601,[],holistic
173,olmo_7b_instruct,1015.0,arena_elo-mixed,mixeval_240601,[],holistic
174,qwen1.5_4b_chat,988.0,arena_elo-mixed,mixeval_240601,[],holistic
176,mpt_7b_chat,927.0,arena_elo-mixed,mixeval_240601,[],holistic
198,gpt_4o_2024_05_13,88.0,triviaqa-mixed,mixeval_240601,[],knowledge
199,claude_3_opus,90.4,triviaqa-mixed,mixeval_240601,[],knowledge
200,gpt_4_turbo_2024_04_09,91.2,triviaqa-mixed,mixeval_240601,[],knowledge
201,gemini_1.5_pro_api_0409,85.3,triviaqa-mixed,mixeval_240601,[],knowledge
202,yi_large_preview,81.7,triviaqa-mixed,mixeval_240601,[],knowledge
203,llama_3_70b_instruct,83.1,triviaqa-mixed,mixeval_240601,[],knowledge
204,qwen_max_0428,86.7,triviaqa-mixed,mixeval_240601,[],knowledge
205,claude_3_sonnet,84.2,triviaqa-mixed,mixeval_240601,[],knowledge
206,reka_core_20240415,82.8,triviaqa-mixed,mixeval_240601,[],knowledge
207,mammoth2_8x7b_plus,83.0,triviaqa-mixed,mixeval_240601,[],knowledge
208,deepseek_v2,84.4,triviaqa-mixed,mixeval_240601,[],knowledge
209,command_r_plus,83.3,triviaqa-mixed,mixeval_240601,[],knowledge
210,yi_1.5_34b_chat,78.4,triviaqa-mixed,mixeval_240601,[],knowledge
211,mistral_large,88.3,triviaqa-mixed,mixeval_240601,[],knowledge
212,qwen1.5_72b_chat,83.9,triviaqa-mixed,mixeval_240601,[],knowledge
213,mistral_medium,86.8,triviaqa-mixed,mixeval_240601,[],knowledge
214,gemini_1.0_pro,81.0,triviaqa-mixed,mixeval_240601,[],knowledge
215,reka_flash_20240226,76.4,triviaqa-mixed,mixeval_240601,[],knowledge
216,mistral_small,85.1,triviaqa-mixed,mixeval_240601,[],knowledge
217,llama_3_8b_instruct,71.7,triviaqa-mixed,mixeval_240601,[],knowledge
218,command_r,80.9,triviaqa-mixed,mixeval_240601,[],knowledge
219,qwen1.5_32b_chat,75.7,triviaqa-mixed,mixeval_240601,[],knowledge
220,gpt_3.5_turbo_0125,85.2,triviaqa-mixed,mixeval_240601,[],knowledge
221,claude_3_haiku,79.9,triviaqa-mixed,mixeval_240601,[],knowledge
222,yi_34b_chat,82.7,triviaqa-mixed,mixeval_240601,[],knowledge
223,mixtral_8x7b_instruct_v0.1,82.5,triviaqa-mixed,mixeval_240601,[],knowledge
224,starling_lm_7b_beta,75.1,triviaqa-mixed,mixeval_240601,[],knowledge
225,yi_1.5_9b_chat,61.3,triviaqa-mixed,mixeval_240601,[],knowledge
226,gemma_1.1_7b_it,64.3,triviaqa-mixed,mixeval_240601,[],knowledge
227,vicuna_33b_v1.3,79.2,triviaqa-mixed,mixeval_240601,[],knowledge
228,llama_2_70b_chat,80.0,triviaqa-mixed,mixeval_240601,[],knowledge
229,map_neo_instruct_v0.1,62.1,triviaqa-mixed,mixeval_240601,[],knowledge
230,mistral_7b_instruct_v0.2,73.7,triviaqa-mixed,mixeval_240601,[],knowledge
231,qwen1.5_7b_chat,64.1,triviaqa-mixed,mixeval_240601,[],knowledge
232,reka_edge_20240208,60.0,triviaqa-mixed,mixeval_240601,[],knowledge
233,zephyr_7b_beta,74.7,triviaqa-mixed,mixeval_240601,[],knowledge
234,llama_2_7b_chat,68.8,triviaqa-mixed,mixeval_240601,[],knowledge
235,yi_6b_chat,66.1,triviaqa-mixed,mixeval_240601,[],knowledge
236,qwen1.5_moe_a2.7b_chat,65.9,triviaqa-mixed,mixeval_240601,[],knowledge
237,gemma_1.1_2b_it,53.7,triviaqa-mixed,mixeval_240601,[],knowledge
238,vicuna_7b_v1.5,66.4,triviaqa-mixed,mixeval_240601,[],knowledge
239,olmo_7b_instruct,51.7,triviaqa-mixed,mixeval_240601,[],knowledge
240,qwen1.5_4b_chat,46.0,triviaqa-mixed,mixeval_240601,[],knowledge
241,jetmoe_8b_chat,46.8,triviaqa-mixed,mixeval_240601,[],knowledge
242,mpt_7b_chat,50.2,triviaqa-mixed,mixeval_240601,[],knowledge
243,llama_3_70b,83.1,triviaqa-mixed,mixeval_240601,[],knowledge
244,qwen1.5_72b,78.4,triviaqa-mixed,mixeval_240601,[],knowledge
245,yi_34b,72.1,triviaqa-mixed,mixeval_240601,[],knowledge
246,qwen1.5_32b,71.9,triviaqa-mixed,mixeval_240601,[],knowledge
247,mixtral_8x7b,77.3,triviaqa-mixed,mixeval_240601,[],knowledge
248,llama_2_70b,78.7,triviaqa-mixed,mixeval_240601,[],knowledge
249,qwen1.5_moe_a2.7b,71.3,triviaqa-mixed,mixeval_240601,[],knowledge
250,qwen1.5_7b,61.4,triviaqa-mixed,mixeval_240601,[],knowledge
251,llama_3_8b,65.2,triviaqa-mixed,mixeval_240601,[],knowledge
252,mistral_7b,67.2,triviaqa-mixed,mixeval_240601,[],knowledge
253,gemma_7b,66.0,triviaqa-mixed,mixeval_240601,[],knowledge
254,yi_6b,54.7,triviaqa-mixed,mixeval_240601,[],knowledge
255,qwen1.5_4b,47.8,triviaqa-mixed,mixeval_240601,[],knowledge
256,jetmoe_8b,53.4,triviaqa-mixed,mixeval_240601,[],knowledge
257,deepseek_7b,58.7,triviaqa-mixed,mixeval_240601,[],knowledge
258,phi_2,37.0,triviaqa-mixed,mixeval_240601,[],knowledge
259,deepseekmoe_16b,64.2,triviaqa-mixed,mixeval_240601,[],knowledge
260,llama_2_7b,55.5,triviaqa-mixed,mixeval_240601,[],knowledge
261,gemma_2b,41.5,triviaqa-mixed,mixeval_240601,[],knowledge
262,olmo_7b,38.4,triviaqa-mixed,mixeval_240601,[],knowledge
263,mpt_7b,33.5,triviaqa-mixed,mixeval_240601,[],knowledge
264,gpt_4o_2024_05_13,85.4,mmlu-mixed,mixeval_240601,[],knowledge
265,claude_3_opus,83.2,mmlu-mixed,mixeval_240601,[],knowledge
266,gpt_4_turbo_2024_04_09,82.8,mmlu-mixed,mixeval_240601,[],knowledge
267,gemini_1.5_pro_api_0409,79.2,mmlu-mixed,mixeval_240601,[],knowledge
268,yi_large_preview,80.9,mmlu-mixed,mixeval_240601,[],knowledge
269,llama_3_70b_instruct,80.5,mmlu-mixed,mixeval_240601,[],knowledge
270,qwen_max_0428,80.6,mmlu-mixed,mixeval_240601,[],knowledge
271,claude_3_sonnet,74.7,mmlu-mixed,mixeval_240601,[],knowledge
272,reka_core_20240415,79.3,mmlu-mixed,mixeval_240601,[],knowledge
273,mammoth2_8x7b_plus,74.5,mmlu-mixed,mixeval_240601,[],knowledge
274,deepseek_v2,77.3,mmlu-mixed,mixeval_240601,[],knowledge
275,command_r_plus,78.9,mmlu-mixed,mixeval_240601,[],knowledge
276,yi_1.5_34b_chat,76.4,mmlu-mixed,mixeval_240601,[],knowledge
277,mistral_large,80.2,mmlu-mixed,mixeval_240601,[],knowledge
278,qwen1.5_72b_chat,80.1,mmlu-mixed,mixeval_240601,[],knowledge
279,mistral_medium,76.3,mmlu-mixed,mixeval_240601,[],knowledge
280,gemini_1.0_pro,74.9,mmlu-mixed,mixeval_240601,[],knowledge
281,reka_flash_20240226,75.4,mmlu-mixed,mixeval_240601,[],knowledge
282,mistral_small,75.2,mmlu-mixed,mixeval_240601,[],knowledge
283,llama_3_8b_instruct,71.9,mmlu-mixed,mixeval_240601,[],knowledge
284,command_r,75.0,mmlu-mixed,mixeval_240601,[],knowledge
285,qwen1.5_32b_chat,78.0,mmlu-mixed,mixeval_240601,[],knowledge
286,gpt_3.5_turbo_0125,74.5,mmlu-mixed,mixeval_240601,[],knowledge
287,claude_3_haiku,76.1,mmlu-mixed,mixeval_240601,[],knowledge
288,yi_34b_chat,73.6,mmlu-mixed,mixeval_240601,[],knowledge
289,mixtral_8x7b_instruct_v0.1,72.0,mmlu-mixed,mixeval_240601,[],knowledge
290,starling_lm_7b_beta,69.0,mmlu-mixed,mixeval_240601,[],knowledge
291,yi_1.5_9b_chat,72.6,mmlu-mixed,mixeval_240601,[],knowledge
292,gemma_1.1_7b_it,66.9,mmlu-mixed,mixeval_240601,[],knowledge
293,vicuna_33b_v1.3,59.2,mmlu-mixed,mixeval_240601,[],knowledge
294,llama_2_70b_chat,69.8,mmlu-mixed,mixeval_240601,[],knowledge
295,map_neo_instruct_v0.1,66.7,mmlu-mixed,mixeval_240601,[],knowledge
296,mistral_7b_instruct_v0.2,67.3,mmlu-mixed,mixeval_240601,[],knowledge
297,qwen1.5_7b_chat,68.7,mmlu-mixed,mixeval_240601,[],knowledge
298,reka_edge_20240208,63.6,mmlu-mixed,mixeval_240601,[],knowledge
299,zephyr_7b_beta,64.9,mmlu-mixed,mixeval_240601,[],knowledge
300,llama_2_7b_chat,59.4,mmlu-mixed,mixeval_240601,[],knowledge
301,yi_6b_chat,65.4,mmlu-mixed,mixeval_240601,[],knowledge
302,qwen1.5_moe_a2.7b_chat,69.5,mmlu-mixed,mixeval_240601,[],knowledge
303,gemma_1.1_2b_it,51.5,mmlu-mixed,mixeval_240601,[],knowledge
304,vicuna_7b_v1.5,58.7,mmlu-mixed,mixeval_240601,[],knowledge
305,olmo_7b_instruct,57.1,mmlu-mixed,mixeval_240601,[],knowledge
306,qwen1.5_4b_chat,61.4,mmlu-mixed,mixeval_240601,[],knowledge
307,jetmoe_8b_chat,58.5,mmlu-mixed,mixeval_240601,[],knowledge
308,mpt_7b_chat,37.8,mmlu-mixed,mixeval_240601,[],knowledge
309,llama_3_70b,79.8,mmlu-mixed,mixeval_240601,[],knowledge
310,qwen1.5_72b,78.8,mmlu-mixed,mixeval_240601,[],knowledge
311,yi_34b,79.3,mmlu-mixed,mixeval_240601,[],knowledge
312,qwen1.5_32b,77.2,mmlu-mixed,mixeval_240601,[],knowledge
313,mixtral_8x7b,71.6,mmlu-mixed,mixeval_240601,[],knowledge
314,llama_2_70b,70.8,mmlu-mixed,mixeval_240601,[],knowledge
315,qwen1.5_moe_a2.7b,69.4,mmlu-mixed,mixeval_240601,[],knowledge
316,qwen1.5_7b,67.0,mmlu-mixed,mixeval_240601,[],knowledge
317,llama_3_8b,69.5,mmlu-mixed,mixeval_240601,[],knowledge
318,mistral_7b,68.5,mmlu-mixed,mixeval_240601,[],knowledge
319,gemma_7b,67.4,mmlu-mixed,mixeval_240601,[],knowledge
320,yi_6b,71.2,mmlu-mixed,mixeval_240601,[],knowledge
321,qwen1.5_4b,59.6,mmlu-mixed,mixeval_240601,[],knowledge
322,jetmoe_8b,55.3,mmlu-mixed,mixeval_240601,[],knowledge
323,deepseek_7b,53.3,mmlu-mixed,mixeval_240601,[],knowledge
324,phi_2,62.5,mmlu-mixed,mixeval_240601,[],knowledge
325,deepseekmoe_16b,49.9,mmlu-mixed,mixeval_240601,[],knowledge
326,llama_2_7b,40.8,mmlu-mixed,mixeval_240601,[],knowledge
327,gemma_2b,37.4,mmlu-mixed,mixeval_240601,[],knowledge
328,olmo_7b,29.7,mmlu-mixed,mixeval_240601,[],knowledge
329,mpt_7b,30.9,mmlu-mixed,mixeval_240601,[],knowledge
330,gpt_4o_2024_05_13,87.9,drop-mixed,mixeval_240601,[],reasoning
331,claude_3_opus,91.5,drop-mixed,mixeval_240601,[],reasoning
332,gpt_4_turbo_2024_04_09,91.0,drop-mixed,mixeval_240601,[],reasoning
333,gemini_1.5_pro_api_0409,84.2,drop-mixed,mixeval_240601,[],reasoning
334,yi_large_preview,87.0,drop-mixed,mixeval_240601,[],reasoning
335,llama_3_70b_instruct,90.1,drop-mixed,mixeval_240601,[],reasoning
336,qwen_max_0428,85.4,drop-mixed,mixeval_240601,[],reasoning
337,claude_3_sonnet,87.7,drop-mixed,mixeval_240601,[],reasoning
338,reka_core_20240415,88.1,drop-mixed,mixeval_240601,[],reasoning
339,mammoth2_8x7b_plus,85.7,drop-mixed,mixeval_240601,[],reasoning
340,deepseek_v2,85.3,drop-mixed,mixeval_240601,[],reasoning
341,command_r_plus,80.4,drop-mixed,mixeval_240601,[],reasoning
342,yi_1.5_34b_chat,87.0,drop-mixed,mixeval_240601,[],reasoning
343,mistral_large,88.6,drop-mixed,mixeval_240601,[],reasoning
344,qwen1.5_72b_chat,85.1,drop-mixed,mixeval_240601,[],reasoning
345,mistral_medium,83.2,drop-mixed,mixeval_240601,[],reasoning
346,gemini_1.0_pro,82.6,drop-mixed,mixeval_240601,[],reasoning
347,reka_flash_20240226,86.7,drop-mixed,mixeval_240601,[],reasoning
348,mistral_small,86.1,drop-mixed,mixeval_240601,[],reasoning
349,llama_3_8b_instruct,86.4,drop-mixed,mixeval_240601,[],reasoning
350,command_r,72.0,drop-mixed,mixeval_240601,[],reasoning
351,qwen1.5_32b_chat,82.9,drop-mixed,mixeval_240601,[],reasoning
352,gpt_3.5_turbo_0125,84.8,drop-mixed,mixeval_240601,[],reasoning
353,claude_3_haiku,85.0,drop-mixed,mixeval_240601,[],reasoning
354,yi_34b_chat,86.1,drop-mixed,mixeval_240601,[],reasoning
355,mixtral_8x7b_instruct_v0.1,79.5,drop-mixed,mixeval_240601,[],reasoning
356,starling_lm_7b_beta,86.4,drop-mixed,mixeval_240601,[],reasoning
357,yi_1.5_9b_chat,83.9,drop-mixed,mixeval_240601,[],reasoning
358,gemma_1.1_7b_it,80.6,drop-mixed,mixeval_240601,[],reasoning
359,vicuna_33b_v1.3,71.4,drop-mixed,mixeval_240601,[],reasoning
360,llama_2_70b_chat,79.8,drop-mixed,mixeval_240601,[],reasoning
361,map_neo_instruct_v0.1,75.5,drop-mixed,mixeval_240601,[],reasoning
362,mistral_7b_instruct_v0.2,72.8,drop-mixed,mixeval_240601,[],reasoning
363,qwen1.5_7b_chat,76.4,drop-mixed,mixeval_240601,[],reasoning
364,reka_edge_20240208,80.0,drop-mixed,mixeval_240601,[],reasoning
365,zephyr_7b_beta,77.3,drop-mixed,mixeval_240601,[],reasoning
366,llama_2_7b_chat,69.3,drop-mixed,mixeval_240601,[],reasoning
367,yi_6b_chat,70.5,drop-mixed,mixeval_240601,[],reasoning
368,qwen1.5_moe_a2.7b_chat,64.6,drop-mixed,mixeval_240601,[],reasoning
369,gemma_1.1_2b_it,59.8,drop-mixed,mixeval_240601,[],reasoning
370,vicuna_7b_v1.5,68.3,drop-mixed,mixeval_240601,[],reasoning
371,olmo_7b_instruct,53.1,drop-mixed,mixeval_240601,[],reasoning
372,qwen1.5_4b_chat,57.2,drop-mixed,mixeval_240601,[],reasoning
373,jetmoe_8b_chat,27.0,drop-mixed,mixeval_240601,[],reasoning
374,mpt_7b_chat,50.0,drop-mixed,mixeval_240601,[],reasoning
375,llama_3_70b,81.5,drop-mixed,mixeval_240601,[],reasoning
376,qwen1.5_72b,64.5,drop-mixed,mixeval_240601,[],reasoning
377,yi_34b,78.2,drop-mixed,mixeval_240601,[],reasoning
378,qwen1.5_32b,68.7,drop-mixed,mixeval_240601,[],reasoning
379,mixtral_8x7b,69.8,drop-mixed,mixeval_240601,[],reasoning
380,llama_2_70b,73.2,drop-mixed,mixeval_240601,[],reasoning
381,qwen1.5_moe_a2.7b,59.9,drop-mixed,mixeval_240601,[],reasoning
382,qwen1.5_7b,63.6,drop-mixed,mixeval_240601,[],reasoning
383,llama_3_8b,63.8,drop-mixed,mixeval_240601,[],reasoning
384,mistral_7b,61.3,drop-mixed,mixeval_240601,[],reasoning
385,gemma_7b,63.8,drop-mixed,mixeval_240601,[],reasoning
386,yi_6b,51.4,drop-mixed,mixeval_240601,[],reasoning
387,qwen1.5_4b,51.0,drop-mixed,mixeval_240601,[],reasoning
388,jetmoe_8b,44.1,drop-mixed,mixeval_240601,[],reasoning
389,deepseek_7b,43.5,drop-mixed,mixeval_240601,[],reasoning
390,phi_2,50.4,drop-mixed,mixeval_240601,[],reasoning
391,deepseekmoe_16b,41.1,drop-mixed,mixeval_240601,[],reasoning
392,llama_2_7b,37.6,drop-mixed,mixeval_240601,[],reasoning
393,gemma_2b,32.6,drop-mixed,mixeval_240601,[],reasoning
394,olmo_7b,24.0,drop-mixed,mixeval_240601,[],reasoning
395,mpt_7b,26.8,drop-mixed,mixeval_240601,[],reasoning
396,gpt_4o_2024_05_13,94.3,hellaswag-mixed,mixeval_240601,[],reasoning
397,claude_3_opus,93.3,hellaswag-mixed,mixeval_240601,[],reasoning
398,gpt_4_turbo_2024_04_09,92.6,hellaswag-mixed,mixeval_240601,[],reasoning
399,gemini_1.5_pro_api_0409,89.2,hellaswag-mixed,mixeval_240601,[],reasoning
400,yi_large_preview,92.6,hellaswag-mixed,mixeval_240601,[],reasoning
401,llama_3_70b_instruct,81.8,hellaswag-mixed,mixeval_240601,[],reasoning
402,qwen_max_0428,93.6,hellaswag-mixed,mixeval_240601,[],reasoning
403,claude_3_sonnet,85.9,hellaswag-mixed,mixeval_240601,[],reasoning
404,reka_core_20240415,88.6,hellaswag-mixed,mixeval_240601,[],reasoning
405,mammoth2_8x7b_plus,82.2,hellaswag-mixed,mixeval_240601,[],reasoning
406,deepseek_v2,88.2,hellaswag-mixed,mixeval_240601,[],reasoning
407,command_r_plus,83.5,hellaswag-mixed,mixeval_240601,[],reasoning
408,yi_1.5_34b_chat,90.2,hellaswag-mixed,mixeval_240601,[],reasoning
409,mistral_large,65.0,hellaswag-mixed,mixeval_240601,[],reasoning
410,qwen1.5_72b_chat,87.9,hellaswag-mixed,mixeval_240601,[],reasoning
411,mistral_medium,72.4,hellaswag-mixed,mixeval_240601,[],reasoning
412,gemini_1.0_pro,74.7,hellaswag-mixed,mixeval_240601,[],reasoning
413,reka_flash_20240226,90.6,hellaswag-mixed,mixeval_240601,[],reasoning
414,mistral_small,73.4,hellaswag-mixed,mixeval_240601,[],reasoning
415,llama_3_8b_instruct,65.7,hellaswag-mixed,mixeval_240601,[],reasoning
416,command_r,75.8,hellaswag-mixed,mixeval_240601,[],reasoning
417,qwen1.5_32b_chat,85.9,hellaswag-mixed,mixeval_240601,[],reasoning
418,gpt_3.5_turbo_0125,63.0,hellaswag-mixed,mixeval_240601,[],reasoning
419,claude_3_haiku,75.8,hellaswag-mixed,mixeval_240601,[],reasoning
420,yi_34b_chat,86.9,hellaswag-mixed,mixeval_240601,[],reasoning
421,mixtral_8x7b_instruct_v0.1,54.2,hellaswag-mixed,mixeval_240601,[],reasoning
422,starling_lm_7b_beta,48.5,hellaswag-mixed,mixeval_240601,[],reasoning
423,yi_1.5_9b_chat,86.5,hellaswag-mixed,mixeval_240601,[],reasoning
424,gemma_1.1_7b_it,66.3,hellaswag-mixed,mixeval_240601,[],reasoning
425,vicuna_33b_v1.3,30.3,hellaswag-mixed,mixeval_240601,[],reasoning
426,llama_2_70b_chat,67.3,hellaswag-mixed,mixeval_240601,[],reasoning
427,map_neo_instruct_v0.1,74.4,hellaswag-mixed,mixeval_240601,[],reasoning
428,mistral_7b_instruct_v0.2,54.2,hellaswag-mixed,mixeval_240601,[],reasoning
429,qwen1.5_7b_chat,76.1,hellaswag-mixed,mixeval_240601,[],reasoning
430,reka_edge_20240208,74.7,hellaswag-mixed,mixeval_240601,[],reasoning
431,zephyr_7b_beta,39.1,hellaswag-mixed,mixeval_240601,[],reasoning
432,llama_2_7b_chat,35.7,hellaswag-mixed,mixeval_240601,[],reasoning
433,yi_6b_chat,52.5,hellaswag-mixed,mixeval_240601,[],reasoning
434,qwen1.5_moe_a2.7b_chat,72.7,hellaswag-mixed,mixeval_240601,[],reasoning
435,gemma_1.1_2b_it,26.6,hellaswag-mixed,mixeval_240601,[],reasoning
436,vicuna_7b_v1.5,24.9,hellaswag-mixed,mixeval_240601,[],reasoning
437,olmo_7b_instruct,55.9,hellaswag-mixed,mixeval_240601,[],reasoning
438,qwen1.5_4b_chat,54.9,hellaswag-mixed,mixeval_240601,[],reasoning
439,jetmoe_8b_chat,86.2,hellaswag-mixed,mixeval_240601,[],reasoning
440,mpt_7b_chat,25.6,hellaswag-mixed,mixeval_240601,[],reasoning
441,llama_3_70b,90.9,hellaswag-mixed,mixeval_240601,[],reasoning
442,qwen1.5_72b,91.9,hellaswag-mixed,mixeval_240601,[],reasoning
443,yi_34b,98.0,hellaswag-mixed,mixeval_240601,[],reasoning
444,qwen1.5_32b,93.3,hellaswag-mixed,mixeval_240601,[],reasoning
445,mixtral_8x7b,73.7,hellaswag-mixed,mixeval_240601,[],reasoning
446,llama_2_70b,63.0,hellaswag-mixed,mixeval_240601,[],reasoning
447,qwen1.5_moe_a2.7b,80.1,hellaswag-mixed,mixeval_240601,[],reasoning
448,qwen1.5_7b,83.8,hellaswag-mixed,mixeval_240601,[],reasoning
449,llama_3_8b,51.5,hellaswag-mixed,mixeval_240601,[],reasoning
450,mistral_7b,54.5,hellaswag-mixed,mixeval_240601,[],reasoning
451,gemma_7b,36.0,hellaswag-mixed,mixeval_240601,[],reasoning
452,yi_6b,77.4,hellaswag-mixed,mixeval_240601,[],reasoning
453,qwen1.5_4b,65.7,hellaswag-mixed,mixeval_240601,[],reasoning
454,jetmoe_8b,89.2,hellaswag-mixed,mixeval_240601,[],reasoning
455,deepseek_7b,35.0,hellaswag-mixed,mixeval_240601,[],reasoning
456,phi_2,20.2,hellaswag-mixed,mixeval_240601,[],reasoning
457,deepseekmoe_16b,28.6,hellaswag-mixed,mixeval_240601,[],reasoning
458,llama_2_7b,24.9,hellaswag-mixed,mixeval_240601,[],reasoning
459,gemma_2b,33.3,hellaswag-mixed,mixeval_240601,[],reasoning
460,olmo_7b,26.9,hellaswag-mixed,mixeval_240601,[],reasoning
461,mpt_7b,19.2,hellaswag-mixed,mixeval_240601,[],reasoning
462,gpt_4o_2024_05_13,86.8,commonsenseqa-mixed,mixeval_240601,[],reasoning
463,claude_3_opus,87.7,commonsenseqa-mixed,mixeval_240601,[],reasoning
464,gpt_4_turbo_2024_04_09,85.4,commonsenseqa-mixed,mixeval_240601,[],reasoning
465,gemini_1.5_pro_api_0409,84.4,commonsenseqa-mixed,mixeval_240601,[],reasoning
466,yi_large_preview,90.1,commonsenseqa-mixed,mixeval_240601,[],reasoning
467,llama_3_70b_instruct,83.0,commonsenseqa-mixed,mixeval_240601,[],reasoning
468,qwen_max_0428,88.2,commonsenseqa-mixed,mixeval_240601,[],reasoning
469,claude_3_sonnet,82.5,commonsenseqa-mixed,mixeval_240601,[],reasoning
470,reka_core_20240415,81.6,commonsenseqa-mixed,mixeval_240601,[],reasoning
471,mammoth2_8x7b_plus,82.5,commonsenseqa-mixed,mixeval_240601,[],reasoning
472,deepseek_v2,84.0,commonsenseqa-mixed,mixeval_240601,[],reasoning
473,command_r_plus,82.1,commonsenseqa-mixed,mixeval_240601,[],reasoning
474,yi_1.5_34b_chat,86.8,commonsenseqa-mixed,mixeval_240601,[],reasoning
475,mistral_large,83.5,commonsenseqa-mixed,mixeval_240601,[],reasoning
476,qwen1.5_72b_chat,86.3,commonsenseqa-mixed,mixeval_240601,[],reasoning
477,mistral_medium,82.5,commonsenseqa-mixed,mixeval_240601,[],reasoning
478,gemini_1.0_pro,80.2,commonsenseqa-mixed,mixeval_240601,[],reasoning
479,reka_flash_20240226,80.7,commonsenseqa-mixed,mixeval_240601,[],reasoning
480,mistral_small,77.8,commonsenseqa-mixed,mixeval_240601,[],reasoning
481,llama_3_8b_instruct,78.3,commonsenseqa-mixed,mixeval_240601,[],reasoning
482,command_r,77.4,commonsenseqa-mixed,mixeval_240601,[],reasoning
483,qwen1.5_32b_chat,88.2,commonsenseqa-mixed,mixeval_240601,[],reasoning
484,gpt_3.5_turbo_0125,81.6,commonsenseqa-mixed,mixeval_240601,[],reasoning
485,claude_3_haiku,78.8,commonsenseqa-mixed,mixeval_240601,[],reasoning
486,yi_34b_chat,78.8,commonsenseqa-mixed,mixeval_240601,[],reasoning
487,mixtral_8x7b_instruct_v0.1,77.4,commonsenseqa-mixed,mixeval_240601,[],reasoning
488,starling_lm_7b_beta,84.9,commonsenseqa-mixed,mixeval_240601,[],reasoning
489,yi_1.5_9b_chat,82.5,commonsenseqa-mixed,mixeval_240601,[],reasoning
490,gemma_1.1_7b_it,73.6,commonsenseqa-mixed,mixeval_240601,[],reasoning
491,vicuna_33b_v1.3,61.8,commonsenseqa-mixed,mixeval_240601,[],reasoning
492,llama_2_70b_chat,74.1,commonsenseqa-mixed,mixeval_240601,[],reasoning
493,map_neo_instruct_v0.1,82.1,commonsenseqa-mixed,mixeval_240601,[],reasoning
494,mistral_7b_instruct_v0.2,66.0,commonsenseqa-mixed,mixeval_240601,[],reasoning
495,qwen1.5_7b_chat,82.1,commonsenseqa-mixed,mixeval_240601,[],reasoning
496,reka_edge_20240208,80.7,commonsenseqa-mixed,mixeval_240601,[],reasoning
497,zephyr_7b_beta,69.3,commonsenseqa-mixed,mixeval_240601,[],reasoning
498,llama_2_7b_chat,61.3,commonsenseqa-mixed,mixeval_240601,[],reasoning
499,yi_6b_chat,69.8,commonsenseqa-mixed,mixeval_240601,[],reasoning
500,qwen1.5_moe_a2.7b_chat,81.1,commonsenseqa-mixed,mixeval_240601,[],reasoning
501,gemma_1.1_2b_it,57.1,commonsenseqa-mixed,mixeval_240601,[],reasoning
502,vicuna_7b_v1.5,62.7,commonsenseqa-mixed,mixeval_240601,[],reasoning
503,olmo_7b_instruct,64.6,commonsenseqa-mixed,mixeval_240601,[],reasoning
504,qwen1.5_4b_chat,74.1,commonsenseqa-mixed,mixeval_240601,[],reasoning
505,jetmoe_8b_chat,68.4,commonsenseqa-mixed,mixeval_240601,[],reasoning
506,mpt_7b_chat,36.3,commonsenseqa-mixed,mixeval_240601,[],reasoning
507,llama_3_70b,85.4,commonsenseqa-mixed,mixeval_240601,[],reasoning
508,qwen1.5_72b,87.3,commonsenseqa-mixed,mixeval_240601,[],reasoning
509,yi_34b,81.1,commonsenseqa-mixed,mixeval_240601,[],reasoning
510,qwen1.5_32b,89.2,commonsenseqa-mixed,mixeval_240601,[],reasoning
511,mixtral_8x7b,77.4,commonsenseqa-mixed,mixeval_240601,[],reasoning
512,llama_2_70b,77.4,commonsenseqa-mixed,mixeval_240601,[],reasoning
513,qwen1.5_moe_a2.7b,80.2,commonsenseqa-mixed,mixeval_240601,[],reasoning
514,qwen1.5_7b,84.4,commonsenseqa-mixed,mixeval_240601,[],reasoning
515,llama_3_8b,69.8,commonsenseqa-mixed,mixeval_240601,[],reasoning
516,mistral_7b,67.9,commonsenseqa-mixed,mixeval_240601,[],reasoning
517,gemma_7b,68.4,commonsenseqa-mixed,mixeval_240601,[],reasoning
518,yi_6b,76.4,commonsenseqa-mixed,mixeval_240601,[],reasoning
519,qwen1.5_4b,79.2,commonsenseqa-mixed,mixeval_240601,[],reasoning
520,jetmoe_8b,60.4,commonsenseqa-mixed,mixeval_240601,[],reasoning
521,deepseek_7b,51.4,commonsenseqa-mixed,mixeval_240601,[],reasoning
522,phi_2,68.9,commonsenseqa-mixed,mixeval_240601,[],reasoning
523,deepseekmoe_16b,48.6,commonsenseqa-mixed,mixeval_240601,[],reasoning
524,llama_2_7b,30.7,commonsenseqa-mixed,mixeval_240601,[],reasoning
525,gemma_2b,31.6,commonsenseqa-mixed,mixeval_240601,[],reasoning
526,olmo_7b,25.5,commonsenseqa-mixed,mixeval_240601,[],reasoning
527,mpt_7b,28.8,commonsenseqa-mixed,mixeval_240601,[],reasoning
528,gpt_4o_2024_05_13,70.3,triviaqa_hard-mixed,mixeval_240601,[],knowledge
529,claude_3_opus,71.4,triviaqa_hard-mixed,mixeval_240601,[],knowledge
530,gpt_4_turbo_2024_04_09,73.1,triviaqa_hard-mixed,mixeval_240601,[],knowledge
531,gemini_1.5_pro_api_0409,67.8,triviaqa_hard-mixed,mixeval_240601,[],knowledge
532,yi_large_preview,55.4,triviaqa_hard-mixed,mixeval_240601,[],knowledge
533,llama_3_70b_instruct,60.5,triviaqa_hard-mixed,mixeval_240601,[],knowledge
534,qwen_max_0428,61.5,triviaqa_hard-mixed,mixeval_240601,[],knowledge
535,claude_3_sonnet,59.1,triviaqa_hard-mixed,mixeval_240601,[],knowledge
536,reka_core_20240415,51.6,triviaqa_hard-mixed,mixeval_240601,[],knowledge
537,mammoth2_8x7b_plus,52.9,triviaqa_hard-mixed,mixeval_240601,[],knowledge
538,deepseek_v2,51.7,triviaqa_hard-mixed,mixeval_240601,[],knowledge
539,command_r_plus,57.5,triviaqa_hard-mixed,mixeval_240601,[],knowledge
540,yi_1.5_34b_chat,44.4,triviaqa_hard-mixed,mixeval_240601,[],knowledge
541,mistral_large,55.5,triviaqa_hard-mixed,mixeval_240601,[],knowledge
542,qwen1.5_72b_chat,49.9,triviaqa_hard-mixed,mixeval_240601,[],knowledge
543,mistral_medium,59.8,triviaqa_hard-mixed,mixeval_240601,[],knowledge
544,gemini_1.0_pro,58.2,triviaqa_hard-mixed,mixeval_240601,[],knowledge
545,reka_flash_20240226,42.9,triviaqa_hard-mixed,mixeval_240601,[],knowledge
546,mistral_small,56.0,triviaqa_hard-mixed,mixeval_240601,[],knowledge
547,llama_3_8b_instruct,40.2,triviaqa_hard-mixed,mixeval_240601,[],knowledge
548,command_r,57.0,triviaqa_hard-mixed,mixeval_240601,[],knowledge
549,qwen1.5_32b_chat,39.1,triviaqa_hard-mixed,mixeval_240601,[],knowledge
550,gpt_3.5_turbo_0125,46.4,triviaqa_hard-mixed,mixeval_240601,[],knowledge
551,claude_3_haiku,42.4,triviaqa_hard-mixed,mixeval_240601,[],knowledge
552,yi_34b_chat,41.5,triviaqa_hard-mixed,mixeval_240601,[],knowledge
553,mixtral_8x7b_instruct_v0.1,48.5,triviaqa_hard-mixed,mixeval_240601,[],knowledge
554,starling_lm_7b_beta,33.4,triviaqa_hard-mixed,mixeval_240601,[],knowledge
555,yi_1.5_9b_chat,23.3,triviaqa_hard-mixed,mixeval_240601,[],knowledge
556,gemma_1.1_7b_it,30.3,triviaqa_hard-mixed,mixeval_240601,[],knowledge
557,vicuna_33b_v1.3,42.5,triviaqa_hard-mixed,mixeval_240601,[],knowledge
558,llama_2_70b_chat,42.2,triviaqa_hard-mixed,mixeval_240601,[],knowledge
559,map_neo_instruct_v0.1,26.5,triviaqa_hard-mixed,mixeval_240601,[],knowledge
560,mistral_7b_instruct_v0.2,33.5,triviaqa_hard-mixed,mixeval_240601,[],knowledge
561,qwen1.5_7b_chat,29.0,triviaqa_hard-mixed,mixeval_240601,[],knowledge
562,reka_edge_20240208,18.6,triviaqa_hard-mixed,mixeval_240601,[],knowledge
563,zephyr_7b_beta,30.2,triviaqa_hard-mixed,mixeval_240601,[],knowledge
564,llama_2_7b_chat,24.8,triviaqa_hard-mixed,mixeval_240601,[],knowledge
565,yi_6b_chat,18.9,triviaqa_hard-mixed,mixeval_240601,[],knowledge
566,qwen1.5_moe_a2.7b_chat,21.9,triviaqa_hard-mixed,mixeval_240601,[],knowledge
567,gemma_1.1_2b_it,31.9,triviaqa_hard-mixed,mixeval_240601,[],knowledge
568,vicuna_7b_v1.5,25.9,triviaqa_hard-mixed,mixeval_240601,[],knowledge
569,olmo_7b_instruct,24.7,triviaqa_hard-mixed,mixeval_240601,[],knowledge
570,qwen1.5_4b_chat,16.5,triviaqa_hard-mixed,mixeval_240601,[],knowledge
571,jetmoe_8b_chat,19.2,triviaqa_hard-mixed,mixeval_240601,[],knowledge
572,mpt_7b_chat,17.5,triviaqa_hard-mixed,mixeval_240601,[],knowledge
573,llama_3_70b,59.1,triviaqa_hard-mixed,mixeval_240601,[],knowledge
574,qwen1.5_72b,41.4,triviaqa_hard-mixed,mixeval_240601,[],knowledge
575,yi_34b,39.4,triviaqa_hard-mixed,mixeval_240601,[],knowledge
576,qwen1.5_32b,28.0,triviaqa_hard-mixed,mixeval_240601,[],knowledge
577,mixtral_8x7b,44.1,triviaqa_hard-mixed,mixeval_240601,[],knowledge
578,llama_2_70b,53.8,triviaqa_hard-mixed,mixeval_240601,[],knowledge
579,qwen1.5_moe_a2.7b,36.0,triviaqa_hard-mixed,mixeval_240601,[],knowledge
580,qwen1.5_7b,31.6,triviaqa_hard-mixed,mixeval_240601,[],knowledge
581,llama_3_8b,22.6,triviaqa_hard-mixed,mixeval_240601,[],knowledge
582,mistral_7b,24.2,triviaqa_hard-mixed,mixeval_240601,[],knowledge
583,gemma_7b,31.1,triviaqa_hard-mixed,mixeval_240601,[],knowledge
584,yi_6b,17.0,triviaqa_hard-mixed,mixeval_240601,[],knowledge
585,qwen1.5_4b,14.0,triviaqa_hard-mixed,mixeval_240601,[],knowledge
586,jetmoe_8b,22.8,triviaqa_hard-mixed,mixeval_240601,[],knowledge
587,deepseek_7b,21.4,triviaqa_hard-mixed,mixeval_240601,[],knowledge
588,phi_2,7.3,triviaqa_hard-mixed,mixeval_240601,[],knowledge
589,deepseekmoe_16b,24.9,triviaqa_hard-mixed,mixeval_240601,[],knowledge
590,llama_2_7b,19.5,triviaqa_hard-mixed,mixeval_240601,[],knowledge
591,gemma_2b,12.1,triviaqa_hard-mixed,mixeval_240601,[],knowledge
592,olmo_7b,16.0,triviaqa_hard-mixed,mixeval_240601,[],knowledge
593,mpt_7b,6.6,triviaqa_hard-mixed,mixeval_240601,[],knowledge
594,gpt_4o_2024_05_13,57.1,mmlu_hard-mixed,mixeval_240601,[],knowledge
595,claude_3_opus,55.0,mmlu_hard-mixed,mixeval_240601,[],knowledge
596,gpt_4_turbo_2024_04_09,45.5,mmlu_hard-mixed,mixeval_240601,[],knowledge
597,gemini_1.5_pro_api_0409,44.6,mmlu_hard-mixed,mixeval_240601,[],knowledge
598,yi_large_preview,48.5,mmlu_hard-mixed,mixeval_240601,[],knowledge
599,llama_3_70b_instruct,46.3,mmlu_hard-mixed,mixeval_240601,[],knowledge
600,qwen_max_0428,41.6,mmlu_hard-mixed,mixeval_240601,[],knowledge
601,claude_3_sonnet,40.7,mmlu_hard-mixed,mixeval_240601,[],knowledge
602,reka_core_20240415,46.3,mmlu_hard-mixed,mixeval_240601,[],knowledge
603,mammoth2_8x7b_plus,41.1,mmlu_hard-mixed,mixeval_240601,[],knowledge
604,deepseek_v2,42.0,mmlu_hard-mixed,mixeval_240601,[],knowledge
605,command_r_plus,42.0,mmlu_hard-mixed,mixeval_240601,[],knowledge
606,yi_1.5_34b_chat,38.1,mmlu_hard-mixed,mixeval_240601,[],knowledge
607,mistral_large,42.4,mmlu_hard-mixed,mixeval_240601,[],knowledge
608,qwen1.5_72b_chat,37.7,mmlu_hard-mixed,mixeval_240601,[],knowledge
609,mistral_medium,38.5,mmlu_hard-mixed,mixeval_240601,[],knowledge
610,gemini_1.0_pro,35.5,mmlu_hard-mixed,mixeval_240601,[],knowledge
611,reka_flash_20240226,34.6,mmlu_hard-mixed,mixeval_240601,[],knowledge
612,mistral_small,33.8,mmlu_hard-mixed,mixeval_240601,[],knowledge
613,llama_3_8b_instruct,40.7,mmlu_hard-mixed,mixeval_240601,[],knowledge
614,command_r,39.0,mmlu_hard-mixed,mixeval_240601,[],knowledge
615,qwen1.5_32b_chat,29.9,mmlu_hard-mixed,mixeval_240601,[],knowledge
616,gpt_3.5_turbo_0125,35.1,mmlu_hard-mixed,mixeval_240601,[],knowledge
617,claude_3_haiku,30.7,mmlu_hard-mixed,mixeval_240601,[],knowledge
618,yi_34b_chat,29.9,mmlu_hard-mixed,mixeval_240601,[],knowledge
619,mixtral_8x7b_instruct_v0.1,37.2,mmlu_hard-mixed,mixeval_240601,[],knowledge
620,starling_lm_7b_beta,34.2,mmlu_hard-mixed,mixeval_240601,[],knowledge
621,yi_1.5_9b_chat,36.8,mmlu_hard-mixed,mixeval_240601,[],knowledge
622,gemma_1.1_7b_it,39.0,mmlu_hard-mixed,mixeval_240601,[],knowledge
623,vicuna_33b_v1.3,39.4,mmlu_hard-mixed,mixeval_240601,[],knowledge
624,llama_2_70b_chat,27.7,mmlu_hard-mixed,mixeval_240601,[],knowledge
625,map_neo_instruct_v0.1,32.5,mmlu_hard-mixed,mixeval_240601,[],knowledge
626,mistral_7b_instruct_v0.2,29.4,mmlu_hard-mixed,mixeval_240601,[],knowledge
627,qwen1.5_7b_chat,29.0,mmlu_hard-mixed,mixeval_240601,[],knowledge
628,reka_edge_20240208,26.4,mmlu_hard-mixed,mixeval_240601,[],knowledge
629,zephyr_7b_beta,24.2,mmlu_hard-mixed,mixeval_240601,[],knowledge
630,llama_2_7b_chat,30.3,mmlu_hard-mixed,mixeval_240601,[],knowledge
631,yi_6b_chat,26.8,mmlu_hard-mixed,mixeval_240601,[],knowledge
632,qwen1.5_moe_a2.7b_chat,26.8,mmlu_hard-mixed,mixeval_240601,[],knowledge
633,gemma_1.1_2b_it,30.3,mmlu_hard-mixed,mixeval_240601,[],knowledge
634,vicuna_7b_v1.5,23.4,mmlu_hard-mixed,mixeval_240601,[],knowledge
635,olmo_7b_instruct,27.3,mmlu_hard-mixed,mixeval_240601,[],knowledge
636,qwen1.5_4b_chat,17.3,mmlu_hard-mixed,mixeval_240601,[],knowledge
637,jetmoe_8b_chat,25.5,mmlu_hard-mixed,mixeval_240601,[],knowledge
638,mpt_7b_chat,24.7,mmlu_hard-mixed,mixeval_240601,[],knowledge
639,llama_3_70b,39.8,mmlu_hard-mixed,mixeval_240601,[],knowledge
640,qwen1.5_72b,42.4,mmlu_hard-mixed,mixeval_240601,[],knowledge
641,yi_34b,42.4,mmlu_hard-mixed,mixeval_240601,[],knowledge
642,qwen1.5_32b,37.2,mmlu_hard-mixed,mixeval_240601,[],knowledge
643,mixtral_8x7b,34.6,mmlu_hard-mixed,mixeval_240601,[],knowledge
644,llama_2_70b,29.0,mmlu_hard-mixed,mixeval_240601,[],knowledge
645,qwen1.5_moe_a2.7b,30.7,mmlu_hard-mixed,mixeval_240601,[],knowledge
646,qwen1.5_7b,28.6,mmlu_hard-mixed,mixeval_240601,[],knowledge
647,llama_3_8b,38.5,mmlu_hard-mixed,mixeval_240601,[],knowledge
648,mistral_7b,27.7,mmlu_hard-mixed,mixeval_240601,[],knowledge
649,gemma_7b,28.1,mmlu_hard-mixed,mixeval_240601,[],knowledge
650,yi_6b,37.2,mmlu_hard-mixed,mixeval_240601,[],knowledge
651,qwen1.5_4b,22.9,mmlu_hard-mixed,mixeval_240601,[],knowledge
652,jetmoe_8b,27.3,mmlu_hard-mixed,mixeval_240601,[],knowledge
653,deepseek_7b,26.4,mmlu_hard-mixed,mixeval_240601,[],knowledge
654,phi_2,29.0,mmlu_hard-mixed,mixeval_240601,[],knowledge
655,deepseekmoe_16b,30.7,mmlu_hard-mixed,mixeval_240601,[],knowledge
656,llama_2_7b,24.7,mmlu_hard-mixed,mixeval_240601,[],knowledge
657,gemma_2b,27.3,mmlu_hard-mixed,mixeval_240601,[],knowledge
658,olmo_7b,25.1,mmlu_hard-mixed,mixeval_240601,[],knowledge
659,mpt_7b,24.2,mmlu_hard-mixed,mixeval_240601,[],knowledge
660,gpt_4o_2024_05_13,67.5,drop_hard-mixed,mixeval_240601,[],reasoning
661,claude_3_opus,75.2,drop_hard-mixed,mixeval_240601,[],reasoning
662,gpt_4_turbo_2024_04_09,71.0,drop_hard-mixed,mixeval_240601,[],reasoning
663,gemini_1.5_pro_api_0409,64.8,drop_hard-mixed,mixeval_240601,[],reasoning
664,yi_large_preview,63.1,drop_hard-mixed,mixeval_240601,[],reasoning
665,llama_3_70b_instruct,74.5,drop_hard-mixed,mixeval_240601,[],reasoning
666,qwen_max_0428,53.5,drop_hard-mixed,mixeval_240601,[],reasoning
667,claude_3_sonnet,66.9,drop_hard-mixed,mixeval_240601,[],reasoning
668,reka_core_20240415,66.6,drop_hard-mixed,mixeval_240601,[],reasoning
669,mammoth2_8x7b_plus,65.1,drop_hard-mixed,mixeval_240601,[],reasoning
670,deepseek_v2,62.8,drop_hard-mixed,mixeval_240601,[],reasoning
671,command_r_plus,65.0,drop_hard-mixed,mixeval_240601,[],reasoning
672,yi_1.5_34b_chat,67.4,drop_hard-mixed,mixeval_240601,[],reasoning
673,mistral_large,61.6,drop_hard-mixed,mixeval_240601,[],reasoning
674,qwen1.5_72b_chat,56.5,drop_hard-mixed,mixeval_240601,[],reasoning
675,mistral_medium,47.1,drop_hard-mixed,mixeval_240601,[],reasoning
676,gemini_1.0_pro,54.1,drop_hard-mixed,mixeval_240601,[],reasoning
677,reka_flash_20240226,65.0,drop_hard-mixed,mixeval_240601,[],reasoning
678,mistral_small,52.6,drop_hard-mixed,mixeval_240601,[],reasoning
679,llama_3_8b_instruct,67.6,drop_hard-mixed,mixeval_240601,[],reasoning
680,command_r,42.0,drop_hard-mixed,mixeval_240601,[],reasoning
681,qwen1.5_32b_chat,54.4,drop_hard-mixed,mixeval_240601,[],reasoning
682,gpt_3.5_turbo_0125,55.4,drop_hard-mixed,mixeval_240601,[],reasoning
683,claude_3_haiku,51.5,drop_hard-mixed,mixeval_240601,[],reasoning
684,yi_34b_chat,57.1,drop_hard-mixed,mixeval_240601,[],reasoning
685,mixtral_8x7b_instruct_v0.1,47.7,drop_hard-mixed,mixeval_240601,[],reasoning
686,starling_lm_7b_beta,62.9,drop_hard-mixed,mixeval_240601,[],reasoning
687,yi_1.5_9b_chat,61.3,drop_hard-mixed,mixeval_240601,[],reasoning
688,gemma_1.1_7b_it,55.1,drop_hard-mixed,mixeval_240601,[],reasoning
689,vicuna_33b_v1.3,36.6,drop_hard-mixed,mixeval_240601,[],reasoning
690,llama_2_70b_chat,42.2,drop_hard-mixed,mixeval_240601,[],reasoning
691,map_neo_instruct_v0.1,42.4,drop_hard-mixed,mixeval_240601,[],reasoning
692,mistral_7b_instruct_v0.2,44.3,drop_hard-mixed,mixeval_240601,[],reasoning
693,qwen1.5_7b_chat,50.0,drop_hard-mixed,mixeval_240601,[],reasoning
694,reka_edge_20240208,56.9,drop_hard-mixed,mixeval_240601,[],reasoning
695,zephyr_7b_beta,45.3,drop_hard-mixed,mixeval_240601,[],reasoning
696,llama_2_7b_chat,44.3,drop_hard-mixed,mixeval_240601,[],reasoning
697,yi_6b_chat,43.7,drop_hard-mixed,mixeval_240601,[],reasoning
698,qwen1.5_moe_a2.7b_chat,39.5,drop_hard-mixed,mixeval_240601,[],reasoning
699,gemma_1.1_2b_it,27.8,drop_hard-mixed,mixeval_240601,[],reasoning
700,vicuna_7b_v1.5,33.2,drop_hard-mixed,mixeval_240601,[],reasoning
701,olmo_7b_instruct,22.9,drop_hard-mixed,mixeval_240601,[],reasoning
702,qwen1.5_4b_chat,28.6,drop_hard-mixed,mixeval_240601,[],reasoning
703,jetmoe_8b_chat,11.5,drop_hard-mixed,mixeval_240601,[],reasoning
704,mpt_7b_chat,31.0,drop_hard-mixed,mixeval_240601,[],reasoning
705,llama_3_70b,59.5,drop_hard-mixed,mixeval_240601,[],reasoning
706,qwen1.5_72b,26.2,drop_hard-mixed,mixeval_240601,[],reasoning
707,yi_34b,56.5,drop_hard-mixed,mixeval_240601,[],reasoning
708,qwen1.5_32b,36.9,drop_hard-mixed,mixeval_240601,[],reasoning
709,mixtral_8x7b,42.0,drop_hard-mixed,mixeval_240601,[],reasoning
710,llama_2_70b,46.1,drop_hard-mixed,mixeval_240601,[],reasoning
711,qwen1.5_moe_a2.7b,31.0,drop_hard-mixed,mixeval_240601,[],reasoning
712,qwen1.5_7b,29.8,drop_hard-mixed,mixeval_240601,[],reasoning
713,llama_3_8b,37.1,drop_hard-mixed,mixeval_240601,[],reasoning
714,mistral_7b,34.5,drop_hard-mixed,mixeval_240601,[],reasoning
715,gemma_7b,31.4,drop_hard-mixed,mixeval_240601,[],reasoning
716,yi_6b,19.4,drop_hard-mixed,mixeval_240601,[],reasoning
717,qwen1.5_4b,24.7,drop_hard-mixed,mixeval_240601,[],reasoning
718,jetmoe_8b,19.2,drop_hard-mixed,mixeval_240601,[],reasoning
719,deepseek_7b,21.4,drop_hard-mixed,mixeval_240601,[],reasoning
720,phi_2,27.1,drop_hard-mixed,mixeval_240601,[],reasoning
721,deepseekmoe_16b,12.2,drop_hard-mixed,mixeval_240601,[],reasoning
722,llama_2_7b,14.9,drop_hard-mixed,mixeval_240601,[],reasoning
723,gemma_2b,13.2,drop_hard-mixed,mixeval_240601,[],reasoning
724,olmo_7b,11.1,drop_hard-mixed,mixeval_240601,[],reasoning
725,mpt_7b,9.2,drop_hard-mixed,mixeval_240601,[],reasoning
771,llama_3_70b,81.7,boolq-mixed,mixeval_240601,[],knowledge
772,qwen1.5_72b,86.9,boolq-mixed,mixeval_240601,[],knowledge
773,yi_34b,79.4,boolq-mixed,mixeval_240601,[],knowledge
774,qwen1.5_32b,83.4,boolq-mixed,mixeval_240601,[],knowledge
775,mixtral_8x7b,77.7,boolq-mixed,mixeval_240601,[],knowledge
776,llama_2_70b,74.3,boolq-mixed,mixeval_240601,[],knowledge
777,qwen1.5_moe_a2.7b,70.9,boolq-mixed,mixeval_240601,[],knowledge
778,qwen1.5_7b,77.7,boolq-mixed,mixeval_240601,[],knowledge
779,llama_3_8b,64.0,boolq-mixed,mixeval_240601,[],knowledge
780,mistral_7b,68.0,boolq-mixed,mixeval_240601,[],knowledge
781,gemma_7b,74.3,boolq-mixed,mixeval_240601,[],knowledge
782,yi_6b,65.1,boolq-mixed,mixeval_240601,[],knowledge
783,qwen1.5_4b,72.0,boolq-mixed,mixeval_240601,[],knowledge
784,jetmoe_8b,64.6,boolq-mixed,mixeval_240601,[],knowledge
785,deepseek_7b,62.9,boolq-mixed,mixeval_240601,[],knowledge
786,phi_2,73.1,boolq-mixed,mixeval_240601,[],knowledge
787,deepseekmoe_16b,62.9,boolq-mixed,mixeval_240601,[],knowledge
788,llama_2_7b,61.7,boolq-mixed,mixeval_240601,[],knowledge
789,gemma_2b,58.9,boolq-mixed,mixeval_240601,[],knowledge
790,olmo_7b,49.1,boolq-mixed,mixeval_240601,[],knowledge
791,mpt_7b,44.0,boolq-mixed,mixeval_240601,[],knowledge
593,gpt_4_0314,0.57,agieval,BLZ_240312,[],holistic
594,gpt_4_0613,0.57,agieval,BLZ_240312,[],holistic
596,claude_1,0.49700000000000005,agieval,BLZ_240312,[],holistic
601,mixtral_8x7b_instruct_v0.1,0.45299999999999996,agieval,BLZ_240312,[],holistic
602,yi_34b_chat,0.508,agieval,BLZ_240312,[],holistic
605,gpt_3.5_turbo_0314,0.43200000000000005,agieval,BLZ_240312,[],holistic
608,vicuna_33b,0.373,agieval,BLZ_240312,[],holistic
609,starling_lm_7b_alpha,0.401,agieval,BLZ_240312,[],holistic
611,llama_2_70b_chat,0.45,agieval,BLZ_240312,[],holistic
613,openhermes_2.5_mistral_7b,0.43,agieval,BLZ_240312,[],holistic
614,openchat_3.5,0.42700000000000005,agieval,BLZ_240312,[],holistic
617,solar_10.7b_instruct_v1.0,0.47600000000000003,agieval,BLZ_240312,[],holistic
618,dolphin_2.2.1_mistral_7b,0.392,agieval,BLZ_240312,[],holistic
620,zephyr_7b_beta,0.406,agieval,BLZ_240312,[],holistic
623,llama_2_13b_chat,0.336,agieval,BLZ_240312,[],holistic
624,vicuna_13b,0.368,agieval,BLZ_240312,[],holistic
626,zephyr_7b_alpha,0.38,agieval,BLZ_240312,[],holistic
627,qwen_14b_chat,0.396,agieval,BLZ_240312,[],holistic
630,llama_2_7b_chat,0.29600000000000004,agieval,BLZ_240312,[],holistic
632,mistral_7b_instruct_v0.1,0.335,agieval,BLZ_240312,[],holistic
634,vicuna_7b,0.314,agieval,BLZ_240312,[],holistic
636,chatglm3_6b,0.414,agieval,BLZ_240312,[],holistic
643,chatglm_6b,0.325,agieval,BLZ_240312,[],holistic
647,llama_13b,0.205,agieval,BLZ_240312,[],holistic
180,gpt_4_0314,0.963,arc_c,BLZ_240312,[],reasoning
182,mistral_medium,0.899,arc_c,BLZ_240312,[],reasoning
188,mixtral_8x7b_instruct_v0.1,0.7021999999999999,arc_c,BLZ_240312,[],reasoning
189,yi_34b_chat,0.6544,arc_c,BLZ_240312,[],reasoning
192,gpt_3.5_turbo_0314,0.855,arc_c,BLZ_240312,[],reasoning
193,wizardlm_70b_v1.0,0.6544,arc_c,BLZ_240312,[],reasoning
194,tulu_2_dpo_70b,0.721,arc_c,BLZ_240312,[],reasoning
195,vicuna_33b,0.6212,arc_c,BLZ_240312,[],reasoning
196,starling_lm_7b_alpha,0.6382,arc_c,BLZ_240312,[],reasoning
198,llama_2_70b_chat,0.6459,arc_c,BLZ_240312,[],reasoning
200,openhermes_2.5_mistral_7b,0.6493000000000001,arc_c,BLZ_240312,[],reasoning
201,openchat_3.5,0.6391,arc_c,BLZ_240312,[],reasoning
204,solar_10.7b_instruct_v1.0,0.7108,arc_c,BLZ_240312,[],reasoning
205,dolphin_2.2.1_mistral_7b,0.6331,arc_c,BLZ_240312,[],reasoning
206,wizardlm_13b_v1.2,0.5904,arc_c,BLZ_240312,[],reasoning
207,zephyr_7b_beta,0.6203,arc_c,BLZ_240312,[],reasoning
208,mpt_30b_chat,0.5870000000000001,arc_c,BLZ_240312,[],reasoning
209,codellama_34b_instruct,0.5427000000000001,arc_c,BLZ_240312,[],reasoning
210,llama_2_13b_chat,0.5904,arc_c,BLZ_240312,[],reasoning
211,vicuna_13b,0.5708,arc_c,BLZ_240312,[],reasoning
213,zephyr_7b_alpha,0.6101,arc_c,BLZ_240312,[],reasoning
215,falcon_180b_chat,0.6945,arc_c,BLZ_240312,[],reasoning
217,llama_2_7b_chat,0.529,arc_c,BLZ_240312,[],reasoning
219,mistral_7b_instruct_v0.1,0.5452,arc_c,BLZ_240312,[],reasoning
221,vicuna_7b,0.5324,arc_c,BLZ_240312,[],reasoning
235,yi_34bx2_moe_60b,0.7108,arc_c,BLZ_240312,[],reasoning
886,gpt_4_1106_preview,0.977,alpacav1,BLZ_240312,[],holistic
888,gpt_4_0314,0.9528,alpacav1,BLZ_240312,[],holistic
889,gpt_4_0613,0.9528,alpacav1,BLZ_240312,[],holistic
890,mistral_medium,0.9682999999999999,alpacav1,BLZ_240312,[],holistic
891,claude_1,0.8839,alpacav1,BLZ_240312,[],holistic
892,claude_2.0,0.9136,alpacav1,BLZ_240312,[],holistic
893,gemini_pro_dev_api,0.7966,alpacav1,BLZ_240312,[],holistic
894,claude_2.1,0.8708,alpacav1,BLZ_240312,[],holistic
895,gpt_3.5_turbo_0613,0.8937,alpacav1,BLZ_240312,[],holistic
896,mixtral_8x7b_instruct_v0.1,0.9478,alpacav1,BLZ_240312,[],holistic
897,yi_34b_chat,0.9408,alpacav1,BLZ_240312,[],holistic
898,gemini_pro,0.7966,alpacav1,BLZ_240312,[],holistic
900,gpt_3.5_turbo_0314,0.8937,alpacav1,BLZ_240312,[],holistic
902,tulu_2_dpo_70b,0.9503,alpacav1,BLZ_240312,[],holistic
903,vicuna_33b,0.8898999999999999,alpacav1,BLZ_240312,[],holistic
904,starling_lm_7b_alpha,0.9198999999999999,alpacav1,BLZ_240312,[],holistic
906,llama_2_70b_chat,0.9266,alpacav1,BLZ_240312,[],holistic
909,openchat_3.5,0.8851,alpacav1,BLZ_240312,[],holistic
911,gpt_3.5_turbo_1106,0.8626,alpacav1,BLZ_240312,[],holistic
914,wizardlm_13b_v1.2,0.8917,alpacav1,BLZ_240312,[],holistic
915,zephyr_7b_beta,0.9059999999999999,alpacav1,BLZ_240312,[],holistic
918,llama_2_13b_chat,0.8109000000000001,alpacav1,BLZ_240312,[],holistic
921,zephyr_7b_alpha,0.8576,alpacav1,BLZ_240312,[],holistic
924,guanaco_33b,0.6596,alpacav1,BLZ_240312,[],holistic
925,llama_2_7b_chat,0.7137,alpacav1,BLZ_240312,[],holistic
934,chatglm2_6b,0.47130000000000005,alpacav1,BLZ_240312,[],holistic
937,openassistant_pythia_12b,0.2596,alpacav1,BLZ_240312,[],holistic
827,gpt_4_1106_preview,0.5,alpacav2,BLZ_240312,[],holistic
829,gpt_4_0314,0.221,alpacav2,BLZ_240312,[],holistic
830,gpt_4_0613,0.158,alpacav2,BLZ_240312,[],holistic
831,mistral_medium,0.21899999999999997,alpacav2,BLZ_240312,[],holistic
832,claude_1,0.17,alpacav2,BLZ_240312,[],holistic
833,claude_2.0,0.172,alpacav2,BLZ_240312,[],holistic
834,gemini_pro_dev_api,0.16899999999999998,alpacav2,BLZ_240312,[],holistic
835,claude_2.1,0.157,alpacav2,BLZ_240312,[],holistic
836,gpt_3.5_turbo_0613,0.141,alpacav2,BLZ_240312,[],holistic
837,mixtral_8x7b_instruct_v0.1,0.183,alpacav2,BLZ_240312,[],holistic
838,yi_34b_chat,0.297,alpacav2,BLZ_240312,[],holistic
839,gemini_pro,0.16899999999999998,alpacav2,BLZ_240312,[],holistic
840,claude_instant_1,0.161,alpacav2,BLZ_240312,[],holistic
841,gpt_3.5_turbo_0314,0.096,alpacav2,BLZ_240312,[],holistic
842,wizardlm_70b_v1.0,0.14400000000000002,alpacav2,BLZ_240312,[],holistic
843,tulu_2_dpo_70b,0.16,alpacav2,BLZ_240312,[],holistic
844,vicuna_33b,0.127,alpacav2,BLZ_240312,[],holistic
845,starling_lm_7b_alpha,0.142,alpacav2,BLZ_240312,[],holistic
846,deepseek_llm_67b_chat,0.121,alpacav2,BLZ_240312,[],holistic
847,llama_2_70b_chat,0.139,alpacav2,BLZ_240312,[],holistic
849,openhermes_2.5_mistral_7b,0.10300000000000001,alpacav2,BLZ_240312,[],holistic
852,gpt_3.5_turbo_1106,0.092,alpacav2,BLZ_240312,[],holistic
854,dolphin_2.2.1_mistral_7b,0.09,alpacav2,BLZ_240312,[],holistic
855,wizardlm_13b_v1.2,0.12,alpacav2,BLZ_240312,[],holistic
856,zephyr_7b_beta,0.11,alpacav2,BLZ_240312,[],holistic
859,llama_2_13b_chat,0.077,alpacav2,BLZ_240312,[],holistic
860,vicuna_13b,0.067,alpacav2,BLZ_240312,[],holistic
862,zephyr_7b_alpha,0.084,alpacav2,BLZ_240312,[],holistic
863,qwen_14b_chat,0.075,alpacav2,BLZ_240312,[],holistic
865,guanaco_33b,0.05,alpacav2,BLZ_240312,[],holistic
866,llama_2_7b_chat,0.0496,alpacav2,BLZ_240312,[],holistic
870,vicuna_7b,0.048,alpacav2,BLZ_240312,[],holistic
875,chatglm2_6b,0.027999999999999997,alpacav2,BLZ_240312,[],holistic
878,openassistant_pythia_12b,0.018000000000000002,alpacav2,BLZ_240312,[],holistic
1299,gpt_4_1106_preview,0.32799999999999996,alpacaeval2_lc,BLZ_240312,[],holistic
1301,gpt_4_0314,0.21600000000000003,alpacaeval2_lc,BLZ_240312,[],holistic
1302,gpt_4_0613,0.18600000000000003,alpacaeval2_lc,BLZ_240312,[],holistic
1303,mistral_medium,0.196,alpacaeval2_lc,BLZ_240312,[],holistic
1304,claude_1,0.21100000000000002,alpacaeval2_lc,BLZ_240312,[],holistic
1305,claude_2.0,0.21600000000000003,alpacaeval2_lc,BLZ_240312,[],holistic
1306,gemini_pro_dev_api,0.172,alpacaeval2_lc,BLZ_240312,[],holistic
1307,claude_2.1,0.193,alpacaeval2_lc,BLZ_240312,[],holistic
1308,gpt_3.5_turbo_0613,0.14300000000000002,alpacaeval2_lc,BLZ_240312,[],holistic
1309,mixtral_8x7b_instruct_v0.1,0.168,alpacaeval2_lc,BLZ_240312,[],holistic
1310,yi_34b_chat,0.188,alpacaeval2_lc,BLZ_240312,[],holistic
1312,claude_instant_1,0.195,alpacaeval2_lc,BLZ_240312,[],holistic
1313,gpt_3.5_turbo_0314,0.156,alpacaeval2_lc,BLZ_240312,[],holistic
1314,wizardlm_70b_v1.0,0.125,alpacaeval2_lc,BLZ_240312,[],holistic
1315,tulu_2_dpo_70b,0.151,alpacaeval2_lc,BLZ_240312,[],holistic
1316,vicuna_33b,0.115,alpacaeval2_lc,BLZ_240312,[],holistic
1317,starling_lm_7b_alpha,0.10099999999999999,alpacaeval2_lc,BLZ_240312,[],holistic
1318,deepseek_llm_67b_chat,0.141,alpacaeval2_lc,BLZ_240312,[],holistic
1319,llama_2_70b_chat,0.10400000000000001,alpacaeval2_lc,BLZ_240312,[],holistic
1321,openhermes_2.5_mistral_7b,0.126,alpacaeval2_lc,BLZ_240312,[],holistic
1324,gpt_3.5_turbo_1106,0.155,alpacaeval2_lc,BLZ_240312,[],holistic
1326,dolphin_2.2.1_mistral_7b,0.10800000000000001,alpacaeval2_lc,BLZ_240312,[],holistic
1327,wizardlm_13b_v1.2,0.099,alpacaeval2_lc,BLZ_240312,[],holistic
1328,zephyr_7b_beta,0.102,alpacaeval2_lc,BLZ_240312,[],holistic
1331,llama_2_13b_chat,0.068,alpacaeval2_lc,BLZ_240312,[],holistic
1332,vicuna_13b,0.085,alpacaeval2_lc,BLZ_240312,[],holistic
1334,zephyr_7b_alpha,0.086,alpacaeval2_lc,BLZ_240312,[],holistic
1335,qwen_14b_chat,0.1,alpacaeval2_lc,BLZ_240312,[],holistic
1338,llama_2_7b_chat,0.045,alpacaeval2_lc,BLZ_240312,[],holistic
1342,vicuna_7b,0.06,alpacaeval2_lc,BLZ_240312,[],holistic
0,gpt_4_0125_preview,1.0,arena_elo,BLZ_240312,[],holistic
1,gpt_4_1106_preview,0.9992019154030327,arena_elo,BLZ_240312,[],holistic
2,bard_gemini_pro,0.9768555466879489,arena_elo,BLZ_240312,[],holistic
3,gpt_4_0314,0.9497206703910615,arena_elo,BLZ_240312,[],holistic
4,gpt_4_0613,0.9273743016759777,arena_elo,BLZ_240312,[],holistic
5,mistral_medium,0.9177972865123704,arena_elo,BLZ_240312,[],holistic
6,claude_1,0.9169992019154031,arena_elo,BLZ_240312,[],holistic
7,claude_2.0,0.9034317637669593,arena_elo,BLZ_240312,[],holistic
8,gemini_pro_dev_api,0.8938547486033519,arena_elo,BLZ_240312,[],holistic
9,claude_2.1,0.8930566640063847,arena_elo,BLZ_240312,[],holistic
10,gpt_3.5_turbo_0613,0.8922585794094174,arena_elo,BLZ_240312,[],holistic
11,mixtral_8x7b_instruct_v0.1,0.8922585794094174,arena_elo,BLZ_240312,[],holistic
12,yi_34b_chat,0.8898643256185156,arena_elo,BLZ_240312,[],holistic
13,gemini_pro,0.8890662410215483,arena_elo,BLZ_240312,[],holistic
14,claude_instant_1,0.8850758180367119,arena_elo,BLZ_240312,[],holistic
15,gpt_3.5_turbo_0314,0.8818834796488427,arena_elo,BLZ_240312,[],holistic
16,wizardlm_70b_v1.0,0.8818834796488427,arena_elo,BLZ_240312,[],holistic
17,tulu_2_dpo_70b,0.8810853950518756,arena_elo,BLZ_240312,[],holistic
18,vicuna_33b,0.8723064644852354,arena_elo,BLZ_240312,[],holistic
19,starling_lm_7b_alpha,0.8699122106943336,arena_elo,BLZ_240312,[],holistic
20,deepseek_llm_67b_chat,0.8635275339185954,arena_elo,BLZ_240312,[],holistic
21,llama_2_70b_chat,0.8635275339185954,arena_elo,BLZ_240312,[],holistic
22,nv_llama2_70b_steerlm_chat,0.8603351955307262,arena_elo,BLZ_240312,[],holistic
23,openhermes_2.5_mistral_7b,0.8603351955307262,arena_elo,BLZ_240312,[],holistic
24,openchat_3.5,0.8587390263367917,arena_elo,BLZ_240312,[],holistic
25,pplx_70b_online,0.8587390263367917,arena_elo,BLZ_240312,[],holistic
26,gpt_3.5_turbo_1106,0.8547486033519553,arena_elo,BLZ_240312,[],holistic
27,solar_10.7b_instruct_v1.0,0.8499600957701516,arena_elo,BLZ_240312,[],holistic
28,dolphin_2.2.1_mistral_7b,0.8499600957701516,arena_elo,BLZ_240312,[],holistic
29,wizardlm_13b_v1.2,0.8443735035913806,arena_elo,BLZ_240312,[],holistic
30,zephyr_7b_beta,0.8387869114126097,arena_elo,BLZ_240312,[],holistic
31,mpt_30b_chat,0.8332003192338387,arena_elo,BLZ_240312,[],holistic
32,codellama_34b_instruct,0.8324022346368715,arena_elo,BLZ_240312,[],holistic
33,llama_2_13b_chat,0.8316041500399042,arena_elo,BLZ_240312,[],holistic
34,vicuna_13b,0.8300079808459697,arena_elo,BLZ_240312,[],holistic
35,pplx_7b_online,0.8284118116520351,arena_elo,BLZ_240312,[],holistic
36,zephyr_7b_alpha,0.8276137270550679,arena_elo,BLZ_240312,[],holistic
37,qwen_14b_chat,0.825219473264166,arena_elo,BLZ_240312,[],holistic
38,falcon_180b_chat,0.8236233040702314,arena_elo,BLZ_240312,[],holistic
39,guanaco_33b,0.8236233040702314,arena_elo,BLZ_240312,[],holistic
40,llama_2_7b_chat,0.8172386272944933,arena_elo,BLZ_240312,[],holistic
41,stripedhyena_nous_7b,0.8140462889066241,arena_elo,BLZ_240312,[],holistic
42,mistral_7b_instruct_v0.1,0.8028731045490822,arena_elo,BLZ_240312,[],holistic
43,palm_chat_bison_001,0.8028731045490822,arena_elo,BLZ_240312,[],holistic
44,vicuna_7b,0.8020750199521149,arena_elo,BLZ_240312,[],holistic
45,koala_13b,0.770949720670391,arena_elo,BLZ_240312,[],holistic
46,chatglm3_6b,0.7661612130885874,arena_elo,BLZ_240312,[],holistic
47,gpt4all_13b_snoozy,0.74780526735834,arena_elo,BLZ_240312,[],holistic
48,mpt_7b_chat,0.7430167597765364,arena_elo,BLZ_240312,[],holistic
49,chatglm2_6b,0.7422186751795691,arena_elo,BLZ_240312,[],holistic
50,rwkv_4_raven_14b,0.7382282521947326,arena_elo,BLZ_240312,[],holistic
51,alpaca_13b,0.7214684756584198,arena_elo,BLZ_240312,[],holistic
52,openassistant_pythia_12b,0.7158818834796489,arena_elo,BLZ_240312,[],holistic
53,chatglm_6b,0.704708699122107,arena_elo,BLZ_240312,[],holistic
54,fastchat_t5_3b,0.6975259377494014,arena_elo,BLZ_240312,[],holistic
55,stablelm_tuned_alpha_7b,0.6743814844373504,arena_elo,BLZ_240312,[],holistic
56,dolly_v2_12b,0.6568236233040702,arena_elo,BLZ_240312,[],holistic
57,llama_13b,0.6384676775738228,arena_elo,BLZ_240312,[],holistic
709,gpt_4_1106_preview,0.8390000000000001,bbh,BLZ_240312,[],holistic
711,gpt_4_0314,0.867,bbh,BLZ_240312,[],holistic
712,gpt_4_0613,0.867,bbh,BLZ_240312,[],holistic
714,claude_1,0.6729999999999999,bbh,BLZ_240312,[],holistic
716,gemini_pro_dev_api,0.6559999999999999,bbh,BLZ_240312,[],holistic
718,gpt_3.5_turbo_0613,0.71,bbh,BLZ_240312,[],holistic
719,mixtral_8x7b_instruct_v0.1,0.67,bbh,BLZ_240312,[],holistic
720,yi_34b_chat,0.7170000000000001,bbh,BLZ_240312,[],holistic
721,gemini_pro,0.6559999999999999,bbh,BLZ_240312,[],holistic
725,tulu_2_dpo_70b,0.66,bbh,BLZ_240312,[],holistic
726,vicuna_33b,0.52,bbh,BLZ_240312,[],holistic
729,llama_2_70b_chat,0.608,bbh,BLZ_240312,[],holistic
734,gpt_3.5_turbo_1106,0.71,bbh,BLZ_240312,[],holistic
736,dolphin_2.2.1_mistral_7b,0.598,bbh,BLZ_240312,[],holistic
741,llama_2_13b_chat,0.5820000000000001,bbh,BLZ_240312,[],holistic
742,vicuna_13b,0.515,bbh,BLZ_240312,[],holistic
745,qwen_14b_chat,0.537,bbh,BLZ_240312,[],holistic
748,llama_2_7b_chat,0.35600000000000004,bbh,BLZ_240312,[],holistic
750,mistral_7b_instruct_v0.1,0.5670000000000001,bbh,BLZ_240312,[],holistic
752,vicuna_7b,0.434,bbh,BLZ_240312,[],holistic
765,llama_13b,0.379,bbh,BLZ_240312,[],holistic
1122,gpt_4_1106_preview,0.8604999999999999,eq_benchv2,BLZ_240312,[],holistic
1124,gpt_4_0314,0.8573000000000001,eq_benchv2,BLZ_240312,[],holistic
1125,gpt_4_0613,0.8479000000000001,eq_benchv2,BLZ_240312,[],holistic
1126,mistral_medium,0.8256999999999999,eq_benchv2,BLZ_240312,[],holistic
1127,claude_1,0.7683,eq_benchv2,BLZ_240312,[],holistic
1128,claude_2.0,0.7289,eq_benchv2,BLZ_240312,[],holistic
1129,gemini_pro_dev_api,0.7508,eq_benchv2,BLZ_240312,[],holistic
1130,claude_2.1,0.7395999999999999,eq_benchv2,BLZ_240312,[],holistic
1131,gpt_3.5_turbo_0613,0.6934999999999999,eq_benchv2,BLZ_240312,[],holistic
1132,mixtral_8x7b_instruct_v0.1,0.7237,eq_benchv2,BLZ_240312,[],holistic
1133,yi_34b_chat,0.7162000000000001,eq_benchv2,BLZ_240312,[],holistic
1135,claude_instant_1,0.6904,eq_benchv2,BLZ_240312,[],holistic
1136,gpt_3.5_turbo_0314,0.7067,eq_benchv2,BLZ_240312,[],holistic
1137,wizardlm_70b_v1.0,0.7128,eq_benchv2,BLZ_240312,[],holistic
1138,tulu_2_dpo_70b,0.7663,eq_benchv2,BLZ_240312,[],holistic
1139,vicuna_33b,0.6707,eq_benchv2,BLZ_240312,[],holistic
1140,starling_lm_7b_alpha,0.7390000000000001,eq_benchv2,BLZ_240312,[],holistic
1141,deepseek_llm_67b_chat,0.7753,eq_benchv2,BLZ_240312,[],holistic
1142,llama_2_70b_chat,0.7359,eq_benchv2,BLZ_240312,[],holistic
1144,openhermes_2.5_mistral_7b,0.6689,eq_benchv2,BLZ_240312,[],holistic
1145,openchat_3.5,0.7218000000000001,eq_benchv2,BLZ_240312,[],holistic
1146,pplx_70b_online,0.6279,eq_benchv2,BLZ_240312,[],holistic
1147,gpt_3.5_turbo_1106,0.7173999999999999,eq_benchv2,BLZ_240312,[],holistic
1148,solar_10.7b_instruct_v1.0,0.7353000000000001,eq_benchv2,BLZ_240312,[],holistic
1149,dolphin_2.2.1_mistral_7b,0.6992,eq_benchv2,BLZ_240312,[],holistic
1150,wizardlm_13b_v1.2,0.6371,eq_benchv2,BLZ_240312,[],holistic
1151,zephyr_7b_beta,0.5832999999999999,eq_benchv2,BLZ_240312,[],holistic
1153,codellama_34b_instruct,0.4915,eq_benchv2,BLZ_240312,[],holistic
1154,llama_2_13b_chat,0.49119999999999997,eq_benchv2,BLZ_240312,[],holistic
1155,vicuna_13b,0.6739,eq_benchv2,BLZ_240312,[],holistic
1156,pplx_7b_online,0.4891,eq_benchv2,BLZ_240312,[],holistic
1157,zephyr_7b_alpha,0.5682,eq_benchv2,BLZ_240312,[],holistic
1158,qwen_14b_chat,0.6347,eq_benchv2,BLZ_240312,[],holistic
1159,falcon_180b_chat,0.5682,eq_benchv2,BLZ_240312,[],holistic
1160,guanaco_33b,0.3611,eq_benchv2,BLZ_240312,[],holistic
1161,llama_2_7b_chat,0.3632,eq_benchv2,BLZ_240312,[],holistic
1162,stripedhyena_nous_7b,0.5458,eq_benchv2,BLZ_240312,[],holistic
1163,mistral_7b_instruct_v0.1,0.5215,eq_benchv2,BLZ_240312,[],holistic
1179,yi_34bx2_moe_60b,0.7269,eq_benchv2,BLZ_240312,[],holistic
542,mixtral_8x7b_instruct_v0.1,0.7641,gpt4all,BLZ_240312,[],holistic
543,yi_34b_chat,0.7212999999999999,gpt4all,BLZ_240312,[],holistic
550,starling_lm_7b_alpha,0.7272,gpt4all,BLZ_240312,[],holistic
554,openhermes_2.5_mistral_7b,0.7312000000000001,gpt4all,BLZ_240312,[],holistic
555,openchat_3.5,0.7292000000000001,gpt4all,BLZ_240312,[],holistic
558,solar_10.7b_instruct_v1.0,0.7511,gpt4all,BLZ_240312,[],holistic
559,dolphin_2.2.1_mistral_7b,0.7223999999999999,gpt4all,BLZ_240312,[],holistic
561,zephyr_7b_beta,0.7182999999999999,gpt4all,BLZ_240312,[],holistic
565,vicuna_13b,0.631,gpt4all,BLZ_240312,[],holistic
567,zephyr_7b_alpha,0.7223999999999999,gpt4all,BLZ_240312,[],holistic
573,mistral_7b_instruct_v0.1,0.6795,gpt4all,BLZ_240312,[],holistic
575,vicuna_7b,0.61,gpt4all,BLZ_240312,[],holistic
576,koala_13b,0.62,gpt4all,BLZ_240312,[],holistic
578,gpt4all_13b_snoozy,0.653,gpt4all,BLZ_240312,[],holistic
579,mpt_7b_chat,0.648,gpt4all,BLZ_240312,[],holistic
583,openassistant_pythia_12b,0.61,gpt4all,BLZ_240312,[],holistic
585,fastchat_t5_3b,0.537,gpt4all,BLZ_240312,[],holistic
586,stablelm_tuned_alpha_7b,0.513,gpt4all,BLZ_240312,[],holistic
588,llama_13b,0.63,gpt4all,BLZ_240312,[],holistic
477,mistral_medium,0.667,gsm8k,BLZ_240312,[],math
483,mixtral_8x7b_instruct_v0.1,0.6073,gsm8k,BLZ_240312,[],math
484,yi_34b_chat,0.31920000000000004,gsm8k,BLZ_240312,[],math
487,gpt_3.5_turbo_0314,0.5710000000000001,gsm8k,BLZ_240312,[],math
488,wizardlm_70b_v1.0,0.1797,gsm8k,BLZ_240312,[],math
489,tulu_2_dpo_70b,0.6262,gsm8k,BLZ_240312,[],math
490,vicuna_33b,0.13720000000000002,gsm8k,BLZ_240312,[],math
491,starling_lm_7b_alpha,0.624,gsm8k,BLZ_240312,[],math
493,llama_2_70b_chat,0.2669,gsm8k,BLZ_240312,[],math
495,openhermes_2.5_mistral_7b,0.2608,gsm8k,BLZ_240312,[],math
496,openchat_3.5,0.26839999999999997,gsm8k,BLZ_240312,[],math
499,solar_10.7b_instruct_v1.0,0.6475,gsm8k,BLZ_240312,[],math
500,dolphin_2.2.1_mistral_7b,0.4807,gsm8k,BLZ_240312,[],math
501,wizardlm_13b_v1.2,0.135,gsm8k,BLZ_240312,[],math
502,zephyr_7b_beta,0.2904,gsm8k,BLZ_240312,[],math
503,mpt_30b_chat,0.1213,gsm8k,BLZ_240312,[],math
504,codellama_34b_instruct,0.37979999999999997,gsm8k,BLZ_240312,[],math
505,llama_2_13b_chat,0.1524,gsm8k,BLZ_240312,[],math
506,vicuna_13b,0.113,gsm8k,BLZ_240312,[],math
508,zephyr_7b_alpha,0.14029999999999998,gsm8k,BLZ_240312,[],math
509,qwen_14b_chat,0.597,gsm8k,BLZ_240312,[],math
510,falcon_180b_chat,0.4594,gsm8k,BLZ_240312,[],math
512,llama_2_7b_chat,0.0735,gsm8k,BLZ_240312,[],math
514,mistral_7b_instruct_v0.1,0.1425,gsm8k,BLZ_240312,[],math
516,vicuna_7b,0.0819,gsm8k,BLZ_240312,[],math
530,yi_34bx2_moe_60b,0.7551000000000001,gsm8k,BLZ_240312,[],math
239,gpt_4_0314,0.953,hellaswag,BLZ_240312,[],reasoning
241,mistral_medium,0.88,hellaswag,BLZ_240312,[],reasoning
247,mixtral_8x7b_instruct_v0.1,0.8763,hellaswag,BLZ_240312,[],reasoning
248,yi_34b_chat,0.8416,hellaswag,BLZ_240312,[],reasoning
251,gpt_3.5_turbo_0314,0.706,hellaswag,BLZ_240312,[],reasoning
252,wizardlm_70b_v1.0,0.8441,hellaswag,BLZ_240312,[],reasoning
253,tulu_2_dpo_70b,0.8898999999999999,hellaswag,BLZ_240312,[],reasoning
254,vicuna_33b,0.83,hellaswag,BLZ_240312,[],reasoning
255,starling_lm_7b_alpha,0.8490000000000001,hellaswag,BLZ_240312,[],reasoning
257,llama_2_70b_chat,0.8588,hellaswag,BLZ_240312,[],reasoning
259,openhermes_2.5_mistral_7b,0.8418000000000001,hellaswag,BLZ_240312,[],reasoning
260,openchat_3.5,0.8479000000000001,hellaswag,BLZ_240312,[],reasoning
263,solar_10.7b_instruct_v1.0,0.8815999999999999,hellaswag,BLZ_240312,[],reasoning
264,dolphin_2.2.1_mistral_7b,0.8376,hellaswag,BLZ_240312,[],reasoning
265,wizardlm_13b_v1.2,0.8220999999999999,hellaswag,BLZ_240312,[],reasoning
266,zephyr_7b_beta,0.8436,hellaswag,BLZ_240312,[],reasoning
267,mpt_30b_chat,0.8254,hellaswag,BLZ_240312,[],reasoning
268,codellama_34b_instruct,0.7692,hellaswag,BLZ_240312,[],reasoning
269,llama_2_13b_chat,0.8194,hellaswag,BLZ_240312,[],reasoning
270,vicuna_13b,0.8123999999999999,hellaswag,BLZ_240312,[],reasoning
272,zephyr_7b_alpha,0.8404,hellaswag,BLZ_240312,[],reasoning
274,falcon_180b_chat,0.8886,hellaswag,BLZ_240312,[],reasoning
276,llama_2_7b_chat,0.7855,hellaswag,BLZ_240312,[],reasoning
278,mistral_7b_instruct_v0.1,0.7563,hellaswag,BLZ_240312,[],reasoning
280,vicuna_7b,0.7739,hellaswag,BLZ_240312,[],reasoning
294,yi_34bx2_moe_60b,0.8523000000000001,hellaswag,BLZ_240312,[],reasoning
129,mixtral_8x7b_instruct_v0.1,0.7262000000000001,hugging_6,BLZ_240312,[],holistic
130,yi_34b_chat,0.6531999999999999,hugging_6,BLZ_240312,[],holistic
134,wizardlm_70b_v1.0,0.6125,hugging_6,BLZ_240312,[],holistic
135,tulu_2_dpo_70b,0.7376999999999999,hugging_6,BLZ_240312,[],holistic
136,vicuna_33b,0.585,hugging_6,BLZ_240312,[],holistic
137,starling_lm_7b_alpha,0.6713,hugging_6,BLZ_240312,[],holistic
139,llama_2_70b_chat,0.624,hugging_6,BLZ_240312,[],holistic
141,openhermes_2.5_mistral_7b,0.6152000000000001,hugging_6,BLZ_240312,[],holistic
142,openchat_3.5,0.6124,hugging_6,BLZ_240312,[],holistic
145,solar_10.7b_instruct_v1.0,0.742,hugging_6,BLZ_240312,[],holistic
146,dolphin_2.2.1_mistral_7b,0.6493000000000001,hugging_6,BLZ_240312,[],holistic
147,wizardlm_13b_v1.2,0.5476,hugging_6,BLZ_240312,[],holistic
148,zephyr_7b_beta,0.6195,hugging_6,BLZ_240312,[],holistic
149,mpt_30b_chat,0.5538000000000001,hugging_6,BLZ_240312,[],holistic
150,codellama_34b_instruct,0.5729,hugging_6,BLZ_240312,[],holistic
151,llama_2_13b_chat,0.5490999999999999,hugging_6,BLZ_240312,[],holistic
152,vicuna_13b,0.5539999999999999,hugging_6,BLZ_240312,[],holistic
154,zephyr_7b_alpha,0.595,hugging_6,BLZ_240312,[],holistic
156,falcon_180b_chat,0.6785,hugging_6,BLZ_240312,[],holistic
158,llama_2_7b_chat,0.5074000000000001,hugging_6,BLZ_240312,[],holistic
160,mistral_7b_instruct_v0.1,0.5496,hugging_6,BLZ_240312,[],holistic
162,vicuna_7b,0.521,hugging_6,BLZ_240312,[],holistic
176,yi_34bx2_moe_60b,0.7672,hugging_6,BLZ_240312,[],holistic
768,gpt_4_1106_preview,0.8540000000000001,humaneval,BLZ_240312,[],code
770,gpt_4_0314,0.884,humaneval,BLZ_240312,[],code
771,gpt_4_0613,0.884,humaneval,BLZ_240312,[],code
773,claude_1,0.56,humaneval,BLZ_240312,[],code
774,claude_2.0,0.7120000000000001,humaneval,BLZ_240312,[],code
775,gemini_pro_dev_api,0.634,humaneval,BLZ_240312,[],code
777,gpt_3.5_turbo_0613,0.726,humaneval,BLZ_240312,[],code
778,mixtral_8x7b_instruct_v0.1,0.5489999999999999,humaneval,BLZ_240312,[],code
780,gemini_pro,0.634,humaneval,BLZ_240312,[],code
781,claude_instant_1,0.528,humaneval,BLZ_240312,[],code
782,gpt_3.5_turbo_0314,0.732,humaneval,BLZ_240312,[],code
790,openhermes_2.5_mistral_7b,0.48200000000000004,humaneval,BLZ_240312,[],code
791,openchat_3.5,0.555,humaneval,BLZ_240312,[],code
793,gpt_3.5_turbo_1106,0.726,humaneval,BLZ_240312,[],code
797,zephyr_7b_beta,0.3,humaneval,BLZ_240312,[],code
799,codellama_34b_instruct,0.518,humaneval,BLZ_240312,[],code
801,vicuna_13b,0.171,humaneval,BLZ_240312,[],code
804,qwen_14b_chat,0.439,humaneval,BLZ_240312,[],code
809,mistral_7b_instruct_v0.1,0.287,humaneval,BLZ_240312,[],code
811,vicuna_7b,0.11599999999999999,humaneval,BLZ_240312,[],code
947,gpt_4_0314,0.93,llmonitor,BLZ_240312,[],holistic
948,gpt_4_0613,0.89,llmonitor,BLZ_240312,[],holistic
950,claude_1,0.66,llmonitor,BLZ_240312,[],holistic
951,claude_2.0,0.68,llmonitor,BLZ_240312,[],holistic
954,gpt_3.5_turbo_0613,0.81,llmonitor,BLZ_240312,[],holistic
958,claude_instant_1,0.6,llmonitor,BLZ_240312,[],holistic
959,gpt_3.5_turbo_0314,0.79,llmonitor,BLZ_240312,[],holistic
965,llama_2_70b_chat,0.6,llmonitor,BLZ_240312,[],holistic
975,mpt_30b_chat,0.4,llmonitor,BLZ_240312,[],holistic
976,codellama_34b_instruct,0.34,llmonitor,BLZ_240312,[],holistic
977,llama_2_13b_chat,0.5,llmonitor,BLZ_240312,[],holistic
978,vicuna_13b,0.5,llmonitor,BLZ_240312,[],holistic
982,falcon_180b_chat,0.67,llmonitor,BLZ_240312,[],holistic
983,guanaco_33b,0.43,llmonitor,BLZ_240312,[],holistic
984,llama_2_7b_chat,0.5,llmonitor,BLZ_240312,[],holistic
986,mistral_7b_instruct_v0.1,0.57,llmonitor,BLZ_240312,[],holistic
987,palm_chat_bison_001,0.57,llmonitor,BLZ_240312,[],holistic
988,vicuna_7b,0.41,llmonitor,BLZ_240312,[],holistic
989,koala_13b,0.31,llmonitor,BLZ_240312,[],holistic
992,mpt_7b_chat,0.43,llmonitor,BLZ_240312,[],holistic
1000,dolly_v2_12b,0.23,llmonitor,BLZ_240312,[],holistic
1185,mistral_medium,0.654,magi,BLZ_240312,[],holistic
1188,gemini_pro_dev_api,0.528,magi,BLZ_240312,[],holistic
1190,gpt_3.5_turbo_0613,0.455,magi,BLZ_240312,[],holistic
1191,mixtral_8x7b_instruct_v0.1,0.49560000000000004,magi,BLZ_240312,[],holistic
1192,yi_34b_chat,0.5821999999999999,magi,BLZ_240312,[],holistic
1195,gpt_3.5_turbo_0314,0.512,magi,BLZ_240312,[],holistic
1196,wizardlm_70b_v1.0,0.4476,magi,BLZ_240312,[],holistic
1197,tulu_2_dpo_70b,0.5212,magi,BLZ_240312,[],holistic
1198,vicuna_33b,0.3837,magi,BLZ_240312,[],holistic
1199,starling_lm_7b_alpha,0.4304,magi,BLZ_240312,[],holistic
1200,deepseek_llm_67b_chat,0.5946,magi,BLZ_240312,[],holistic
1201,llama_2_70b_chat,0.39899999999999997,magi,BLZ_240312,[],holistic
1203,openhermes_2.5_mistral_7b,0.4236,magi,BLZ_240312,[],holistic
1204,openchat_3.5,0.42200000000000004,magi,BLZ_240312,[],holistic
1206,gpt_3.5_turbo_1106,0.462,magi,BLZ_240312,[],holistic
1207,solar_10.7b_instruct_v1.0,0.4693,magi,BLZ_240312,[],holistic
1208,dolphin_2.2.1_mistral_7b,0.3782,magi,BLZ_240312,[],holistic
1209,wizardlm_13b_v1.2,0.3678,magi,BLZ_240312,[],holistic
1210,zephyr_7b_beta,0.4042,magi,BLZ_240312,[],holistic
1213,llama_2_13b_chat,0.37170000000000003,magi,BLZ_240312,[],holistic
1214,vicuna_13b,0.36560000000000004,magi,BLZ_240312,[],holistic
1216,zephyr_7b_alpha,0.39899999999999997,magi,BLZ_240312,[],holistic
1217,qwen_14b_chat,0.4535,magi,BLZ_240312,[],holistic
1219,guanaco_33b,0.38659999999999994,magi,BLZ_240312,[],holistic
1220,llama_2_7b_chat,0.35969999999999996,magi,BLZ_240312,[],holistic
1222,mistral_7b_instruct_v0.1,0.3704,magi,BLZ_240312,[],holistic
1063,gpt_4_1106_preview,0.83,mbpp,BLZ_240312,[],code
1067,mistral_medium,0.623,mbpp,BLZ_240312,[],code
1070,gemini_pro_dev_api,0.7290000000000001,mbpp,BLZ_240312,[],code
1073,mixtral_8x7b_instruct_v0.1,0.607,mbpp,BLZ_240312,[],code
1075,gemini_pro,0.7290000000000001,mbpp,BLZ_240312,[],code
1077,gpt_3.5_turbo_0314,0.816,mbpp,BLZ_240312,[],code
1089,solar_10.7b_instruct_v1.0,0.429,mbpp,BLZ_240312,[],code
1092,zephyr_7b_beta,0.41100000000000003,mbpp,BLZ_240312,[],code
296,gpt_4_1106_preview,0.805,mmlu,BLZ_240312,[],knowledge
298,gpt_4_0314,0.8640000000000001,mmlu,BLZ_240312,[],knowledge
300,mistral_medium,0.753,mmlu,BLZ_240312,[],knowledge
301,claude_1,0.77,mmlu,BLZ_240312,[],knowledge
302,claude_2.0,0.785,mmlu,BLZ_240312,[],knowledge
303,gemini_pro_dev_api,0.718,mmlu,BLZ_240312,[],knowledge
306,mixtral_8x7b_instruct_v0.1,0.706,mmlu,BLZ_240312,[],knowledge
307,yi_34b_chat,0.735,mmlu,BLZ_240312,[],knowledge
308,gemini_pro,0.718,mmlu,BLZ_240312,[],knowledge
309,claude_instant_1,0.7340000000000001,mmlu,BLZ_240312,[],knowledge
310,gpt_3.5_turbo_0314,0.7,mmlu,BLZ_240312,[],knowledge
311,wizardlm_70b_v1.0,0.637,mmlu,BLZ_240312,[],knowledge
312,tulu_2_dpo_70b,0.698,mmlu,BLZ_240312,[],knowledge
313,vicuna_33b,0.5920000000000001,mmlu,BLZ_240312,[],knowledge
314,starling_lm_7b_alpha,0.639,mmlu,BLZ_240312,[],knowledge
315,deepseek_llm_67b_chat,0.713,mmlu,BLZ_240312,[],knowledge
316,llama_2_70b_chat,0.63,mmlu,BLZ_240312,[],knowledge
317,nv_llama2_70b_steerlm_chat,0.685,mmlu,BLZ_240312,[],knowledge
318,openhermes_2.5_mistral_7b,0.638,mmlu,BLZ_240312,[],knowledge
319,openchat_3.5,0.643,mmlu,BLZ_240312,[],knowledge
321,gpt_3.5_turbo_1106,0.6779999999999999,mmlu,BLZ_240312,[],knowledge
322,solar_10.7b_instruct_v1.0,0.662,mmlu,BLZ_240312,[],knowledge
323,dolphin_2.2.1_mistral_7b,0.632,mmlu,BLZ_240312,[],knowledge
324,wizardlm_13b_v1.2,0.527,mmlu,BLZ_240312,[],knowledge
325,zephyr_7b_beta,0.614,mmlu,BLZ_240312,[],knowledge
326,mpt_30b_chat,0.504,mmlu,BLZ_240312,[],knowledge
327,codellama_34b_instruct,0.537,mmlu,BLZ_240312,[],knowledge
328,llama_2_13b_chat,0.536,mmlu,BLZ_240312,[],knowledge
329,vicuna_13b,0.5579999999999999,mmlu,BLZ_240312,[],knowledge
331,zephyr_7b_alpha,0.614,mmlu,BLZ_240312,[],knowledge
332,qwen_14b_chat,0.665,mmlu,BLZ_240312,[],knowledge
333,falcon_180b_chat,0.68,mmlu,BLZ_240312,[],knowledge
334,guanaco_33b,0.5760000000000001,mmlu,BLZ_240312,[],knowledge
335,llama_2_7b_chat,0.45799999999999996,mmlu,BLZ_240312,[],knowledge
337,mistral_7b_instruct_v0.1,0.5539999999999999,mmlu,BLZ_240312,[],knowledge
339,vicuna_7b,0.51,mmlu,BLZ_240312,[],knowledge
340,koala_13b,0.447,mmlu,BLZ_240312,[],knowledge
342,gpt4all_13b_snoozy,0.43,mmlu,BLZ_240312,[],knowledge
343,mpt_7b_chat,0.32,mmlu,BLZ_240312,[],knowledge
344,chatglm2_6b,0.455,mmlu,BLZ_240312,[],knowledge
345,rwkv_4_raven_14b,0.256,mmlu,BLZ_240312,[],knowledge
346,alpaca_13b,0.48100000000000004,mmlu,BLZ_240312,[],knowledge
347,openassistant_pythia_12b,0.27,mmlu,BLZ_240312,[],knowledge
348,chatglm_6b,0.361,mmlu,BLZ_240312,[],knowledge
349,fastchat_t5_3b,0.47700000000000004,mmlu,BLZ_240312,[],knowledge
350,stablelm_tuned_alpha_7b,0.244,mmlu,BLZ_240312,[],knowledge
351,dolly_v2_12b,0.257,mmlu,BLZ_240312,[],knowledge
352,llama_13b,0.47,mmlu,BLZ_240312,[],knowledge
353,yi_34bx2_moe_60b,0.775,mmlu,BLZ_240312,[],knowledge
59,gpt_4_0125_preview,0.0929,mt_bench,BLZ_240312,[],holistic
60,gpt_4_1106_preview,0.0932,mt_bench,BLZ_240312,[],holistic
62,gpt_4_0314,0.08960000000000001,mt_bench,BLZ_240312,[],holistic
63,gpt_4_0613,0.09179999999999999,mt_bench,BLZ_240312,[],holistic
64,mistral_medium,0.0861,mt_bench,BLZ_240312,[],holistic
65,claude_1,0.079,mt_bench,BLZ_240312,[],holistic
66,claude_2.0,0.0806,mt_bench,BLZ_240312,[],holistic
67,gemini_pro_dev_api,0.08039999999999999,mt_bench,BLZ_240312,[],holistic
68,claude_2.1,0.0818,mt_bench,BLZ_240312,[],holistic
69,gpt_3.5_turbo_0613,0.0839,mt_bench,BLZ_240312,[],holistic
70,mixtral_8x7b_instruct_v0.1,0.083,mt_bench,BLZ_240312,[],holistic
71,yi_34b_chat,0.07769999999999999,mt_bench,BLZ_240312,[],holistic
72,gemini_pro,0.08039999999999999,mt_bench,BLZ_240312,[],holistic
73,claude_instant_1,0.0785,mt_bench,BLZ_240312,[],holistic
74,gpt_3.5_turbo_0314,0.0794,mt_bench,BLZ_240312,[],holistic
75,wizardlm_70b_v1.0,0.0771,mt_bench,BLZ_240312,[],holistic
76,tulu_2_dpo_70b,0.0789,mt_bench,BLZ_240312,[],holistic
77,vicuna_33b,0.0712,mt_bench,BLZ_240312,[],holistic
78,starling_lm_7b_alpha,0.0809,mt_bench,BLZ_240312,[],holistic
79,deepseek_llm_67b_chat,0.08529999999999999,mt_bench,BLZ_240312,[],holistic
80,llama_2_70b_chat,0.06860000000000001,mt_bench,BLZ_240312,[],holistic
81,nv_llama2_70b_steerlm_chat,0.0754,mt_bench,BLZ_240312,[],holistic
82,openhermes_2.5_mistral_7b,0.07690000000000001,mt_bench,BLZ_240312,[],holistic
83,openchat_3.5,0.0781,mt_bench,BLZ_240312,[],holistic
84,pplx_70b_online,0.0588,mt_bench,BLZ_240312,[],holistic
85,gpt_3.5_turbo_1106,0.0832,mt_bench,BLZ_240312,[],holistic
86,solar_10.7b_instruct_v1.0,0.0758,mt_bench,BLZ_240312,[],holistic
88,wizardlm_13b_v1.2,0.07200000000000001,mt_bench,BLZ_240312,[],holistic
89,zephyr_7b_beta,0.07339999999999999,mt_bench,BLZ_240312,[],holistic
90,mpt_30b_chat,0.0639,mt_bench,BLZ_240312,[],holistic
92,llama_2_13b_chat,0.0665,mt_bench,BLZ_240312,[],holistic
93,vicuna_13b,0.06570000000000001,mt_bench,BLZ_240312,[],holistic
95,zephyr_7b_alpha,0.0688,mt_bench,BLZ_240312,[],holistic
96,qwen_14b_chat,0.0696,mt_bench,BLZ_240312,[],holistic
98,guanaco_33b,0.0653,mt_bench,BLZ_240312,[],holistic
99,llama_2_7b_chat,0.06269999999999999,mt_bench,BLZ_240312,[],holistic
101,mistral_7b_instruct_v0.1,0.0684,mt_bench,BLZ_240312,[],holistic
102,palm_chat_bison_001,0.064,mt_bench,BLZ_240312,[],holistic
103,vicuna_7b,0.0617,mt_bench,BLZ_240312,[],holistic
104,koala_13b,0.0535,mt_bench,BLZ_240312,[],holistic
106,gpt4all_13b_snoozy,0.0541,mt_bench,BLZ_240312,[],holistic
107,mpt_7b_chat,0.0542,mt_bench,BLZ_240312,[],holistic
108,chatglm2_6b,0.0496,mt_bench,BLZ_240312,[],holistic
109,rwkv_4_raven_14b,0.0398,mt_bench,BLZ_240312,[],holistic
110,alpaca_13b,0.0453,mt_bench,BLZ_240312,[],holistic
111,openassistant_pythia_12b,0.0432,mt_bench,BLZ_240312,[],holistic
112,chatglm_6b,0.045,mt_bench,BLZ_240312,[],holistic
113,fastchat_t5_3b,0.0304,mt_bench,BLZ_240312,[],holistic
114,stablelm_tuned_alpha_7b,0.0275,mt_bench,BLZ_240312,[],holistic
115,dolly_v2_12b,0.032799999999999996,mt_bench,BLZ_240312,[],holistic
116,llama_13b,0.026099999999999998,mt_bench,BLZ_240312,[],holistic
357,gpt_4_0314,0.59,truthfulqa,BLZ_240312,[],knowledge
365,mixtral_8x7b_instruct_v0.1,0.6457999999999999,truthfulqa,BLZ_240312,[],knowledge
366,yi_34b_chat,0.5537,truthfulqa,BLZ_240312,[],knowledge
370,wizardlm_70b_v1.0,0.5481,truthfulqa,BLZ_240312,[],knowledge
371,tulu_2_dpo_70b,0.6578,truthfulqa,BLZ_240312,[],knowledge
372,vicuna_33b,0.5616,truthfulqa,BLZ_240312,[],knowledge
373,starling_lm_7b_alpha,0.4639,truthfulqa,BLZ_240312,[],knowledge
375,llama_2_70b_chat,0.528,truthfulqa,BLZ_240312,[],knowledge
377,openhermes_2.5_mistral_7b,0.5224,truthfulqa,BLZ_240312,[],knowledge
378,openchat_3.5,0.46380000000000005,truthfulqa,BLZ_240312,[],knowledge
381,solar_10.7b_instruct_v1.0,0.7143,truthfulqa,BLZ_240312,[],knowledge
382,dolphin_2.2.1_mistral_7b,0.5311,truthfulqa,BLZ_240312,[],knowledge
383,wizardlm_13b_v1.2,0.4727,truthfulqa,BLZ_240312,[],knowledge
384,zephyr_7b_beta,0.5745,truthfulqa,BLZ_240312,[],knowledge
385,mpt_30b_chat,0.5242,truthfulqa,BLZ_240312,[],knowledge
386,codellama_34b_instruct,0.44439999999999996,truthfulqa,BLZ_240312,[],knowledge
387,llama_2_13b_chat,0.4412,truthfulqa,BLZ_240312,[],knowledge
388,vicuna_13b,0.5151,truthfulqa,BLZ_240312,[],knowledge
390,zephyr_7b_alpha,0.579,truthfulqa,BLZ_240312,[],knowledge
392,falcon_180b_chat,0.4547,truthfulqa,BLZ_240312,[],knowledge
394,llama_2_7b_chat,0.4557,truthfulqa,BLZ_240312,[],knowledge
396,mistral_7b_instruct_v0.1,0.5628,truthfulqa,BLZ_240312,[],knowledge
398,vicuna_7b,0.5034000000000001,truthfulqa,BLZ_240312,[],knowledge
412,yi_34bx2_moe_60b,0.6618999999999999,truthfulqa,BLZ_240312,[],knowledge
418,mistral_medium,0.88,winogrande,BLZ_240312,[],reasoning
424,mixtral_8x7b_instruct_v0.1,0.8137000000000001,winogrande,BLZ_240312,[],reasoning
425,yi_34b_chat,0.8011,winogrande,BLZ_240312,[],reasoning
428,gpt_3.5_turbo_0314,0.852,winogrande,BLZ_240312,[],reasoning
429,wizardlm_70b_v1.0,0.8081999999999999,winogrande,BLZ_240312,[],reasoning
430,tulu_2_dpo_70b,0.8327,winogrande,BLZ_240312,[],reasoning
431,vicuna_33b,0.7703,winogrande,BLZ_240312,[],reasoning
432,starling_lm_7b_alpha,0.8058,winogrande,BLZ_240312,[],reasoning
434,llama_2_70b_chat,0.8051,winogrande,BLZ_240312,[],reasoning
436,openhermes_2.5_mistral_7b,0.7806000000000001,winogrande,BLZ_240312,[],reasoning
437,openchat_3.5,0.8058,winogrande,BLZ_240312,[],reasoning
440,solar_10.7b_instruct_v1.0,0.8358,winogrande,BLZ_240312,[],reasoning
441,dolphin_2.2.1_mistral_7b,0.7814,winogrande,BLZ_240312,[],reasoning
442,wizardlm_13b_v1.2,0.7190000000000001,winogrande,BLZ_240312,[],reasoning
443,zephyr_7b_beta,0.7774,winogrande,BLZ_240312,[],reasoning
444,mpt_30b_chat,0.753,winogrande,BLZ_240312,[],reasoning
445,codellama_34b_instruct,0.7459,winogrande,BLZ_240312,[],reasoning
446,llama_2_13b_chat,0.7451000000000001,winogrande,BLZ_240312,[],reasoning
447,vicuna_13b,0.7465999999999999,winogrande,BLZ_240312,[],reasoning
449,zephyr_7b_alpha,0.7861,winogrande,BLZ_240312,[],reasoning
451,falcon_180b_chat,0.8690000000000001,winogrande,BLZ_240312,[],reasoning
453,llama_2_7b_chat,0.7173999999999999,winogrande,BLZ_240312,[],reasoning
455,mistral_7b_instruct_v0.1,0.7372,winogrande,BLZ_240312,[],reasoning
457,vicuna_7b,0.7214,winogrande,BLZ_240312,[],reasoning
471,yi_34bx2_moe_60b,0.8484999999999999,winogrande,BLZ_240312,[],reasoning
0,gpt_4_0613,0.957,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic
1,llama_3_70b,0.902,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic
2,mixtral_8x22b,0.855,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic
3,palmyra_x_v3_72b,0.826,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic
4,gpt_4_turbo_1106_preview,0.821,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic
5,palm_2_unicorn,0.781,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic
6,claude_3_opus_20240229,0.762,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic
7,qwen1.5_72b,0.757,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic
8,palmyra_x_v2_33b,0.736,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic
9,yi_34b,0.723,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic
10,qwen1.5_32b,0.689,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic
11,claude_v1.3,0.689,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic
12,mixtral_8x7b_32k_seqlen,0.679,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic
13,palm_2_bison,0.655,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic
14,claude_2.0,0.651,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic
15,deepseek_llm_chat_67b,0.645,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic
16,llama_2_70b,0.609,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic
17,claude_2.1,0.594,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic
18,gpt_3.5_text_davinci_003,0.577,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic
19,qwen1.5_14b,0.574,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic
20,claude_instant_1.2,0.551,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic
21,llama_3_8b,0.519,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic
22,gpt_3.5_turbo_0613,0.502,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic
23,gemma_7b,0.47,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic
24,claude_3_sonnet_20240229,0.468,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic
25,gpt_3.5_text_davinci_002,0.468,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic
26,llama_65b,0.466,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic
27,mistral_large_2402,0.46,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic
28,cohere_command,0.421,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic
29,dbrx_instructruct,0.419,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic
30,mistral_v0.1_7b,0.415,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic
31,mistral_small_2402,0.415,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic
32,mistral_medium_2312,0.383,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic
33,qwen1.5_7b,0.377,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic
34,claude_3_haiku_20240307,0.377,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic
35,yi_6b,0.351,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic
36,llama_2_13b,0.332,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic
37,jurassic_2_jumbo_178b,0.317,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic
38,falcon_40b,0.306,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic
39,phi_2,0.26,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic
40,jurassic_2_grande_17b,0.253,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic
41,llama_2_7b,0.234,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic
42,luminous_supreme_70b,0.213,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic
43,cohere_command_light,0.166,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic
44,luminous_extended_30b,0.119,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic
45,falcon_7b,0.1,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic
46,olmo_7b,0.083,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic
47,luminous_base_13b,0.072,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']",holistic
48,gpt_4_0613,0.768,narrativeqa,helm_lite_240610,[],knowledge
49,llama_3_70b,0.798,narrativeqa,helm_lite_240610,[],knowledge
50,mixtral_8x22b,0.779,narrativeqa,helm_lite_240610,[],knowledge
51,palmyra_x_v3_72b,0.706,narrativeqa,helm_lite_240610,[],knowledge
52,gpt_4_turbo_1106_preview,0.727,narrativeqa,helm_lite_240610,[],knowledge
53,palm_2_unicorn,0.583,narrativeqa,helm_lite_240610,[],knowledge
54,claude_3_opus_20240229,0.351,narrativeqa,helm_lite_240610,[],knowledge
55,qwen1.5_72b,0.601,narrativeqa,helm_lite_240610,[],knowledge
56,palmyra_x_v2_33b,0.752,narrativeqa,helm_lite_240610,[],knowledge
57,yi_34b,0.782,narrativeqa,helm_lite_240610,[],knowledge
58,qwen1.5_32b,0.589,narrativeqa,helm_lite_240610,[],knowledge
59,claude_v1.3,0.723,narrativeqa,helm_lite_240610,[],knowledge
60,mixtral_8x7b_32k_seqlen,0.767,narrativeqa,helm_lite_240610,[],knowledge
61,palm_2_bison,0.718,narrativeqa,helm_lite_240610,[],knowledge
62,claude_2.0,0.718,narrativeqa,helm_lite_240610,[],knowledge
63,deepseek_llm_chat_67b,0.581,narrativeqa,helm_lite_240610,[],knowledge
64,llama_2_70b,0.763,narrativeqa,helm_lite_240610,[],knowledge
65,claude_2.1,0.677,narrativeqa,helm_lite_240610,[],knowledge
66,gpt_3.5_text_davinci_003,0.731,narrativeqa,helm_lite_240610,[],knowledge
67,qwen1.5_14b,0.711,narrativeqa,helm_lite_240610,[],knowledge
68,claude_instant_1.2,0.616,narrativeqa,helm_lite_240610,[],knowledge
69,llama_3_8b,0.754,narrativeqa,helm_lite_240610,[],knowledge
70,gpt_3.5_turbo_0613,0.655,narrativeqa,helm_lite_240610,[],knowledge
71,gemma_7b,0.752,narrativeqa,helm_lite_240610,[],knowledge
72,claude_3_sonnet_20240229,0.111,narrativeqa,helm_lite_240610,[],knowledge
73,gpt_3.5_text_davinci_002,0.719,narrativeqa,helm_lite_240610,[],knowledge
74,llama_65b,0.755,narrativeqa,helm_lite_240610,[],knowledge
75,mistral_large_2402,0.454,narrativeqa,helm_lite_240610,[],knowledge
76,cohere_command,0.749,narrativeqa,helm_lite_240610,[],knowledge
77,dbrx_instructruct,0.488,narrativeqa,helm_lite_240610,[],knowledge
78,mistral_v0.1_7b,0.716,narrativeqa,helm_lite_240610,[],knowledge
79,mistral_small_2402,0.519,narrativeqa,helm_lite_240610,[],knowledge
80,mistral_medium_2312,0.449,narrativeqa,helm_lite_240610,[],knowledge
81,qwen1.5_7b,0.448,narrativeqa,helm_lite_240610,[],knowledge
82,claude_3_haiku_20240307,0.244,narrativeqa,helm_lite_240610,[],knowledge
83,yi_6b,0.702,narrativeqa,helm_lite_240610,[],knowledge
84,llama_2_13b,0.741,narrativeqa,helm_lite_240610,[],knowledge
85,jurassic_2_jumbo_178b,0.728,narrativeqa,helm_lite_240610,[],knowledge
86,falcon_40b,0.671,narrativeqa,helm_lite_240610,[],knowledge
87,phi_2,0.703,narrativeqa,helm_lite_240610,[],knowledge
88,jurassic_2_grande_17b,0.744,narrativeqa,helm_lite_240610,[],knowledge
89,llama_2_7b,0.686,narrativeqa,helm_lite_240610,[],knowledge
90,luminous_supreme_70b,0.743,narrativeqa,helm_lite_240610,[],knowledge
91,cohere_command_light,0.629,narrativeqa,helm_lite_240610,[],knowledge
92,luminous_extended_30b,0.684,narrativeqa,helm_lite_240610,[],knowledge
93,falcon_7b,0.621,narrativeqa,helm_lite_240610,[],knowledge
94,olmo_7b,0.597,narrativeqa,helm_lite_240610,[],knowledge
95,luminous_base_13b,0.633,narrativeqa,helm_lite_240610,[],knowledge
96,gpt_4_0613,0.79,naturalquestions_open,helm_lite_240610,[],knowledge
97,llama_3_70b,0.743,naturalquestions_open,helm_lite_240610,[],knowledge
98,mixtral_8x22b,0.726,naturalquestions_open,helm_lite_240610,[],knowledge
99,palmyra_x_v3_72b,0.685,naturalquestions_open,helm_lite_240610,[],knowledge
100,gpt_4_turbo_1106_preview,0.763,naturalquestions_open,helm_lite_240610,[],knowledge
101,palm_2_unicorn,0.674,naturalquestions_open,helm_lite_240610,[],knowledge
102,claude_3_opus_20240229,0.264,naturalquestions_open,helm_lite_240610,[],knowledge
103,qwen1.5_72b,0.758,naturalquestions_open,helm_lite_240610,[],knowledge
104,palmyra_x_v2_33b,0.752,naturalquestions_open,helm_lite_240610,[],knowledge
105,yi_34b,0.775,naturalquestions_open,helm_lite_240610,[],knowledge
106,qwen1.5_32b,0.777,naturalquestions_open,helm_lite_240610,[],knowledge
107,claude_v1.3,0.699,naturalquestions_open,helm_lite_240610,[],knowledge
108,mixtral_8x7b_32k_seqlen,0.699,naturalquestions_open,helm_lite_240610,[],knowledge
109,palm_2_bison,0.813,naturalquestions_open,helm_lite_240610,[],knowledge
110,claude_2.0,0.67,naturalquestions_open,helm_lite_240610,[],knowledge
111,deepseek_llm_chat_67b,0.733,naturalquestions_open,helm_lite_240610,[],knowledge
112,llama_2_70b,0.674,naturalquestions_open,helm_lite_240610,[],knowledge
113,claude_2.1,0.611,naturalquestions_open,helm_lite_240610,[],knowledge
114,gpt_3.5_text_davinci_003,0.77,naturalquestions_open,helm_lite_240610,[],knowledge
115,qwen1.5_14b,0.772,naturalquestions_open,helm_lite_240610,[],knowledge
116,claude_instant_1.2,0.731,naturalquestions_open,helm_lite_240610,[],knowledge
117,llama_3_8b,0.681,naturalquestions_open,helm_lite_240610,[],knowledge
118,gpt_3.5_turbo_0613,0.678,naturalquestions_open,helm_lite_240610,[],knowledge
119,gemma_7b,0.665,naturalquestions_open,helm_lite_240610,[],knowledge
120,claude_3_sonnet_20240229,0.072,naturalquestions_open,helm_lite_240610,[],knowledge
121,gpt_3.5_text_davinci_002,0.71,naturalquestions_open,helm_lite_240610,[],knowledge
122,llama_65b,0.672,naturalquestions_open,helm_lite_240610,[],knowledge
123,mistral_large_2402,0.485,naturalquestions_open,helm_lite_240610,[],knowledge
124,cohere_command,0.777,naturalquestions_open,helm_lite_240610,[],knowledge
125,dbrx_instructruct,0.55,naturalquestions_open,helm_lite_240610,[],knowledge
126,mistral_v0.1_7b,0.687,naturalquestions_open,helm_lite_240610,[],knowledge
127,mistral_small_2402,0.587,naturalquestions_open,helm_lite_240610,[],knowledge
128,mistral_medium_2312,0.468,naturalquestions_open,helm_lite_240610,[],knowledge
129,qwen1.5_7b,0.749,naturalquestions_open,helm_lite_240610,[],knowledge
130,claude_3_haiku_20240307,0.252,naturalquestions_open,helm_lite_240610,[],knowledge
131,yi_6b,0.748,naturalquestions_open,helm_lite_240610,[],knowledge
132,llama_2_13b,0.64,naturalquestions_open,helm_lite_240610,[],knowledge
133,jurassic_2_jumbo_178b,0.65,naturalquestions_open,helm_lite_240610,[],knowledge
134,falcon_40b,0.676,naturalquestions_open,helm_lite_240610,[],knowledge
135,phi_2,0.68,naturalquestions_open,helm_lite_240610,[],knowledge
136,jurassic_2_grande_17b,0.627,naturalquestions_open,helm_lite_240610,[],knowledge
137,llama_2_7b,0.612,naturalquestions_open,helm_lite_240610,[],knowledge
138,luminous_supreme_70b,0.656,naturalquestions_open,helm_lite_240610,[],knowledge
139,cohere_command_light,0.686,naturalquestions_open,helm_lite_240610,[],knowledge
140,luminous_extended_30b,0.611,naturalquestions_open,helm_lite_240610,[],knowledge
141,falcon_7b,0.58,naturalquestions_open,helm_lite_240610,[],knowledge
142,olmo_7b,0.603,naturalquestions_open,helm_lite_240610,[],knowledge
143,luminous_base_13b,0.577,naturalquestions_open,helm_lite_240610,[],knowledge
144,gpt_4_0613,0.457,naturalquestions_closed,helm_lite_240610,[],knowledge
145,llama_3_70b,0.475,naturalquestions_closed,helm_lite_240610,[],knowledge
146,mixtral_8x22b,0.478,naturalquestions_closed,helm_lite_240610,[],knowledge
147,palmyra_x_v3_72b,0.407,naturalquestions_closed,helm_lite_240610,[],knowledge
148,gpt_4_turbo_1106_preview,0.435,naturalquestions_closed,helm_lite_240610,[],knowledge
149,palm_2_unicorn,0.435,naturalquestions_closed,helm_lite_240610,[],knowledge
150,claude_3_opus_20240229,0.441,naturalquestions_closed,helm_lite_240610,[],knowledge
151,qwen1.5_72b,0.417,naturalquestions_closed,helm_lite_240610,[],knowledge
152,palmyra_x_v2_33b,0.428,naturalquestions_closed,helm_lite_240610,[],knowledge
153,yi_34b,0.443,naturalquestions_closed,helm_lite_240610,[],knowledge
154,qwen1.5_32b,0.353,naturalquestions_closed,helm_lite_240610,[],knowledge
155,claude_v1.3,0.409,naturalquestions_closed,helm_lite_240610,[],knowledge
156,mixtral_8x7b_32k_seqlen,0.427,naturalquestions_closed,helm_lite_240610,[],knowledge
157,palm_2_bison,0.39,naturalquestions_closed,helm_lite_240610,[],knowledge
158,claude_2.0,0.428,naturalquestions_closed,helm_lite_240610,[],knowledge
159,deepseek_llm_chat_67b,0.412,naturalquestions_closed,helm_lite_240610,[],knowledge
160,llama_2_70b,0.46,naturalquestions_closed,helm_lite_240610,[],knowledge
161,claude_2.1,0.375,naturalquestions_closed,helm_lite_240610,[],knowledge
162,gpt_3.5_text_davinci_003,0.413,naturalquestions_closed,helm_lite_240610,[],knowledge
163,qwen1.5_14b,0.3,naturalquestions_closed,helm_lite_240610,[],knowledge
164,claude_instant_1.2,0.343,naturalquestions_closed,helm_lite_240610,[],knowledge
165,llama_3_8b,0.378,naturalquestions_closed,helm_lite_240610,[],knowledge
166,gpt_3.5_turbo_0613,0.335,naturalquestions_closed,helm_lite_240610,[],knowledge
167,gemma_7b,0.336,naturalquestions_closed,helm_lite_240610,[],knowledge
168,claude_3_sonnet_20240229,0.028,naturalquestions_closed,helm_lite_240610,[],knowledge
169,gpt_3.5_text_davinci_002,0.394,naturalquestions_closed,helm_lite_240610,[],knowledge
170,llama_65b,0.433,naturalquestions_closed,helm_lite_240610,[],knowledge
171,mistral_large_2402,0.311,naturalquestions_closed,helm_lite_240610,[],knowledge
172,cohere_command,0.391,naturalquestions_closed,helm_lite_240610,[],knowledge
173,dbrx_instructruct,0.284,naturalquestions_closed,helm_lite_240610,[],knowledge
174,mistral_v0.1_7b,0.367,naturalquestions_closed,helm_lite_240610,[],knowledge
175,mistral_small_2402,0.304,naturalquestions_closed,helm_lite_240610,[],knowledge
176,mistral_medium_2312,0.29,naturalquestions_closed,helm_lite_240610,[],knowledge
177,qwen1.5_7b,0.27,naturalquestions_closed,helm_lite_240610,[],knowledge
178,claude_3_haiku_20240307,0.144,naturalquestions_closed,helm_lite_240610,[],knowledge
179,yi_6b,0.31,naturalquestions_closed,helm_lite_240610,[],knowledge
180,llama_2_13b,0.371,naturalquestions_closed,helm_lite_240610,[],knowledge
181,jurassic_2_jumbo_178b,0.385,naturalquestions_closed,helm_lite_240610,[],knowledge
182,falcon_40b,0.392,naturalquestions_closed,helm_lite_240610,[],knowledge
183,phi_2,0.155,naturalquestions_closed,helm_lite_240610,[],knowledge
184,jurassic_2_grande_17b,0.35,naturalquestions_closed,helm_lite_240610,[],knowledge
185,llama_2_7b,0.333,naturalquestions_closed,helm_lite_240610,[],knowledge
186,luminous_supreme_70b,0.299,naturalquestions_closed,helm_lite_240610,[],knowledge
187,cohere_command_light,0.195,naturalquestions_closed,helm_lite_240610,[],knowledge
188,luminous_extended_30b,0.253,naturalquestions_closed,helm_lite_240610,[],knowledge
189,falcon_7b,0.285,naturalquestions_closed,helm_lite_240610,[],knowledge
190,olmo_7b,0.259,naturalquestions_closed,helm_lite_240610,[],knowledge
191,luminous_base_13b,0.197,naturalquestions_closed,helm_lite_240610,[],knowledge
192,gpt_4_0613,0.96,openbookqa,helm_lite_240610,[],knowledge
193,llama_3_70b,0.934,openbookqa,helm_lite_240610,[],knowledge
194,mixtral_8x22b,0.882,openbookqa,helm_lite_240610,[],knowledge
195,palmyra_x_v3_72b,0.938,openbookqa,helm_lite_240610,[],knowledge
196,gpt_4_turbo_1106_preview,0.95,openbookqa,helm_lite_240610,[],knowledge
197,palm_2_unicorn,0.938,openbookqa,helm_lite_240610,[],knowledge
198,claude_3_opus_20240229,0.956,openbookqa,helm_lite_240610,[],knowledge
199,qwen1.5_72b,0.93,openbookqa,helm_lite_240610,[],knowledge
200,palmyra_x_v2_33b,0.878,openbookqa,helm_lite_240610,[],knowledge
201,yi_34b,0.92,openbookqa,helm_lite_240610,[],knowledge
202,qwen1.5_32b,0.932,openbookqa,helm_lite_240610,[],knowledge
203,claude_v1.3,0.908,openbookqa,helm_lite_240610,[],knowledge
204,mixtral_8x7b_32k_seqlen,0.868,openbookqa,helm_lite_240610,[],knowledge
205,palm_2_bison,0.878,openbookqa,helm_lite_240610,[],knowledge
206,claude_2.0,0.862,openbookqa,helm_lite_240610,[],knowledge
207,deepseek_llm_chat_67b,0.88,openbookqa,helm_lite_240610,[],knowledge
208,llama_2_70b,0.838,openbookqa,helm_lite_240610,[],knowledge
209,claude_2.1,0.872,openbookqa,helm_lite_240610,[],knowledge
210,gpt_3.5_text_davinci_003,0.828,openbookqa,helm_lite_240610,[],knowledge
211,qwen1.5_14b,0.862,openbookqa,helm_lite_240610,[],knowledge
212,claude_instant_1.2,0.844,openbookqa,helm_lite_240610,[],knowledge
213,llama_3_8b,0.766,openbookqa,helm_lite_240610,[],knowledge
214,gpt_3.5_turbo_0613,0.838,openbookqa,helm_lite_240610,[],knowledge
215,gemma_7b,0.808,openbookqa,helm_lite_240610,[],knowledge
216,claude_3_sonnet_20240229,0.918,openbookqa,helm_lite_240610,[],knowledge
217,gpt_3.5_text_davinci_002,0.796,openbookqa,helm_lite_240610,[],knowledge
218,llama_65b,0.754,openbookqa,helm_lite_240610,[],knowledge
219,mistral_large_2402,0.894,openbookqa,helm_lite_240610,[],knowledge
220,cohere_command,0.774,openbookqa,helm_lite_240610,[],knowledge
221,dbrx_instructruct,0.91,openbookqa,helm_lite_240610,[],knowledge
222,mistral_v0.1_7b,0.776,openbookqa,helm_lite_240610,[],knowledge
223,mistral_small_2402,0.862,openbookqa,helm_lite_240610,[],knowledge
224,mistral_medium_2312,0.83,openbookqa,helm_lite_240610,[],knowledge
225,qwen1.5_7b,0.806,openbookqa,helm_lite_240610,[],knowledge
226,claude_3_haiku_20240307,0.838,openbookqa,helm_lite_240610,[],knowledge
227,yi_6b,0.8,openbookqa,helm_lite_240610,[],knowledge
228,llama_2_13b,0.634,openbookqa,helm_lite_240610,[],knowledge
229,jurassic_2_jumbo_178b,0.688,openbookqa,helm_lite_240610,[],knowledge
230,falcon_40b,0.662,openbookqa,helm_lite_240610,[],knowledge
231,phi_2,0.798,openbookqa,helm_lite_240610,[],knowledge
232,jurassic_2_grande_17b,0.614,openbookqa,helm_lite_240610,[],knowledge
233,llama_2_7b,0.544,openbookqa,helm_lite_240610,[],knowledge
234,luminous_supreme_70b,0.284,openbookqa,helm_lite_240610,[],knowledge
235,cohere_command_light,0.398,openbookqa,helm_lite_240610,[],knowledge
236,luminous_extended_30b,0.272,openbookqa,helm_lite_240610,[],knowledge
237,falcon_7b,0.26,openbookqa,helm_lite_240610,[],knowledge
238,olmo_7b,0.222,openbookqa,helm_lite_240610,[],knowledge
239,luminous_base_13b,0.286,openbookqa,helm_lite_240610,[],knowledge
240,gpt_4_0613,0.735,mmlu,helm_lite_240610,[],knowledge
241,llama_3_70b,0.695,mmlu,helm_lite_240610,[],knowledge
242,mixtral_8x22b,0.701,mmlu,helm_lite_240610,[],knowledge
243,palmyra_x_v3_72b,0.702,mmlu,helm_lite_240610,[],knowledge
244,gpt_4_turbo_1106_preview,0.699,mmlu,helm_lite_240610,[],knowledge
245,palm_2_unicorn,0.702,mmlu,helm_lite_240610,[],knowledge
246,claude_3_opus_20240229,0.768,mmlu,helm_lite_240610,[],knowledge
247,qwen1.5_72b,0.647,mmlu,helm_lite_240610,[],knowledge
248,palmyra_x_v2_33b,0.621,mmlu,helm_lite_240610,[],knowledge
249,yi_34b,0.65,mmlu,helm_lite_240610,[],knowledge
250,qwen1.5_32b,0.628,mmlu,helm_lite_240610,[],knowledge
251,claude_v1.3,0.631,mmlu,helm_lite_240610,[],knowledge
252,mixtral_8x7b_32k_seqlen,0.649,mmlu,helm_lite_240610,[],knowledge
253,palm_2_bison,0.608,mmlu,helm_lite_240610,[],knowledge
254,claude_2.0,0.639,mmlu,helm_lite_240610,[],knowledge
255,deepseek_llm_chat_67b,0.641,mmlu,helm_lite_240610,[],knowledge
256,llama_2_70b,0.58,mmlu,helm_lite_240610,[],knowledge
257,claude_2.1,0.643,mmlu,helm_lite_240610,[],knowledge
258,gpt_3.5_text_davinci_003,0.555,mmlu,helm_lite_240610,[],knowledge
259,qwen1.5_14b,0.626,mmlu,helm_lite_240610,[],knowledge
260,claude_instant_1.2,0.631,mmlu,helm_lite_240610,[],knowledge
261,llama_3_8b,0.602,mmlu,helm_lite_240610,[],knowledge
262,gpt_3.5_turbo_0613,0.614,mmlu,helm_lite_240610,[],knowledge
263,gemma_7b,0.571,mmlu,helm_lite_240610,[],knowledge
264,claude_3_sonnet_20240229,0.652,mmlu,helm_lite_240610,[],knowledge
265,gpt_3.5_text_davinci_002,0.568,mmlu,helm_lite_240610,[],knowledge
266,llama_65b,0.584,mmlu,helm_lite_240610,[],knowledge
267,mistral_large_2402,0.638,mmlu,helm_lite_240610,[],knowledge
268,cohere_command,0.525,mmlu,helm_lite_240610,[],knowledge
269,dbrx_instructruct,0.643,mmlu,helm_lite_240610,[],knowledge
270,mistral_v0.1_7b,0.584,mmlu,helm_lite_240610,[],knowledge
271,mistral_small_2402,0.593,mmlu,helm_lite_240610,[],knowledge
272,mistral_medium_2312,0.618,mmlu,helm_lite_240610,[],knowledge
273,qwen1.5_7b,0.569,mmlu,helm_lite_240610,[],knowledge
274,claude_3_haiku_20240307,0.662,mmlu,helm_lite_240610,[],knowledge
275,yi_6b,0.53,mmlu,helm_lite_240610,[],knowledge
276,llama_2_13b,0.505,mmlu,helm_lite_240610,[],knowledge
277,jurassic_2_jumbo_178b,0.483,mmlu,helm_lite_240610,[],knowledge
278,falcon_40b,0.507,mmlu,helm_lite_240610,[],knowledge
279,phi_2,0.518,mmlu,helm_lite_240610,[],knowledge
280,jurassic_2_grande_17b,0.471,mmlu,helm_lite_240610,[],knowledge
281,llama_2_7b,0.425,mmlu,helm_lite_240610,[],knowledge
282,luminous_supreme_70b,0.316,mmlu,helm_lite_240610,[],knowledge
283,cohere_command_light,0.386,mmlu,helm_lite_240610,[],knowledge
284,luminous_extended_30b,0.248,mmlu,helm_lite_240610,[],knowledge
285,falcon_7b,0.288,mmlu,helm_lite_240610,[],knowledge
286,olmo_7b,0.305,mmlu,helm_lite_240610,[],knowledge
287,luminous_base_13b,0.243,mmlu,helm_lite_240610,[],knowledge
288,gpt_4_0613,0.802,math,helm_lite_240610,[],math
289,llama_3_70b,0.663,math,helm_lite_240610,[],math
290,mixtral_8x22b,0.656,math,helm_lite_240610,[],math
291,palmyra_x_v3_72b,0.723,math,helm_lite_240610,[],math
292,gpt_4_turbo_1106_preview,0.857,math,helm_lite_240610,[],math
293,palm_2_unicorn,0.674,math,helm_lite_240610,[],math
294,claude_3_opus_20240229,0.76,math,helm_lite_240610,[],math
295,qwen1.5_72b,0.683,math,helm_lite_240610,[],math
296,palmyra_x_v2_33b,0.58,math,helm_lite_240610,[],math
297,yi_34b,0.375,math,helm_lite_240610,[],math
298,qwen1.5_32b,0.733,math,helm_lite_240610,[],math
299,claude_v1.3,0.54,math,helm_lite_240610,[],math
300,mixtral_8x7b_32k_seqlen,0.494,math,helm_lite_240610,[],math
301,palm_2_bison,0.421,math,helm_lite_240610,[],math
302,claude_2.0,0.603,math,helm_lite_240610,[],math
303,deepseek_llm_chat_67b,0.615,math,helm_lite_240610,[],math
304,llama_2_70b,0.323,math,helm_lite_240610,[],math
305,claude_2.1,0.632,math,helm_lite_240610,[],math
306,gpt_3.5_text_davinci_003,0.449,math,helm_lite_240610,[],math
307,qwen1.5_14b,0.686,math,helm_lite_240610,[],math
308,claude_instant_1.2,0.499,math,helm_lite_240610,[],math
309,llama_3_8b,0.391,math,helm_lite_240610,[],math
310,gpt_3.5_turbo_0613,0.667,math,helm_lite_240610,[],math
311,gemma_7b,0.5,math,helm_lite_240610,[],math
312,claude_3_sonnet_20240229,0.084,math,helm_lite_240610,[],math
313,gpt_3.5_text_davinci_002,0.428,math,helm_lite_240610,[],math
314,llama_65b,0.257,math,helm_lite_240610,[],math
315,mistral_large_2402,0.75,math,helm_lite_240610,[],math
316,cohere_command,0.236,math,helm_lite_240610,[],math
317,dbrx_instructruct,0.358,math,helm_lite_240610,[],math
318,mistral_v0.1_7b,0.297,math,helm_lite_240610,[],math
319,mistral_small_2402,0.621,math,helm_lite_240610,[],math
320,mistral_medium_2312,0.565,math,helm_lite_240610,[],math
321,qwen1.5_7b,0.561,math,helm_lite_240610,[],math
322,claude_3_haiku_20240307,0.131,math,helm_lite_240610,[],math
323,yi_6b,0.126,math,helm_lite_240610,[],math
324,llama_2_13b,0.102,math,helm_lite_240610,[],math
325,jurassic_2_jumbo_178b,0.103,math,helm_lite_240610,[],math
326,falcon_40b,0.128,math,helm_lite_240610,[],math
327,phi_2,0.255,math,helm_lite_240610,[],math
328,jurassic_2_grande_17b,0.064,math,helm_lite_240610,[],math
329,llama_2_7b,0.097,math,helm_lite_240610,[],math
330,luminous_supreme_70b,0.078,math,helm_lite_240610,[],math
331,cohere_command_light,0.098,math,helm_lite_240610,[],math
332,luminous_extended_30b,0.04,math,helm_lite_240610,[],math
333,falcon_7b,0.044,math,helm_lite_240610,[],math
334,olmo_7b,0.029,math,helm_lite_240610,[],math
335,luminous_base_13b,0.026,math,helm_lite_240610,[],math
336,gpt_4_0613,0.932,gsm8k,helm_lite_240610,[],math
337,llama_3_70b,0.805,gsm8k,helm_lite_240610,[],math
338,mixtral_8x22b,0.8,gsm8k,helm_lite_240610,[],math
339,palmyra_x_v3_72b,0.831,gsm8k,helm_lite_240610,[],math
340,gpt_4_turbo_1106_preview,0.668,gsm8k,helm_lite_240610,[],math
341,palm_2_unicorn,0.831,gsm8k,helm_lite_240610,[],math
342,claude_3_opus_20240229,0.924,gsm8k,helm_lite_240610,[],math
343,qwen1.5_72b,0.799,gsm8k,helm_lite_240610,[],math
344,palmyra_x_v2_33b,0.735,gsm8k,helm_lite_240610,[],math
345,yi_34b,0.648,gsm8k,helm_lite_240610,[],math
346,qwen1.5_32b,0.773,gsm8k,helm_lite_240610,[],math
347,claude_v1.3,0.784,gsm8k,helm_lite_240610,[],math
348,mixtral_8x7b_32k_seqlen,0.622,gsm8k,helm_lite_240610,[],math
349,palm_2_bison,0.61,gsm8k,helm_lite_240610,[],math
350,claude_2.0,0.583,gsm8k,helm_lite_240610,[],math
351,deepseek_llm_chat_67b,0.795,gsm8k,helm_lite_240610,[],math
352,llama_2_70b,0.567,gsm8k,helm_lite_240610,[],math
353,claude_2.1,0.604,gsm8k,helm_lite_240610,[],math
354,gpt_3.5_text_davinci_003,0.615,gsm8k,helm_lite_240610,[],math
355,qwen1.5_14b,0.693,gsm8k,helm_lite_240610,[],math
356,claude_instant_1.2,0.721,gsm8k,helm_lite_240610,[],math
357,llama_3_8b,0.499,gsm8k,helm_lite_240610,[],math
358,gpt_3.5_turbo_0613,0.501,gsm8k,helm_lite_240610,[],math
359,gemma_7b,0.559,gsm8k,helm_lite_240610,[],math
360,claude_3_sonnet_20240229,0.907,gsm8k,helm_lite_240610,[],math
361,gpt_3.5_text_davinci_002,0.479,gsm8k,helm_lite_240610,[],math
362,llama_65b,0.489,gsm8k,helm_lite_240610,[],math
363,mistral_large_2402,0.694,gsm8k,helm_lite_240610,[],math
364,cohere_command,0.452,gsm8k,helm_lite_240610,[],math
365,dbrx_instructruct,0.671,gsm8k,helm_lite_240610,[],math
366,mistral_v0.1_7b,0.377,gsm8k,helm_lite_240610,[],math
367,mistral_small_2402,0.734,gsm8k,helm_lite_240610,[],math
368,mistral_medium_2312,0.706,gsm8k,helm_lite_240610,[],math
369,qwen1.5_7b,0.6,gsm8k,helm_lite_240610,[],math
370,claude_3_haiku_20240307,0.699,gsm8k,helm_lite_240610,[],math
371,yi_6b,0.375,gsm8k,helm_lite_240610,[],math
372,llama_2_13b,0.266,gsm8k,helm_lite_240610,[],math
373,jurassic_2_jumbo_178b,0.239,gsm8k,helm_lite_240610,[],math
374,falcon_40b,0.267,gsm8k,helm_lite_240610,[],math
375,phi_2,0.581,gsm8k,helm_lite_240610,[],math
376,jurassic_2_grande_17b,0.159,gsm8k,helm_lite_240610,[],math
377,llama_2_7b,0.154,gsm8k,helm_lite_240610,[],math
378,luminous_supreme_70b,0.137,gsm8k,helm_lite_240610,[],math
379,cohere_command_light,0.149,gsm8k,helm_lite_240610,[],math
380,luminous_extended_30b,0.075,gsm8k,helm_lite_240610,[],math
381,falcon_7b,0.055,gsm8k,helm_lite_240610,[],math
382,olmo_7b,0.044,gsm8k,helm_lite_240610,[],math
383,luminous_base_13b,0.028,gsm8k,helm_lite_240610,[],math
384,gpt_4_0613,0.713,legalbench,helm_lite_240610,[],knowledge
385,llama_3_70b,0.733,legalbench,helm_lite_240610,[],knowledge
386,mixtral_8x22b,0.708,legalbench,helm_lite_240610,[],knowledge
387,palmyra_x_v3_72b,0.709,legalbench,helm_lite_240610,[],knowledge
388,gpt_4_turbo_1106_preview,0.626,legalbench,helm_lite_240610,[],knowledge
389,palm_2_unicorn,0.677,legalbench,helm_lite_240610,[],knowledge
390,claude_3_opus_20240229,0.662,legalbench,helm_lite_240610,[],knowledge
391,qwen1.5_72b,0.694,legalbench,helm_lite_240610,[],knowledge
392,palmyra_x_v2_33b,0.644,legalbench,helm_lite_240610,[],knowledge
393,yi_34b,0.618,legalbench,helm_lite_240610,[],knowledge
394,qwen1.5_32b,0.636,legalbench,helm_lite_240610,[],knowledge
395,claude_v1.3,0.629,legalbench,helm_lite_240610,[],knowledge
396,mixtral_8x7b_32k_seqlen,0.63,legalbench,helm_lite_240610,[],knowledge
397,palm_2_bison,0.645,legalbench,helm_lite_240610,[],knowledge
398,claude_2.0,0.643,legalbench,helm_lite_240610,[],knowledge
399,deepseek_llm_chat_67b,0.637,legalbench,helm_lite_240610,[],knowledge
400,llama_2_70b,0.673,legalbench,helm_lite_240610,[],knowledge
401,claude_2.1,0.643,legalbench,helm_lite_240610,[],knowledge
402,gpt_3.5_text_davinci_003,0.622,legalbench,helm_lite_240610,[],knowledge
403,qwen1.5_14b,0.593,legalbench,helm_lite_240610,[],knowledge
404,claude_instant_1.2,0.586,legalbench,helm_lite_240610,[],knowledge
405,llama_3_8b,0.637,legalbench,helm_lite_240610,[],knowledge
406,gpt_3.5_turbo_0613,0.528,legalbench,helm_lite_240610,[],knowledge
407,gemma_7b,0.581,legalbench,helm_lite_240610,[],knowledge
408,claude_3_sonnet_20240229,0.49,legalbench,helm_lite_240610,[],knowledge
409,gpt_3.5_text_davinci_002,0.58,legalbench,helm_lite_240610,[],knowledge
410,llama_65b,0.48,legalbench,helm_lite_240610,[],knowledge
411,mistral_large_2402,0.479,legalbench,helm_lite_240610,[],knowledge
412,cohere_command,0.578,legalbench,helm_lite_240610,[],knowledge
413,dbrx_instructruct,0.426,legalbench,helm_lite_240610,[],knowledge
414,mistral_v0.1_7b,0.58,legalbench,helm_lite_240610,[],knowledge
415,mistral_small_2402,0.389,legalbench,helm_lite_240610,[],knowledge
416,mistral_medium_2312,0.452,legalbench,helm_lite_240610,[],knowledge
417,qwen1.5_7b,0.523,legalbench,helm_lite_240610,[],knowledge
418,claude_3_haiku_20240307,0.46,legalbench,helm_lite_240610,[],knowledge
419,yi_6b,0.519,legalbench,helm_lite_240610,[],knowledge
420,llama_2_13b,0.591,legalbench,helm_lite_240610,[],knowledge
421,jurassic_2_jumbo_178b,0.533,legalbench,helm_lite_240610,[],knowledge
422,falcon_40b,0.442,legalbench,helm_lite_240610,[],knowledge
423,phi_2,0.334,legalbench,helm_lite_240610,[],knowledge
424,jurassic_2_grande_17b,0.468,legalbench,helm_lite_240610,[],knowledge
425,llama_2_7b,0.502,legalbench,helm_lite_240610,[],knowledge
426,luminous_supreme_70b,0.452,legalbench,helm_lite_240610,[],knowledge
427,cohere_command_light,0.397,legalbench,helm_lite_240610,[],knowledge
428,luminous_extended_30b,0.421,legalbench,helm_lite_240610,[],knowledge
429,falcon_7b,0.346,legalbench,helm_lite_240610,[],knowledge
430,olmo_7b,0.341,legalbench,helm_lite_240610,[],knowledge
431,luminous_base_13b,0.332,legalbench,helm_lite_240610,[],knowledge
432,gpt_4_0613,0.815,medqa,helm_lite_240610,[],knowledge
433,llama_3_70b,0.777,medqa,helm_lite_240610,[],knowledge
434,mixtral_8x22b,0.704,medqa,helm_lite_240610,[],knowledge
435,palmyra_x_v3_72b,0.684,medqa,helm_lite_240610,[],knowledge
436,gpt_4_turbo_1106_preview,0.817,medqa,helm_lite_240610,[],knowledge
437,palm_2_unicorn,0.684,medqa,helm_lite_240610,[],knowledge
438,claude_3_opus_20240229,0.775,medqa,helm_lite_240610,[],knowledge
439,qwen1.5_72b,0.67,medqa,helm_lite_240610,[],knowledge
440,palmyra_x_v2_33b,0.598,medqa,helm_lite_240610,[],knowledge
441,yi_34b,0.656,medqa,helm_lite_240610,[],knowledge
442,qwen1.5_32b,0.656,medqa,helm_lite_240610,[],knowledge
443,claude_v1.3,0.618,medqa,helm_lite_240610,[],knowledge
444,mixtral_8x7b_32k_seqlen,0.652,medqa,helm_lite_240610,[],knowledge
445,palm_2_bison,0.547,medqa,helm_lite_240610,[],knowledge
446,claude_2.0,0.652,medqa,helm_lite_240610,[],knowledge
447,deepseek_llm_chat_67b,0.628,medqa,helm_lite_240610,[],knowledge
448,llama_2_70b,0.618,medqa,helm_lite_240610,[],knowledge
449,claude_2.1,0.644,medqa,helm_lite_240610,[],knowledge
450,gpt_3.5_text_davinci_003,0.531,medqa,helm_lite_240610,[],knowledge
451,qwen1.5_14b,0.515,medqa,helm_lite_240610,[],knowledge
452,claude_instant_1.2,0.559,medqa,helm_lite_240610,[],knowledge
453,llama_3_8b,0.581,medqa,helm_lite_240610,[],knowledge
454,gpt_3.5_turbo_0613,0.622,medqa,helm_lite_240610,[],knowledge
455,gemma_7b,0.513,medqa,helm_lite_240610,[],knowledge
456,claude_3_sonnet_20240229,0.684,medqa,helm_lite_240610,[],knowledge
457,gpt_3.5_text_davinci_002,0.525,medqa,helm_lite_240610,[],knowledge
458,llama_65b,0.507,medqa,helm_lite_240610,[],knowledge
459,mistral_large_2402,0.499,medqa,helm_lite_240610,[],knowledge
460,cohere_command,0.445,medqa,helm_lite_240610,[],knowledge
461,dbrx_instructruct,0.694,medqa,helm_lite_240610,[],knowledge
462,mistral_v0.1_7b,0.525,medqa,helm_lite_240610,[],knowledge
463,mistral_small_2402,0.616,medqa,helm_lite_240610,[],knowledge
464,mistral_medium_2312,0.61,medqa,helm_lite_240610,[],knowledge
465,qwen1.5_7b,0.479,medqa,helm_lite_240610,[],knowledge
466,claude_3_haiku_20240307,0.702,medqa,helm_lite_240610,[],knowledge
467,yi_6b,0.497,medqa,helm_lite_240610,[],knowledge
468,llama_2_13b,0.392,medqa,helm_lite_240610,[],knowledge
469,jurassic_2_jumbo_178b,0.431,medqa,helm_lite_240610,[],knowledge
470,falcon_40b,0.419,medqa,helm_lite_240610,[],knowledge
471,phi_2,0.41,medqa,helm_lite_240610,[],knowledge
472,jurassic_2_grande_17b,0.39,medqa,helm_lite_240610,[],knowledge
473,llama_2_7b,0.392,medqa,helm_lite_240610,[],knowledge
474,luminous_supreme_70b,0.276,medqa,helm_lite_240610,[],knowledge
475,cohere_command_light,0.312,medqa,helm_lite_240610,[],knowledge
476,luminous_extended_30b,0.276,medqa,helm_lite_240610,[],knowledge
477,falcon_7b,0.254,medqa,helm_lite_240610,[],knowledge
478,olmo_7b,0.229,medqa,helm_lite_240610,[],knowledge
479,luminous_base_13b,0.26,medqa,helm_lite_240610,[],knowledge
480,gpt_4_0613,0.211,wmt_2014,helm_lite_240610,[],mt
481,llama_3_70b,0.225,wmt_2014,helm_lite_240610,[],mt
482,mixtral_8x22b,0.209,wmt_2014,helm_lite_240610,[],mt
483,palmyra_x_v3_72b,0.262,wmt_2014,helm_lite_240610,[],mt
484,gpt_4_turbo_1106_preview,0.205,wmt_2014,helm_lite_240610,[],mt
485,palm_2_unicorn,0.26,wmt_2014,helm_lite_240610,[],mt
486,claude_3_opus_20240229,0.24,wmt_2014,helm_lite_240610,[],mt
487,qwen1.5_72b,0.201,wmt_2014,helm_lite_240610,[],mt
488,palmyra_x_v2_33b,0.239,wmt_2014,helm_lite_240610,[],mt
489,yi_34b,0.172,wmt_2014,helm_lite_240610,[],mt
490,qwen1.5_32b,0.193,wmt_2014,helm_lite_240610,[],mt
491,claude_v1.3,0.219,wmt_2014,helm_lite_240610,[],mt
492,mixtral_8x7b_32k_seqlen,0.19,wmt_2014,helm_lite_240610,[],mt
493,palm_2_bison,0.241,wmt_2014,helm_lite_240610,[],mt
494,claude_2.0,0.219,wmt_2014,helm_lite_240610,[],mt
495,deepseek_llm_chat_67b,0.186,wmt_2014,helm_lite_240610,[],mt
496,llama_2_70b,0.196,wmt_2014,helm_lite_240610,[],mt
497,claude_2.1,0.204,wmt_2014,helm_lite_240610,[],mt
498,gpt_3.5_text_davinci_003,0.191,wmt_2014,helm_lite_240610,[],mt
499,qwen1.5_14b,0.178,wmt_2014,helm_lite_240610,[],mt
500,claude_instant_1.2,0.194,wmt_2014,helm_lite_240610,[],mt
501,llama_3_8b,0.183,wmt_2014,helm_lite_240610,[],mt
502,gpt_3.5_turbo_0613,0.187,wmt_2014,helm_lite_240610,[],mt
503,gemma_7b,0.187,wmt_2014,helm_lite_240610,[],mt
504,claude_3_sonnet_20240229,0.218,wmt_2014,helm_lite_240610,[],mt
505,gpt_3.5_text_davinci_002,0.174,wmt_2014,helm_lite_240610,[],mt
506,llama_65b,0.189,wmt_2014,helm_lite_240610,[],mt
507,mistral_large_2402,0.182,wmt_2014,helm_lite_240610,[],mt
508,cohere_command,0.088,wmt_2014,helm_lite_240610,[],mt
509,dbrx_instructruct,0.131,wmt_2014,helm_lite_240610,[],mt
510,mistral_v0.1_7b,0.16,wmt_2014,helm_lite_240610,[],mt
511,mistral_small_2402,0.169,wmt_2014,helm_lite_240610,[],mt
512,mistral_medium_2312,0.169,wmt_2014,helm_lite_240610,[],mt
513,qwen1.5_7b,0.153,wmt_2014,helm_lite_240610,[],mt
514,claude_3_haiku_20240307,0.148,wmt_2014,helm_lite_240610,[],mt
515,yi_6b,0.117,wmt_2014,helm_lite_240610,[],mt
516,llama_2_13b,0.167,wmt_2014,helm_lite_240610,[],mt
517,jurassic_2_jumbo_178b,0.114,wmt_2014,helm_lite_240610,[],mt
518,falcon_40b,0.162,wmt_2014,helm_lite_240610,[],mt
519,phi_2,0.038,wmt_2014,helm_lite_240610,[],mt
520,jurassic_2_grande_17b,0.102,wmt_2014,helm_lite_240610,[],mt
521,llama_2_7b,0.144,wmt_2014,helm_lite_240610,[],mt
522,luminous_supreme_70b,0.102,wmt_2014,helm_lite_240610,[],mt
523,cohere_command_light,0.023,wmt_2014,helm_lite_240610,[],mt
524,luminous_extended_30b,0.083,wmt_2014,helm_lite_240610,[],mt
525,falcon_7b,0.094,wmt_2014,helm_lite_240610,[],mt
526,olmo_7b,0.097,wmt_2014,helm_lite_240610,[],mt
527,luminous_base_13b,0.066,wmt_2014,helm_lite_240610,[],mt
0,llama_2_70b,0.944,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
1,llama_65b,0.908,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
2,text_davinci_002,0.905,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
3,mistral_v0.1_7b,0.884,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
4,cohere_command_beta_52.4b,0.874,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
5,text_davinci_003,0.872,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
6,jurassic_2_jumbo_178b,0.824,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
7,llama_2_13b,0.823,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
8,tnlg_v2_530b,0.787,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
9,gpt_3.5_turbo_0613,0.783,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
10,llama_30b,0.781,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
11,anthropic_lm_v4_s3_52b,0.78,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
12,gpt_3.5_turbo_0301,0.76,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
13,jurassic_2_grande_17b,0.743,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
14,palmyra_x_43b,0.732,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
15,falcon_40b,0.729,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
16,falcon_instruct_40b,0.727,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
17,mpt_instruct_30b,0.716,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
18,mpt_30b,0.714,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
19,j1_grande_v2_beta_17b,0.706,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
20,vicuna_v1.3_13b,0.706,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
21,cohere_command_beta_6.1b,0.675,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
22,cohere_xlarge_v20221108_52.4b,0.664,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
23,luminous_supreme_70b,0.662,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
24,vicuna_v1.3_7b,0.625,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
25,opt_175b,0.609,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
26,llama_2_7b,0.607,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
27,llama_13b,0.595,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
28,instructpalmyra_30b,0.568,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
29,cohere_xlarge_v20220609_52.4b,0.56,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
30,jurassic_2_large_7.5b,0.553,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
31,davinci_175b,0.538,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
32,llama_7b,0.533,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
33,redpajama_incite_instruct_7b,0.524,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
34,j1_jumbo_v1_178b,0.517,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
35,glm_130b,0.512,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
36,luminous_extended_30b,0.485,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
37,opt_66b,0.448,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
38,bloom_176b,0.446,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
39,j1_grande_v1_17b,0.433,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
40,alpaca_7b,0.381,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
41,falcon_7b,0.378,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
42,redpajama_incite_base_7b,0.378,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
43,cohere_large_v20220720_13.1b,0.372,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
44,redpajama_incite_instruct_v1_3b,0.366,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
45,text_curie_001,0.36,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
46,gpt_neox_20b,0.351,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
47,luminous_base_13b,0.315,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
48,cohere_medium_v20221108_6.1b,0.312,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
49,redpajama_incite_base_v1_3b,0.311,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
50,tnlg_v2_6.7b,0.309,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
51,j1_large_v1_7.5b,0.285,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
52,gpt_j_6b,0.273,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
53,pythia_12b,0.257,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
54,curie_6.7b,0.247,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
55,falcon_instruct_7b,0.244,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
56,cohere_medium_v20220720_6.1b,0.23,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
57,text_babbage_001,0.229,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
58,t0pp_11b,0.197,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
59,pythia_6.9b,0.196,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
60,ul2_20b,0.167,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
61,t5_11b,0.131,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
62,babbage_1.3b,0.114,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
63,cohere_small_v20220720_410m,0.109,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
64,ada_350m,0.108,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
65,text_ada_001,0.107,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
66,yalm_100b,0.075,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions_closed', 'naturalquestions_open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms_marco_regular', 'ms_marco_trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']",holistic
67,llama_2_70b,0.582,mmlu,helm_classic_240130,[],knowledge
68,llama_65b,0.584,mmlu,helm_classic_240130,[],knowledge
69,text_davinci_002,0.568,mmlu,helm_classic_240130,[],knowledge
70,mistral_v0.1_7b,0.572,mmlu,helm_classic_240130,[],knowledge
71,cohere_command_beta_52.4b,0.452,mmlu,helm_classic_240130,[],knowledge
72,text_davinci_003,0.569,mmlu,helm_classic_240130,[],knowledge
73,jurassic_2_jumbo_178b,0.48,mmlu,helm_classic_240130,[],knowledge
74,llama_2_13b,0.507,mmlu,helm_classic_240130,[],knowledge
75,tnlg_v2_530b,0.469,mmlu,helm_classic_240130,[],knowledge
76,gpt_3.5_turbo_0613,0.391,mmlu,helm_classic_240130,[],knowledge
77,llama_30b,0.531,mmlu,helm_classic_240130,[],knowledge
78,anthropic_lm_v4_s3_52b,0.481,mmlu,helm_classic_240130,[],knowledge
79,gpt_3.5_turbo_0301,0.59,mmlu,helm_classic_240130,[],knowledge
80,jurassic_2_grande_17b,0.475,mmlu,helm_classic_240130,[],knowledge
81,palmyra_x_43b,0.609,mmlu,helm_classic_240130,[],knowledge
82,falcon_40b,0.509,mmlu,helm_classic_240130,[],knowledge
83,falcon_instruct_40b,0.497,mmlu,helm_classic_240130,[],knowledge
84,mpt_instruct_30b,0.444,mmlu,helm_classic_240130,[],knowledge
85,mpt_30b,0.437,mmlu,helm_classic_240130,[],knowledge
86,j1_grande_v2_beta_17b,0.445,mmlu,helm_classic_240130,[],knowledge
87,vicuna_v1.3_13b,0.462,mmlu,helm_classic_240130,[],knowledge
88,cohere_command_beta_6.1b,0.406,mmlu,helm_classic_240130,[],knowledge
89,cohere_xlarge_v20221108_52.4b,0.382,mmlu,helm_classic_240130,[],knowledge
90,luminous_supreme_70b,0.38,mmlu,helm_classic_240130,[],knowledge
91,vicuna_v1.3_7b,0.434,mmlu,helm_classic_240130,[],knowledge
92,opt_175b,0.318,mmlu,helm_classic_240130,[],knowledge
93,llama_2_7b,0.431,mmlu,helm_classic_240130,[],knowledge
94,llama_13b,0.422,mmlu,helm_classic_240130,[],knowledge
95,instructpalmyra_30b,0.403,mmlu,helm_classic_240130,[],knowledge
96,cohere_xlarge_v20220609_52.4b,0.353,mmlu,helm_classic_240130,[],knowledge
97,jurassic_2_large_7.5b,0.339,mmlu,helm_classic_240130,[],knowledge
98,davinci_175b,0.422,mmlu,helm_classic_240130,[],knowledge
99,llama_7b,0.321,mmlu,helm_classic_240130,[],knowledge
100,redpajama_incite_instruct_7b,0.363,mmlu,helm_classic_240130,[],knowledge
101,j1_jumbo_v1_178b,0.259,mmlu,helm_classic_240130,[],knowledge
102,glm_130b,0.344,mmlu,helm_classic_240130,[],knowledge
103,luminous_extended_30b,0.321,mmlu,helm_classic_240130,[],knowledge
104,opt_66b,0.276,mmlu,helm_classic_240130,[],knowledge
105,bloom_176b,0.299,mmlu,helm_classic_240130,[],knowledge
106,j1_grande_v1_17b,0.27,mmlu,helm_classic_240130,[],knowledge
107,alpaca_7b,0.385,mmlu,helm_classic_240130,[],knowledge
108,falcon_7b,0.286,mmlu,helm_classic_240130,[],knowledge
109,redpajama_incite_base_7b,0.302,mmlu,helm_classic_240130,[],knowledge
110,cohere_large_v20220720_13.1b,0.324,mmlu,helm_classic_240130,[],knowledge
111,redpajama_incite_instruct_v1_3b,0.257,mmlu,helm_classic_240130,[],knowledge
112,text_curie_001,0.237,mmlu,helm_classic_240130,[],knowledge
113,gpt_neox_20b,0.276,mmlu,helm_classic_240130,[],knowledge
114,luminous_base_13b,0.27,mmlu,helm_classic_240130,[],knowledge
115,cohere_medium_v20221108_6.1b,0.254,mmlu,helm_classic_240130,[],knowledge
116,redpajama_incite_base_v1_3b,0.263,mmlu,helm_classic_240130,[],knowledge
117,tnlg_v2_6.7b,0.242,mmlu,helm_classic_240130,[],knowledge
118,j1_large_v1_7.5b,0.241,mmlu,helm_classic_240130,[],knowledge
119,gpt_j_6b,0.249,mmlu,helm_classic_240130,[],knowledge
120,pythia_12b,0.274,mmlu,helm_classic_240130,[],knowledge
121,curie_6.7b,0.243,mmlu,helm_classic_240130,[],knowledge
122,falcon_instruct_7b,0.275,mmlu,helm_classic_240130,[],knowledge
123,cohere_medium_v20220720_6.1b,0.279,mmlu,helm_classic_240130,[],knowledge
124,text_babbage_001,0.229,mmlu,helm_classic_240130,[],knowledge
125,t0pp_11b,0.407,mmlu,helm_classic_240130,[],knowledge
126,pythia_6.9b,0.236,mmlu,helm_classic_240130,[],knowledge
127,ul2_20b,0.291,mmlu,helm_classic_240130,[],knowledge
128,t5_11b,0.29,mmlu,helm_classic_240130,[],knowledge
129,babbage_1.3b,0.235,mmlu,helm_classic_240130,[],knowledge
130,cohere_small_v20220720_410m,0.264,mmlu,helm_classic_240130,[],knowledge
131,ada_350m,0.243,mmlu,helm_classic_240130,[],knowledge
132,text_ada_001,0.238,mmlu,helm_classic_240130,[],knowledge
133,yalm_100b,0.243,mmlu,helm_classic_240130,[],knowledge
134,llama_2_70b,0.886,boolq,helm_classic_240130,[],knowledge
135,llama_65b,0.871,boolq,helm_classic_240130,[],knowledge
136,text_davinci_002,0.877,boolq,helm_classic_240130,[],knowledge
137,mistral_v0.1_7b,0.874,boolq,helm_classic_240130,[],knowledge
138,cohere_command_beta_52.4b,0.856,boolq,helm_classic_240130,[],knowledge
139,text_davinci_003,0.881,boolq,helm_classic_240130,[],knowledge
140,jurassic_2_jumbo_178b,0.829,boolq,helm_classic_240130,[],knowledge
141,llama_2_13b,0.811,boolq,helm_classic_240130,[],knowledge
142,tnlg_v2_530b,0.809,boolq,helm_classic_240130,[],knowledge
143,gpt_3.5_turbo_0613,0.87,boolq,helm_classic_240130,[],knowledge
144,llama_30b,0.861,boolq,helm_classic_240130,[],knowledge
145,anthropic_lm_v4_s3_52b,0.815,boolq,helm_classic_240130,[],knowledge
146,gpt_3.5_turbo_0301,0.74,boolq,helm_classic_240130,[],knowledge
147,jurassic_2_grande_17b,0.826,boolq,helm_classic_240130,[],knowledge
148,palmyra_x_43b,0.896,boolq,helm_classic_240130,[],knowledge
149,falcon_40b,0.819,boolq,helm_classic_240130,[],knowledge
150,falcon_instruct_40b,0.829,boolq,helm_classic_240130,[],knowledge
151,mpt_instruct_30b,0.85,boolq,helm_classic_240130,[],knowledge
152,mpt_30b,0.704,boolq,helm_classic_240130,[],knowledge
153,j1_grande_v2_beta_17b,0.812,boolq,helm_classic_240130,[],knowledge
154,vicuna_v1.3_13b,0.808,boolq,helm_classic_240130,[],knowledge
155,cohere_command_beta_6.1b,0.798,boolq,helm_classic_240130,[],knowledge
156,cohere_xlarge_v20221108_52.4b,0.762,boolq,helm_classic_240130,[],knowledge
157,luminous_supreme_70b,0.775,boolq,helm_classic_240130,[],knowledge
158,vicuna_v1.3_7b,0.76,boolq,helm_classic_240130,[],knowledge
159,opt_175b,0.793,boolq,helm_classic_240130,[],knowledge
160,llama_2_7b,0.762,boolq,helm_classic_240130,[],knowledge
161,llama_13b,0.714,boolq,helm_classic_240130,[],knowledge
162,instructpalmyra_30b,0.751,boolq,helm_classic_240130,[],knowledge
163,cohere_xlarge_v20220609_52.4b,0.718,boolq,helm_classic_240130,[],knowledge
164,jurassic_2_large_7.5b,0.742,boolq,helm_classic_240130,[],knowledge
165,davinci_175b,0.722,boolq,helm_classic_240130,[],knowledge
166,llama_7b,0.756,boolq,helm_classic_240130,[],knowledge
167,redpajama_incite_instruct_7b,0.705,boolq,helm_classic_240130,[],knowledge
168,j1_jumbo_v1_178b,0.776,boolq,helm_classic_240130,[],knowledge
169,glm_130b,0.784,boolq,helm_classic_240130,[],knowledge
170,luminous_extended_30b,0.767,boolq,helm_classic_240130,[],knowledge
171,opt_66b,0.76,boolq,helm_classic_240130,[],knowledge
172,bloom_176b,0.704,boolq,helm_classic_240130,[],knowledge
173,j1_grande_v1_17b,0.722,boolq,helm_classic_240130,[],knowledge
174,alpaca_7b,0.778,boolq,helm_classic_240130,[],knowledge
175,falcon_7b,0.753,boolq,helm_classic_240130,[],knowledge
176,redpajama_incite_base_7b,0.713,boolq,helm_classic_240130,[],knowledge
177,cohere_large_v20220720_13.1b,0.725,boolq,helm_classic_240130,[],knowledge
178,redpajama_incite_instruct_v1_3b,0.677,boolq,helm_classic_240130,[],knowledge
179,text_curie_001,0.62,boolq,helm_classic_240130,[],knowledge
180,gpt_neox_20b,0.683,boolq,helm_classic_240130,[],knowledge
181,luminous_base_13b,0.719,boolq,helm_classic_240130,[],knowledge
182,cohere_medium_v20221108_6.1b,0.7,boolq,helm_classic_240130,[],knowledge
183,redpajama_incite_base_v1_3b,0.685,boolq,helm_classic_240130,[],knowledge
184,tnlg_v2_6.7b,0.698,boolq,helm_classic_240130,[],knowledge
185,j1_large_v1_7.5b,0.683,boolq,helm_classic_240130,[],knowledge
186,gpt_j_6b,0.649,boolq,helm_classic_240130,[],knowledge
187,pythia_12b,0.662,boolq,helm_classic_240130,[],knowledge
188,curie_6.7b,0.656,boolq,helm_classic_240130,[],knowledge
189,falcon_instruct_7b,0.72,boolq,helm_classic_240130,[],knowledge
190,cohere_medium_v20220720_6.1b,0.659,boolq,helm_classic_240130,[],knowledge
191,text_babbage_001,0.451,boolq,helm_classic_240130,[],knowledge
192,t0pp_11b,0.0,boolq,helm_classic_240130,[],knowledge
193,pythia_6.9b,0.631,boolq,helm_classic_240130,[],knowledge
194,ul2_20b,0.746,boolq,helm_classic_240130,[],knowledge
195,t5_11b,0.761,boolq,helm_classic_240130,[],knowledge
196,babbage_1.3b,0.574,boolq,helm_classic_240130,[],knowledge
197,cohere_small_v20220720_410m,0.457,boolq,helm_classic_240130,[],knowledge
198,ada_350m,0.581,boolq,helm_classic_240130,[],knowledge
199,text_ada_001,0.464,boolq,helm_classic_240130,[],knowledge
200,yalm_100b,0.634,boolq,helm_classic_240130,[],knowledge
201,llama_2_70b,0.77,narrativeqa,helm_classic_240130,[],knowledge
202,llama_65b,0.755,narrativeqa,helm_classic_240130,[],knowledge
203,text_davinci_002,0.727,narrativeqa,helm_classic_240130,[],knowledge
204,mistral_v0.1_7b,0.716,narrativeqa,helm_classic_240130,[],knowledge
205,cohere_command_beta_52.4b,0.752,narrativeqa,helm_classic_240130,[],knowledge
206,text_davinci_003,0.727,narrativeqa,helm_classic_240130,[],knowledge
207,jurassic_2_jumbo_178b,0.733,narrativeqa,helm_classic_240130,[],knowledge
208,llama_2_13b,0.744,narrativeqa,helm_classic_240130,[],knowledge
209,tnlg_v2_530b,0.722,narrativeqa,helm_classic_240130,[],knowledge
210,gpt_3.5_turbo_0613,0.625,narrativeqa,helm_classic_240130,[],knowledge
211,llama_30b,0.752,narrativeqa,helm_classic_240130,[],knowledge
212,anthropic_lm_v4_s3_52b,0.728,narrativeqa,helm_classic_240130,[],knowledge
213,gpt_3.5_turbo_0301,0.663,narrativeqa,helm_classic_240130,[],knowledge
214,jurassic_2_grande_17b,0.737,narrativeqa,helm_classic_240130,[],knowledge
215,palmyra_x_43b,0.742,narrativeqa,helm_classic_240130,[],knowledge
216,falcon_40b,0.673,narrativeqa,helm_classic_240130,[],knowledge
217,falcon_instruct_40b,0.625,narrativeqa,helm_classic_240130,[],knowledge
218,mpt_instruct_30b,0.733,narrativeqa,helm_classic_240130,[],knowledge
219,mpt_30b,0.732,narrativeqa,helm_classic_240130,[],knowledge
220,j1_grande_v2_beta_17b,0.725,narrativeqa,helm_classic_240130,[],knowledge
221,vicuna_v1.3_13b,0.691,narrativeqa,helm_classic_240130,[],knowledge
222,cohere_command_beta_6.1b,0.709,narrativeqa,helm_classic_240130,[],knowledge
223,cohere_xlarge_v20221108_52.4b,0.672,narrativeqa,helm_classic_240130,[],knowledge
224,luminous_supreme_70b,0.711,narrativeqa,helm_classic_240130,[],knowledge
225,vicuna_v1.3_7b,0.643,narrativeqa,helm_classic_240130,[],knowledge
226,opt_175b,0.671,narrativeqa,helm_classic_240130,[],knowledge
227,llama_2_7b,0.691,narrativeqa,helm_classic_240130,[],knowledge
228,llama_13b,0.711,narrativeqa,helm_classic_240130,[],knowledge
229,instructpalmyra_30b,0.496,narrativeqa,helm_classic_240130,[],knowledge
230,cohere_xlarge_v20220609_52.4b,0.65,narrativeqa,helm_classic_240130,[],knowledge
232,davinci_175b,0.687,narrativeqa,helm_classic_240130,[],knowledge
233,llama_7b,0.669,narrativeqa,helm_classic_240130,[],knowledge
234,redpajama_incite_instruct_7b,0.638,narrativeqa,helm_classic_240130,[],knowledge
235,j1_jumbo_v1_178b,0.695,narrativeqa,helm_classic_240130,[],knowledge
236,glm_130b,0.706,narrativeqa,helm_classic_240130,[],knowledge
237,luminous_extended_30b,0.665,narrativeqa,helm_classic_240130,[],knowledge
238,opt_66b,0.638,narrativeqa,helm_classic_240130,[],knowledge
239,bloom_176b,0.662,narrativeqa,helm_classic_240130,[],knowledge
240,j1_grande_v1_17b,0.672,narrativeqa,helm_classic_240130,[],knowledge
241,alpaca_7b,0.396,narrativeqa,helm_classic_240130,[],knowledge
242,falcon_7b,0.621,narrativeqa,helm_classic_240130,[],knowledge
243,redpajama_incite_base_7b,0.617,narrativeqa,helm_classic_240130,[],knowledge
244,cohere_large_v20220720_13.1b,0.625,narrativeqa,helm_classic_240130,[],knowledge
245,redpajama_incite_instruct_v1_3b,0.638,narrativeqa,helm_classic_240130,[],knowledge
246,text_curie_001,0.582,narrativeqa,helm_classic_240130,[],knowledge
247,gpt_neox_20b,0.599,narrativeqa,helm_classic_240130,[],knowledge
248,luminous_base_13b,0.605,narrativeqa,helm_classic_240130,[],knowledge
249,cohere_medium_v20221108_6.1b,0.61,narrativeqa,helm_classic_240130,[],knowledge
250,redpajama_incite_base_v1_3b,0.555,narrativeqa,helm_classic_240130,[],knowledge
251,tnlg_v2_6.7b,0.631,narrativeqa,helm_classic_240130,[],knowledge
252,j1_large_v1_7.5b,0.623,narrativeqa,helm_classic_240130,[],knowledge
253,gpt_j_6b,0.545,narrativeqa,helm_classic_240130,[],knowledge
254,pythia_12b,0.596,narrativeqa,helm_classic_240130,[],knowledge
255,curie_6.7b,0.604,narrativeqa,helm_classic_240130,[],knowledge
256,falcon_instruct_7b,0.476,narrativeqa,helm_classic_240130,[],knowledge
257,cohere_medium_v20220720_6.1b,0.559,narrativeqa,helm_classic_240130,[],knowledge
258,text_babbage_001,0.429,narrativeqa,helm_classic_240130,[],knowledge
259,t0pp_11b,0.151,narrativeqa,helm_classic_240130,[],knowledge
260,pythia_6.9b,0.528,narrativeqa,helm_classic_240130,[],knowledge
261,ul2_20b,0.083,narrativeqa,helm_classic_240130,[],knowledge
262,t5_11b,0.086,narrativeqa,helm_classic_240130,[],knowledge
263,babbage_1.3b,0.491,narrativeqa,helm_classic_240130,[],knowledge
264,cohere_small_v20220720_410m,0.294,narrativeqa,helm_classic_240130,[],knowledge
265,ada_350m,0.326,narrativeqa,helm_classic_240130,[],knowledge
266,text_ada_001,0.238,narrativeqa,helm_classic_240130,[],knowledge
267,yalm_100b,0.252,narrativeqa,helm_classic_240130,[],knowledge
268,llama_2_70b,0.458,naturalquestions_closed,helm_classic_240130,[],knowledge
269,llama_65b,0.431,naturalquestions_closed,helm_classic_240130,[],knowledge
270,text_davinci_002,0.383,naturalquestions_closed,helm_classic_240130,[],knowledge
271,mistral_v0.1_7b,0.365,naturalquestions_closed,helm_classic_240130,[],knowledge
272,cohere_command_beta_52.4b,0.372,naturalquestions_closed,helm_classic_240130,[],knowledge
273,text_davinci_003,0.406,naturalquestions_closed,helm_classic_240130,[],knowledge
274,jurassic_2_jumbo_178b,0.385,naturalquestions_closed,helm_classic_240130,[],knowledge
275,llama_2_13b,0.376,naturalquestions_closed,helm_classic_240130,[],knowledge
276,tnlg_v2_530b,0.384,naturalquestions_closed,helm_classic_240130,[],knowledge
277,gpt_3.5_turbo_0613,0.348,naturalquestions_closed,helm_classic_240130,[],knowledge
278,llama_30b,0.408,naturalquestions_closed,helm_classic_240130,[],knowledge
279,anthropic_lm_v4_s3_52b,0.288,naturalquestions_closed,helm_classic_240130,[],knowledge
280,gpt_3.5_turbo_0301,0.39,naturalquestions_closed,helm_classic_240130,[],knowledge
281,jurassic_2_grande_17b,0.356,naturalquestions_closed,helm_classic_240130,[],knowledge
282,palmyra_x_43b,0.413,naturalquestions_closed,helm_classic_240130,[],knowledge
283,falcon_40b,0.392,naturalquestions_closed,helm_classic_240130,[],knowledge
284,falcon_instruct_40b,0.377,naturalquestions_closed,helm_classic_240130,[],knowledge
285,mpt_instruct_30b,0.304,naturalquestions_closed,helm_classic_240130,[],knowledge
286,mpt_30b,0.347,naturalquestions_closed,helm_classic_240130,[],knowledge
287,j1_grande_v2_beta_17b,0.337,naturalquestions_closed,helm_classic_240130,[],knowledge
288,vicuna_v1.3_13b,0.346,naturalquestions_closed,helm_classic_240130,[],knowledge
289,cohere_command_beta_6.1b,0.229,naturalquestions_closed,helm_classic_240130,[],knowledge
290,cohere_xlarge_v20221108_52.4b,0.361,naturalquestions_closed,helm_classic_240130,[],knowledge
291,luminous_supreme_70b,0.293,naturalquestions_closed,helm_classic_240130,[],knowledge
292,vicuna_v1.3_7b,0.287,naturalquestions_closed,helm_classic_240130,[],knowledge
293,opt_175b,0.297,naturalquestions_closed,helm_classic_240130,[],knowledge
294,llama_2_7b,0.337,naturalquestions_closed,helm_classic_240130,[],knowledge
295,llama_13b,0.346,naturalquestions_closed,helm_classic_240130,[],knowledge
296,instructpalmyra_30b,0.33,naturalquestions_closed,helm_classic_240130,[],knowledge
297,cohere_xlarge_v20220609_52.4b,0.312,naturalquestions_closed,helm_classic_240130,[],knowledge
298,jurassic_2_large_7.5b,0.274,naturalquestions_closed,helm_classic_240130,[],knowledge
299,davinci_175b,0.329,naturalquestions_closed,helm_classic_240130,[],knowledge
300,llama_7b,0.297,naturalquestions_closed,helm_classic_240130,[],knowledge
301,redpajama_incite_instruct_7b,0.232,naturalquestions_closed,helm_classic_240130,[],knowledge
302,j1_jumbo_v1_178b,0.293,naturalquestions_closed,helm_classic_240130,[],knowledge
303,glm_130b,0.148,naturalquestions_closed,helm_classic_240130,[],knowledge
304,luminous_extended_30b,0.254,naturalquestions_closed,helm_classic_240130,[],knowledge
305,opt_66b,0.258,naturalquestions_closed,helm_classic_240130,[],knowledge
306,bloom_176b,0.216,naturalquestions_closed,helm_classic_240130,[],knowledge
307,j1_grande_v1_17b,0.233,naturalquestions_closed,helm_classic_240130,[],knowledge
308,alpaca_7b,0.266,naturalquestions_closed,helm_classic_240130,[],knowledge
309,falcon_7b,0.285,naturalquestions_closed,helm_classic_240130,[],knowledge
310,redpajama_incite_base_7b,0.25,naturalquestions_closed,helm_classic_240130,[],knowledge
311,cohere_large_v20220720_13.1b,0.232,naturalquestions_closed,helm_classic_240130,[],knowledge
312,redpajama_incite_instruct_v1_3b,0.203,naturalquestions_closed,helm_classic_240130,[],knowledge
313,text_curie_001,0.175,naturalquestions_closed,helm_classic_240130,[],knowledge
314,gpt_neox_20b,0.193,naturalquestions_closed,helm_classic_240130,[],knowledge
315,luminous_base_13b,0.202,naturalquestions_closed,helm_classic_240130,[],knowledge
316,cohere_medium_v20221108_6.1b,0.199,naturalquestions_closed,helm_classic_240130,[],knowledge
317,redpajama_incite_base_v1_3b,0.207,naturalquestions_closed,helm_classic_240130,[],knowledge
318,tnlg_v2_6.7b,0.21,naturalquestions_closed,helm_classic_240130,[],knowledge
319,j1_large_v1_7.5b,0.19,naturalquestions_closed,helm_classic_240130,[],knowledge
320,gpt_j_6b,0.156,naturalquestions_closed,helm_classic_240130,[],knowledge
321,pythia_12b,0.175,naturalquestions_closed,helm_classic_240130,[],knowledge
322,curie_6.7b,0.199,naturalquestions_closed,helm_classic_240130,[],knowledge
323,falcon_instruct_7b,0.194,naturalquestions_closed,helm_classic_240130,[],knowledge
324,cohere_medium_v20220720_6.1b,0.177,naturalquestions_closed,helm_classic_240130,[],knowledge
325,text_babbage_001,0.07,naturalquestions_closed,helm_classic_240130,[],knowledge
326,t0pp_11b,0.039,naturalquestions_closed,helm_classic_240130,[],knowledge
327,pythia_6.9b,0.142,naturalquestions_closed,helm_classic_240130,[],knowledge
328,ul2_20b,0.204,naturalquestions_closed,helm_classic_240130,[],knowledge
329,t5_11b,0.194,naturalquestions_closed,helm_classic_240130,[],knowledge
330,babbage_1.3b,0.119,naturalquestions_closed,helm_classic_240130,[],knowledge
331,cohere_small_v20220720_410m,0.078,naturalquestions_closed,helm_classic_240130,[],knowledge
332,ada_350m,0.082,naturalquestions_closed,helm_classic_240130,[],knowledge
333,text_ada_001,0.025,naturalquestions_closed,helm_classic_240130,[],knowledge
334,yalm_100b,0.068,naturalquestions_closed,helm_classic_240130,[],knowledge
335,llama_2_70b,0.674,naturalquestions_open,helm_classic_240130,[],knowledge
336,llama_65b,0.672,naturalquestions_open,helm_classic_240130,[],knowledge
337,text_davinci_002,0.713,naturalquestions_open,helm_classic_240130,[],knowledge
338,mistral_v0.1_7b,0.687,naturalquestions_open,helm_classic_240130,[],knowledge
339,cohere_command_beta_52.4b,0.76,naturalquestions_open,helm_classic_240130,[],knowledge
340,text_davinci_003,0.77,naturalquestions_open,helm_classic_240130,[],knowledge
341,jurassic_2_jumbo_178b,0.669,naturalquestions_open,helm_classic_240130,[],knowledge
342,llama_2_13b,0.637,naturalquestions_open,helm_classic_240130,[],knowledge
343,tnlg_v2_530b,0.642,naturalquestions_open,helm_classic_240130,[],knowledge
344,gpt_3.5_turbo_0613,0.675,naturalquestions_open,helm_classic_240130,[],knowledge
345,llama_30b,0.666,naturalquestions_open,helm_classic_240130,[],knowledge
346,anthropic_lm_v4_s3_52b,0.686,naturalquestions_open,helm_classic_240130,[],knowledge
347,gpt_3.5_turbo_0301,0.624,naturalquestions_open,helm_classic_240130,[],knowledge
348,jurassic_2_grande_17b,0.639,naturalquestions_open,helm_classic_240130,[],knowledge
350,falcon_40b,0.675,naturalquestions_open,helm_classic_240130,[],knowledge
351,falcon_instruct_40b,0.666,naturalquestions_open,helm_classic_240130,[],knowledge
352,mpt_instruct_30b,0.697,naturalquestions_open,helm_classic_240130,[],knowledge
353,mpt_30b,0.673,naturalquestions_open,helm_classic_240130,[],knowledge
354,j1_grande_v2_beta_17b,0.625,naturalquestions_open,helm_classic_240130,[],knowledge
355,vicuna_v1.3_13b,0.686,naturalquestions_open,helm_classic_240130,[],knowledge
356,cohere_command_beta_6.1b,0.717,naturalquestions_open,helm_classic_240130,[],knowledge
357,cohere_xlarge_v20221108_52.4b,0.628,naturalquestions_open,helm_classic_240130,[],knowledge
358,luminous_supreme_70b,0.649,naturalquestions_open,helm_classic_240130,[],knowledge
359,vicuna_v1.3_7b,0.634,naturalquestions_open,helm_classic_240130,[],knowledge
360,opt_175b,0.615,naturalquestions_open,helm_classic_240130,[],knowledge
361,llama_2_7b,0.611,naturalquestions_open,helm_classic_240130,[],knowledge
362,llama_13b,0.614,naturalquestions_open,helm_classic_240130,[],knowledge
363,instructpalmyra_30b,0.682,naturalquestions_open,helm_classic_240130,[],knowledge
364,cohere_xlarge_v20220609_52.4b,0.595,naturalquestions_open,helm_classic_240130,[],knowledge
365,jurassic_2_large_7.5b,0.589,naturalquestions_open,helm_classic_240130,[],knowledge
366,davinci_175b,0.625,naturalquestions_open,helm_classic_240130,[],knowledge
367,llama_7b,0.589,naturalquestions_open,helm_classic_240130,[],knowledge
368,redpajama_incite_instruct_7b,0.659,naturalquestions_open,helm_classic_240130,[],knowledge
369,j1_jumbo_v1_178b,0.595,naturalquestions_open,helm_classic_240130,[],knowledge
370,glm_130b,0.642,naturalquestions_open,helm_classic_240130,[],knowledge
371,luminous_extended_30b,0.609,naturalquestions_open,helm_classic_240130,[],knowledge
372,opt_66b,0.596,naturalquestions_open,helm_classic_240130,[],knowledge
373,bloom_176b,0.621,naturalquestions_open,helm_classic_240130,[],knowledge
374,j1_grande_v1_17b,0.578,naturalquestions_open,helm_classic_240130,[],knowledge
375,alpaca_7b,0.592,naturalquestions_open,helm_classic_240130,[],knowledge
376,falcon_7b,0.579,naturalquestions_open,helm_classic_240130,[],knowledge
377,redpajama_incite_base_7b,0.586,naturalquestions_open,helm_classic_240130,[],knowledge
378,cohere_large_v20220720_13.1b,0.573,naturalquestions_open,helm_classic_240130,[],knowledge
379,redpajama_incite_instruct_v1_3b,0.637,naturalquestions_open,helm_classic_240130,[],knowledge
380,text_curie_001,0.571,naturalquestions_open,helm_classic_240130,[],knowledge
381,gpt_neox_20b,0.596,naturalquestions_open,helm_classic_240130,[],knowledge
382,luminous_base_13b,0.568,naturalquestions_open,helm_classic_240130,[],knowledge
383,cohere_medium_v20221108_6.1b,0.517,naturalquestions_open,helm_classic_240130,[],knowledge
384,redpajama_incite_base_v1_3b,0.52,naturalquestions_open,helm_classic_240130,[],knowledge
385,tnlg_v2_6.7b,0.561,naturalquestions_open,helm_classic_240130,[],knowledge
386,j1_large_v1_7.5b,0.532,naturalquestions_open,helm_classic_240130,[],knowledge
387,gpt_j_6b,0.559,naturalquestions_open,helm_classic_240130,[],knowledge
388,pythia_12b,0.581,naturalquestions_open,helm_classic_240130,[],knowledge
389,curie_6.7b,0.552,naturalquestions_open,helm_classic_240130,[],knowledge
390,falcon_instruct_7b,0.449,naturalquestions_open,helm_classic_240130,[],knowledge
391,cohere_medium_v20220720_6.1b,0.504,naturalquestions_open,helm_classic_240130,[],knowledge
392,text_babbage_001,0.33,naturalquestions_open,helm_classic_240130,[],knowledge
393,t0pp_11b,0.19,naturalquestions_open,helm_classic_240130,[],knowledge
394,pythia_6.9b,0.539,naturalquestions_open,helm_classic_240130,[],knowledge
395,ul2_20b,0.349,naturalquestions_open,helm_classic_240130,[],knowledge
396,t5_11b,0.477,naturalquestions_open,helm_classic_240130,[],knowledge
397,babbage_1.3b,0.451,naturalquestions_open,helm_classic_240130,[],knowledge
398,cohere_small_v20220720_410m,0.309,naturalquestions_open,helm_classic_240130,[],knowledge
399,ada_350m,0.365,naturalquestions_open,helm_classic_240130,[],knowledge
400,text_ada_001,0.149,naturalquestions_open,helm_classic_240130,[],knowledge
401,yalm_100b,0.227,naturalquestions_open,helm_classic_240130,[],knowledge
402,llama_2_70b,0.484,quac,helm_classic_240130,[],other
403,llama_65b,0.401,quac,helm_classic_240130,[],other
404,text_davinci_002,0.445,quac,helm_classic_240130,[],other
405,mistral_v0.1_7b,0.423,quac,helm_classic_240130,[],other
406,cohere_command_beta_52.4b,0.432,quac,helm_classic_240130,[],other
407,text_davinci_003,0.525,quac,helm_classic_240130,[],other
408,jurassic_2_jumbo_178b,0.435,quac,helm_classic_240130,[],other
409,llama_2_13b,0.424,quac,helm_classic_240130,[],other
410,tnlg_v2_530b,0.39,quac,helm_classic_240130,[],other
411,gpt_3.5_turbo_0613,0.485,quac,helm_classic_240130,[],other
412,llama_30b,0.39,quac,helm_classic_240130,[],other
413,anthropic_lm_v4_s3_52b,0.431,quac,helm_classic_240130,[],other
414,gpt_3.5_turbo_0301,0.512,quac,helm_classic_240130,[],other
415,jurassic_2_grande_17b,0.418,quac,helm_classic_240130,[],other
416,palmyra_x_43b,0.473,quac,helm_classic_240130,[],other
417,falcon_40b,0.307,quac,helm_classic_240130,[],other
418,falcon_instruct_40b,0.371,quac,helm_classic_240130,[],other
419,mpt_instruct_30b,0.327,quac,helm_classic_240130,[],other
420,mpt_30b,0.393,quac,helm_classic_240130,[],other
421,j1_grande_v2_beta_17b,0.392,quac,helm_classic_240130,[],other
422,vicuna_v1.3_13b,0.403,quac,helm_classic_240130,[],other
423,cohere_command_beta_6.1b,0.375,quac,helm_classic_240130,[],other
424,cohere_xlarge_v20221108_52.4b,0.374,quac,helm_classic_240130,[],other
425,luminous_supreme_70b,0.37,quac,helm_classic_240130,[],other
426,vicuna_v1.3_7b,0.392,quac,helm_classic_240130,[],other
427,opt_175b,0.36,quac,helm_classic_240130,[],other
428,llama_2_7b,0.406,quac,helm_classic_240130,[],other
429,llama_13b,0.347,quac,helm_classic_240130,[],other
430,instructpalmyra_30b,0.433,quac,helm_classic_240130,[],other
431,cohere_xlarge_v20220609_52.4b,0.361,quac,helm_classic_240130,[],other
433,davinci_175b,0.36,quac,helm_classic_240130,[],other
434,llama_7b,0.338,quac,helm_classic_240130,[],other
435,redpajama_incite_instruct_7b,0.26,quac,helm_classic_240130,[],other
436,j1_jumbo_v1_178b,0.358,quac,helm_classic_240130,[],other
437,glm_130b,0.272,quac,helm_classic_240130,[],other
438,luminous_extended_30b,0.349,quac,helm_classic_240130,[],other
439,opt_66b,0.357,quac,helm_classic_240130,[],other
440,bloom_176b,0.361,quac,helm_classic_240130,[],other
441,j1_grande_v1_17b,0.362,quac,helm_classic_240130,[],other
442,alpaca_7b,0.27,quac,helm_classic_240130,[],other
443,falcon_7b,0.332,quac,helm_classic_240130,[],other
444,redpajama_incite_base_7b,0.336,quac,helm_classic_240130,[],other
445,cohere_large_v20220720_13.1b,0.338,quac,helm_classic_240130,[],other
446,redpajama_incite_instruct_v1_3b,0.259,quac,helm_classic_240130,[],other
447,text_curie_001,0.358,quac,helm_classic_240130,[],other
448,gpt_neox_20b,0.326,quac,helm_classic_240130,[],other
449,luminous_base_13b,0.334,quac,helm_classic_240130,[],other
450,cohere_medium_v20221108_6.1b,0.314,quac,helm_classic_240130,[],other
451,redpajama_incite_base_v1_3b,0.309,quac,helm_classic_240130,[],other
452,tnlg_v2_6.7b,0.345,quac,helm_classic_240130,[],other
453,j1_large_v1_7.5b,0.328,quac,helm_classic_240130,[],other
454,gpt_j_6b,0.33,quac,helm_classic_240130,[],other
455,pythia_12b,0.313,quac,helm_classic_240130,[],other
456,curie_6.7b,0.321,quac,helm_classic_240130,[],other
457,falcon_instruct_7b,0.311,quac,helm_classic_240130,[],other
458,cohere_medium_v20220720_6.1b,0.279,quac,helm_classic_240130,[],other
459,text_babbage_001,0.284,quac,helm_classic_240130,[],other
460,t0pp_11b,0.121,quac,helm_classic_240130,[],other
461,pythia_6.9b,0.296,quac,helm_classic_240130,[],other
462,ul2_20b,0.144,quac,helm_classic_240130,[],other
463,t5_11b,0.116,quac,helm_classic_240130,[],other
464,babbage_1.3b,0.273,quac,helm_classic_240130,[],other
465,cohere_small_v20220720_410m,0.219,quac,helm_classic_240130,[],other
466,ada_350m,0.242,quac,helm_classic_240130,[],other
467,text_ada_001,0.176,quac,helm_classic_240130,[],other
468,yalm_100b,0.162,quac,helm_classic_240130,[],other
471,text_davinci_002,0.815,hellaswag,helm_classic_240130,[],reasoning
473,cohere_command_beta_52.4b,0.811,hellaswag,helm_classic_240130,[],reasoning
474,text_davinci_003,0.822,hellaswag,helm_classic_240130,[],reasoning
475,jurassic_2_jumbo_178b,0.788,hellaswag,helm_classic_240130,[],reasoning
477,tnlg_v2_530b,0.799,hellaswag,helm_classic_240130,[],reasoning
480,anthropic_lm_v4_s3_52b,0.807,hellaswag,helm_classic_240130,[],reasoning
482,jurassic_2_grande_17b,0.781,hellaswag,helm_classic_240130,[],reasoning
488,j1_grande_v2_beta_17b,0.764,hellaswag,helm_classic_240130,[],reasoning
490,cohere_command_beta_6.1b,0.752,hellaswag,helm_classic_240130,[],reasoning
491,cohere_xlarge_v20221108_52.4b,0.81,hellaswag,helm_classic_240130,[],reasoning
494,opt_175b,0.791,hellaswag,helm_classic_240130,[],reasoning
498,cohere_xlarge_v20220609_52.4b,0.811,hellaswag,helm_classic_240130,[],reasoning
499,jurassic_2_large_7.5b,0.729,hellaswag,helm_classic_240130,[],reasoning
500,davinci_175b,0.775,hellaswag,helm_classic_240130,[],reasoning
503,j1_jumbo_v1_178b,0.765,hellaswag,helm_classic_240130,[],reasoning
506,opt_66b,0.745,hellaswag,helm_classic_240130,[],reasoning
507,bloom_176b,0.744,hellaswag,helm_classic_240130,[],reasoning
508,j1_grande_v1_17b,0.739,hellaswag,helm_classic_240130,[],reasoning
512,cohere_large_v20220720_13.1b,0.736,hellaswag,helm_classic_240130,[],reasoning
514,text_curie_001,0.676,hellaswag,helm_classic_240130,[],reasoning
515,gpt_neox_20b,0.718,hellaswag,helm_classic_240130,[],reasoning
517,cohere_medium_v20221108_6.1b,0.726,hellaswag,helm_classic_240130,[],reasoning
519,tnlg_v2_6.7b,0.704,hellaswag,helm_classic_240130,[],reasoning
520,j1_large_v1_7.5b,0.7,hellaswag,helm_classic_240130,[],reasoning
521,gpt_j_6b,0.663,hellaswag,helm_classic_240130,[],reasoning
523,curie_6.7b,0.682,hellaswag,helm_classic_240130,[],reasoning
525,cohere_medium_v20220720_6.1b,0.706,hellaswag,helm_classic_240130,[],reasoning
526,text_babbage_001,0.561,hellaswag,helm_classic_240130,[],reasoning
531,babbage_1.3b,0.555,hellaswag,helm_classic_240130,[],reasoning
532,cohere_small_v20220720_410m,0.483,hellaswag,helm_classic_240130,[],reasoning
533,ada_350m,0.435,hellaswag,helm_classic_240130,[],reasoning
534,text_ada_001,0.429,hellaswag,helm_classic_240130,[],reasoning
538,text_davinci_002,0.594,openbookqa,helm_classic_240130,[],knowledge
540,cohere_command_beta_52.4b,0.582,openbookqa,helm_classic_240130,[],knowledge
541,text_davinci_003,0.646,openbookqa,helm_classic_240130,[],knowledge
542,jurassic_2_jumbo_178b,0.558,openbookqa,helm_classic_240130,[],knowledge
544,tnlg_v2_530b,0.562,openbookqa,helm_classic_240130,[],knowledge
547,anthropic_lm_v4_s3_52b,0.558,openbookqa,helm_classic_240130,[],knowledge
549,jurassic_2_grande_17b,0.542,openbookqa,helm_classic_240130,[],knowledge
555,j1_grande_v2_beta_17b,0.56,openbookqa,helm_classic_240130,[],knowledge
557,cohere_command_beta_6.1b,0.55,openbookqa,helm_classic_240130,[],knowledge
558,cohere_xlarge_v20221108_52.4b,0.588,openbookqa,helm_classic_240130,[],knowledge
561,opt_175b,0.586,openbookqa,helm_classic_240130,[],knowledge
565,cohere_xlarge_v20220609_52.4b,0.55,openbookqa,helm_classic_240130,[],knowledge
566,jurassic_2_large_7.5b,0.53,openbookqa,helm_classic_240130,[],knowledge
567,davinci_175b,0.586,openbookqa,helm_classic_240130,[],knowledge
570,j1_jumbo_v1_178b,0.534,openbookqa,helm_classic_240130,[],knowledge
573,opt_66b,0.534,openbookqa,helm_classic_240130,[],knowledge
574,bloom_176b,0.534,openbookqa,helm_classic_240130,[],knowledge
575,j1_grande_v1_17b,0.52,openbookqa,helm_classic_240130,[],knowledge
579,cohere_large_v20220720_13.1b,0.542,openbookqa,helm_classic_240130,[],knowledge
581,text_curie_001,0.514,openbookqa,helm_classic_240130,[],knowledge
582,gpt_neox_20b,0.524,openbookqa,helm_classic_240130,[],knowledge
584,cohere_medium_v20221108_6.1b,0.538,openbookqa,helm_classic_240130,[],knowledge
586,tnlg_v2_6.7b,0.478,openbookqa,helm_classic_240130,[],knowledge
587,j1_large_v1_7.5b,0.514,openbookqa,helm_classic_240130,[],knowledge
588,gpt_j_6b,0.514,openbookqa,helm_classic_240130,[],knowledge
590,curie_6.7b,0.502,openbookqa,helm_classic_240130,[],knowledge
592,cohere_medium_v20220720_6.1b,0.496,openbookqa,helm_classic_240130,[],knowledge
593,text_babbage_001,0.452,openbookqa,helm_classic_240130,[],knowledge
598,babbage_1.3b,0.438,openbookqa,helm_classic_240130,[],knowledge
599,cohere_small_v20220720_410m,0.348,openbookqa,helm_classic_240130,[],knowledge
600,ada_350m,0.38,openbookqa,helm_classic_240130,[],knowledge
601,text_ada_001,0.346,openbookqa,helm_classic_240130,[],knowledge
603,llama_2_70b,0.554,truthfulqa,helm_classic_240130,[],knowledge
604,llama_65b,0.508,truthfulqa,helm_classic_240130,[],knowledge
605,text_davinci_002,0.61,truthfulqa,helm_classic_240130,[],knowledge
606,mistral_v0.1_7b,0.422,truthfulqa,helm_classic_240130,[],knowledge
607,cohere_command_beta_52.4b,0.269,truthfulqa,helm_classic_240130,[],knowledge
608,text_davinci_003,0.593,truthfulqa,helm_classic_240130,[],knowledge
609,jurassic_2_jumbo_178b,0.437,truthfulqa,helm_classic_240130,[],knowledge
610,llama_2_13b,0.33,truthfulqa,helm_classic_240130,[],knowledge
611,tnlg_v2_530b,0.251,truthfulqa,helm_classic_240130,[],knowledge
612,gpt_3.5_turbo_0613,0.339,truthfulqa,helm_classic_240130,[],knowledge
613,llama_30b,0.344,truthfulqa,helm_classic_240130,[],knowledge
614,anthropic_lm_v4_s3_52b,0.368,truthfulqa,helm_classic_240130,[],knowledge
615,gpt_3.5_turbo_0301,0.609,truthfulqa,helm_classic_240130,[],knowledge
616,jurassic_2_grande_17b,0.348,truthfulqa,helm_classic_240130,[],knowledge
617,palmyra_x_43b,0.616,truthfulqa,helm_classic_240130,[],knowledge
618,falcon_40b,0.353,truthfulqa,helm_classic_240130,[],knowledge
619,falcon_instruct_40b,0.384,truthfulqa,helm_classic_240130,[],knowledge
620,mpt_instruct_30b,0.234,truthfulqa,helm_classic_240130,[],knowledge
621,mpt_30b,0.231,truthfulqa,helm_classic_240130,[],knowledge
622,j1_grande_v2_beta_17b,0.306,truthfulqa,helm_classic_240130,[],knowledge
623,vicuna_v1.3_13b,0.385,truthfulqa,helm_classic_240130,[],knowledge
624,cohere_command_beta_6.1b,0.203,truthfulqa,helm_classic_240130,[],knowledge
625,cohere_xlarge_v20221108_52.4b,0.169,truthfulqa,helm_classic_240130,[],knowledge
626,luminous_supreme_70b,0.222,truthfulqa,helm_classic_240130,[],knowledge
627,vicuna_v1.3_7b,0.292,truthfulqa,helm_classic_240130,[],knowledge
628,opt_175b,0.25,truthfulqa,helm_classic_240130,[],knowledge
629,llama_2_7b,0.272,truthfulqa,helm_classic_240130,[],knowledge
630,llama_13b,0.324,truthfulqa,helm_classic_240130,[],knowledge
631,instructpalmyra_30b,0.185,truthfulqa,helm_classic_240130,[],knowledge
632,cohere_xlarge_v20220609_52.4b,0.198,truthfulqa,helm_classic_240130,[],knowledge
633,jurassic_2_large_7.5b,0.245,truthfulqa,helm_classic_240130,[],knowledge
634,davinci_175b,0.194,truthfulqa,helm_classic_240130,[],knowledge
635,llama_7b,0.28,truthfulqa,helm_classic_240130,[],knowledge
636,redpajama_incite_instruct_7b,0.243,truthfulqa,helm_classic_240130,[],knowledge
637,j1_jumbo_v1_178b,0.175,truthfulqa,helm_classic_240130,[],knowledge
638,glm_130b,0.218,truthfulqa,helm_classic_240130,[],knowledge
639,luminous_extended_30b,0.221,truthfulqa,helm_classic_240130,[],knowledge
640,opt_66b,0.201,truthfulqa,helm_classic_240130,[],knowledge
641,bloom_176b,0.205,truthfulqa,helm_classic_240130,[],knowledge
642,j1_grande_v1_17b,0.193,truthfulqa,helm_classic_240130,[],knowledge
643,alpaca_7b,0.243,truthfulqa,helm_classic_240130,[],knowledge
644,falcon_7b,0.234,truthfulqa,helm_classic_240130,[],knowledge
645,redpajama_incite_base_7b,0.205,truthfulqa,helm_classic_240130,[],knowledge
646,cohere_large_v20220720_13.1b,0.181,truthfulqa,helm_classic_240130,[],knowledge
647,redpajama_incite_instruct_v1_3b,0.208,truthfulqa,helm_classic_240130,[],knowledge
648,text_curie_001,0.257,truthfulqa,helm_classic_240130,[],knowledge
649,gpt_neox_20b,0.216,truthfulqa,helm_classic_240130,[],knowledge
650,luminous_base_13b,0.182,truthfulqa,helm_classic_240130,[],knowledge
651,cohere_medium_v20221108_6.1b,0.215,truthfulqa,helm_classic_240130,[],knowledge
652,redpajama_incite_base_v1_3b,0.277,truthfulqa,helm_classic_240130,[],knowledge
653,tnlg_v2_6.7b,0.167,truthfulqa,helm_classic_240130,[],knowledge
654,j1_large_v1_7.5b,0.197,truthfulqa,helm_classic_240130,[],knowledge
655,gpt_j_6b,0.199,truthfulqa,helm_classic_240130,[],knowledge
656,pythia_12b,0.177,truthfulqa,helm_classic_240130,[],knowledge
657,curie_6.7b,0.232,truthfulqa,helm_classic_240130,[],knowledge
658,falcon_instruct_7b,0.213,truthfulqa,helm_classic_240130,[],knowledge
659,cohere_medium_v20220720_6.1b,0.19,truthfulqa,helm_classic_240130,[],knowledge
660,text_babbage_001,0.233,truthfulqa,helm_classic_240130,[],knowledge
661,t0pp_11b,0.377,truthfulqa,helm_classic_240130,[],knowledge
662,pythia_6.9b,0.213,truthfulqa,helm_classic_240130,[],knowledge
663,ul2_20b,0.193,truthfulqa,helm_classic_240130,[],knowledge
664,t5_11b,0.133,truthfulqa,helm_classic_240130,[],knowledge
665,babbage_1.3b,0.188,truthfulqa,helm_classic_240130,[],knowledge
666,cohere_small_v20220720_410m,0.217,truthfulqa,helm_classic_240130,[],knowledge
667,ada_350m,0.215,truthfulqa,helm_classic_240130,[],knowledge
668,text_ada_001,0.232,truthfulqa,helm_classic_240130,[],knowledge
669,yalm_100b,0.202,truthfulqa,helm_classic_240130,[],knowledge
672,text_davinci_002,0.421,ms_marco_regular,helm_classic_240130,[],other
674,cohere_command_beta_52.4b,0.472,ms_marco_regular,helm_classic_240130,[],other
675,text_davinci_003,0.368,ms_marco_regular,helm_classic_240130,[],other
676,jurassic_2_jumbo_178b,0.398,ms_marco_regular,helm_classic_240130,[],other
678,tnlg_v2_530b,0.377,ms_marco_regular,helm_classic_240130,[],other
683,jurassic_2_grande_17b,0.293,ms_marco_regular,helm_classic_240130,[],other
689,j1_grande_v2_beta_17b,0.285,ms_marco_regular,helm_classic_240130,[],other
691,cohere_command_beta_6.1b,0.434,ms_marco_regular,helm_classic_240130,[],other
692,cohere_xlarge_v20221108_52.4b,0.315,ms_marco_regular,helm_classic_240130,[],other
695,opt_175b,0.288,ms_marco_regular,helm_classic_240130,[],other
699,cohere_xlarge_v20220609_52.4b,0.273,ms_marco_regular,helm_classic_240130,[],other
700,jurassic_2_large_7.5b,0.247,ms_marco_regular,helm_classic_240130,[],other
701,davinci_175b,0.211,ms_marco_regular,helm_classic_240130,[],other
704,j1_jumbo_v1_178b,0.21,ms_marco_regular,helm_classic_240130,[],other
707,opt_66b,0.237,ms_marco_regular,helm_classic_240130,[],other
708,bloom_176b,0.236,ms_marco_regular,helm_classic_240130,[],other
709,j1_grande_v1_17b,0.161,ms_marco_regular,helm_classic_240130,[],other
713,cohere_large_v20220720_13.1b,0.19,ms_marco_regular,helm_classic_240130,[],other
715,text_curie_001,0.271,ms_marco_regular,helm_classic_240130,[],other
716,gpt_neox_20b,0.184,ms_marco_regular,helm_classic_240130,[],other
718,cohere_medium_v20221108_6.1b,0.175,ms_marco_regular,helm_classic_240130,[],other
720,tnlg_v2_6.7b,0.158,ms_marco_regular,helm_classic_240130,[],other
721,j1_large_v1_7.5b,0.147,ms_marco_regular,helm_classic_240130,[],other
722,gpt_j_6b,0.152,ms_marco_regular,helm_classic_240130,[],other
724,curie_6.7b,0.162,ms_marco_regular,helm_classic_240130,[],other
726,cohere_medium_v20220720_6.1b,0.152,ms_marco_regular,helm_classic_240130,[],other
727,text_babbage_001,0.208,ms_marco_regular,helm_classic_240130,[],other
732,babbage_1.3b,0.122,ms_marco_regular,helm_classic_240130,[],other
734,ada_350m,0.102,ms_marco_regular,helm_classic_240130,[],other
735,text_ada_001,0.134,ms_marco_regular,helm_classic_240130,[],other
739,text_davinci_002,0.664,ms_marco_trec,helm_classic_240130,[],other
741,cohere_command_beta_52.4b,0.762,ms_marco_trec,helm_classic_240130,[],other
742,text_davinci_003,0.644,ms_marco_trec,helm_classic_240130,[],other
743,jurassic_2_jumbo_178b,0.661,ms_marco_trec,helm_classic_240130,[],other
745,tnlg_v2_530b,0.643,ms_marco_trec,helm_classic_240130,[],other
750,jurassic_2_grande_17b,0.514,ms_marco_trec,helm_classic_240130,[],other
756,j1_grande_v2_beta_17b,0.46,ms_marco_trec,helm_classic_240130,[],other
758,cohere_command_beta_6.1b,0.709,ms_marco_trec,helm_classic_240130,[],other
759,cohere_xlarge_v20221108_52.4b,0.55,ms_marco_trec,helm_classic_240130,[],other
762,opt_175b,0.448,ms_marco_trec,helm_classic_240130,[],other
766,cohere_xlarge_v20220609_52.4b,0.459,ms_marco_trec,helm_classic_240130,[],other
767,jurassic_2_large_7.5b,0.464,ms_marco_trec,helm_classic_240130,[],other
768,davinci_175b,0.378,ms_marco_trec,helm_classic_240130,[],other
771,j1_jumbo_v1_178b,0.363,ms_marco_trec,helm_classic_240130,[],other
774,opt_66b,0.482,ms_marco_trec,helm_classic_240130,[],other
775,bloom_176b,0.386,ms_marco_trec,helm_classic_240130,[],other
776,j1_grande_v1_17b,0.341,ms_marco_trec,helm_classic_240130,[],other
780,cohere_large_v20220720_13.1b,0.33,ms_marco_trec,helm_classic_240130,[],other
782,text_curie_001,0.507,ms_marco_trec,helm_classic_240130,[],other
783,gpt_neox_20b,0.398,ms_marco_trec,helm_classic_240130,[],other
785,cohere_medium_v20221108_6.1b,0.373,ms_marco_trec,helm_classic_240130,[],other
787,tnlg_v2_6.7b,0.332,ms_marco_trec,helm_classic_240130,[],other
788,j1_large_v1_7.5b,0.292,ms_marco_trec,helm_classic_240130,[],other
789,gpt_j_6b,0.345,ms_marco_trec,helm_classic_240130,[],other
791,curie_6.7b,0.3,ms_marco_trec,helm_classic_240130,[],other
793,cohere_medium_v20220720_6.1b,0.374,ms_marco_trec,helm_classic_240130,[],other
794,text_babbage_001,0.449,ms_marco_trec,helm_classic_240130,[],other
799,babbage_1.3b,0.317,ms_marco_trec,helm_classic_240130,[],other
800,cohere_small_v20220720_410m,0.304,ms_marco_trec,helm_classic_240130,[],other
801,ada_350m,0.29,ms_marco_trec,helm_classic_240130,[],other
802,text_ada_001,0.302,ms_marco_trec,helm_classic_240130,[],other
806,text_davinci_002,0.153,cnn/dailymail,helm_classic_240130,[],other
808,cohere_command_beta_52.4b,0.161,cnn/dailymail,helm_classic_240130,[],other
809,text_davinci_003,0.156,cnn/dailymail,helm_classic_240130,[],other
810,jurassic_2_jumbo_178b,0.149,cnn/dailymail,helm_classic_240130,[],other
812,tnlg_v2_530b,0.161,cnn/dailymail,helm_classic_240130,[],other
815,anthropic_lm_v4_s3_52b,0.154,cnn/dailymail,helm_classic_240130,[],other
817,jurassic_2_grande_17b,0.144,cnn/dailymail,helm_classic_240130,[],other
818,palmyra_x_43b,0.049,cnn/dailymail,helm_classic_240130,[],other
823,j1_grande_v2_beta_17b,0.146,cnn/dailymail,helm_classic_240130,[],other
825,cohere_command_beta_6.1b,0.153,cnn/dailymail,helm_classic_240130,[],other
826,cohere_xlarge_v20221108_52.4b,0.153,cnn/dailymail,helm_classic_240130,[],other
827,luminous_supreme_70b,0.15,cnn/dailymail,helm_classic_240130,[],other
829,opt_175b,0.146,cnn/dailymail,helm_classic_240130,[],other
832,instructpalmyra_30b,0.152,cnn/dailymail,helm_classic_240130,[],other
833,cohere_xlarge_v20220609_52.4b,0.144,cnn/dailymail,helm_classic_240130,[],other
834,jurassic_2_large_7.5b,0.136,cnn/dailymail,helm_classic_240130,[],other
835,davinci_175b,0.127,cnn/dailymail,helm_classic_240130,[],other
838,j1_jumbo_v1_178b,0.144,cnn/dailymail,helm_classic_240130,[],other
839,glm_130b,0.154,cnn/dailymail,helm_classic_240130,[],other
840,luminous_extended_30b,0.139,cnn/dailymail,helm_classic_240130,[],other
841,opt_66b,0.136,cnn/dailymail,helm_classic_240130,[],other
842,bloom_176b,0.08,cnn/dailymail,helm_classic_240130,[],other
843,j1_grande_v1_17b,0.143,cnn/dailymail,helm_classic_240130,[],other
847,cohere_large_v20220720_13.1b,0.126,cnn/dailymail,helm_classic_240130,[],other
849,text_curie_001,0.152,cnn/dailymail,helm_classic_240130,[],other
850,gpt_neox_20b,0.123,cnn/dailymail,helm_classic_240130,[],other
851,luminous_base_13b,0.11,cnn/dailymail,helm_classic_240130,[],other
852,cohere_medium_v20221108_6.1b,0.121,cnn/dailymail,helm_classic_240130,[],other
854,tnlg_v2_6.7b,0.146,cnn/dailymail,helm_classic_240130,[],other
855,j1_large_v1_7.5b,0.134,cnn/dailymail,helm_classic_240130,[],other
856,gpt_j_6b,0.131,cnn/dailymail,helm_classic_240130,[],other
858,curie_6.7b,0.113,cnn/dailymail,helm_classic_240130,[],other
860,cohere_medium_v20220720_6.1b,0.077,cnn/dailymail,helm_classic_240130,[],other
861,text_babbage_001,0.151,cnn/dailymail,helm_classic_240130,[],other
862,t0pp_11b,0.122,cnn/dailymail,helm_classic_240130,[],other
864,ul2_20b,0.03,cnn/dailymail,helm_classic_240130,[],other
865,t5_11b,0.043,cnn/dailymail,helm_classic_240130,[],other
866,babbage_1.3b,0.079,cnn/dailymail,helm_classic_240130,[],other
867,cohere_small_v20220720_410m,0.063,cnn/dailymail,helm_classic_240130,[],other
868,ada_350m,0.09,cnn/dailymail,helm_classic_240130,[],other
869,text_ada_001,0.136,cnn/dailymail,helm_classic_240130,[],other
870,yalm_100b,0.017,cnn/dailymail,helm_classic_240130,[],other
873,text_davinci_002,0.144,xsum,helm_classic_240130,[],other
875,cohere_command_beta_52.4b,0.152,xsum,helm_classic_240130,[],other
876,text_davinci_003,0.124,xsum,helm_classic_240130,[],other
877,jurassic_2_jumbo_178b,0.182,xsum,helm_classic_240130,[],other
879,tnlg_v2_530b,0.169,xsum,helm_classic_240130,[],other
882,anthropic_lm_v4_s3_52b,0.134,xsum,helm_classic_240130,[],other
884,jurassic_2_grande_17b,0.167,xsum,helm_classic_240130,[],other
885,palmyra_x_43b,0.149,xsum,helm_classic_240130,[],other
890,j1_grande_v2_beta_17b,0.152,xsum,helm_classic_240130,[],other
892,cohere_command_beta_6.1b,0.122,xsum,helm_classic_240130,[],other
893,cohere_xlarge_v20221108_52.4b,0.153,xsum,helm_classic_240130,[],other
894,luminous_supreme_70b,0.136,xsum,helm_classic_240130,[],other
896,opt_175b,0.155,xsum,helm_classic_240130,[],other
899,instructpalmyra_30b,0.104,xsum,helm_classic_240130,[],other
900,cohere_xlarge_v20220609_52.4b,0.129,xsum,helm_classic_240130,[],other
901,jurassic_2_large_7.5b,0.142,xsum,helm_classic_240130,[],other
902,davinci_175b,0.126,xsum,helm_classic_240130,[],other
905,j1_jumbo_v1_178b,0.129,xsum,helm_classic_240130,[],other
906,glm_130b,0.132,xsum,helm_classic_240130,[],other
907,luminous_extended_30b,0.124,xsum,helm_classic_240130,[],other
908,opt_66b,0.126,xsum,helm_classic_240130,[],other
909,bloom_176b,0.03,xsum,helm_classic_240130,[],other
910,j1_grande_v1_17b,0.122,xsum,helm_classic_240130,[],other
914,cohere_large_v20220720_13.1b,0.108,xsum,helm_classic_240130,[],other
916,text_curie_001,0.076,xsum,helm_classic_240130,[],other
917,gpt_neox_20b,0.102,xsum,helm_classic_240130,[],other
918,luminous_base_13b,0.105,xsum,helm_classic_240130,[],other
919,cohere_medium_v20221108_6.1b,0.099,xsum,helm_classic_240130,[],other
921,tnlg_v2_6.7b,0.11,xsum,helm_classic_240130,[],other
922,j1_large_v1_7.5b,0.102,xsum,helm_classic_240130,[],other
923,gpt_j_6b,0.096,xsum,helm_classic_240130,[],other
925,curie_6.7b,0.091,xsum,helm_classic_240130,[],other
927,cohere_medium_v20220720_6.1b,0.087,xsum,helm_classic_240130,[],other
928,text_babbage_001,0.046,xsum,helm_classic_240130,[],other
929,t0pp_11b,0.09,xsum,helm_classic_240130,[],other
931,ul2_20b,0.058,xsum,helm_classic_240130,[],other
932,t5_11b,0.015,xsum,helm_classic_240130,[],other
933,babbage_1.3b,0.045,xsum,helm_classic_240130,[],other
934,cohere_small_v20220720_410m,0.033,xsum,helm_classic_240130,[],other
935,ada_350m,0.022,xsum,helm_classic_240130,[],other
936,text_ada_001,0.034,xsum,helm_classic_240130,[],other
937,yalm_100b,0.021,xsum,helm_classic_240130,[],other
938,llama_2_70b,0.961,imdb,helm_classic_240130,[],other
939,llama_65b,0.962,imdb,helm_classic_240130,[],other
940,text_davinci_002,0.948,imdb,helm_classic_240130,[],other
941,mistral_v0.1_7b,0.962,imdb,helm_classic_240130,[],other
942,cohere_command_beta_52.4b,0.96,imdb,helm_classic_240130,[],other
943,text_davinci_003,0.848,imdb,helm_classic_240130,[],other
944,jurassic_2_jumbo_178b,0.938,imdb,helm_classic_240130,[],other
945,llama_2_13b,0.962,imdb,helm_classic_240130,[],other
946,tnlg_v2_530b,0.941,imdb,helm_classic_240130,[],other
947,gpt_3.5_turbo_0613,0.943,imdb,helm_classic_240130,[],other
948,llama_30b,0.927,imdb,helm_classic_240130,[],other
949,anthropic_lm_v4_s3_52b,0.934,imdb,helm_classic_240130,[],other
950,gpt_3.5_turbo_0301,0.899,imdb,helm_classic_240130,[],other
951,jurassic_2_grande_17b,0.938,imdb,helm_classic_240130,[],other
952,palmyra_x_43b,0.935,imdb,helm_classic_240130,[],other
953,falcon_40b,0.959,imdb,helm_classic_240130,[],other
954,falcon_instruct_40b,0.959,imdb,helm_classic_240130,[],other
955,mpt_instruct_30b,0.956,imdb,helm_classic_240130,[],other
956,mpt_30b,0.959,imdb,helm_classic_240130,[],other
957,j1_grande_v2_beta_17b,0.957,imdb,helm_classic_240130,[],other
958,vicuna_v1.3_13b,0.762,imdb,helm_classic_240130,[],other
959,cohere_command_beta_6.1b,0.961,imdb,helm_classic_240130,[],other
960,cohere_xlarge_v20221108_52.4b,0.956,imdb,helm_classic_240130,[],other
961,luminous_supreme_70b,0.959,imdb,helm_classic_240130,[],other
962,vicuna_v1.3_7b,0.916,imdb,helm_classic_240130,[],other
963,opt_175b,0.947,imdb,helm_classic_240130,[],other
964,llama_2_7b,0.907,imdb,helm_classic_240130,[],other
965,llama_13b,0.928,imdb,helm_classic_240130,[],other
966,instructpalmyra_30b,0.94,imdb,helm_classic_240130,[],other
967,cohere_xlarge_v20220609_52.4b,0.956,imdb,helm_classic_240130,[],other
968,jurassic_2_large_7.5b,0.956,imdb,helm_classic_240130,[],other
969,davinci_175b,0.933,imdb,helm_classic_240130,[],other
970,llama_7b,0.947,imdb,helm_classic_240130,[],other
971,redpajama_incite_instruct_7b,0.927,imdb,helm_classic_240130,[],other
972,j1_jumbo_v1_178b,0.943,imdb,helm_classic_240130,[],other
973,glm_130b,0.955,imdb,helm_classic_240130,[],other
974,luminous_extended_30b,0.947,imdb,helm_classic_240130,[],other
975,opt_66b,0.917,imdb,helm_classic_240130,[],other
976,bloom_176b,0.945,imdb,helm_classic_240130,[],other
977,j1_grande_v1_17b,0.953,imdb,helm_classic_240130,[],other
978,alpaca_7b,0.738,imdb,helm_classic_240130,[],other
979,falcon_7b,0.836,imdb,helm_classic_240130,[],other
980,redpajama_incite_base_7b,0.752,imdb,helm_classic_240130,[],other
981,cohere_large_v20220720_13.1b,0.933,imdb,helm_classic_240130,[],other
982,redpajama_incite_instruct_v1_3b,0.894,imdb,helm_classic_240130,[],other
983,text_curie_001,0.923,imdb,helm_classic_240130,[],other
984,gpt_neox_20b,0.948,imdb,helm_classic_240130,[],other
985,luminous_base_13b,0.939,imdb,helm_classic_240130,[],other
986,cohere_medium_v20221108_6.1b,0.935,imdb,helm_classic_240130,[],other
987,redpajama_incite_base_v1_3b,0.907,imdb,helm_classic_240130,[],other
988,tnlg_v2_6.7b,0.927,imdb,helm_classic_240130,[],other
989,j1_large_v1_7.5b,0.956,imdb,helm_classic_240130,[],other
990,gpt_j_6b,0.939,imdb,helm_classic_240130,[],other
991,pythia_12b,0.931,imdb,helm_classic_240130,[],other
992,curie_6.7b,0.889,imdb,helm_classic_240130,[],other
993,falcon_instruct_7b,0.852,imdb,helm_classic_240130,[],other
994,cohere_medium_v20220720_6.1b,0.935,imdb,helm_classic_240130,[],other
995,text_babbage_001,0.913,imdb,helm_classic_240130,[],other
996,t0pp_11b,0.207,imdb,helm_classic_240130,[],other
997,pythia_6.9b,0.928,imdb,helm_classic_240130,[],other
998,ul2_20b,0.337,imdb,helm_classic_240130,[],other
999,t5_11b,0.379,imdb,helm_classic_240130,[],other
1000,babbage_1.3b,0.597,imdb,helm_classic_240130,[],other
1001,cohere_small_v20220720_410m,0.578,imdb,helm_classic_240130,[],other
1002,ada_350m,0.849,imdb,helm_classic_240130,[],other
1003,text_ada_001,0.822,imdb,helm_classic_240130,[],other
1004,yalm_100b,0.836,imdb,helm_classic_240130,[],other
1005,llama_2_70b,0.652,civilcomments,helm_classic_240130,[],other
1006,llama_65b,0.655,civilcomments,helm_classic_240130,[],other
1007,text_davinci_002,0.668,civilcomments,helm_classic_240130,[],other
1008,mistral_v0.1_7b,0.624,civilcomments,helm_classic_240130,[],other
1009,cohere_command_beta_52.4b,0.601,civilcomments,helm_classic_240130,[],other
1010,text_davinci_003,0.684,civilcomments,helm_classic_240130,[],other
1011,jurassic_2_jumbo_178b,0.57,civilcomments,helm_classic_240130,[],other
1012,llama_2_13b,0.588,civilcomments,helm_classic_240130,[],other
1013,tnlg_v2_530b,0.601,civilcomments,helm_classic_240130,[],other
1014,gpt_3.5_turbo_0613,0.696,civilcomments,helm_classic_240130,[],other
1015,llama_30b,0.549,civilcomments,helm_classic_240130,[],other
1016,anthropic_lm_v4_s3_52b,0.61,civilcomments,helm_classic_240130,[],other
1017,gpt_3.5_turbo_0301,0.674,civilcomments,helm_classic_240130,[],other
1018,jurassic_2_grande_17b,0.547,civilcomments,helm_classic_240130,[],other
1019,palmyra_x_43b,0.008,civilcomments,helm_classic_240130,[],other
1020,falcon_40b,0.552,civilcomments,helm_classic_240130,[],other
1021,falcon_instruct_40b,0.603,civilcomments,helm_classic_240130,[],other
1022,mpt_instruct_30b,0.573,civilcomments,helm_classic_240130,[],other
1023,mpt_30b,0.599,civilcomments,helm_classic_240130,[],other
1024,j1_grande_v2_beta_17b,0.546,civilcomments,helm_classic_240130,[],other
1025,vicuna_v1.3_13b,0.645,civilcomments,helm_classic_240130,[],other
1026,cohere_command_beta_6.1b,0.54,civilcomments,helm_classic_240130,[],other
1027,cohere_xlarge_v20221108_52.4b,0.524,civilcomments,helm_classic_240130,[],other
1028,luminous_supreme_70b,0.562,civilcomments,helm_classic_240130,[],other
1029,vicuna_v1.3_7b,0.62,civilcomments,helm_classic_240130,[],other
1030,opt_175b,0.505,civilcomments,helm_classic_240130,[],other
1031,llama_2_7b,0.562,civilcomments,helm_classic_240130,[],other
1032,llama_13b,0.6,civilcomments,helm_classic_240130,[],other
1033,instructpalmyra_30b,0.555,civilcomments,helm_classic_240130,[],other
1034,cohere_xlarge_v20220609_52.4b,0.532,civilcomments,helm_classic_240130,[],other
1035,jurassic_2_large_7.5b,0.57,civilcomments,helm_classic_240130,[],other
1036,davinci_175b,0.532,civilcomments,helm_classic_240130,[],other
1037,llama_7b,0.563,civilcomments,helm_classic_240130,[],other
1038,redpajama_incite_instruct_7b,0.664,civilcomments,helm_classic_240130,[],other
1039,j1_jumbo_v1_178b,0.553,civilcomments,helm_classic_240130,[],other
1040,glm_130b,0.5,civilcomments,helm_classic_240130,[],other
1041,luminous_extended_30b,0.524,civilcomments,helm_classic_240130,[],other
1042,opt_66b,0.506,civilcomments,helm_classic_240130,[],other
1043,bloom_176b,0.62,civilcomments,helm_classic_240130,[],other
1044,j1_grande_v1_17b,0.529,civilcomments,helm_classic_240130,[],other
1045,alpaca_7b,0.566,civilcomments,helm_classic_240130,[],other
1046,falcon_7b,0.514,civilcomments,helm_classic_240130,[],other
1047,redpajama_incite_base_7b,0.547,civilcomments,helm_classic_240130,[],other
1048,cohere_large_v20220720_13.1b,0.507,civilcomments,helm_classic_240130,[],other
1049,redpajama_incite_instruct_v1_3b,0.549,civilcomments,helm_classic_240130,[],other
1050,text_curie_001,0.537,civilcomments,helm_classic_240130,[],other
1051,gpt_neox_20b,0.516,civilcomments,helm_classic_240130,[],other
1052,luminous_base_13b,0.544,civilcomments,helm_classic_240130,[],other
1053,cohere_medium_v20221108_6.1b,0.5,civilcomments,helm_classic_240130,[],other
1054,redpajama_incite_base_v1_3b,0.549,civilcomments,helm_classic_240130,[],other
1055,tnlg_v2_6.7b,0.532,civilcomments,helm_classic_240130,[],other
1056,j1_large_v1_7.5b,0.532,civilcomments,helm_classic_240130,[],other
1057,gpt_j_6b,0.52,civilcomments,helm_classic_240130,[],other
1058,pythia_12b,0.531,civilcomments,helm_classic_240130,[],other
1059,curie_6.7b,0.539,civilcomments,helm_classic_240130,[],other
1060,falcon_instruct_7b,0.511,civilcomments,helm_classic_240130,[],other
1061,cohere_medium_v20220720_6.1b,0.504,civilcomments,helm_classic_240130,[],other
1062,text_babbage_001,0.499,civilcomments,helm_classic_240130,[],other
1063,t0pp_11b,0.234,civilcomments,helm_classic_240130,[],other
1064,pythia_6.9b,0.511,civilcomments,helm_classic_240130,[],other
1065,ul2_20b,0.521,civilcomments,helm_classic_240130,[],other
1066,t5_11b,0.509,civilcomments,helm_classic_240130,[],other
1067,babbage_1.3b,0.519,civilcomments,helm_classic_240130,[],other
1068,cohere_small_v20220720_410m,0.501,civilcomments,helm_classic_240130,[],other
1069,ada_350m,0.517,civilcomments,helm_classic_240130,[],other
1070,text_ada_001,0.503,civilcomments,helm_classic_240130,[],other
1071,yalm_100b,0.49,civilcomments,helm_classic_240130,[],other
1072,llama_2_70b,0.727,raft,helm_classic_240130,[],other
1073,llama_65b,0.702,raft,helm_classic_240130,[],other
1074,text_davinci_002,0.733,raft,helm_classic_240130,[],other
1075,mistral_v0.1_7b,0.707,raft,helm_classic_240130,[],other
1076,cohere_command_beta_52.4b,0.667,raft,helm_classic_240130,[],other
1077,text_davinci_003,0.759,raft,helm_classic_240130,[],other
1078,jurassic_2_jumbo_178b,0.746,raft,helm_classic_240130,[],other
1079,llama_2_13b,0.707,raft,helm_classic_240130,[],other
1080,tnlg_v2_530b,0.679,raft,helm_classic_240130,[],other
1081,gpt_3.5_turbo_0613,0.748,raft,helm_classic_240130,[],other
1082,llama_30b,0.752,raft,helm_classic_240130,[],other
1083,anthropic_lm_v4_s3_52b,0.699,raft,helm_classic_240130,[],other
1084,gpt_3.5_turbo_0301,0.768,raft,helm_classic_240130,[],other
1085,jurassic_2_grande_17b,0.712,raft,helm_classic_240130,[],other
1086,palmyra_x_43b,0.701,raft,helm_classic_240130,[],other
1087,falcon_40b,0.661,raft,helm_classic_240130,[],other
1088,falcon_instruct_40b,0.586,raft,helm_classic_240130,[],other
1089,mpt_instruct_30b,0.68,raft,helm_classic_240130,[],other
1090,mpt_30b,0.723,raft,helm_classic_240130,[],other
1091,j1_grande_v2_beta_17b,0.679,raft,helm_classic_240130,[],other
1092,vicuna_v1.3_13b,0.657,raft,helm_classic_240130,[],other
1093,cohere_command_beta_6.1b,0.634,raft,helm_classic_240130,[],other
1094,cohere_xlarge_v20221108_52.4b,0.624,raft,helm_classic_240130,[],other
1095,luminous_supreme_70b,0.653,raft,helm_classic_240130,[],other
1096,vicuna_v1.3_7b,0.693,raft,helm_classic_240130,[],other
1097,opt_175b,0.606,raft,helm_classic_240130,[],other
1098,llama_2_7b,0.643,raft,helm_classic_240130,[],other
1099,llama_13b,0.643,raft,helm_classic_240130,[],other
1100,instructpalmyra_30b,0.652,raft,helm_classic_240130,[],other
1101,cohere_xlarge_v20220609_52.4b,0.633,raft,helm_classic_240130,[],other
1102,jurassic_2_large_7.5b,0.622,raft,helm_classic_240130,[],other
1103,davinci_175b,0.642,raft,helm_classic_240130,[],other
1104,llama_7b,0.573,raft,helm_classic_240130,[],other
1105,redpajama_incite_instruct_7b,0.695,raft,helm_classic_240130,[],other
1106,j1_jumbo_v1_178b,0.681,raft,helm_classic_240130,[],other
1107,glm_130b,0.598,raft,helm_classic_240130,[],other
1108,luminous_extended_30b,0.523,raft,helm_classic_240130,[],other
1109,opt_66b,0.557,raft,helm_classic_240130,[],other
1110,bloom_176b,0.592,raft,helm_classic_240130,[],other
1111,j1_grande_v1_17b,0.658,raft,helm_classic_240130,[],other
1112,alpaca_7b,0.486,raft,helm_classic_240130,[],other
1113,falcon_7b,0.602,raft,helm_classic_240130,[],other
1114,redpajama_incite_base_7b,0.648,raft,helm_classic_240130,[],other
1115,cohere_large_v20220720_13.1b,0.596,raft,helm_classic_240130,[],other
1116,redpajama_incite_instruct_v1_3b,0.661,raft,helm_classic_240130,[],other
1117,text_curie_001,0.489,raft,helm_classic_240130,[],other
1118,gpt_neox_20b,0.505,raft,helm_classic_240130,[],other
1119,luminous_base_13b,0.473,raft,helm_classic_240130,[],other
1120,cohere_medium_v20221108_6.1b,0.591,raft,helm_classic_240130,[],other
1121,redpajama_incite_base_v1_3b,0.502,raft,helm_classic_240130,[],other
1122,tnlg_v2_6.7b,0.525,raft,helm_classic_240130,[],other
1123,j1_large_v1_7.5b,0.545,raft,helm_classic_240130,[],other
1124,gpt_j_6b,0.619,raft,helm_classic_240130,[],other
1125,pythia_12b,0.514,raft,helm_classic_240130,[],other
1126,curie_6.7b,0.49,raft,helm_classic_240130,[],other
1127,falcon_instruct_7b,0.523,raft,helm_classic_240130,[],other
1128,cohere_medium_v20220720_6.1b,0.52,raft,helm_classic_240130,[],other
1129,text_babbage_001,0.509,raft,helm_classic_240130,[],other
1130,t0pp_11b,0.118,raft,helm_classic_240130,[],other
1131,pythia_6.9b,0.502,raft,helm_classic_240130,[],other
1132,ul2_20b,0.404,raft,helm_classic_240130,[],other
1133,t5_11b,0.37,raft,helm_classic_240130,[],other
1134,babbage_1.3b,0.455,raft,helm_classic_240130,[],other
1135,cohere_small_v20220720_410m,0.492,raft,helm_classic_240130,[],other
1136,ada_350m,0.423,raft,helm_classic_240130,[],other
1137,text_ada_001,0.406,raft,helm_classic_240130,[],other
1138,yalm_100b,0.395,raft,helm_classic_240130,[],other
0,phi_1,1.1,grounding,biggen_240612,[],other
1,phi_1_5,2.425,grounding,biggen_240612,[],other
2,phi_2,3.05,grounding,biggen_240612,[],other
3,qwen1.5_0.5b,1.85,grounding,biggen_240612,[],other
4,qwen1.5_1.8b,2.425,grounding,biggen_240612,[],other
5,qwen1.5_4b,2.85,grounding,biggen_240612,[],other
6,gemma_2b,2.163,grounding,biggen_240612,[],other
7,olmo_1b,1.675,grounding,biggen_240612,[],other
8,qwen1.5_0.5b_chat,2.075,grounding,biggen_240612,[],other
9,qwen1.5_1.8b_chat,2.75,grounding,biggen_240612,[],other
10,qwen1.5_4b_chat,2.862,grounding,biggen_240612,[],other
11,phi_3_mini_4k_instruct,3.675,grounding,biggen_240612,[],other
12,phi_3_mini_128k_instruct,3.5,grounding,biggen_240612,[],other
13,gemma_2b_it,2.825,grounding,biggen_240612,[],other
14,gemma_1.1_2b_it,2.812,grounding,biggen_240612,[],other
15,gemma_7b,1.288,grounding,biggen_240612,[],other
16,mistral_7b_v0.1,3.15,grounding,biggen_240612,[],other
17,mistral_7b_v0.2,3.038,grounding,biggen_240612,[],other
18,qwen1.5_7b,2.9,grounding,biggen_240612,[],other
19,yi_6b,2.688,grounding,biggen_240612,[],other
20,llama_2_7b,2.325,grounding,biggen_240612,[],other
21,codellama_7b,1.875,grounding,biggen_240612,[],other
22,meta_llama_3_8b,3.025,grounding,biggen_240612,[],other
23,llemma_7b,2.237,grounding,biggen_240612,[],other
24,olmo_7b,2.075,grounding,biggen_240612,[],other
25,gemma_7b_it,3.212,grounding,biggen_240612,[],other
26,gemma_1.1_7b_it,3.5,grounding,biggen_240612,[],other
27,mistral_7b_instruct_v0.2,3.612,grounding,biggen_240612,[],other
28,qwen1.5_7b_chat,3.575,grounding,biggen_240612,[],other
29,yi_6b_chat,3.062,grounding,biggen_240612,[],other
30,llama_2_7b_chat,3.25,grounding,biggen_240612,[],other
31,codellama_7b_instruct,3.1,grounding,biggen_240612,[],other
32,meta_llama_3_8b_instruct,3.975,grounding,biggen_240612,[],other
33,olmo_7b_sft,2.825,grounding,biggen_240612,[],other
34,olmo_7b_instruct,2.925,grounding,biggen_240612,[],other
35,tulu_2_7b,2.788,grounding,biggen_240612,[],other
36,tulu_2_dpo_7b,3.2,grounding,biggen_240612,[],other
37,codetulu_2_7b,2.862,grounding,biggen_240612,[],other
38,orca_2_7b,2.3,grounding,biggen_240612,[],other
39,openchat_3.5_0106,3.575,grounding,biggen_240612,[],other
40,openhermes_2_mistral_7b,3.388,grounding,biggen_240612,[],other
41,openhermes_2.5_mistral_7b,3.3,grounding,biggen_240612,[],other
42,nous_hermes_2_mistral_7b_dpo,3.525,grounding,biggen_240612,[],other
43,starling_lm_7b_alpha,3.638,grounding,biggen_240612,[],other
44,starling_lm_7b_beta,3.737,grounding,biggen_240612,[],other
45,mistral_orpo_alpha,3.35,grounding,biggen_240612,[],other
46,mistral_orpo_beta,3.487,grounding,biggen_240612,[],other
47,zephyr_7b_beta,3.362,grounding,biggen_240612,[],other
48,qwen1.5_14b,3.413,grounding,biggen_240612,[],other
49,llama_2_13b,2.763,grounding,biggen_240612,[],other
50,codellama_13b,2.2,grounding,biggen_240612,[],other
51,solar_10.7b_v1.0,3.212,grounding,biggen_240612,[],other
52,qwen1.5_14b_chat,3.612,grounding,biggen_240612,[],other
53,solar_10.7b_instruct_v1.0,3.663,grounding,biggen_240612,[],other
54,aya_101,1.25,grounding,biggen_240612,[],other
55,llama_2_13b_chat,3.538,grounding,biggen_240612,[],other
56,codellama_13b_instruct,3.075,grounding,biggen_240612,[],other
57,tulu_2_13b,2.975,grounding,biggen_240612,[],other
58,tulu_2_dpo_13b,3.487,grounding,biggen_240612,[],other
59,codetulu_2_13b,3.1,grounding,biggen_240612,[],other
60,orca_2_13b,2.825,grounding,biggen_240612,[],other
61,yi_34b,3.388,grounding,biggen_240612,[],other
62,llemma_34b,2.812,grounding,biggen_240612,[],other
63,qwen1.5_32b,3.3,grounding,biggen_240612,[],other
64,codellama_34b,2.65,grounding,biggen_240612,[],other
65,mixtral_8x7b_v0.1,3.663,grounding,biggen_240612,[],other
66,yi_34b_chat,3.7,grounding,biggen_240612,[],other
67,nous_hermes_2_yi_34b,3.175,grounding,biggen_240612,[],other
68,codellama_34b_instruct,3.337,grounding,biggen_240612,[],other
69,codetulu_2_34b,3.275,grounding,biggen_240612,[],other
70,qwen1.5_32b_chat,3.712,grounding,biggen_240612,[],other
71,mixtral_8x7b_instruct_v0.1,3.862,grounding,biggen_240612,[],other
72,nous_hermes_2_mixtral_8x7b_sft,3.587,grounding,biggen_240612,[],other
73,nous_hermes_2_mixtral_8x7b_dpo,3.612,grounding,biggen_240612,[],other
74,c4ai_command_r_v01,3.688,grounding,biggen_240612,[],other
75,llama_2_70b,3.288,grounding,biggen_240612,[],other
76,codellama_70b,2.812,grounding,biggen_240612,[],other
77,mixtral_8x22b_v0.1_awq,3.475,grounding,biggen_240612,[],other
78,meta_llama_3_70b,3.263,grounding,biggen_240612,[],other
79,qwen1.5_72b,3.362,grounding,biggen_240612,[],other
80,llama_2_70b_chat,3.612,grounding,biggen_240612,[],other
81,codellama_70b_instruct,2.913,grounding,biggen_240612,[],other
82,tulu_2_dpo_70b,3.7,grounding,biggen_240612,[],other
83,c4ai_command_r_plus_gptq,3.788,grounding,biggen_240612,[],other
84,meta_llama_3_70b_instruct,4.013,grounding,biggen_240612,[],other
85,mixtral_8x22b_instruct_v0.1_awq,3.812,grounding,biggen_240612,[],other
86,zephyr_orpo_141b_a35b_v0.1_awq,3.425,grounding,biggen_240612,[],other
87,qwen1.5_72b_chat,3.938,grounding,biggen_240612,[],other
88,qwen_110b_chat,4.025,grounding,biggen_240612,[],other
89,gpt_3.5_turbo_1106,3.875,grounding,biggen_240612,[],other
90,gpt_3.5_turbo_0125,3.737,grounding,biggen_240612,[],other
91,gpt_4_1106_preview,4.237,grounding,biggen_240612,[],other
92,gpt_4_0125_preview,4.2,grounding,biggen_240612,[],other
93,gpt_4_turbo_2024_04_09,4.188,grounding,biggen_240612,[],other
94,gpt_4o_2024_05_13,4.088,grounding,biggen_240612,[],other
95,mistral_medium_hjpark,3.938,grounding,biggen_240612,[],other
96,mistral_large_hjpark,3.913,grounding,biggen_240612,[],other
97,gemini_1.0_pro,3.6,grounding,biggen_240612,[],other
98,gemini_pro_1.5,3.938,grounding,biggen_240612,[],other
99,gemini_flash_1.5,4.112,grounding,biggen_240612,[],other
100,claude_3_haiku_20240307,4.1,grounding,biggen_240612,[],other
101,claude_3_sonnet_20240229,4.05,grounding,biggen_240612,[],other
102,claude_3_opus_20240229,4.088,grounding,biggen_240612,[],other
103,phi_1,1.0,instruction_following,biggen_240612,[],other
104,phi_1_5,2.77,instruction_following,biggen_240612,[],other
105,phi_2,2.86,instruction_following,biggen_240612,[],other
106,qwen1.5_0.5b,2.06,instruction_following,biggen_240612,[],other
107,qwen1.5_1.8b,2.79,instruction_following,biggen_240612,[],other
108,qwen1.5_4b,2.82,instruction_following,biggen_240612,[],other
109,gemma_2b,2.61,instruction_following,biggen_240612,[],other
110,olmo_1b,1.7,instruction_following,biggen_240612,[],other
111,qwen1.5_0.5b_chat,2.36,instruction_following,biggen_240612,[],other
112,qwen1.5_1.8b_chat,3.09,instruction_following,biggen_240612,[],other
113,qwen1.5_4b_chat,2.99,instruction_following,biggen_240612,[],other
114,phi_3_mini_4k_instruct,3.82,instruction_following,biggen_240612,[],other
115,phi_3_mini_128k_instruct,3.66,instruction_following,biggen_240612,[],other
116,gemma_2b_it,3.12,instruction_following,biggen_240612,[],other
117,gemma_1.1_2b_it,3.21,instruction_following,biggen_240612,[],other
118,gemma_7b,1.53,instruction_following,biggen_240612,[],other
119,mistral_7b_v0.1,3.22,instruction_following,biggen_240612,[],other
120,mistral_7b_v0.2,3.31,instruction_following,biggen_240612,[],other
121,qwen1.5_7b,3.03,instruction_following,biggen_240612,[],other
122,yi_6b,2.77,instruction_following,biggen_240612,[],other
123,llama_2_7b,2.73,instruction_following,biggen_240612,[],other
124,codellama_7b,2.01,instruction_following,biggen_240612,[],other
125,meta_llama_3_8b,2.84,instruction_following,biggen_240612,[],other
126,llemma_7b,2.44,instruction_following,biggen_240612,[],other
127,olmo_7b,2.23,instruction_following,biggen_240612,[],other
128,gemma_7b_it,3.31,instruction_following,biggen_240612,[],other
129,gemma_1.1_7b_it,3.47,instruction_following,biggen_240612,[],other
130,mistral_7b_instruct_v0.2,3.74,instruction_following,biggen_240612,[],other
131,qwen1.5_7b_chat,3.83,instruction_following,biggen_240612,[],other
132,yi_6b_chat,3.5,instruction_following,biggen_240612,[],other
133,llama_2_7b_chat,3.55,instruction_following,biggen_240612,[],other
134,codellama_7b_instruct,3.26,instruction_following,biggen_240612,[],other
135,meta_llama_3_8b_instruct,3.75,instruction_following,biggen_240612,[],other
136,olmo_7b_sft,3.18,instruction_following,biggen_240612,[],other
137,olmo_7b_instruct,3.29,instruction_following,biggen_240612,[],other
138,tulu_2_7b,3.35,instruction_following,biggen_240612,[],other
139,tulu_2_dpo_7b,3.64,instruction_following,biggen_240612,[],other
140,codetulu_2_7b,3.11,instruction_following,biggen_240612,[],other
141,orca_2_7b,2.23,instruction_following,biggen_240612,[],other
142,openchat_3.5_0106,3.73,instruction_following,biggen_240612,[],other
143,openhermes_2_mistral_7b,3.53,instruction_following,biggen_240612,[],other
144,openhermes_2.5_mistral_7b,3.34,instruction_following,biggen_240612,[],other
145,nous_hermes_2_mistral_7b_dpo,3.61,instruction_following,biggen_240612,[],other
146,starling_lm_7b_alpha,3.62,instruction_following,biggen_240612,[],other
147,starling_lm_7b_beta,3.82,instruction_following,biggen_240612,[],other
148,mistral_orpo_alpha,3.53,instruction_following,biggen_240612,[],other
149,mistral_orpo_beta,3.76,instruction_following,biggen_240612,[],other
150,zephyr_7b_beta,3.69,instruction_following,biggen_240612,[],other
151,qwen1.5_14b,3.41,instruction_following,biggen_240612,[],other
152,llama_2_13b,2.99,instruction_following,biggen_240612,[],other
153,codellama_13b,2.08,instruction_following,biggen_240612,[],other
154,solar_10.7b_v1.0,3.53,instruction_following,biggen_240612,[],other
155,qwen1.5_14b_chat,3.84,instruction_following,biggen_240612,[],other
156,solar_10.7b_instruct_v1.0,3.73,instruction_following,biggen_240612,[],other
157,aya_101,1.33,instruction_following,biggen_240612,[],other
158,llama_2_13b_chat,3.72,instruction_following,biggen_240612,[],other
159,codellama_13b_instruct,3.13,instruction_following,biggen_240612,[],other
160,tulu_2_13b,3.4,instruction_following,biggen_240612,[],other
161,tulu_2_dpo_13b,3.65,instruction_following,biggen_240612,[],other
162,codetulu_2_13b,3.33,instruction_following,biggen_240612,[],other
163,orca_2_13b,2.45,instruction_following,biggen_240612,[],other
164,yi_34b,3.47,instruction_following,biggen_240612,[],other
165,llemma_34b,2.74,instruction_following,biggen_240612,[],other
166,qwen1.5_32b,3.63,instruction_following,biggen_240612,[],other
167,codellama_34b,2.49,instruction_following,biggen_240612,[],other
168,mixtral_8x7b_v0.1,3.45,instruction_following,biggen_240612,[],other
169,yi_34b_chat,3.79,instruction_following,biggen_240612,[],other
170,nous_hermes_2_yi_34b,3.65,instruction_following,biggen_240612,[],other
171,codellama_34b_instruct,3.5,instruction_following,biggen_240612,[],other
172,codetulu_2_34b,3.44,instruction_following,biggen_240612,[],other
173,qwen1.5_32b_chat,3.92,instruction_following,biggen_240612,[],other
174,mixtral_8x7b_instruct_v0.1,3.95,instruction_following,biggen_240612,[],other
175,nous_hermes_2_mixtral_8x7b_sft,3.7,instruction_following,biggen_240612,[],other
176,nous_hermes_2_mixtral_8x7b_dpo,3.83,instruction_following,biggen_240612,[],other
177,c4ai_command_r_v01,3.67,instruction_following,biggen_240612,[],other
178,llama_2_70b,3.4,instruction_following,biggen_240612,[],other
179,codellama_70b,2.46,instruction_following,biggen_240612,[],other
180,mixtral_8x22b_v0.1_awq,3.59,instruction_following,biggen_240612,[],other
181,meta_llama_3_70b,3.26,instruction_following,biggen_240612,[],other
182,qwen1.5_72b,3.5,instruction_following,biggen_240612,[],other
183,llama_2_70b_chat,3.71,instruction_following,biggen_240612,[],other
184,codellama_70b_instruct,2.53,instruction_following,biggen_240612,[],other
185,tulu_2_dpo_70b,3.79,instruction_following,biggen_240612,[],other
186,c4ai_command_r_plus_gptq,3.89,instruction_following,biggen_240612,[],other
187,meta_llama_3_70b_instruct,4.02,instruction_following,biggen_240612,[],other
188,mixtral_8x22b_instruct_v0.1_awq,3.91,instruction_following,biggen_240612,[],other
189,zephyr_orpo_141b_a35b_v0.1_awq,3.57,instruction_following,biggen_240612,[],other
190,qwen1.5_72b_chat,4.0,instruction_following,biggen_240612,[],other
191,qwen_110b_chat,3.89,instruction_following,biggen_240612,[],other
192,gpt_3.5_turbo_1106,3.73,instruction_following,biggen_240612,[],other
193,gpt_3.5_turbo_0125,3.74,instruction_following,biggen_240612,[],other
194,gpt_4_1106_preview,4.23,instruction_following,biggen_240612,[],other
195,gpt_4_0125_preview,4.12,instruction_following,biggen_240612,[],other
196,gpt_4_turbo_2024_04_09,4.04,instruction_following,biggen_240612,[],other
197,gpt_4o_2024_05_13,4.1,instruction_following,biggen_240612,[],other
198,mistral_medium_hjpark,3.88,instruction_following,biggen_240612,[],other
199,mistral_large_hjpark,3.82,instruction_following,biggen_240612,[],other
200,gemini_1.0_pro,3.67,instruction_following,biggen_240612,[],other
201,gemini_pro_1.5,3.91,instruction_following,biggen_240612,[],other
202,gemini_flash_1.5,3.78,instruction_following,biggen_240612,[],other
203,claude_3_haiku_20240307,4.0,instruction_following,biggen_240612,[],other
204,claude_3_sonnet_20240229,3.84,instruction_following,biggen_240612,[],other
205,claude_3_opus_20240229,4.0,instruction_following,biggen_240612,[],other
206,phi_1,1.0,planning,biggen_240612,[],other
207,phi_1_5,2.314,planning,biggen_240612,[],other
208,phi_2,2.6,planning,biggen_240612,[],other
209,qwen1.5_0.5b,1.471,planning,biggen_240612,[],other
210,qwen1.5_1.8b,2.214,planning,biggen_240612,[],other
211,qwen1.5_4b,2.557,planning,biggen_240612,[],other
212,gemma_2b,2.129,planning,biggen_240612,[],other
213,olmo_1b,1.343,planning,biggen_240612,[],other
214,qwen1.5_0.5b_chat,1.957,planning,biggen_240612,[],other
215,qwen1.5_1.8b_chat,2.629,planning,biggen_240612,[],other
216,qwen1.5_4b_chat,2.914,planning,biggen_240612,[],other
217,phi_3_mini_4k_instruct,3.486,planning,biggen_240612,[],other
218,phi_3_mini_128k_instruct,3.5,planning,biggen_240612,[],other
219,gemma_2b_it,3.0,planning,biggen_240612,[],other
220,gemma_1.1_2b_it,3.0,planning,biggen_240612,[],other
221,gemma_7b,1.171,planning,biggen_240612,[],other
222,mistral_7b_v0.1,3.029,planning,biggen_240612,[],other
223,mistral_7b_v0.2,2.871,planning,biggen_240612,[],other
224,qwen1.5_7b,2.814,planning,biggen_240612,[],other
225,yi_6b,2.271,planning,biggen_240612,[],other
226,llama_2_7b,2.4,planning,biggen_240612,[],other
227,codellama_7b,1.586,planning,biggen_240612,[],other
228,meta_llama_3_8b,2.414,planning,biggen_240612,[],other
229,llemma_7b,1.971,planning,biggen_240612,[],other
230,olmo_7b,1.757,planning,biggen_240612,[],other
231,gemma_7b_it,2.857,planning,biggen_240612,[],other
232,gemma_1.1_7b_it,3.143,planning,biggen_240612,[],other
233,mistral_7b_instruct_v0.2,3.7,planning,biggen_240612,[],other
234,qwen1.5_7b_chat,3.471,planning,biggen_240612,[],other
235,yi_6b_chat,3.171,planning,biggen_240612,[],other
236,llama_2_7b_chat,3.286,planning,biggen_240612,[],other
237,codellama_7b_instruct,2.914,planning,biggen_240612,[],other
238,meta_llama_3_8b_instruct,3.714,planning,biggen_240612,[],other
239,olmo_7b_sft,2.843,planning,biggen_240612,[],other
240,olmo_7b_instruct,2.986,planning,biggen_240612,[],other
241,tulu_2_7b,3.129,planning,biggen_240612,[],other
242,tulu_2_dpo_7b,3.229,planning,biggen_240612,[],other
243,codetulu_2_7b,2.929,planning,biggen_240612,[],other
244,orca_2_7b,1.3,planning,biggen_240612,[],other
245,openchat_3.5_0106,3.643,planning,biggen_240612,[],other
246,openhermes_2_mistral_7b,3.529,planning,biggen_240612,[],other
247,openhermes_2.5_mistral_7b,3.457,planning,biggen_240612,[],other
248,nous_hermes_2_mistral_7b_dpo,3.514,planning,biggen_240612,[],other
249,starling_lm_7b_alpha,3.557,planning,biggen_240612,[],other
250,starling_lm_7b_beta,3.671,planning,biggen_240612,[],other
251,mistral_orpo_alpha,3.329,planning,biggen_240612,[],other
252,mistral_orpo_beta,3.3,planning,biggen_240612,[],other
253,zephyr_7b_beta,3.571,planning,biggen_240612,[],other
254,qwen1.5_14b,2.9,planning,biggen_240612,[],other
255,llama_2_13b,2.629,planning,biggen_240612,[],other
256,codellama_13b,1.814,planning,biggen_240612,[],other
257,solar_10.7b_v1.0,3.057,planning,biggen_240612,[],other
258,qwen1.5_14b_chat,3.657,planning,biggen_240612,[],other
259,solar_10.7b_instruct_v1.0,3.614,planning,biggen_240612,[],other
260,aya_101,1.357,planning,biggen_240612,[],other
261,llama_2_13b_chat,3.4,planning,biggen_240612,[],other
262,codellama_13b_instruct,3.086,planning,biggen_240612,[],other
263,tulu_2_13b,3.371,planning,biggen_240612,[],other
264,tulu_2_dpo_13b,3.371,planning,biggen_240612,[],other
265,codetulu_2_13b,3.1,planning,biggen_240612,[],other
266,orca_2_13b,1.6,planning,biggen_240612,[],other
267,yi_34b,3.243,planning,biggen_240612,[],other
268,llemma_34b,2.529,planning,biggen_240612,[],other
269,qwen1.5_32b,3.229,planning,biggen_240612,[],other
270,codellama_34b,2.257,planning,biggen_240612,[],other
271,mixtral_8x7b_v0.1,3.286,planning,biggen_240612,[],other
272,yi_34b_chat,3.729,planning,biggen_240612,[],other
273,nous_hermes_2_yi_34b,3.543,planning,biggen_240612,[],other
274,codellama_34b_instruct,3.171,planning,biggen_240612,[],other
275,codetulu_2_34b,3.5,planning,biggen_240612,[],other
276,qwen1.5_32b_chat,3.829,planning,biggen_240612,[],other
277,mixtral_8x7b_instruct_v0.1,3.457,planning,biggen_240612,[],other
278,nous_hermes_2_mixtral_8x7b_sft,3.586,planning,biggen_240612,[],other
279,nous_hermes_2_mixtral_8x7b_dpo,3.657,planning,biggen_240612,[],other
280,c4ai_command_r_v01,3.643,planning,biggen_240612,[],other
281,llama_2_70b,3.2,planning,biggen_240612,[],other
282,codellama_70b,2.357,planning,biggen_240612,[],other
283,mixtral_8x22b_v0.1_awq,3.457,planning,biggen_240612,[],other
284,meta_llama_3_70b,2.8,planning,biggen_240612,[],other
285,qwen1.5_72b,3.186,planning,biggen_240612,[],other
286,llama_2_70b_chat,3.671,planning,biggen_240612,[],other
287,codellama_70b_instruct,2.5,planning,biggen_240612,[],other
288,tulu_2_dpo_70b,3.886,planning,biggen_240612,[],other
289,c4ai_command_r_plus_gptq,3.914,planning,biggen_240612,[],other
290,meta_llama_3_70b_instruct,3.929,planning,biggen_240612,[],other
291,mixtral_8x22b_instruct_v0.1_awq,3.729,planning,biggen_240612,[],other
292,zephyr_orpo_141b_a35b_v0.1_awq,3.8,planning,biggen_240612,[],other
293,qwen1.5_72b_chat,3.814,planning,biggen_240612,[],other
294,qwen_110b_chat,3.957,planning,biggen_240612,[],other
295,gpt_3.5_turbo_1106,3.871,planning,biggen_240612,[],other
296,gpt_3.5_turbo_0125,3.871,planning,biggen_240612,[],other
297,gpt_4_1106_preview,4.157,planning,biggen_240612,[],other
298,gpt_4_0125_preview,4.243,planning,biggen_240612,[],other
299,gpt_4_turbo_2024_04_09,4.029,planning,biggen_240612,[],other
300,gpt_4o_2024_05_13,4.086,planning,biggen_240612,[],other
301,mistral_medium_hjpark,3.914,planning,biggen_240612,[],other
302,mistral_large_hjpark,3.9,planning,biggen_240612,[],other
303,gemini_1.0_pro,3.714,planning,biggen_240612,[],other
304,gemini_pro_1.5,3.929,planning,biggen_240612,[],other
305,gemini_flash_1.5,3.771,planning,biggen_240612,[],other
306,claude_3_haiku_20240307,4.043,planning,biggen_240612,[],other
307,claude_3_sonnet_20240229,4.057,planning,biggen_240612,[],other
308,claude_3_opus_20240229,4.1,planning,biggen_240612,[],other
309,phi_1,1.0,reasoning,biggen_240612,[],reasoning
310,phi_1_5,2.13,reasoning,biggen_240612,[],reasoning
311,phi_2,2.7,reasoning,biggen_240612,[],reasoning
312,qwen1.5_0.5b,1.5,reasoning,biggen_240612,[],reasoning
313,qwen1.5_1.8b,1.83,reasoning,biggen_240612,[],reasoning
314,qwen1.5_4b,2.3,reasoning,biggen_240612,[],reasoning
315,gemma_2b,1.99,reasoning,biggen_240612,[],reasoning
316,olmo_1b,1.33,reasoning,biggen_240612,[],reasoning
317,qwen1.5_0.5b_chat,1.68,reasoning,biggen_240612,[],reasoning
318,qwen1.5_1.8b_chat,2.28,reasoning,biggen_240612,[],reasoning
319,qwen1.5_4b_chat,2.69,reasoning,biggen_240612,[],reasoning
320,phi_3_mini_4k_instruct,3.59,reasoning,biggen_240612,[],reasoning
321,phi_3_mini_128k_instruct,3.61,reasoning,biggen_240612,[],reasoning
322,gemma_2b_it,2.39,reasoning,biggen_240612,[],reasoning
323,gemma_1.1_2b_it,2.49,reasoning,biggen_240612,[],reasoning
324,gemma_7b,1.28,reasoning,biggen_240612,[],reasoning
325,mistral_7b_v0.1,2.75,reasoning,biggen_240612,[],reasoning
326,mistral_7b_v0.2,2.65,reasoning,biggen_240612,[],reasoning
327,qwen1.5_7b,2.37,reasoning,biggen_240612,[],reasoning
328,yi_6b,2.25,reasoning,biggen_240612,[],reasoning
329,llama_2_7b,2.03,reasoning,biggen_240612,[],reasoning
330,codellama_7b,1.57,reasoning,biggen_240612,[],reasoning
331,meta_llama_3_8b,2.32,reasoning,biggen_240612,[],reasoning
332,llemma_7b,2.07,reasoning,biggen_240612,[],reasoning
333,olmo_7b,1.76,reasoning,biggen_240612,[],reasoning
334,gemma_7b_it,2.88,reasoning,biggen_240612,[],reasoning
335,gemma_1.1_7b_it,3.05,reasoning,biggen_240612,[],reasoning
336,mistral_7b_instruct_v0.2,3.06,reasoning,biggen_240612,[],reasoning
337,qwen1.5_7b_chat,3.02,reasoning,biggen_240612,[],reasoning
338,yi_6b_chat,2.61,reasoning,biggen_240612,[],reasoning
339,llama_2_7b_chat,2.72,reasoning,biggen_240612,[],reasoning
340,codellama_7b_instruct,2.52,reasoning,biggen_240612,[],reasoning
341,meta_llama_3_8b_instruct,3.32,reasoning,biggen_240612,[],reasoning
342,olmo_7b_sft,2.37,reasoning,biggen_240612,[],reasoning
343,olmo_7b_instruct,2.38,reasoning,biggen_240612,[],reasoning
344,tulu_2_7b,2.57,reasoning,biggen_240612,[],reasoning
345,tulu_2_dpo_7b,2.68,reasoning,biggen_240612,[],reasoning
346,codetulu_2_7b,2.56,reasoning,biggen_240612,[],reasoning
347,orca_2_7b,1.75,reasoning,biggen_240612,[],reasoning
348,openchat_3.5_0106,3.23,reasoning,biggen_240612,[],reasoning
349,openhermes_2_mistral_7b,3.09,reasoning,biggen_240612,[],reasoning
350,openhermes_2.5_mistral_7b,3.12,reasoning,biggen_240612,[],reasoning
351,nous_hermes_2_mistral_7b_dpo,3.11,reasoning,biggen_240612,[],reasoning
352,starling_lm_7b_alpha,3.24,reasoning,biggen_240612,[],reasoning
353,starling_lm_7b_beta,3.46,reasoning,biggen_240612,[],reasoning
354,mistral_orpo_alpha,2.93,reasoning,biggen_240612,[],reasoning
355,mistral_orpo_beta,2.96,reasoning,biggen_240612,[],reasoning
356,zephyr_7b_beta,3.08,reasoning,biggen_240612,[],reasoning
357,qwen1.5_14b,2.77,reasoning,biggen_240612,[],reasoning
358,llama_2_13b,2.17,reasoning,biggen_240612,[],reasoning
359,codellama_13b,1.89,reasoning,biggen_240612,[],reasoning
360,solar_10.7b_v1.0,2.72,reasoning,biggen_240612,[],reasoning
361,qwen1.5_14b_chat,3.38,reasoning,biggen_240612,[],reasoning
362,solar_10.7b_instruct_v1.0,3.23,reasoning,biggen_240612,[],reasoning
363,aya_101,1.34,reasoning,biggen_240612,[],reasoning
364,llama_2_13b_chat,2.61,reasoning,biggen_240612,[],reasoning
365,codellama_13b_instruct,2.78,reasoning,biggen_240612,[],reasoning
366,tulu_2_13b,2.7,reasoning,biggen_240612,[],reasoning
367,tulu_2_dpo_13b,2.8,reasoning,biggen_240612,[],reasoning
368,codetulu_2_13b,2.62,reasoning,biggen_240612,[],reasoning
369,orca_2_13b,2.22,reasoning,biggen_240612,[],reasoning
370,yi_34b,3.06,reasoning,biggen_240612,[],reasoning
371,llemma_34b,2.56,reasoning,biggen_240612,[],reasoning
372,qwen1.5_32b,3.07,reasoning,biggen_240612,[],reasoning
373,codellama_34b,2.0,reasoning,biggen_240612,[],reasoning
374,mixtral_8x7b_v0.1,3.13,reasoning,biggen_240612,[],reasoning
375,yi_34b_chat,3.25,reasoning,biggen_240612,[],reasoning
376,nous_hermes_2_yi_34b,3.3,reasoning,biggen_240612,[],reasoning
377,codellama_34b_instruct,2.95,reasoning,biggen_240612,[],reasoning
378,codetulu_2_34b,2.97,reasoning,biggen_240612,[],reasoning
379,qwen1.5_32b_chat,3.47,reasoning,biggen_240612,[],reasoning
380,mixtral_8x7b_instruct_v0.1,3.58,reasoning,biggen_240612,[],reasoning
381,nous_hermes_2_mixtral_8x7b_sft,3.29,reasoning,biggen_240612,[],reasoning
382,nous_hermes_2_mixtral_8x7b_dpo,3.42,reasoning,biggen_240612,[],reasoning
383,c4ai_command_r_v01,3.25,reasoning,biggen_240612,[],reasoning
384,llama_2_70b,2.86,reasoning,biggen_240612,[],reasoning
385,codellama_70b,2.35,reasoning,biggen_240612,[],reasoning
386,mixtral_8x22b_v0.1_awq,3.48,reasoning,biggen_240612,[],reasoning
387,meta_llama_3_70b,2.88,reasoning,biggen_240612,[],reasoning
388,qwen1.5_72b,3.2,reasoning,biggen_240612,[],reasoning
389,llama_2_70b_chat,3.1,reasoning,biggen_240612,[],reasoning
390,codellama_70b_instruct,2.56,reasoning,biggen_240612,[],reasoning
391,tulu_2_dpo_70b,3.12,reasoning,biggen_240612,[],reasoning
392,c4ai_command_r_plus_gptq,3.48,reasoning,biggen_240612,[],reasoning
393,meta_llama_3_70b_instruct,3.77,reasoning,biggen_240612,[],reasoning
394,mixtral_8x22b_instruct_v0.1_awq,3.76,reasoning,biggen_240612,[],reasoning
395,zephyr_orpo_141b_a35b_v0.1_awq,3.42,reasoning,biggen_240612,[],reasoning
396,qwen1.5_72b_chat,3.65,reasoning,biggen_240612,[],reasoning
397,qwen_110b_chat,3.8,reasoning,biggen_240612,[],reasoning
398,gpt_3.5_turbo_1106,3.37,reasoning,biggen_240612,[],reasoning
399,gpt_3.5_turbo_0125,3.58,reasoning,biggen_240612,[],reasoning
400,gpt_4_1106_preview,4.15,reasoning,biggen_240612,[],reasoning
401,gpt_4_0125_preview,4.2,reasoning,biggen_240612,[],reasoning
402,gpt_4_turbo_2024_04_09,4.13,reasoning,biggen_240612,[],reasoning
403,gpt_4o_2024_05_13,4.03,reasoning,biggen_240612,[],reasoning
404,mistral_medium_hjpark,3.89,reasoning,biggen_240612,[],reasoning
405,mistral_large_hjpark,3.78,reasoning,biggen_240612,[],reasoning
406,gemini_1.0_pro,3.61,reasoning,biggen_240612,[],reasoning
407,gemini_pro_1.5,3.89,reasoning,biggen_240612,[],reasoning
408,gemini_flash_1.5,3.85,reasoning,biggen_240612,[],reasoning
409,claude_3_haiku_20240307,3.55,reasoning,biggen_240612,[],reasoning
410,claude_3_sonnet_20240229,3.82,reasoning,biggen_240612,[],reasoning
411,claude_3_opus_20240229,3.9,reasoning,biggen_240612,[],reasoning
412,phi_1,1.303,refinement,biggen_240612,[],other
413,phi_1_5,2.329,refinement,biggen_240612,[],other
414,phi_2,2.789,refinement,biggen_240612,[],other
415,qwen1.5_0.5b,1.934,refinement,biggen_240612,[],other
416,qwen1.5_1.8b,2.408,refinement,biggen_240612,[],other
417,qwen1.5_4b,2.447,refinement,biggen_240612,[],other
418,gemma_2b,1.934,refinement,biggen_240612,[],other
419,olmo_1b,1.737,refinement,biggen_240612,[],other
420,qwen1.5_0.5b_chat,1.776,refinement,biggen_240612,[],other
421,qwen1.5_1.8b_chat,2.553,refinement,biggen_240612,[],other
422,qwen1.5_4b_chat,2.579,refinement,biggen_240612,[],other
423,phi_3_mini_4k_instruct,3.763,refinement,biggen_240612,[],other
424,phi_3_mini_128k_instruct,3.539,refinement,biggen_240612,[],other
425,gemma_2b_it,2.724,refinement,biggen_240612,[],other
426,gemma_1.1_2b_it,2.947,refinement,biggen_240612,[],other
427,gemma_7b,1.474,refinement,biggen_240612,[],other
428,mistral_7b_v0.1,2.566,refinement,biggen_240612,[],other
429,mistral_7b_v0.2,2.579,refinement,biggen_240612,[],other
430,qwen1.5_7b,2.579,refinement,biggen_240612,[],other
431,yi_6b,2.434,refinement,biggen_240612,[],other
432,llama_2_7b,2.092,refinement,biggen_240612,[],other
433,codellama_7b,1.776,refinement,biggen_240612,[],other
434,meta_llama_3_8b,2.829,refinement,biggen_240612,[],other
435,llemma_7b,2.158,refinement,biggen_240612,[],other
436,olmo_7b,1.868,refinement,biggen_240612,[],other
437,gemma_7b_it,3.039,refinement,biggen_240612,[],other
438,gemma_1.1_7b_it,3.158,refinement,biggen_240612,[],other
439,mistral_7b_instruct_v0.2,3.355,refinement,biggen_240612,[],other
440,qwen1.5_7b_chat,3.132,refinement,biggen_240612,[],other
441,yi_6b_chat,2.803,refinement,biggen_240612,[],other
442,llama_2_7b_chat,2.987,refinement,biggen_240612,[],other
443,codellama_7b_instruct,2.671,refinement,biggen_240612,[],other
444,meta_llama_3_8b_instruct,3.408,refinement,biggen_240612,[],other
445,olmo_7b_sft,2.224,refinement,biggen_240612,[],other
446,olmo_7b_instruct,2.539,refinement,biggen_240612,[],other
447,tulu_2_7b,2.789,refinement,biggen_240612,[],other
448,tulu_2_dpo_7b,2.868,refinement,biggen_240612,[],other
449,codetulu_2_7b,2.763,refinement,biggen_240612,[],other
450,orca_2_7b,2.066,refinement,biggen_240612,[],other
451,openchat_3.5_0106,3.408,refinement,biggen_240612,[],other
452,openhermes_2_mistral_7b,3.079,refinement,biggen_240612,[],other
453,openhermes_2.5_mistral_7b,2.855,refinement,biggen_240612,[],other
454,nous_hermes_2_mistral_7b_dpo,3.158,refinement,biggen_240612,[],other
455,starling_lm_7b_alpha,3.092,refinement,biggen_240612,[],other
456,starling_lm_7b_beta,3.421,refinement,biggen_240612,[],other
457,mistral_orpo_alpha,3.184,refinement,biggen_240612,[],other
458,mistral_orpo_beta,2.987,refinement,biggen_240612,[],other
459,zephyr_7b_beta,3.158,refinement,biggen_240612,[],other
460,qwen1.5_14b,2.974,refinement,biggen_240612,[],other
461,llama_2_13b,2.382,refinement,biggen_240612,[],other
462,codellama_13b,1.697,refinement,biggen_240612,[],other
463,solar_10.7b_v1.0,3.092,refinement,biggen_240612,[],other
464,qwen1.5_14b_chat,3.25,refinement,biggen_240612,[],other
465,solar_10.7b_instruct_v1.0,3.289,refinement,biggen_240612,[],other
466,aya_101,1.882,refinement,biggen_240612,[],other
467,llama_2_13b_chat,3.066,refinement,biggen_240612,[],other
468,codellama_13b_instruct,2.526,refinement,biggen_240612,[],other
469,tulu_2_13b,2.803,refinement,biggen_240612,[],other
470,tulu_2_dpo_13b,3.118,refinement,biggen_240612,[],other
471,codetulu_2_13b,2.961,refinement,biggen_240612,[],other
472,orca_2_13b,2.092,refinement,biggen_240612,[],other
473,yi_34b,2.921,refinement,biggen_240612,[],other
474,llemma_34b,2.566,refinement,biggen_240612,[],other
475,qwen1.5_32b,2.921,refinement,biggen_240612,[],other
476,codellama_34b,2.289,refinement,biggen_240612,[],other
477,mixtral_8x7b_v0.1,3.013,refinement,biggen_240612,[],other
478,yi_34b_chat,3.342,refinement,biggen_240612,[],other
479,nous_hermes_2_yi_34b,3.342,refinement,biggen_240612,[],other
480,codellama_34b_instruct,2.776,refinement,biggen_240612,[],other
481,codetulu_2_34b,3.039,refinement,biggen_240612,[],other
482,qwen1.5_32b_chat,3.145,refinement,biggen_240612,[],other
483,mixtral_8x7b_instruct_v0.1,3.329,refinement,biggen_240612,[],other
484,nous_hermes_2_mixtral_8x7b_sft,3.039,refinement,biggen_240612,[],other
485,nous_hermes_2_mixtral_8x7b_dpo,3.303,refinement,biggen_240612,[],other
486,c4ai_command_r_v01,3.316,refinement,biggen_240612,[],other
487,llama_2_70b,2.895,refinement,biggen_240612,[],other
488,codellama_70b,2.408,refinement,biggen_240612,[],other
489,mixtral_8x22b_v0.1_awq,3.237,refinement,biggen_240612,[],other
490,meta_llama_3_70b,3.066,refinement,biggen_240612,[],other
491,qwen1.5_72b,3.013,refinement,biggen_240612,[],other
492,llama_2_70b_chat,3.303,refinement,biggen_240612,[],other
493,codellama_70b_instruct,2.25,refinement,biggen_240612,[],other
494,tulu_2_dpo_70b,3.382,refinement,biggen_240612,[],other
495,c4ai_command_r_plus_gptq,3.447,refinement,biggen_240612,[],other
496,meta_llama_3_70b_instruct,3.776,refinement,biggen_240612,[],other
497,mixtral_8x22b_instruct_v0.1_awq,3.684,refinement,biggen_240612,[],other
498,zephyr_orpo_141b_a35b_v0.1_awq,3.303,refinement,biggen_240612,[],other
499,qwen1.5_72b_chat,3.868,refinement,biggen_240612,[],other
500,qwen_110b_chat,3.842,refinement,biggen_240612,[],other
501,gpt_3.5_turbo_1106,3.105,refinement,biggen_240612,[],other
502,gpt_3.5_turbo_0125,3.539,refinement,biggen_240612,[],other
503,gpt_4_1106_preview,4.263,refinement,biggen_240612,[],other
504,gpt_4_0125_preview,3.961,refinement,biggen_240612,[],other
505,gpt_4_turbo_2024_04_09,4.0,refinement,biggen_240612,[],other
506,gpt_4o_2024_05_13,3.855,refinement,biggen_240612,[],other
507,mistral_medium_hjpark,3.632,refinement,biggen_240612,[],other
508,mistral_large_hjpark,3.684,refinement,biggen_240612,[],other
509,gemini_1.0_pro,2.816,refinement,biggen_240612,[],other
510,gemini_pro_1.5,3.553,refinement,biggen_240612,[],other
511,gemini_flash_1.5,3.513,refinement,biggen_240612,[],other
512,claude_3_haiku_20240307,3.566,refinement,biggen_240612,[],other
513,claude_3_sonnet_20240229,3.658,refinement,biggen_240612,[],other
514,claude_3_opus_20240229,3.947,refinement,biggen_240612,[],other
515,phi_1,1.391,safety,biggen_240612,[],other
516,phi_1_5,2.87,safety,biggen_240612,[],other
517,phi_2,3.406,safety,biggen_240612,[],other
518,qwen1.5_0.5b,2.029,safety,biggen_240612,[],other
519,qwen1.5_1.8b,2.42,safety,biggen_240612,[],other
520,qwen1.5_4b,3.13,safety,biggen_240612,[],other
521,gemma_2b,2.42,safety,biggen_240612,[],other
522,olmo_1b,2.072,safety,biggen_240612,[],other
523,qwen1.5_0.5b_chat,2.594,safety,biggen_240612,[],other
524,qwen1.5_1.8b_chat,2.696,safety,biggen_240612,[],other
525,qwen1.5_4b_chat,3.362,safety,biggen_240612,[],other
526,phi_3_mini_4k_instruct,4.101,safety,biggen_240612,[],other
527,phi_3_mini_128k_instruct,3.986,safety,biggen_240612,[],other
528,gemma_2b_it,3.928,safety,biggen_240612,[],other
529,gemma_1.1_2b_it,3.884,safety,biggen_240612,[],other
530,gemma_7b,2.029,safety,biggen_240612,[],other
531,mistral_7b_v0.1,3.29,safety,biggen_240612,[],other
532,mistral_7b_v0.2,3.304,safety,biggen_240612,[],other
533,qwen1.5_7b,3.087,safety,biggen_240612,[],other
534,yi_6b,3.101,safety,biggen_240612,[],other
535,llama_2_7b,3.188,safety,biggen_240612,[],other
536,codellama_7b,2.377,safety,biggen_240612,[],other
537,meta_llama_3_8b,2.899,safety,biggen_240612,[],other
538,llemma_7b,2.435,safety,biggen_240612,[],other
539,olmo_7b,2.623,safety,biggen_240612,[],other
540,gemma_7b_it,3.768,safety,biggen_240612,[],other
541,gemma_1.1_7b_it,4.043,safety,biggen_240612,[],other
542,mistral_7b_instruct_v0.2,3.986,safety,biggen_240612,[],other
543,qwen1.5_7b_chat,3.928,safety,biggen_240612,[],other
544,yi_6b_chat,3.609,safety,biggen_240612,[],other
545,llama_2_7b_chat,4.261,safety,biggen_240612,[],other
546,codellama_7b_instruct,3.841,safety,biggen_240612,[],other
547,meta_llama_3_8b_instruct,3.652,safety,biggen_240612,[],other
548,olmo_7b_sft,3.435,safety,biggen_240612,[],other
549,olmo_7b_instruct,3.188,safety,biggen_240612,[],other
550,tulu_2_7b,3.797,safety,biggen_240612,[],other
551,tulu_2_dpo_7b,3.797,safety,biggen_240612,[],other
552,codetulu_2_7b,3.348,safety,biggen_240612,[],other
553,orca_2_7b,2.58,safety,biggen_240612,[],other
554,openchat_3.5_0106,3.971,safety,biggen_240612,[],other
555,openhermes_2_mistral_7b,3.203,safety,biggen_240612,[],other
556,openhermes_2.5_mistral_7b,3.101,safety,biggen_240612,[],other
557,nous_hermes_2_mistral_7b_dpo,3.333,safety,biggen_240612,[],other
558,starling_lm_7b_alpha,3.797,safety,biggen_240612,[],other
559,starling_lm_7b_beta,3.841,safety,biggen_240612,[],other
560,mistral_orpo_alpha,3.826,safety,biggen_240612,[],other
561,mistral_orpo_beta,3.609,safety,biggen_240612,[],other
562,zephyr_7b_beta,3.725,safety,biggen_240612,[],other
563,qwen1.5_14b,2.536,safety,biggen_240612,[],other
564,llama_2_13b,3.319,safety,biggen_240612,[],other
565,codellama_13b,2.304,safety,biggen_240612,[],other
566,solar_10.7b_v1.0,3.652,safety,biggen_240612,[],other
567,qwen1.5_14b_chat,4.058,safety,biggen_240612,[],other
568,solar_10.7b_instruct_v1.0,3.826,safety,biggen_240612,[],other
569,aya_101,1.58,safety,biggen_240612,[],other
570,llama_2_13b_chat,4.29,safety,biggen_240612,[],other
571,codellama_13b_instruct,4.116,safety,biggen_240612,[],other
572,tulu_2_13b,3.87,safety,biggen_240612,[],other
573,tulu_2_dpo_13b,3.928,safety,biggen_240612,[],other
574,codetulu_2_13b,3.42,safety,biggen_240612,[],other
575,orca_2_13b,2.913,safety,biggen_240612,[],other
576,yi_34b,3.464,safety,biggen_240612,[],other
577,llemma_34b,2.884,safety,biggen_240612,[],other
578,qwen1.5_32b,3.377,safety,biggen_240612,[],other
579,codellama_34b,2.536,safety,biggen_240612,[],other
580,mixtral_8x7b_v0.1,3.855,safety,biggen_240612,[],other
581,yi_34b_chat,4.087,safety,biggen_240612,[],other
582,nous_hermes_2_yi_34b,3.507,safety,biggen_240612,[],other
583,codellama_34b_instruct,4.145,safety,biggen_240612,[],other
584,codetulu_2_34b,3.739,safety,biggen_240612,[],other
585,qwen1.5_32b_chat,4.116,safety,biggen_240612,[],other
586,mixtral_8x7b_instruct_v0.1,3.884,safety,biggen_240612,[],other
587,nous_hermes_2_mixtral_8x7b_sft,3.551,safety,biggen_240612,[],other
588,nous_hermes_2_mixtral_8x7b_dpo,3.667,safety,biggen_240612,[],other
589,c4ai_command_r_v01,3.913,safety,biggen_240612,[],other
590,llama_2_70b,3.913,safety,biggen_240612,[],other
591,codellama_70b,2.754,safety,biggen_240612,[],other
592,mixtral_8x22b_v0.1_awq,3.754,safety,biggen_240612,[],other
593,meta_llama_3_70b,3.058,safety,biggen_240612,[],other
594,qwen1.5_72b,3.957,safety,biggen_240612,[],other
595,llama_2_70b_chat,4.536,safety,biggen_240612,[],other
596,codellama_70b_instruct,4.043,safety,biggen_240612,[],other
597,tulu_2_dpo_70b,3.913,safety,biggen_240612,[],other
598,c4ai_command_r_plus_gptq,3.986,safety,biggen_240612,[],other
599,meta_llama_3_70b_instruct,3.87,safety,biggen_240612,[],other
600,mixtral_8x22b_instruct_v0.1_awq,3.899,safety,biggen_240612,[],other
601,zephyr_orpo_141b_a35b_v0.1_awq,3.435,safety,biggen_240612,[],other
602,qwen1.5_72b_chat,4.0,safety,biggen_240612,[],other
603,qwen_110b_chat,3.971,safety,biggen_240612,[],other
604,gpt_3.5_turbo_1106,4.13,safety,biggen_240612,[],other
605,gpt_3.5_turbo_0125,3.957,safety,biggen_240612,[],other
606,gpt_4_1106_preview,4.594,safety,biggen_240612,[],other
607,gpt_4_0125_preview,4.203,safety,biggen_240612,[],other
608,gpt_4_turbo_2024_04_09,4.116,safety,biggen_240612,[],other
609,gpt_4o_2024_05_13,4.043,safety,biggen_240612,[],other
610,mistral_medium_hjpark,4.13,safety,biggen_240612,[],other
611,mistral_large_hjpark,4.087,safety,biggen_240612,[],other
612,gemini_1.0_pro,4.043,safety,biggen_240612,[],other
613,gemini_pro_1.5,3.971,safety,biggen_240612,[],other
614,gemini_flash_1.5,4.203,safety,biggen_240612,[],other
615,claude_3_haiku_20240307,4.29,safety,biggen_240612,[],other
616,claude_3_sonnet_20240229,4.362,safety,biggen_240612,[],other
617,claude_3_opus_20240229,4.551,safety,biggen_240612,[],other
618,phi_1,1.01,theory_of_mind,biggen_240612,[],reasoning
619,phi_1_5,2.7,theory_of_mind,biggen_240612,[],reasoning
620,phi_2,3.0,theory_of_mind,biggen_240612,[],reasoning
621,qwen1.5_0.5b,1.75,theory_of_mind,biggen_240612,[],reasoning
622,qwen1.5_1.8b,2.36,theory_of_mind,biggen_240612,[],reasoning
623,qwen1.5_4b,2.61,theory_of_mind,biggen_240612,[],reasoning
624,gemma_2b,2.24,theory_of_mind,biggen_240612,[],reasoning
625,olmo_1b,1.44,theory_of_mind,biggen_240612,[],reasoning
626,qwen1.5_0.5b_chat,2.26,theory_of_mind,biggen_240612,[],reasoning
627,qwen1.5_1.8b_chat,3.03,theory_of_mind,biggen_240612,[],reasoning
628,qwen1.5_4b_chat,2.89,theory_of_mind,biggen_240612,[],reasoning
629,phi_3_mini_4k_instruct,3.78,theory_of_mind,biggen_240612,[],reasoning
630,phi_3_mini_128k_instruct,3.66,theory_of_mind,biggen_240612,[],reasoning
631,gemma_2b_it,3.16,theory_of_mind,biggen_240612,[],reasoning
632,gemma_1.1_2b_it,3.15,theory_of_mind,biggen_240612,[],reasoning
633,gemma_7b,1.17,theory_of_mind,biggen_240612,[],reasoning
634,mistral_7b_v0.1,2.97,theory_of_mind,biggen_240612,[],reasoning
635,mistral_7b_v0.2,3.1,theory_of_mind,biggen_240612,[],reasoning
636,qwen1.5_7b,2.68,theory_of_mind,biggen_240612,[],reasoning
637,yi_6b,2.74,theory_of_mind,biggen_240612,[],reasoning
638,llama_2_7b,2.37,theory_of_mind,biggen_240612,[],reasoning
639,codellama_7b,1.77,theory_of_mind,biggen_240612,[],reasoning
640,meta_llama_3_8b,2.57,theory_of_mind,biggen_240612,[],reasoning
641,llemma_7b,2.02,theory_of_mind,biggen_240612,[],reasoning
642,olmo_7b,1.97,theory_of_mind,biggen_240612,[],reasoning
643,gemma_7b_it,3.19,theory_of_mind,biggen_240612,[],reasoning
644,gemma_1.1_7b_it,3.354,theory_of_mind,biggen_240612,[],reasoning
645,mistral_7b_instruct_v0.2,3.68,theory_of_mind,biggen_240612,[],reasoning
646,qwen1.5_7b_chat,3.67,theory_of_mind,biggen_240612,[],reasoning
647,yi_6b_chat,3.545,theory_of_mind,biggen_240612,[],reasoning
648,llama_2_7b_chat,3.6,theory_of_mind,biggen_240612,[],reasoning
649,codellama_7b_instruct,3.23,theory_of_mind,biggen_240612,[],reasoning
650,meta_llama_3_8b_instruct,3.65,theory_of_mind,biggen_240612,[],reasoning
651,olmo_7b_sft,2.85,theory_of_mind,biggen_240612,[],reasoning
652,olmo_7b_instruct,3.29,theory_of_mind,biggen_240612,[],reasoning
653,tulu_2_7b,3.17,theory_of_mind,biggen_240612,[],reasoning
654,tulu_2_dpo_7b,3.59,theory_of_mind,biggen_240612,[],reasoning
655,codetulu_2_7b,3.09,theory_of_mind,biggen_240612,[],reasoning
656,orca_2_7b,2.23,theory_of_mind,biggen_240612,[],reasoning
657,openchat_3.5_0106,3.56,theory_of_mind,biggen_240612,[],reasoning
658,openhermes_2_mistral_7b,3.3,theory_of_mind,biggen_240612,[],reasoning
659,openhermes_2.5_mistral_7b,3.35,theory_of_mind,biggen_240612,[],reasoning
660,nous_hermes_2_mistral_7b_dpo,3.51,theory_of_mind,biggen_240612,[],reasoning
661,starling_lm_7b_alpha,3.47,theory_of_mind,biggen_240612,[],reasoning
662,starling_lm_7b_beta,3.68,theory_of_mind,biggen_240612,[],reasoning
663,mistral_orpo_alpha,3.47,theory_of_mind,biggen_240612,[],reasoning
664,mistral_orpo_beta,3.47,theory_of_mind,biggen_240612,[],reasoning
665,zephyr_7b_beta,3.64,theory_of_mind,biggen_240612,[],reasoning
666,qwen1.5_14b,3.01,theory_of_mind,biggen_240612,[],reasoning
667,llama_2_13b,2.61,theory_of_mind,biggen_240612,[],reasoning
668,codellama_13b,1.98,theory_of_mind,biggen_240612,[],reasoning
669,solar_10.7b_v1.0,3.21,theory_of_mind,biggen_240612,[],reasoning
670,qwen1.5_14b_chat,3.51,theory_of_mind,biggen_240612,[],reasoning
671,solar_10.7b_instruct_v1.0,3.66,theory_of_mind,biggen_240612,[],reasoning
672,aya_101,1.37,theory_of_mind,biggen_240612,[],reasoning
673,llama_2_13b_chat,3.65,theory_of_mind,biggen_240612,[],reasoning
674,codellama_13b_instruct,3.25,theory_of_mind,biggen_240612,[],reasoning
675,tulu_2_13b,3.23,theory_of_mind,biggen_240612,[],reasoning
676,tulu_2_dpo_13b,3.61,theory_of_mind,biggen_240612,[],reasoning
677,codetulu_2_13b,3.31,theory_of_mind,biggen_240612,[],reasoning
678,orca_2_13b,2.77,theory_of_mind,biggen_240612,[],reasoning
679,yi_34b,3.26,theory_of_mind,biggen_240612,[],reasoning
680,llemma_34b,2.51,theory_of_mind,biggen_240612,[],reasoning
681,qwen1.5_32b,3.24,theory_of_mind,biggen_240612,[],reasoning
682,codellama_34b,2.56,theory_of_mind,biggen_240612,[],reasoning
683,mixtral_8x7b_v0.1,3.35,theory_of_mind,biggen_240612,[],reasoning
684,yi_34b_chat,3.84,theory_of_mind,biggen_240612,[],reasoning
685,nous_hermes_2_yi_34b,3.43,theory_of_mind,biggen_240612,[],reasoning
686,codellama_34b_instruct,3.34,theory_of_mind,biggen_240612,[],reasoning
687,codetulu_2_34b,3.45,theory_of_mind,biggen_240612,[],reasoning
688,qwen1.5_32b_chat,3.78,theory_of_mind,biggen_240612,[],reasoning
689,mixtral_8x7b_instruct_v0.1,3.8,theory_of_mind,biggen_240612,[],reasoning
690,nous_hermes_2_mixtral_8x7b_sft,3.47,theory_of_mind,biggen_240612,[],reasoning
691,nous_hermes_2_mixtral_8x7b_dpo,3.63,theory_of_mind,biggen_240612,[],reasoning
692,c4ai_command_r_v01,3.74,theory_of_mind,biggen_240612,[],reasoning
693,llama_2_70b,3.25,theory_of_mind,biggen_240612,[],reasoning
694,codellama_70b,2.3,theory_of_mind,biggen_240612,[],reasoning
695,mixtral_8x22b_v0.1_awq,3.39,theory_of_mind,biggen_240612,[],reasoning
696,meta_llama_3_70b,2.9,theory_of_mind,biggen_240612,[],reasoning
697,qwen1.5_72b,3.17,theory_of_mind,biggen_240612,[],reasoning
698,llama_2_70b_chat,3.75,theory_of_mind,biggen_240612,[],reasoning
699,codellama_70b_instruct,2.44,theory_of_mind,biggen_240612,[],reasoning
700,tulu_2_dpo_70b,3.79,theory_of_mind,biggen_240612,[],reasoning
701,c4ai_command_r_plus_gptq,3.87,theory_of_mind,biggen_240612,[],reasoning
702,meta_llama_3_70b_instruct,3.92,theory_of_mind,biggen_240612,[],reasoning
703,mixtral_8x22b_instruct_v0.1_awq,3.74,theory_of_mind,biggen_240612,[],reasoning
704,zephyr_orpo_141b_a35b_v0.1_awq,3.48,theory_of_mind,biggen_240612,[],reasoning
705,qwen1.5_72b_chat,3.92,theory_of_mind,biggen_240612,[],reasoning
706,qwen_110b_chat,3.94,theory_of_mind,biggen_240612,[],reasoning
707,gpt_3.5_turbo_1106,3.74,theory_of_mind,biggen_240612,[],reasoning
708,gpt_3.5_turbo_0125,3.8,theory_of_mind,biggen_240612,[],reasoning
709,gpt_4_1106_preview,4.07,theory_of_mind,biggen_240612,[],reasoning
710,gpt_4_0125_preview,4.21,theory_of_mind,biggen_240612,[],reasoning
711,gpt_4_turbo_2024_04_09,4.03,theory_of_mind,biggen_240612,[],reasoning
712,gpt_4o_2024_05_13,4.04,theory_of_mind,biggen_240612,[],reasoning
713,mistral_medium_hjpark,3.85,theory_of_mind,biggen_240612,[],reasoning
714,mistral_large_hjpark,3.93,theory_of_mind,biggen_240612,[],reasoning
715,gemini_1.0_pro,3.83,theory_of_mind,biggen_240612,[],reasoning
716,gemini_pro_1.5,3.96,theory_of_mind,biggen_240612,[],reasoning
717,gemini_flash_1.5,3.89,theory_of_mind,biggen_240612,[],reasoning
718,claude_3_haiku_20240307,3.97,theory_of_mind,biggen_240612,[],reasoning
719,claude_3_sonnet_20240229,3.97,theory_of_mind,biggen_240612,[],reasoning
720,claude_3_opus_20240229,4.08,theory_of_mind,biggen_240612,[],reasoning
721,phi_1,1.012,tool_usage,biggen_240612,[],other
722,phi_1_5,1.3,tool_usage,biggen_240612,[],other
723,phi_2,1.675,tool_usage,biggen_240612,[],other
724,qwen1.5_0.5b,1.15,tool_usage,biggen_240612,[],other
725,qwen1.5_1.8b,1.413,tool_usage,biggen_240612,[],other
726,qwen1.5_4b,1.688,tool_usage,biggen_240612,[],other
727,gemma_2b,1.35,tool_usage,biggen_240612,[],other
728,olmo_1b,1.087,tool_usage,biggen_240612,[],other
729,qwen1.5_0.5b_chat,1.25,tool_usage,biggen_240612,[],other
730,qwen1.5_1.8b_chat,1.688,tool_usage,biggen_240612,[],other
731,qwen1.5_4b_chat,2.05,tool_usage,biggen_240612,[],other
732,phi_3_mini_4k_instruct,3.112,tool_usage,biggen_240612,[],other
733,phi_3_mini_128k_instruct,2.7,tool_usage,biggen_240612,[],other
734,gemma_2b_it,1.812,tool_usage,biggen_240612,[],other
735,gemma_1.1_2b_it,1.675,tool_usage,biggen_240612,[],other
736,gemma_7b,1.025,tool_usage,biggen_240612,[],other
737,mistral_7b_v0.1,2.038,tool_usage,biggen_240612,[],other
738,mistral_7b_v0.2,1.962,tool_usage,biggen_240612,[],other
739,qwen1.5_7b,2.212,tool_usage,biggen_240612,[],other
740,yi_6b,1.425,tool_usage,biggen_240612,[],other
741,llama_2_7b,1.337,tool_usage,biggen_240612,[],other
742,codellama_7b,1.387,tool_usage,biggen_240612,[],other
743,meta_llama_3_8b,1.738,tool_usage,biggen_240612,[],other
744,llemma_7b,1.575,tool_usage,biggen_240612,[],other
745,olmo_7b,1.15,tool_usage,biggen_240612,[],other
746,gemma_7b_it,2.125,tool_usage,biggen_240612,[],other
747,gemma_1.1_7b_it,2.562,tool_usage,biggen_240612,[],other
748,mistral_7b_instruct_v0.2,3.175,tool_usage,biggen_240612,[],other
749,qwen1.5_7b_chat,3.013,tool_usage,biggen_240612,[],other
750,yi_6b_chat,2.05,tool_usage,biggen_240612,[],other
751,llama_2_7b_chat,2.075,tool_usage,biggen_240612,[],other
752,codellama_7b_instruct,2.288,tool_usage,biggen_240612,[],other
753,meta_llama_3_8b_instruct,3.263,tool_usage,biggen_240612,[],other
754,olmo_7b_sft,1.887,tool_usage,biggen_240612,[],other
755,olmo_7b_instruct,1.875,tool_usage,biggen_240612,[],other
756,tulu_2_7b,2.062,tool_usage,biggen_240612,[],other
757,tulu_2_dpo_7b,2.325,tool_usage,biggen_240612,[],other
758,codetulu_2_7b,2.65,tool_usage,biggen_240612,[],other
759,orca_2_7b,1.462,tool_usage,biggen_240612,[],other
760,openchat_3.5_0106,2.9,tool_usage,biggen_240612,[],other
761,openhermes_2_mistral_7b,2.663,tool_usage,biggen_240612,[],other
762,openhermes_2.5_mistral_7b,2.65,tool_usage,biggen_240612,[],other
763,nous_hermes_2_mistral_7b_dpo,2.837,tool_usage,biggen_240612,[],other
764,starling_lm_7b_alpha,2.95,tool_usage,biggen_240612,[],other
765,starling_lm_7b_beta,3.388,tool_usage,biggen_240612,[],other
766,mistral_orpo_alpha,2.675,tool_usage,biggen_240612,[],other
767,mistral_orpo_beta,2.775,tool_usage,biggen_240612,[],other
768,zephyr_7b_beta,3.175,tool_usage,biggen_240612,[],other
769,qwen1.5_14b,2.788,tool_usage,biggen_240612,[],other
770,llama_2_13b,1.575,tool_usage,biggen_240612,[],other
771,codellama_13b,1.525,tool_usage,biggen_240612,[],other
772,solar_10.7b_v1.0,2.312,tool_usage,biggen_240612,[],other
773,qwen1.5_14b_chat,3.075,tool_usage,biggen_240612,[],other
774,solar_10.7b_instruct_v1.0,3.188,tool_usage,biggen_240612,[],other
775,aya_101,1.163,tool_usage,biggen_240612,[],other
776,llama_2_13b_chat,2.3,tool_usage,biggen_240612,[],other
777,codellama_13b_instruct,2.388,tool_usage,biggen_240612,[],other
778,tulu_2_13b,2.5,tool_usage,biggen_240612,[],other
779,tulu_2_dpo_13b,2.763,tool_usage,biggen_240612,[],other
780,codetulu_2_13b,3.013,tool_usage,biggen_240612,[],other
781,orca_2_13b,2.013,tool_usage,biggen_240612,[],other
782,yi_34b,2.3,tool_usage,biggen_240612,[],other
783,llemma_34b,1.887,tool_usage,biggen_240612,[],other
784,qwen1.5_32b,2.712,tool_usage,biggen_240612,[],other
785,codellama_34b,1.875,tool_usage,biggen_240612,[],other
786,mixtral_8x7b_v0.1,2.538,tool_usage,biggen_240612,[],other
787,yi_34b_chat,3.075,tool_usage,biggen_240612,[],other
788,nous_hermes_2_yi_34b,3.013,tool_usage,biggen_240612,[],other
789,codellama_34b_instruct,2.487,tool_usage,biggen_240612,[],other
790,codetulu_2_34b,3.2,tool_usage,biggen_240612,[],other
791,qwen1.5_32b_chat,3.55,tool_usage,biggen_240612,[],other
792,mixtral_8x7b_instruct_v0.1,3.237,tool_usage,biggen_240612,[],other
793,nous_hermes_2_mixtral_8x7b_sft,3.288,tool_usage,biggen_240612,[],other
794,nous_hermes_2_mixtral_8x7b_dpo,3.413,tool_usage,biggen_240612,[],other
795,c4ai_command_r_v01,2.987,tool_usage,biggen_240612,[],other
796,llama_2_70b,2.487,tool_usage,biggen_240612,[],other
797,codellama_70b,2.138,tool_usage,biggen_240612,[],other
798,mixtral_8x22b_v0.1_awq,2.875,tool_usage,biggen_240612,[],other
799,meta_llama_3_70b,2.388,tool_usage,biggen_240612,[],other
800,qwen1.5_72b,2.875,tool_usage,biggen_240612,[],other
801,llama_2_70b_chat,2.875,tool_usage,biggen_240612,[],other
802,codellama_70b_instruct,1.712,tool_usage,biggen_240612,[],other
803,tulu_2_dpo_70b,3.5,tool_usage,biggen_240612,[],other
804,c4ai_command_r_plus_gptq,3.475,tool_usage,biggen_240612,[],other
805,meta_llama_3_70b_instruct,3.625,tool_usage,biggen_240612,[],other
806,mixtral_8x22b_instruct_v0.1_awq,3.462,tool_usage,biggen_240612,[],other
807,zephyr_orpo_141b_a35b_v0.1_awq,3.062,tool_usage,biggen_240612,[],other
808,qwen1.5_72b_chat,3.388,tool_usage,biggen_240612,[],other
809,qwen_110b_chat,3.438,tool_usage,biggen_240612,[],other
810,gpt_3.5_turbo_1106,3.062,tool_usage,biggen_240612,[],other
811,gpt_3.5_turbo_0125,2.987,tool_usage,biggen_240612,[],other
812,gpt_4_1106_preview,3.7,tool_usage,biggen_240612,[],other
813,gpt_4_0125_preview,3.675,tool_usage,biggen_240612,[],other
814,gpt_4_turbo_2024_04_09,3.712,tool_usage,biggen_240612,[],other
815,gpt_4o_2024_05_13,3.775,tool_usage,biggen_240612,[],other
816,mistral_medium_hjpark,3.737,tool_usage,biggen_240612,[],other
817,mistral_large_hjpark,3.638,tool_usage,biggen_240612,[],other
818,gemini_1.0_pro,3.138,tool_usage,biggen_240612,[],other
819,gemini_pro_1.5,3.337,tool_usage,biggen_240612,[],other
820,gemini_flash_1.5,3.337,tool_usage,biggen_240612,[],other
821,claude_3_haiku_20240307,3.775,tool_usage,biggen_240612,[],other
822,claude_3_sonnet_20240229,3.663,tool_usage,biggen_240612,[],other
823,claude_3_opus_20240229,3.775,tool_usage,biggen_240612,[],other
0,aya_101,0.029411764705882353,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
1,c4ai_command_r_plus_gptq,0.8382352941176471,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
2,c4ai_command_r_v01,0.6948529411764706,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
3,claude_3_haiku_20240307,0.9252450980392157,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
4,claude_3_opus_20240229,0.9681372549019608,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
5,claude_3_sonnet_20240229,0.9240196078431373,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
6,codellama_13b,0.07598039215686275,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
7,codellama_13b_instruct,0.4276960784313726,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
8,codellama_34b,0.1482843137254902,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
9,codellama_34b_instruct,0.5098039215686274,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
10,codellama_70b,0.18872549019607843,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
11,codellama_70b_instruct,0.27450980392156865,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
12,codellama_7b,0.05514705882352941,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
13,codellama_7b_instruct,0.36519607843137253,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
14,codetulu_2_13b,0.43137254901960786,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
15,codetulu_2_34b,0.5441176470588235,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
16,codetulu_2_7b,0.32598039215686275,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
17,gemini_1.0_pro,0.7107843137254902,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
18,gemini_flash_1.5,0.866421568627451,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
19,gemini_pro_1.5,0.8676470588235294,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
20,gemma_1.1_2b_it,0.33578431372549017,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
21,gemma_1.1_7b_it,0.5551470588235294,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
22,gemma_2b,0.09803921568627451,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
23,gemma_2b_it,0.3333333333333333,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
24,gemma_7b,0.013480392156862746,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
25,gemma_7b_it,0.40931372549019607,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
26,gpt_3.5_turbo_0125,0.7757352941176471,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
27,gpt_3.5_turbo_1106,0.758578431372549,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
28,gpt_4_0125_preview,0.9779411764705882,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
29,gpt_4_1106_preview,0.9889705882352942,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
30,gpt_4_turbo_2024_04_09,0.9558823529411765,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
31,gpt_4o_2024_05_13,0.9436274509803921,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
32,llama_2_13b,0.20220588235294118,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
33,llama_2_13b_chat,0.5968137254901961,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
34,llama_2_70b,0.4656862745098039,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
35,llama_2_70b_chat,0.7205882352941176,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
36,llama_2_7b,0.1446078431372549,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
37,llama_2_7b_chat,0.5355392156862745,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
38,llemma_34b,0.21200980392156862,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
39,llemma_7b,0.11029411764705882,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
40,meta_llama_3_70b,0.36887254901960786,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
41,meta_llama_3_70b_instruct,0.875,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
42,meta_llama_3_8b,0.2377450980392157,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
43,meta_llama_3_8b_instruct,0.7328431372549019,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
44,mistral_7b_instruct_v0.2,0.7156862745098039,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
45,mistral_7b_v0.1,0.3272058823529412,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
46,mistral_7b_v0.2,0.3137254901960784,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
47,mistral_large_hjpark,0.8762254901960784,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
48,mistral_medium_hjpark,0.8970588235294118,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
49,mistral_orpo_alpha,0.5392156862745098,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
50,mistral_orpo_beta,0.5477941176470589,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
51,mixtral_8x22b_instruct_v0.1_awq,0.8198529411764706,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
52,mixtral_8x22b_v0.1_awq,0.5968137254901961,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
53,mixtral_8x7b_instruct_v0.1,0.7647058823529411,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
54,mixtral_8x7b_v0.1,0.5453431372549019,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
55,nous_hermes_2_mistral_7b_dpo,0.571078431372549,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
56,nous_hermes_2_mixtral_8x7b_dpo,0.7095588235294118,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
57,nous_hermes_2_mixtral_8x7b_sft,0.6262254901960784,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
58,nous_hermes_2_yi_34b,0.5906862745098039,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
59,olmo_1b,0.028186274509803922,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
60,olmo_7b,0.07107843137254902,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
61,olmo_7b_instruct,0.30269607843137253,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
62,olmo_7b_sft,0.2549019607843137,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
63,openchat_3.5_0106,0.6825980392156863,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
64,openhermes_2.5_mistral_7b,0.4583333333333333,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
65,openhermes_2_mistral_7b,0.5122549019607843,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
66,orca_2_13b,0.17401960784313725,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
67,orca_2_7b,0.08700980392156862,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
68,phi_1,0.0,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
69,phi_1_5,0.15318627450980393,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
70,phi_2,0.29044117647058826,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
71,phi_3_mini_128k_instruct,0.6911764705882353,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
72,phi_3_mini_4k_instruct,0.7867647058823529,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
73,qwen1.5_0.5b,0.0428921568627451,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
74,qwen1.5_0.5b_chat,0.07965686274509803,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
75,qwen1.5_1.8b,0.12867647058823528,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
76,qwen1.5_1.8b_chat,0.21691176470588236,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
77,qwen1.5_14b,0.3946078431372549,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
78,qwen1.5_14b_chat,0.7267156862745098,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
79,qwen1.5_32b,0.4791666666666667,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
80,qwen1.5_32b_chat,0.8149509803921569,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
81,qwen1.5_4b,0.21323529411764705,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
82,qwen1.5_4b_chat,0.29411764705882354,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
83,qwen1.5_72b,0.5294117647058824,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
84,qwen1.5_72b_chat,0.8713235294117647,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
85,qwen1.5_7b,0.2610294117647059,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
86,qwen1.5_7b_chat,0.6580882352941176,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
87,qwen_110b_chat,0.8848039215686274,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
88,solar_10.7b_instruct_v1.0,0.6862745098039216,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
89,solar_10.7b_v1.0,0.43995098039215685,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
90,starling_lm_7b_alpha,0.6139705882352942,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
91,starling_lm_7b_beta,0.7573529411764706,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
92,tulu_2_13b,0.4313725490196078,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
93,tulu_2_7b,0.3553921568627451,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
94,tulu_2_dpo_13b,0.5833333333333333,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
95,tulu_2_dpo_70b,0.7708333333333334,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
96,tulu_2_dpo_7b,0.4767156862745098,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
97,yi_34b,0.46078431372549017,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
98,yi_34b_chat,0.7720588235294118,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
99,yi_6b,0.17892156862745098,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
100,yi_6b_chat,0.4117647058823529,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
101,zephyr_7b_beta,0.6200980392156863,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
102,zephyr_orpo_141b_a35b_v0.1_awq,0.6311274509803921,biggen_mwr,biggen_240612,"['grounding', 'instruction_following', 'planning', 'reasoning', 'refinement', 'safety', 'theory_of_mind', 'tool_usage']",
0,gpt_4o_0513,35.7,wildbench_mix,wildbench_240612,[],holistic
1,gpt_4_turbo_0409,34.6,wildbench_mix,wildbench_240612,[],holistic
2,gpt_4_turbo_0125,29.9,wildbench_mix,wildbench_240612,[],holistic
3,gemini_1.5_pro,27.8,wildbench_mix,wildbench_240612,[],holistic
4,llama_3_70b_inst,21.0,wildbench_mix,wildbench_240612,[],holistic
5,claude_3_opus,20.1,wildbench_mix,wildbench_240612,[],holistic
6,gemini_1.5_flash,17.4,wildbench_mix,wildbench_240612,[],holistic
7,yi_1.5_34b_chat,16.8,wildbench_mix,wildbench_240612,[],holistic
8,llama3_inst_8b_simpo,14.0,wildbench_mix,wildbench_240612,[],holistic
9,claude_3_sonnet,7.2,wildbench_mix,wildbench_240612,[],holistic
10,qwen1.5_72b_chat,4.4,wildbench_mix,wildbench_240612,[],holistic
11,command_r_plus,0.4,wildbench_mix,wildbench_240612,[],holistic
12,claude_3_haiku,-8.5,wildbench_mix,wildbench_240612,[],holistic
13,mistral_large,-10.5,wildbench_mix,wildbench_240612,[],holistic
14,starlinglm_7b_beta,-11.9,wildbench_mix,wildbench_240612,[],holistic
15,llama_3_8b_inst,-14.6,wildbench_mix,wildbench_240612,[],holistic
16,command_r,-16.0,wildbench_mix,wildbench_240612,[],holistic
17,mixtral_8x7b_inst,-18.8,wildbench_mix,wildbench_240612,[],holistic
18,dbrx_instruct,-21.6,wildbench_mix,wildbench_240612,[],holistic
19,yi_1.5_6b_chat,-24.3,wildbench_mix,wildbench_240612,[],holistic
20,mistral_7b_inst_v0.2,-25.0,wildbench_mix,wildbench_240612,[],holistic
21,tulu_2_dpo_70b,-25.4,wildbench_mix,wildbench_240612,[],holistic
22,llama_2_70b_chat,-26.8,wildbench_mix,wildbench_240612,[],holistic
23,qwen1.5_7b_chat,-27.0,wildbench_mix,wildbench_240612,[],holistic
24,phi_3_medium_128k,-33.3,wildbench_mix,wildbench_240612,[],holistic
25,gpt_3.5_turbo_0125,-33.5,wildbench_mix,wildbench_240612,[],holistic
26,llama_2_7b_chat,-48.0,wildbench_mix,wildbench_240612,[],holistic
27,gemma_7b_it,-57.0,wildbench_mix,wildbench_240612,[],holistic
28,gemma_2b_it,-74.1,wildbench_mix,wildbench_240612,[],holistic
29,gpt_4o_0513,1.5,wildbench_gpt4t,wildbench_240612,[],holistic
30,gpt_4_turbo_0409,0.0,wildbench_gpt4t,wildbench_240612,[],holistic
31,gpt_4_turbo_0125,4.4,wildbench_gpt4t,wildbench_240612,[],holistic
32,gemini_1.5_pro,-4.4,wildbench_gpt4t,wildbench_240612,[],holistic
33,llama_3_70b_inst,-19.0,wildbench_gpt4t,wildbench_240612,[],holistic
34,claude_3_opus,-20.4,wildbench_gpt4t,wildbench_240612,[],holistic
35,gemini_1.5_flash,-16.6,wildbench_gpt4t,wildbench_240612,[],holistic
36,yi_1.5_34b_chat,-18.3,wildbench_gpt4t,wildbench_240612,[],holistic
37,llama3_inst_8b_simpo,-22.5,wildbench_gpt4t,wildbench_240612,[],holistic
38,claude_3_sonnet,-31.6,wildbench_gpt4t,wildbench_240612,[],holistic
39,qwen1.5_72b_chat,-34.8,wildbench_gpt4t,wildbench_240612,[],holistic
40,command_r_plus,-36.3,wildbench_gpt4t,wildbench_240612,[],holistic
41,claude_3_haiku,-46.9,wildbench_gpt4t,wildbench_240612,[],holistic
42,mistral_large,-48.1,wildbench_gpt4t,wildbench_240612,[],holistic
43,starlinglm_7b_beta,-48.7,wildbench_gpt4t,wildbench_240612,[],holistic
44,llama_3_8b_inst,-49.8,wildbench_gpt4t,wildbench_240612,[],holistic
45,command_r,-48.4,wildbench_gpt4t,wildbench_240612,[],holistic
46,mixtral_8x7b_inst,-53.4,wildbench_gpt4t,wildbench_240612,[],holistic
47,dbrx_instruct,-57.3,wildbench_gpt4t,wildbench_240612,[],holistic
48,yi_1.5_6b_chat,-55.0,wildbench_gpt4t,wildbench_240612,[],holistic
49,mistral_7b_inst_v0.2,-58.1,wildbench_gpt4t,wildbench_240612,[],holistic
50,tulu_2_dpo_70b,-59.3,wildbench_gpt4t,wildbench_240612,[],holistic
51,llama_2_70b_chat,-56.9,wildbench_gpt4t,wildbench_240612,[],holistic
52,qwen1.5_7b_chat,-57.7,wildbench_gpt4t,wildbench_240612,[],holistic
53,phi_3_medium_128k,-66.4,wildbench_gpt4t,wildbench_240612,[],holistic
54,gpt_3.5_turbo_0125,-66.3,wildbench_gpt4t,wildbench_240612,[],holistic
55,llama_2_7b_chat,-71.8,wildbench_gpt4t,wildbench_240612,[],holistic
56,gemma_7b_it,-78.4,wildbench_gpt4t,wildbench_240612,[],holistic
57,gemma_2b_it,-87.8,wildbench_gpt4t,wildbench_240612,[],holistic
58,gpt_4o_0513,46.3,wildbench_haiku,wildbench_240612,[],holistic
59,gpt_4_turbo_0409,45.3,wildbench_haiku,wildbench_240612,[],holistic
60,gpt_4_turbo_0125,38.8,wildbench_haiku,wildbench_240612,[],holistic
61,gemini_1.5_pro,37.9,wildbench_haiku,wildbench_240612,[],holistic
62,llama_3_70b_inst,31.9,wildbench_haiku,wildbench_240612,[],holistic
63,claude_3_opus,34.3,wildbench_haiku,wildbench_240612,[],holistic
64,gemini_1.5_flash,26.3,wildbench_haiku,wildbench_240612,[],holistic
65,yi_1.5_34b_chat,24.1,wildbench_haiku,wildbench_240612,[],holistic
66,llama3_inst_8b_simpo,18.9,wildbench_haiku,wildbench_240612,[],holistic
67,claude_3_sonnet,19.4,wildbench_haiku,wildbench_240612,[],holistic
68,qwen1.5_72b_chat,13.1,wildbench_haiku,wildbench_240612,[],holistic
69,command_r_plus,7.4,wildbench_haiku,wildbench_240612,[],holistic
70,claude_3_haiku,0.0,wildbench_haiku,wildbench_240612,[],holistic
71,mistral_large,-4.0,wildbench_haiku,wildbench_240612,[],holistic
72,starlinglm_7b_beta,-5.0,wildbench_haiku,wildbench_240612,[],holistic
73,llama_3_8b_inst,-9.7,wildbench_haiku,wildbench_240612,[],holistic
74,command_r,-12.7,wildbench_haiku,wildbench_240612,[],holistic
75,mixtral_8x7b_inst,-13.5,wildbench_haiku,wildbench_240612,[],holistic
76,dbrx_instruct,-16.3,wildbench_haiku,wildbench_240612,[],holistic
77,yi_1.5_6b_chat,-19.9,wildbench_haiku,wildbench_240612,[],holistic
78,mistral_7b_inst_v0.2,-22.4,wildbench_haiku,wildbench_240612,[],holistic
79,tulu_2_dpo_70b,-20.3,wildbench_haiku,wildbench_240612,[],holistic
80,llama_2_70b_chat,-23.6,wildbench_haiku,wildbench_240612,[],holistic
81,qwen1.5_7b_chat,-23.0,wildbench_haiku,wildbench_240612,[],holistic
82,phi_3_medium_128k,-30.0,wildbench_haiku,wildbench_240612,[],holistic
83,gpt_3.5_turbo_0125,-30.0,wildbench_haiku,wildbench_240612,[],holistic
84,llama_2_7b_chat,-44.6,wildbench_haiku,wildbench_240612,[],holistic
85,gemma_7b_it,-55.8,wildbench_haiku,wildbench_240612,[],holistic
86,gemma_2b_it,-73.6,wildbench_haiku,wildbench_240612,[],holistic
87,gpt_4o_0513,59.3,wildbench_llama2,wildbench_240612,[],holistic
88,gpt_4_turbo_0409,58.4,wildbench_llama2,wildbench_240612,[],holistic
89,gpt_4_turbo_0125,55.2,wildbench_llama2,wildbench_240612,[],holistic
90,gemini_1.5_pro,50.0,wildbench_llama2,wildbench_240612,[],holistic
91,llama_3_70b_inst,50.2,wildbench_llama2,wildbench_240612,[],holistic
92,claude_3_opus,46.3,wildbench_llama2,wildbench_240612,[],holistic
93,gemini_1.5_flash,42.5,wildbench_llama2,wildbench_240612,[],holistic
94,yi_1.5_34b_chat,44.5,wildbench_llama2,wildbench_240612,[],holistic
95,llama3_inst_8b_simpo,45.7,wildbench_llama2,wildbench_240612,[],holistic
96,claude_3_sonnet,33.9,wildbench_llama2,wildbench_240612,[],holistic
97,qwen1.5_72b_chat,34.7,wildbench_llama2,wildbench_240612,[],holistic
98,command_r_plus,30.2,wildbench_llama2,wildbench_240612,[],holistic
99,claude_3_haiku,21.4,wildbench_llama2,wildbench_240612,[],holistic
100,mistral_large,20.5,wildbench_llama2,wildbench_240612,[],holistic
101,starlinglm_7b_beta,18.0,wildbench_llama2,wildbench_240612,[],holistic
102,llama_3_8b_inst,15.7,wildbench_llama2,wildbench_240612,[],holistic
103,command_r,13.1,wildbench_llama2,wildbench_240612,[],holistic
104,mixtral_8x7b_inst,10.4,wildbench_llama2,wildbench_240612,[],holistic
105,dbrx_instruct,8.7,wildbench_llama2,wildbench_240612,[],holistic
106,yi_1.5_6b_chat,2.1,wildbench_llama2,wildbench_240612,[],holistic
107,mistral_7b_inst_v0.2,5.5,wildbench_llama2,wildbench_240612,[],holistic
108,tulu_2_dpo_70b,3.3,wildbench_llama2,wildbench_240612,[],holistic
109,llama_2_70b_chat,0.0,wildbench_llama2,wildbench_240612,[],holistic
110,qwen1.5_7b_chat,-0.2,wildbench_llama2,wildbench_240612,[],holistic
111,phi_3_medium_128k,-3.6,wildbench_llama2,wildbench_240612,[],holistic
112,gpt_3.5_turbo_0125,-4.1,wildbench_llama2,wildbench_240612,[],holistic
113,llama_2_7b_chat,-27.8,wildbench_llama2,wildbench_240612,[],holistic
114,gemma_7b_it,-36.8,wildbench_llama2,wildbench_240612,[],holistic
115,gemma_2b_it,-60.8,wildbench_llama2,wildbench_240612,[],holistic
116,gpt_4o_0513,65.3,wb_score,wildbench_240612,[],holistic
117,gpt_4_turbo_0409,64.7,wb_score,wildbench_240612,[],holistic
118,gpt_4_turbo_0125,63.3,wb_score,wildbench_240612,[],holistic
119,gemini_1.5_pro,55.7,wb_score,wildbench_240612,[],holistic
120,llama_3_70b_inst,60.4,wb_score,wildbench_240612,[],holistic
121,claude_3_opus,63.1,wb_score,wildbench_240612,[],holistic
122,gemini_1.5_flash,53.1,wb_score,wildbench_240612,[],holistic
123,yi_1.5_34b_chat,57.8,wb_score,wildbench_240612,[],holistic
124,llama3_inst_8b_simpo,53.9,wb_score,wildbench_240612,[],holistic
125,claude_3_sonnet,55.5,wb_score,wildbench_240612,[],holistic
126,qwen1.5_72b_chat,56.5,wb_score,wildbench_240612,[],holistic
127,command_r_plus,51.4,wb_score,wildbench_240612,[],holistic
128,claude_3_haiku,50.4,wb_score,wildbench_240612,[],holistic
129,mistral_large,54.2,wb_score,wildbench_240612,[],holistic
130,starlinglm_7b_beta,46.8,wb_score,wildbench_240612,[],holistic
131,llama_3_8b_inst,45.7,wb_score,wildbench_240612,[],holistic
132,command_r,45.7,wb_score,wildbench_240612,[],holistic
133,mixtral_8x7b_inst,47.8,wb_score,wildbench_240612,[],holistic
134,dbrx_instruct,48.9,wb_score,wildbench_240612,[],holistic
135,yi_1.5_6b_chat,39.6,wb_score,wildbench_240612,[],holistic
136,mistral_7b_inst_v0.2,43.4,wb_score,wildbench_240612,[],holistic
137,tulu_2_dpo_70b,45.2,wb_score,wildbench_240612,[],holistic
138,llama_2_70b_chat,39.2,wb_score,wildbench_240612,[],holistic
139,qwen1.5_7b_chat,40.0,wb_score,wildbench_240612,[],holistic
140,phi_3_medium_128k,42.1,wb_score,wildbench_240612,[],holistic
141,gpt_3.5_turbo_0125,42.1,wb_score,wildbench_240612,[],holistic
142,llama_2_7b_chat,27.6,wb_score,wildbench_240612,[],holistic
143,gemma_7b_it,23.9,wb_score,wildbench_240612,[],holistic
144,gemma_2b_it,6.2,wb_score,wildbench_240612,[],holistic
145,gpt_4o_0513,1293.0,arena_elo,wildbench_240612,[],holistic
146,gpt_4_turbo_0409,1251.0,arena_elo,wildbench_240612,[],holistic
147,gpt_4_turbo_0125,1239.0,arena_elo,wildbench_240612,[],holistic
149,llama_3_70b_inst,1213.0,arena_elo,wildbench_240612,[],holistic
150,claude_3_opus,1232.0,arena_elo,wildbench_240612,[],holistic
154,claude_3_sonnet,1187.0,arena_elo,wildbench_240612,[],holistic
155,qwen1.5_72b_chat,1143.0,arena_elo,wildbench_240612,[],holistic
156,command_r_plus,1155.0,arena_elo,wildbench_240612,[],holistic
157,claude_3_haiku,1169.0,arena_elo,wildbench_240612,[],holistic
158,mistral_large,1158.0,arena_elo,wildbench_240612,[],holistic
159,starlinglm_7b_beta,1111.0,arena_elo,wildbench_240612,[],holistic
160,llama_3_8b_inst,1144.0,arena_elo,wildbench_240612,[],holistic
161,command_r,1106.0,arena_elo,wildbench_240612,[],holistic
162,mixtral_8x7b_inst,1114.0,arena_elo,wildbench_240612,[],holistic
163,dbrx_instruct,1106.0,arena_elo,wildbench_240612,[],holistic
165,mistral_7b_inst_v0.2,1071.0,arena_elo,wildbench_240612,[],holistic
166,tulu_2_dpo_70b,1099.0,arena_elo,wildbench_240612,[],holistic
167,llama_2_70b_chat,1070.0,arena_elo,wildbench_240612,[],holistic
168,qwen1.5_7b_chat,1059.0,arena_elo,wildbench_240612,[],holistic
170,gpt_3.5_turbo_0125,1105.0,arena_elo,wildbench_240612,[],holistic
171,llama_2_7b_chat,1012.0,arena_elo,wildbench_240612,[],holistic
172,gemma_7b_it,1047.0,arena_elo,wildbench_240612,[],holistic
173,gemma_2b_it,980.0,arena_elo,wildbench_240612,[],holistic
175,gpt_4_turbo_0409,82.6,arena_hard,wildbench_240612,[],holistic
176,gpt_4_turbo_0125,78.0,arena_hard,wildbench_240612,[],holistic
178,llama_3_70b_inst,41.1,arena_hard,wildbench_240612,[],holistic
179,claude_3_opus,60.4,arena_hard,wildbench_240612,[],holistic
182,llama3_inst_8b_simpo,33.8,arena_hard,wildbench_240612,[],holistic
183,claude_3_sonnet,46.8,arena_hard,wildbench_240612,[],holistic
184,qwen1.5_72b_chat,36.1,arena_hard,wildbench_240612,[],holistic
185,command_r_plus,33.1,arena_hard,wildbench_240612,[],holistic
186,claude_3_haiku,41.5,arena_hard,wildbench_240612,[],holistic
187,mistral_large,37.7,arena_hard,wildbench_240612,[],holistic
188,starlinglm_7b_beta,23.0,arena_hard,wildbench_240612,[],holistic
189,llama_3_8b_inst,20.6,arena_hard,wildbench_240612,[],holistic
190,command_r,17.0,arena_hard,wildbench_240612,[],holistic
191,mixtral_8x7b_inst,23.4,arena_hard,wildbench_240612,[],holistic
192,dbrx_instruct,23.9,arena_hard,wildbench_240612,[],holistic
195,tulu_2_dpo_70b,15.0,arena_hard,wildbench_240612,[],holistic
196,llama_2_70b_chat,11.6,arena_hard,wildbench_240612,[],holistic
199,gpt_3.5_turbo_0125,23.3,arena_hard,wildbench_240612,[],holistic
200,llama_2_7b_chat,4.6,arena_hard,wildbench_240612,[],holistic
201,gemma_7b_it,7.5,arena_hard,wildbench_240612,[],holistic
202,gemma_2b_it,3.0,arena_hard,wildbench_240612,[],holistic
203,gpt_4o_0513,57.5,alpacaeval2_lc,wildbench_240612,[],holistic
204,gpt_4_turbo_0409,55.0,alpacaeval2_lc,wildbench_240612,[],holistic
207,llama_3_70b_inst,34.4,alpacaeval2_lc,wildbench_240612,[],holistic
208,claude_3_opus,40.5,alpacaeval2_lc,wildbench_240612,[],holistic
211,llama3_inst_8b_simpo,44.7,alpacaeval2_lc,wildbench_240612,[],holistic
212,claude_3_sonnet,34.9,alpacaeval2_lc,wildbench_240612,[],holistic
213,qwen1.5_72b_chat,36.6,alpacaeval2_lc,wildbench_240612,[],holistic
216,mistral_large,32.7,alpacaeval2_lc,wildbench_240612,[],holistic
218,llama_3_8b_inst,22.9,alpacaeval2_lc,wildbench_240612,[],holistic
220,mixtral_8x7b_inst,23.7,alpacaeval2_lc,wildbench_240612,[],holistic
221,dbrx_instruct,25.4,alpacaeval2_lc,wildbench_240612,[],holistic
223,mistral_7b_inst_v0.2,17.1,alpacaeval2_lc,wildbench_240612,[],holistic
224,tulu_2_dpo_70b,21.2,alpacaeval2_lc,wildbench_240612,[],holistic
225,llama_2_70b_chat,14.7,alpacaeval2_lc,wildbench_240612,[],holistic
226,qwen1.5_7b_chat,14.7,alpacaeval2_lc,wildbench_240612,[],holistic
229,llama_2_7b_chat,5.4,alpacaeval2_lc,wildbench_240612,[],holistic
230,gemma_7b_it,10.4,alpacaeval2_lc,wildbench_240612,[],holistic
231,gemma_2b_it,5.4,alpacaeval2_lc,wildbench_240612,[],holistic
232,gpt_4o_0513,51.3,alpacav2,wildbench_240612,[],holistic
233,gpt_4_turbo_0409,46.1,alpacav2,wildbench_240612,[],holistic
236,llama_3_70b_inst,33.2,alpacav2,wildbench_240612,[],holistic
237,claude_3_opus,29.1,alpacav2,wildbench_240612,[],holistic
240,llama3_inst_8b_simpo,40.5,alpacav2,wildbench_240612,[],holistic
241,claude_3_sonnet,25.6,alpacav2,wildbench_240612,[],holistic
242,qwen1.5_72b_chat,26.5,alpacav2,wildbench_240612,[],holistic
245,mistral_large,21.4,alpacav2,wildbench_240612,[],holistic
247,llama_3_8b_inst,22.6,alpacav2,wildbench_240612,[],holistic
249,mixtral_8x7b_inst,18.3,alpacav2,wildbench_240612,[],holistic
250,dbrx_instruct,18.4,alpacav2,wildbench_240612,[],holistic
252,mistral_7b_inst_v0.2,14.7,alpacav2,wildbench_240612,[],holistic
253,tulu_2_dpo_70b,16.0,alpacav2,wildbench_240612,[],holistic
254,llama_2_70b_chat,13.9,alpacav2,wildbench_240612,[],holistic
255,qwen1.5_7b_chat,11.8,alpacav2,wildbench_240612,[],holistic
258,llama_2_7b_chat,5.0,alpacav2,wildbench_240612,[],holistic
259,gemma_7b_it,6.9,alpacav2,wildbench_240612,[],holistic
260,gemma_2b_it,3.4,alpacav2,wildbench_240612,[],holistic
0,gpt_4,4.41,agentbench_overall,agentbench_240720,[],agent
1,claude_v1.3,2.77,agentbench_overall,agentbench_240720,[],agent
2,gpt_3.5_turbo,2.55,agentbench_overall,agentbench_240720,[],agent
3,text_davinci_003,2.1,agentbench_overall,agentbench_240720,[],agent
4,claude_instant_v1.1,1.9,agentbench_overall,agentbench_240720,[],agent
5,text_davinci_002,1.46,agentbench_overall,agentbench_240720,[],agent
6,text_bison_001,1.39,agentbench_overall,agentbench_240720,[],agent
7,chatglm2_v0.2,1.31,agentbench_overall,agentbench_240720,[],agent
8,openchat_v3.2,1.15,agentbench_overall,agentbench_240720,[],agent
9,wizardlm_30b,0.83,agentbench_overall,agentbench_240720,[],agent
10,vicuna_13b,0.62,agentbench_overall,agentbench_240720,[],agent
11,wizardlm_13b,0.59,agentbench_overall,agentbench_240720,[],agent
12,llama2_13b_chat,0.55,agentbench_overall,agentbench_240720,[],agent
13,codegeex2_6b,0.53,agentbench_overall,agentbench_240720,[],agent
14,openchat_8192,0.51,agentbench_overall,agentbench_240720,[],agent
15,baichuan_13b_chat,0.36,agentbench_overall,agentbench_240720,[],agent
16,koala_13b,0.34,agentbench_overall,agentbench_240720,[],agent
17,llama2_7b_chat,0.31,agentbench_overall,agentbench_240720,[],agent
18,chatglm_6b,0.31,agentbench_overall,agentbench_240720,[],agent
19,vicuna_7b,0.24,agentbench_overall,agentbench_240720,[],agent
20,internlm_chat_7b,0.23,agentbench_overall,agentbench_240720,[],agent
21,baichuan_7b,0.22,agentbench_overall,agentbench_240720,[],agent
22,wizardcoder,0.21,agentbench_overall,agentbench_240720,[],agent
23,dolly_v2_12b,0.15,agentbench_overall,agentbench_240720,[],agent
24,oasst_sft_4_pythia_12b,0.07,agentbench_overall,agentbench_240720,[],agent
25,gpt_4,36.81,agentbench_os,agentbench_240720,[],agent
26,claude_v1.3,13.19,agentbench_os,agentbench_240720,[],agent
27,gpt_3.5_turbo,32.64,agentbench_os,agentbench_240720,[],agent
28,text_davinci_003,22.92,agentbench_os,agentbench_240720,[],agent
29,claude_instant_v1.1,14.58,agentbench_os,agentbench_240720,[],agent
30,text_davinci_002,4.86,agentbench_os,agentbench_240720,[],agent
31,text_bison_001,4.17,agentbench_os,agentbench_240720,[],agent
32,chatglm2_v0.2,14.58,agentbench_os,agentbench_240720,[],agent
33,openchat_v3.2,9.72,agentbench_os,agentbench_240720,[],agent
34,wizardlm_30b,14.58,agentbench_os,agentbench_240720,[],agent
35,vicuna_13b,8.33,agentbench_os,agentbench_240720,[],agent
36,wizardlm_13b,9.72,agentbench_os,agentbench_240720,[],agent
37,llama2_13b_chat,10.42,agentbench_os,agentbench_240720,[],agent
38,codegeex2_6b,12.5,agentbench_os,agentbench_240720,[],agent
39,openchat_8192,10.42,agentbench_os,agentbench_240720,[],agent
40,baichuan_13b_chat,11.81,agentbench_os,agentbench_240720,[],agent
41,koala_13b,2.78,agentbench_os,agentbench_240720,[],agent
42,llama2_7b_chat,10.42,agentbench_os,agentbench_240720,[],agent
43,chatglm_6b,4.86,agentbench_os,agentbench_240720,[],agent
44,vicuna_7b,6.25,agentbench_os,agentbench_240720,[],agent
45,internlm_chat_7b,3.47,agentbench_os,agentbench_240720,[],agent
46,baichuan_7b,4.17,agentbench_os,agentbench_240720,[],agent
47,wizardcoder,3.47,agentbench_os,agentbench_240720,[],agent
48,dolly_v2_12b,0.0,agentbench_os,agentbench_240720,[],agent
49,oasst_sft_4_pythia_12b,2.78,agentbench_os,agentbench_240720,[],agent
50,gpt_4,33.67,agentbench_db,agentbench_240720,[],agent
51,claude_v1.3,16.75,agentbench_db,agentbench_240720,[],agent
52,gpt_3.5_turbo,15.0,agentbench_db,agentbench_240720,[],agent
53,text_davinci_003,16.33,agentbench_db,agentbench_240720,[],agent
54,claude_instant_v1.1,8.0,agentbench_db,agentbench_240720,[],agent
55,text_davinci_002,13.67,agentbench_db,agentbench_240720,[],agent
56,text_bison_001,12.75,agentbench_db,agentbench_240720,[],agent
57,chatglm2_v0.2,13.67,agentbench_db,agentbench_240720,[],agent
58,openchat_v3.2,5.33,agentbench_db,agentbench_240720,[],agent
59,wizardlm_30b,12.67,agentbench_db,agentbench_240720,[],agent
60,vicuna_13b,11.33,agentbench_db,agentbench_240720,[],agent
61,wizardlm_13b,13.0,agentbench_db,agentbench_240720,[],agent
62,llama2_13b_chat,4.5,agentbench_db,agentbench_240720,[],agent
63,codegeex2_6b,6.5,agentbench_db,agentbench_240720,[],agent
64,openchat_8192,2.67,agentbench_db,agentbench_240720,[],agent
65,baichuan_13b_chat,3.0,agentbench_db,agentbench_240720,[],agent
66,koala_13b,5.33,agentbench_db,agentbench_240720,[],agent
67,llama2_7b_chat,2.75,agentbench_db,agentbench_240720,[],agent
68,chatglm_6b,0.33,agentbench_db,agentbench_240720,[],agent
69,vicuna_7b,3.33,agentbench_db,agentbench_240720,[],agent
70,internlm_chat_7b,6.33,agentbench_db,agentbench_240720,[],agent
71,baichuan_7b,0.0,agentbench_db,agentbench_240720,[],agent
72,wizardcoder,0.0,agentbench_db,agentbench_240720,[],agent
73,dolly_v2_12b,0.0,agentbench_db,agentbench_240720,[],agent
74,oasst_sft_4_pythia_12b,0.0,agentbench_db,agentbench_240720,[],agent
75,gpt_4,52.14,agentbench_kg,agentbench_240720,[],agent
76,claude_v1.3,36.22,agentbench_kg,agentbench_240720,[],agent
77,gpt_3.5_turbo,27.2,agentbench_kg,agentbench_240720,[],agent
78,text_davinci_003,30.82,agentbench_kg,agentbench_240720,[],agent
79,claude_instant_v1.1,29.67,agentbench_kg,agentbench_240720,[],agent
80,text_davinci_002,18.87,agentbench_kg,agentbench_240720,[],agent
81,text_bison_001,17.12,agentbench_kg,agentbench_240720,[],agent
82,chatglm2_v0.2,6.85,agentbench_kg,agentbench_240720,[],agent
83,openchat_v3.2,6.84,agentbench_kg,agentbench_240720,[],agent
84,wizardlm_30b,2.33,agentbench_kg,agentbench_240720,[],agent
85,vicuna_13b,1.24,agentbench_kg,agentbench_240720,[],agent
86,wizardlm_13b,0.44,agentbench_kg,agentbench_240720,[],agent
87,llama2_13b_chat,3.11,agentbench_kg,agentbench_240720,[],agent
88,codegeex2_6b,6.35,agentbench_kg,agentbench_240720,[],agent
89,openchat_8192,0.59,agentbench_kg,agentbench_240720,[],agent
90,baichuan_13b_chat,6.27,agentbench_kg,agentbench_240720,[],agent
91,koala_13b,0.0,agentbench_kg,agentbench_240720,[],agent
92,llama2_7b_chat,1.89,agentbench_kg,agentbench_240720,[],agent
93,chatglm_6b,0.0,agentbench_kg,agentbench_240720,[],agent
94,vicuna_7b,0.0,agentbench_kg,agentbench_240720,[],agent
95,internlm_chat_7b,0.0,agentbench_kg,agentbench_240720,[],agent
96,baichuan_7b,0.46,agentbench_kg,agentbench_240720,[],agent
97,wizardcoder,2.78,agentbench_kg,agentbench_240720,[],agent
98,dolly_v2_12b,0.0,agentbench_kg,agentbench_240720,[],agent
99,oasst_sft_4_pythia_12b,0.0,agentbench_kg,agentbench_240720,[],agent
100,gpt_4,50.0,agentbench_dcg,agentbench_240720,[],agent
101,claude_v1.3,30.0,agentbench_dcg,agentbench_240720,[],agent
102,gpt_3.5_turbo,30.0,agentbench_dcg,agentbench_240720,[],agent
103,text_davinci_003,15.0,agentbench_dcg,agentbench_240720,[],agent
104,claude_instant_v1.1,35.0,agentbench_dcg,agentbench_240720,[],agent
105,text_davinci_002,25.0,agentbench_dcg,agentbench_240720,[],agent
106,text_bison_001,20.0,agentbench_dcg,agentbench_240720,[],agent
107,chatglm2_v0.2,10.0,agentbench_dcg,agentbench_240720,[],agent
108,openchat_v3.2,0.0,agentbench_dcg,agentbench_240720,[],agent
109,wizardlm_30b,10.0,agentbench_dcg,agentbench_240720,[],agent
110,vicuna_13b,0.0,agentbench_dcg,agentbench_240720,[],agent
111,wizardlm_13b,0.0,agentbench_dcg,agentbench_240720,[],agent
112,llama2_13b_chat,0.0,agentbench_dcg,agentbench_240720,[],agent
113,codegeex2_6b,0.0,agentbench_dcg,agentbench_240720,[],agent
114,openchat_8192,10.0,agentbench_dcg,agentbench_240720,[],agent
115,baichuan_13b_chat,0.0,agentbench_dcg,agentbench_240720,[],agent
116,koala_13b,0.0,agentbench_dcg,agentbench_240720,[],agent
117,llama2_7b_chat,0.0,agentbench_dcg,agentbench_240720,[],agent
118,chatglm_6b,0.0,agentbench_dcg,agentbench_240720,[],agent
119,vicuna_7b,0.0,agentbench_dcg,agentbench_240720,[],agent
120,internlm_chat_7b,0.0,agentbench_dcg,agentbench_240720,[],agent
121,baichuan_7b,0.0,agentbench_dcg,agentbench_240720,[],agent
122,wizardcoder,0.0,agentbench_dcg,agentbench_240720,[],agent
123,dolly_v2_12b,0.0,agentbench_dcg,agentbench_240720,[],agent
124,oasst_sft_4_pythia_12b,0.0,agentbench_dcg,agentbench_240720,[],agent
125,gpt_4,17.6,agentbench_ltp,agentbench_240720,[],agent
126,claude_v1.3,6.39,agentbench_ltp,agentbench_240720,[],agent
127,gpt_3.5_turbo,14.85,agentbench_ltp,agentbench_240720,[],agent
128,text_davinci_003,5.21,agentbench_ltp,agentbench_240720,[],agent
129,claude_instant_v1.1,6.08,agentbench_ltp,agentbench_240720,[],agent
130,text_davinci_002,2.32,agentbench_ltp,agentbench_240720,[],agent
131,text_bison_001,0.12,agentbench_ltp,agentbench_240720,[],agent
132,chatglm2_v0.2,12.62,agentbench_ltp,agentbench_240720,[],agent
133,openchat_v3.2,9.54,agentbench_ltp,agentbench_240720,[],agent
134,wizardlm_30b,4.47,agentbench_ltp,agentbench_240720,[],agent
135,vicuna_13b,7.97,agentbench_ltp,agentbench_240720,[],agent
136,wizardlm_13b,4.06,agentbench_ltp,agentbench_240720,[],agent
137,llama2_13b_chat,3.69,agentbench_ltp,agentbench_240720,[],agent
138,codegeex2_6b,1.94,agentbench_ltp,agentbench_240720,[],agent
139,openchat_8192,0.0,agentbench_ltp,agentbench_240720,[],agent
140,baichuan_13b_chat,1.3,agentbench_ltp,agentbench_240720,[],agent
141,koala_13b,2.46,agentbench_ltp,agentbench_240720,[],agent
142,llama2_7b_chat,2.1,agentbench_ltp,agentbench_240720,[],agent
143,chatglm_6b,6.69,agentbench_ltp,agentbench_240720,[],agent
144,vicuna_7b,3.08,agentbench_ltp,agentbench_240720,[],agent
145,internlm_chat_7b,3.41,agentbench_ltp,agentbench_240720,[],agent
146,baichuan_7b,2.29,agentbench_ltp,agentbench_240720,[],agent
147,wizardcoder,1.32,agentbench_ltp,agentbench_240720,[],agent
148,dolly_v2_12b,3.36,agentbench_ltp,agentbench_240720,[],agent
149,oasst_sft_4_pythia_12b,1.48,agentbench_ltp,agentbench_240720,[],agent
150,gpt_4,78.0,agentbench_hh,agentbench_240720,[],agent
151,claude_v1.3,52.0,agentbench_hh,agentbench_240720,[],agent
152,gpt_3.5_turbo,14.0,agentbench_hh,agentbench_240720,[],agent
153,text_davinci_003,20.0,agentbench_hh,agentbench_240720,[],agent
154,claude_instant_v1.1,26.0,agentbench_hh,agentbench_240720,[],agent
155,text_davinci_002,14.0,agentbench_hh,agentbench_240720,[],agent
156,text_bison_001,4.0,agentbench_hh,agentbench_240720,[],agent
157,chatglm2_v0.2,6.0,agentbench_hh,agentbench_240720,[],agent
158,openchat_v3.2,8.0,agentbench_hh,agentbench_240720,[],agent
159,wizardlm_30b,6.0,agentbench_hh,agentbench_240720,[],agent
160,vicuna_13b,0.0,agentbench_hh,agentbench_240720,[],agent
161,wizardlm_13b,6.0,agentbench_hh,agentbench_240720,[],agent
162,llama2_13b_chat,2.0,agentbench_hh,agentbench_240720,[],agent
163,codegeex2_6b,0.0,agentbench_hh,agentbench_240720,[],agent
164,openchat_8192,4.0,agentbench_hh,agentbench_240720,[],agent
165,baichuan_13b_chat,0.0,agentbench_hh,agentbench_240720,[],agent
166,koala_13b,0.0,agentbench_hh,agentbench_240720,[],agent
167,llama2_7b_chat,0.0,agentbench_hh,agentbench_240720,[],agent
168,chatglm_6b,0.0,agentbench_hh,agentbench_240720,[],agent
169,vicuna_7b,0.0,agentbench_hh,agentbench_240720,[],agent
170,internlm_chat_7b,0.0,agentbench_hh,agentbench_240720,[],agent
171,baichuan_7b,0.0,agentbench_hh,agentbench_240720,[],agent
172,wizardcoder,0.0,agentbench_hh,agentbench_240720,[],agent
173,dolly_v2_12b,0.0,agentbench_hh,agentbench_240720,[],agent
174,oasst_sft_4_pythia_12b,0.0,agentbench_hh,agentbench_240720,[],agent
175,gpt_4,58.6,agentbench_ws,agentbench_240720,[],agent
176,claude_v1.3,59.26,agentbench_ws,agentbench_240720,[],agent
177,gpt_3.5_turbo,67.21,agentbench_ws,agentbench_240720,[],agent
178,text_davinci_003,61.43,agentbench_ws,agentbench_240720,[],agent
179,claude_instant_v1.1,44.22,agentbench_ws,agentbench_240720,[],agent
180,text_davinci_002,60.15,agentbench_ws,agentbench_240720,[],agent
181,text_bison_001,46.06,agentbench_ws,agentbench_240720,[],agent
182,chatglm2_v0.2,19.35,agentbench_ws,agentbench_240720,[],agent
183,openchat_v3.2,50.17,agentbench_ws,agentbench_240720,[],agent
184,wizardlm_30b,10.6,agentbench_ws,agentbench_240720,[],agent
185,vicuna_13b,12.57,agentbench_ws,agentbench_240720,[],agent
186,wizardlm_13b,1.2,agentbench_ws,agentbench_240720,[],agent
187,llama2_13b_chat,3.12,agentbench_ws,agentbench_240720,[],agent
188,codegeex2_6b,11.8,agentbench_ws,agentbench_240720,[],agent
189,openchat_8192,6.68,agentbench_ws,agentbench_240720,[],agent
190,baichuan_13b_chat,5.74,agentbench_ws,agentbench_240720,[],agent
191,koala_13b,5.96,agentbench_ws,agentbench_240720,[],agent
192,llama2_7b_chat,2.22,agentbench_ws,agentbench_240720,[],agent
193,chatglm_6b,0.5,agentbench_ws,agentbench_240720,[],agent
194,vicuna_7b,6.4,agentbench_ws,agentbench_240720,[],agent
195,internlm_chat_7b,0.0,agentbench_ws,agentbench_240720,[],agent
196,baichuan_7b,2.84,agentbench_ws,agentbench_240720,[],agent
197,wizardcoder,0.0,agentbench_ws,agentbench_240720,[],agent
198,dolly_v2_12b,0.38,agentbench_ws,agentbench_240720,[],agent
199,oasst_sft_4_pythia_12b,0.0,agentbench_ws,agentbench_240720,[],agent
200,gpt_4,22.59,agentbench_wb,agentbench_240720,[],agent
201,claude_v1.3,20.97,agentbench_wb,agentbench_240720,[],agent
202,gpt_3.5_turbo,15.69,agentbench_wb,agentbench_240720,[],agent
203,text_davinci_003,15.52,agentbench_wb,agentbench_240720,[],agent
204,claude_instant_v1.1,0.77,agentbench_wb,agentbench_240720,[],agent
205,text_davinci_002,1.11,agentbench_wb,agentbench_240720,[],agent
206,text_bison_001,20.46,agentbench_wb,agentbench_240720,[],agent
207,chatglm2_v0.2,12.87,agentbench_wb,agentbench_240720,[],agent
208,openchat_v3.2,14.92,agentbench_wb,agentbench_240720,[],agent
209,wizardlm_30b,3.07,agentbench_wb,agentbench_240720,[],agent
210,vicuna_13b,3.92,agentbench_wb,agentbench_240720,[],agent
211,wizardlm_13b,5.8,agentbench_wb,agentbench_240720,[],agent
212,llama2_13b_chat,11.94,agentbench_wb,agentbench_240720,[],agent
213,codegeex2_6b,5.37,agentbench_wb,agentbench_240720,[],agent
214,openchat_8192,7.08,agentbench_wb,agentbench_240720,[],agent
215,baichuan_13b_chat,2.3,agentbench_wb,agentbench_240720,[],agent
216,koala_13b,8.1,agentbench_wb,agentbench_240720,[],agent
217,llama2_7b_chat,3.75,agentbench_wb,agentbench_240720,[],agent
218,chatglm_6b,4.94,agentbench_wb,agentbench_240720,[],agent
219,vicuna_7b,0.17,agentbench_wb,agentbench_240720,[],agent
220,internlm_chat_7b,0.17,agentbench_wb,agentbench_240720,[],agent
221,baichuan_7b,5.8,agentbench_wb,agentbench_240720,[],agent
222,wizardcoder,6.65,agentbench_wb,agentbench_240720,[],agent
223,dolly_v2_12b,4.43,agentbench_wb,agentbench_240720,[],agent
224,oasst_sft_4_pythia_12b,0.34,agentbench_wb,agentbench_240720,[],agent
0,pythia_1b,31.4,arc_c,olmes_260624,[],reasoning
1,olmo_1b,38.6,arc_c,olmes_260624,[],reasoning
2,tinyllama_1.1b,38.1,arc_c,olmes_260624,[],reasoning
3,pythia_6.7b,44.6,arc_c,olmes_260624,[],reasoning
4,rpj_incite_7b,45.3,arc_c,olmes_260624,[],reasoning
5,stablelm2_1.6b,50.6,arc_c,olmes_260624,[],reasoning
6,olmo_7b,46.4,arc_c,olmes_260624,[],reasoning
7,mpt_7b,45.7,arc_c,olmes_260624,[],reasoning
8,falcon_7b,49.7,arc_c,olmes_260624,[],reasoning
9,llama2_7b,54.2,arc_c,olmes_260624,[],reasoning
10,llama2_13b,67.3,arc_c,olmes_260624,[],reasoning
11,olmo_1.7_7b,66.9,arc_c,olmes_260624,[],reasoning
12,llama3_8b,79.3,arc_c,olmes_260624,[],reasoning
13,mistral_7b_v0.1,78.6,arc_c,olmes_260624,[],reasoning
14,llama3_70b,93.7,arc_c,olmes_260624,[],reasoning
15,pythia_1b,63.4,arc_e,olmes_260624,[],reasoning
16,olmo_1b,68.3,arc_e,olmes_260624,[],reasoning
17,tinyllama_1.1b,69.5,arc_e,olmes_260624,[],reasoning
18,pythia_6.7b,72.6,arc_e,olmes_260624,[],reasoning
19,rpj_incite_7b,78.8,arc_e,olmes_260624,[],reasoning
20,stablelm2_1.6b,75.3,arc_e,olmes_260624,[],reasoning
21,olmo_7b,78.9,arc_e,olmes_260624,[],reasoning
22,mpt_7b,78.0,arc_e,olmes_260624,[],reasoning
23,falcon_7b,80.6,arc_e,olmes_260624,[],reasoning
24,llama2_7b,84.0,arc_e,olmes_260624,[],reasoning
25,llama2_13b,85.9,arc_e,olmes_260624,[],reasoning
26,olmo_1.7_7b,83.6,arc_e,olmes_260624,[],reasoning
27,llama3_8b,92.4,arc_e,olmes_260624,[],reasoning
28,mistral_7b_v0.1,90.8,arc_e,olmes_260624,[],reasoning
29,llama3_70b,97.7,arc_e,olmes_260624,[],reasoning
30,pythia_1b,56.8,boolq,olmes_260624,[],knowledge
31,olmo_1b,51.3,boolq,olmes_260624,[],knowledge
32,tinyllama_1.1b,63.6,boolq,olmes_260624,[],knowledge
33,pythia_6.7b,68.7,boolq,olmes_260624,[],knowledge
34,rpj_incite_7b,72.0,boolq,olmes_260624,[],knowledge
35,stablelm2_1.6b,82.3,boolq,olmes_260624,[],knowledge
36,olmo_7b,78.7,boolq,olmes_260624,[],knowledge
37,mpt_7b,82.4,boolq,olmes_260624,[],knowledge
38,falcon_7b,78.2,boolq,olmes_260624,[],knowledge
39,llama2_7b,86.1,boolq,olmes_260624,[],knowledge
40,llama2_13b,86.7,boolq,olmes_260624,[],knowledge
41,olmo_1.7_7b,85.9,boolq,olmes_260624,[],knowledge
42,llama3_8b,87.5,boolq,olmes_260624,[],knowledge
43,mistral_7b_v0.1,89.3,boolq,olmes_260624,[],knowledge
44,llama3_70b,91.7,boolq,olmes_260624,[],knowledge
45,pythia_1b,50.9,csqa,olmes_260624,[],knowledge
46,olmo_1b,62.2,csqa,olmes_260624,[],knowledge
47,tinyllama_1.1b,61.1,csqa,olmes_260624,[],knowledge
48,pythia_6.7b,62.1,csqa,olmes_260624,[],knowledge
49,rpj_incite_7b,69.2,csqa,olmes_260624,[],knowledge
50,stablelm2_1.6b,70.4,csqa,olmes_260624,[],knowledge
51,olmo_7b,70.8,csqa,olmes_260624,[],knowledge
52,mpt_7b,70.9,csqa,olmes_260624,[],knowledge
53,falcon_7b,73.4,csqa,olmes_260624,[],knowledge
54,llama2_7b,74.2,csqa,olmes_260624,[],knowledge
55,llama2_13b,74.0,csqa,olmes_260624,[],knowledge
56,olmo_1.7_7b,85.8,csqa,olmes_260624,[],knowledge
57,llama3_8b,73.9,csqa,olmes_260624,[],knowledge
58,mistral_7b_v0.1,72.4,csqa,olmes_260624,[],knowledge
59,llama3_70b,83.2,csqa,olmes_260624,[],knowledge
60,pythia_1b,48.0,hellaswag,olmes_260624,[],reasoning
61,olmo_1b,65.2,hellaswag,olmes_260624,[],reasoning
62,tinyllama_1.1b,60.8,hellaswag,olmes_260624,[],reasoning
63,pythia_6.7b,66.1,hellaswag,olmes_260624,[],reasoning
64,rpj_incite_7b,72.8,hellaswag,olmes_260624,[],reasoning
65,stablelm2_1.6b,70.3,hellaswag,olmes_260624,[],reasoning
66,olmo_7b,78.1,hellaswag,olmes_260624,[],reasoning
67,mpt_7b,79.6,hellaswag,olmes_260624,[],reasoning
68,falcon_7b,79.0,hellaswag,olmes_260624,[],reasoning
69,llama2_7b,78.9,hellaswag,olmes_260624,[],reasoning
70,llama2_13b,83.9,hellaswag,olmes_260624,[],reasoning
71,olmo_1.7_7b,80.1,hellaswag,olmes_260624,[],reasoning
72,llama3_8b,81.8,hellaswag,olmes_260624,[],reasoning
73,mistral_7b_v0.1,83.0,hellaswag,olmes_260624,[],reasoning
74,llama3_70b,89.5,hellaswag,olmes_260624,[],reasoning
75,pythia_1b,31.1,mmlu,olmes_260624,[],knowledge
76,olmo_1b,33.4,mmlu,olmes_260624,[],knowledge
77,tinyllama_1.1b,33.6,mmlu,olmes_260624,[],knowledge
78,pythia_6.7b,37.7,mmlu,olmes_260624,[],knowledge
79,rpj_incite_7b,40.1,mmlu,olmes_260624,[],knowledge
80,stablelm2_1.6b,40.4,mmlu,olmes_260624,[],knowledge
81,olmo_7b,40.5,mmlu,olmes_260624,[],knowledge
82,mpt_7b,40.6,mmlu,olmes_260624,[],knowledge
83,falcon_7b,42.1,mmlu,olmes_260624,[],knowledge
84,llama2_7b,46.2,mmlu,olmes_260624,[],knowledge
85,llama2_13b,55.8,mmlu,olmes_260624,[],knowledge
86,olmo_1.7_7b,54.4,mmlu,olmes_260624,[],knowledge
87,llama3_8b,66.6,mmlu,olmes_260624,[],knowledge
88,mistral_7b_v0.1,64.0,mmlu,olmes_260624,[],knowledge
89,llama3_70b,79.8,mmlu,olmes_260624,[],knowledge
90,pythia_1b,40.4,openbookqa,olmes_260624,[],knowledge
91,olmo_1b,47.6,openbookqa,olmes_260624,[],knowledge
92,tinyllama_1.1b,45.0,openbookqa,olmes_260624,[],knowledge
93,pythia_6.7b,50.4,openbookqa,olmes_260624,[],knowledge
94,rpj_incite_7b,49.0,openbookqa,olmes_260624,[],knowledge
95,stablelm2_1.6b,56.6,openbookqa,olmes_260624,[],knowledge
96,olmo_7b,55.8,openbookqa,olmes_260624,[],knowledge
97,mpt_7b,52.4,openbookqa,olmes_260624,[],knowledge
98,falcon_7b,55.2,openbookqa,olmes_260624,[],knowledge
99,llama2_7b,57.8,openbookqa,olmes_260624,[],knowledge
100,llama2_13b,65.4,openbookqa,olmes_260624,[],knowledge
101,olmo_1.7_7b,68.6,openbookqa,olmes_260624,[],knowledge
102,llama3_8b,77.2,openbookqa,olmes_260624,[],knowledge
103,mistral_7b_v0.1,80.6,openbookqa,olmes_260624,[],knowledge
104,llama3_70b,93.4,openbookqa,olmes_260624,[],knowledge
105,pythia_1b,68.9,piqa,olmes_260624,[],reasoning
106,olmo_1b,74.1,piqa,olmes_260624,[],reasoning
107,tinyllama_1.1b,71.7,piqa,olmes_260624,[],reasoning
108,pythia_6.7b,74.9,piqa,olmes_260624,[],reasoning
109,rpj_incite_7b,75.9,piqa,olmes_260624,[],reasoning
110,stablelm2_1.6b,75.6,piqa,olmes_260624,[],reasoning
111,olmo_7b,78.5,piqa,olmes_260624,[],reasoning
112,mpt_7b,79.2,piqa,olmes_260624,[],reasoning
113,falcon_7b,79.0,piqa,olmes_260624,[],reasoning
114,llama2_7b,77.5,piqa,olmes_260624,[],reasoning
115,llama2_13b,80.2,piqa,olmes_260624,[],reasoning
116,olmo_1.7_7b,80.3,piqa,olmes_260624,[],reasoning
117,llama3_8b,81.6,piqa,olmes_260624,[],reasoning
118,mistral_7b_v0.1,82.8,piqa,olmes_260624,[],reasoning
119,llama3_70b,91.6,piqa,olmes_260624,[],reasoning
120,pythia_1b,46.4,siqa,olmes_260624,[],other
121,olmo_1b,51.5,siqa,olmes_260624,[],other
122,tinyllama_1.1b,50.4,siqa,olmes_260624,[],other
123,pythia_6.7b,51.7,siqa,olmes_260624,[],other
124,rpj_incite_7b,56.6,siqa,olmes_260624,[],other
125,stablelm2_1.6b,64.3,siqa,olmes_260624,[],other
126,olmo_7b,56.5,siqa,olmes_260624,[],other
127,mpt_7b,57.4,siqa,olmes_260624,[],other
128,falcon_7b,60.1,siqa,olmes_260624,[],other
129,llama2_7b,59.6,siqa,olmes_260624,[],other
130,llama2_13b,65.9,siqa,olmes_260624,[],other
131,olmo_1.7_7b,76.1,siqa,olmes_260624,[],other
132,llama3_8b,70.2,siqa,olmes_260624,[],other
133,mistral_7b_v0.1,71.3,siqa,olmes_260624,[],other
134,llama3_70b,78.9,siqa,olmes_260624,[],other
135,pythia_1b,52.7,winogrande,olmes_260624,[],reasoning
136,olmo_1b,59.3,winogrande,olmes_260624,[],reasoning
137,tinyllama_1.1b,60.1,winogrande,olmes_260624,[],reasoning
138,pythia_6.7b,62.3,winogrande,olmes_260624,[],reasoning
139,rpj_incite_7b,68.0,winogrande,olmes_260624,[],reasoning
140,stablelm2_1.6b,65.7,winogrande,olmes_260624,[],reasoning
141,olmo_7b,68.5,winogrande,olmes_260624,[],reasoning
142,mpt_7b,70.2,winogrande,olmes_260624,[],reasoning
143,falcon_7b,71.3,winogrande,olmes_260624,[],reasoning
144,llama2_7b,71.7,winogrande,olmes_260624,[],reasoning
145,llama2_13b,74.9,winogrande,olmes_260624,[],reasoning
146,olmo_1.7_7b,73.6,winogrande,olmes_260624,[],reasoning
147,llama3_8b,76.2,winogrande,olmes_260624,[],reasoning
148,mistral_7b_v0.1,77.9,winogrande,olmes_260624,[],reasoning
149,llama3_70b,84.1,winogrande,olmes_260624,[],reasoning
150,pythia_1b,49.0,olmes_average,olmes_260624,[],holistic
151,olmo_1b,55.1,olmes_average,olmes_260624,[],holistic
152,tinyllama_1.1b,55.4,olmes_average,olmes_260624,[],holistic
153,pythia_6.7b,59.1,olmes_average,olmes_260624,[],holistic
154,rpj_incite_7b,62.8,olmes_average,olmes_260624,[],holistic
155,stablelm2_1.6b,65.1,olmes_average,olmes_260624,[],holistic
156,olmo_7b,65.3,olmes_average,olmes_260624,[],holistic
157,mpt_7b,65.6,olmes_average,olmes_260624,[],holistic
158,falcon_7b,66.9,olmes_average,olmes_260624,[],holistic
159,llama2_7b,69.0,olmes_average,olmes_260624,[],holistic
160,llama2_13b,74.0,olmes_average,olmes_260624,[],holistic
161,olmo_1.7_7b,75.5,olmes_average,olmes_260624,[],holistic
162,llama3_8b,78.7,olmes_average,olmes_260624,[],holistic
163,mistral_7b_v0.1,79.1,olmes_average,olmes_260624,[],holistic
164,llama3_70b,88.4,olmes_average,olmes_260624,[],holistic
0,llama_2_70b,0.3753,mmlu_pro,mmlu_pro_240610,[],knowledge
1,llama_3_8b,0.3536,mmlu_pro,mmlu_pro_240610,[],knowledge
2,deepseekmath_instruct,0.353,mmlu_pro,mmlu_pro_240610,[],knowledge
3,gemma_7b,0.3373,mmlu_pro,mmlu_pro_240610,[],knowledge
4,mistral_7b_v0.1,0.3088,mmlu_pro,mmlu_pro_240610,[],knowledge
5,mistral_7b_instruct_v0.2,0.3084,mmlu_pro,mmlu_pro_240610,[],knowledge
6,mistral_7b_v0.2,0.3043,mmlu_pro,mmlu_pro_240610,[],knowledge
7,qwen1.5_7b_chat,0.2906,mmlu_pro,mmlu_pro_240610,[],knowledge
8,yi_6b_chat,0.2884,mmlu_pro,mmlu_pro_240610,[],knowledge
9,yi_6b,0.2651,mmlu_pro,mmlu_pro_240610,[],knowledge
10,mistral_7b_instruct_v0.1,0.2575,mmlu_pro,mmlu_pro_240610,[],knowledge
11,llama_2_13b,0.2534,mmlu_pro,mmlu_pro_240610,[],knowledge
12,llemma_7b,0.2345,mmlu_pro,mmlu_pro_240610,[],knowledge
13,llama_2_7b,0.2032,mmlu_pro,mmlu_pro_240610,[],knowledge
14,gpt_4o,0.7255,mmlu_pro,mmlu_pro_240610,[],knowledge
15,claude_3_opus,0.6845,mmlu_pro,mmlu_pro_240610,[],knowledge
16,gpt_4_turbo,0.6371,mmlu_pro,mmlu_pro_240610,[],knowledge
17,gemini_1.5_flash,0.5912,mmlu_pro,mmlu_pro_240610,[],knowledge
18,yi_large,0.5753,mmlu_pro,mmlu_pro_240610,[],knowledge
19,claude_3_sonnet,0.568,mmlu_pro,mmlu_pro_240610,[],knowledge
20,llama_3_70b_instruct,0.562,mmlu_pro,mmlu_pro_240610,[],knowledge
21,deepseek_v2,0.5481,mmlu_pro,mmlu_pro_240610,[],knowledge
22,phi_3_medium_4k_instruct,0.5348,mmlu_pro,mmlu_pro_240610,[],knowledge
23,llama_3_70b,0.5278,mmlu_pro,mmlu_pro_240610,[],knowledge
24,qwen1.5_72b_chat,0.5162,mmlu_pro,mmlu_pro_240610,[],knowledge
25,mammoth2_8x7b_plus,0.504,mmlu_pro,mmlu_pro_240610,[],knowledge
26,qwen1.5_110b,0.4993,mmlu_pro,mmlu_pro_240610,[],knowledge
27,mammoth2_8b_plus,0.4335,mmlu_pro,mmlu_pro_240610,[],knowledge
28,mixtral_8x7b_instruct_v0.1,0.4327,mmlu_pro,mmlu_pro_240610,[],knowledge
29,phi_3_mini_4k_instruct,0.4317,mmlu_pro,mmlu_pro_240610,[],knowledge
30,yi_34b,0.4303,mmlu_pro,mmlu_pro_240610,[],knowledge
31,mixtral_8x7b_v0.1,0.4103,mmlu_pro,mmlu_pro_240610,[],knowledge
32,llama_3_8b_instruct,0.4098,mmlu_pro,mmlu_pro_240610,[],knowledge
33,mammoth2_7b_plus,0.4085,mmlu_pro,mmlu_pro_240610,[],knowledge
34,qwen1.5_14b_chat,0.3802,mmlu_pro,mmlu_pro_240610,[],knowledge
35,c4ai_command_r_v01,0.379,mmlu_pro,mmlu_pro_240610,[],knowledge
0,claude_3_5_sonnet_20240620,61.16,livebench_average,livebench_240701,[],holistic
1,gpt_4o_2024_05_13,54.96,livebench_average,livebench_240701,[],holistic
2,gpt_4_turbo_2024_04_09,53.0,livebench_average,livebench_240701,[],holistic
3,gpt_4_1106_preview,52.17,livebench_average,livebench_240701,[],holistic
4,claude_3_opus_20240229,50.75,livebench_average,livebench_240701,[],holistic
5,gpt_4_0125_preview,49.39,livebench_average,livebench_240701,[],holistic
6,deepseek_coder_v2,46.79,livebench_average,livebench_240701,[],holistic
7,gemini_1.5_pro_api_0514,44.35,livebench_average,livebench_240701,[],holistic
8,gemma_2_27b_it,41.22,livebench_average,livebench_240701,[],holistic
9,gemini_1.5_flash_api_0514,40.89,livebench_average,livebench_240701,[],holistic
10,qwen2_72b_instruct,40.16,livebench_average,livebench_240701,[],holistic
11,acm_rewrite_qwen2_72b_chat,39.6,livebench_average,livebench_240701,[],holistic
12,mistral_large_2402,38.92,livebench_average,livebench_240701,[],holistic
13,deepseek_chat_v2,38.39,livebench_average,livebench_240701,[],holistic
14,claude_3_sonnet_20240229,38.08,livebench_average,livebench_240701,[],holistic
15,meta_llama_3_70b_instruct,37.38,livebench_average,livebench_240701,[],holistic
16,claude_3_haiku_20240307,35.32,livebench_average,livebench_240701,[],holistic
17,mixtral_8x22b_instruct_v0.1,34.84,livebench_average,livebench_240701,[],holistic
18,gpt_3.5_turbo_0125,34.43,livebench_average,livebench_240701,[],holistic
19,gpt_3.5_turbo_1106,34.14,livebench_average,livebench_240701,[],holistic
20,command_r_plus,32.86,livebench_average,livebench_240701,[],holistic
21,mistral_small_2402,32.8,livebench_average,livebench_240701,[],holistic
22,gemma_2_9b_it,31.57,livebench_average,livebench_240701,[],holistic
23,phi_3_medium_4k_instruct,30.33,livebench_average,livebench_240701,[],holistic
24,phi_3_medium_128k_instruct,29.64,livebench_average,livebench_240701,[],holistic
25,deepseek_coder_v2_lite_instruct,29.15,livebench_average,livebench_240701,[],holistic
26,qwen1.5_110b_chat,28.96,livebench_average,livebench_240701,[],holistic
27,qwen1.5_72b_chat,28.89,livebench_average,livebench_240701,[],holistic
28,command_r,27.23,livebench_average,livebench_240701,[],holistic
29,phi_3_small_128k_instruct,27.19,livebench_average,livebench_240701,[],holistic
30,meta_llama_3_8b_instruct,26.67,livebench_average,livebench_240701,[],holistic
31,qwen2_7b_instruct,26.45,livebench_average,livebench_240701,[],holistic
32,phi_3_small_8k_instruct,26.24,livebench_average,livebench_240701,[],holistic
33,openhermes_2.5_mistral_7b,23.3,livebench_average,livebench_240701,[],holistic
34,mixtral_8x7b_instruct_v0.1,22.5,livebench_average,livebench_240701,[],holistic
35,mistral_7b_instruct_v0.2,19.33,livebench_average,livebench_240701,[],holistic
36,phi_3_mini_4k_instruct,19.27,livebench_average,livebench_240701,[],holistic
37,zephyr_7b_alpha,19.22,livebench_average,livebench_240701,[],holistic
38,phi_3_mini_128k_instruct,18.04,livebench_average,livebench_240701,[],holistic
39,zephyr_7b_beta,17.32,livebench_average,livebench_240701,[],holistic
40,deepseek_v2_lite_chat,17.14,livebench_average,livebench_240701,[],holistic
41,qwen1.5_7b_chat,16.5,livebench_average,livebench_240701,[],holistic
42,starling_lm_7b_beta,16.44,livebench_average,livebench_240701,[],holistic
43,vicuna_7b_v1.5_16k,13.71,livebench_average,livebench_240701,[],holistic
44,vicuna_7b_v1.5,11.73,livebench_average,livebench_240701,[],holistic
45,qwen1.5_4b_chat,11.13,livebench_average,livebench_240701,[],holistic
46,llama_2_7b_chat,10.25,livebench_average,livebench_240701,[],holistic
47,qwen2_1.5b_instruct,9.96,livebench_average,livebench_240701,[],holistic
48,yi_6b_chat,8.79,livebench_average,livebench_240701,[],holistic
49,qwen2_0.5b_instruct,6.78,livebench_average,livebench_240701,[],holistic
50,qwen1.5_1.8b_chat,6.09,livebench_average,livebench_240701,[],holistic
51,qwen1.5_0.5b_chat,5.26,livebench_average,livebench_240701,[],holistic
52,claude_3_5_sonnet_20240620,64.0,reasoning_average,livebench_240701,[],reasoning
53,gpt_4o_2024_05_13,55.0,reasoning_average,livebench_240701,[],reasoning
54,gpt_4_turbo_2024_04_09,54.0,reasoning_average,livebench_240701,[],reasoning
55,gpt_4_1106_preview,52.0,reasoning_average,livebench_240701,[],reasoning
56,claude_3_opus_20240229,41.0,reasoning_average,livebench_240701,[],reasoning
57,gpt_4_0125_preview,48.0,reasoning_average,livebench_240701,[],reasoning
58,deepseek_coder_v2,49.0,reasoning_average,livebench_240701,[],reasoning
59,gemini_1.5_pro_api_0514,33.0,reasoning_average,livebench_240701,[],reasoning
60,gemma_2_27b_it,31.0,reasoning_average,livebench_240701,[],reasoning
61,gemini_1.5_flash_api_0514,30.0,reasoning_average,livebench_240701,[],reasoning
62,qwen2_72b_instruct,42.0,reasoning_average,livebench_240701,[],reasoning
63,acm_rewrite_qwen2_72b_chat,37.0,reasoning_average,livebench_240701,[],reasoning
64,mistral_large_2402,35.0,reasoning_average,livebench_240701,[],reasoning
65,deepseek_chat_v2,29.0,reasoning_average,livebench_240701,[],reasoning
66,claude_3_sonnet_20240229,26.0,reasoning_average,livebench_240701,[],reasoning
67,meta_llama_3_70b_instruct,31.0,reasoning_average,livebench_240701,[],reasoning
68,claude_3_haiku_20240307,26.0,reasoning_average,livebench_240701,[],reasoning
69,mixtral_8x22b_instruct_v0.1,29.0,reasoning_average,livebench_240701,[],reasoning
70,gpt_3.5_turbo_0125,26.0,reasoning_average,livebench_240701,[],reasoning
71,gpt_3.5_turbo_1106,28.0,reasoning_average,livebench_240701,[],reasoning
72,command_r_plus,32.0,reasoning_average,livebench_240701,[],reasoning
73,mistral_small_2402,28.0,reasoning_average,livebench_240701,[],reasoning
74,gemma_2_9b_it,19.0,reasoning_average,livebench_240701,[],reasoning
75,phi_3_medium_4k_instruct,35.0,reasoning_average,livebench_240701,[],reasoning
76,phi_3_medium_128k_instruct,31.0,reasoning_average,livebench_240701,[],reasoning
77,deepseek_coder_v2_lite_instruct,22.0,reasoning_average,livebench_240701,[],reasoning
78,qwen1.5_110b_chat,26.0,reasoning_average,livebench_240701,[],reasoning
79,qwen1.5_72b_chat,21.0,reasoning_average,livebench_240701,[],reasoning
80,command_r,28.0,reasoning_average,livebench_240701,[],reasoning
81,phi_3_small_128k_instruct,36.0,reasoning_average,livebench_240701,[],reasoning
82,meta_llama_3_8b_instruct,25.0,reasoning_average,livebench_240701,[],reasoning
83,qwen2_7b_instruct,20.0,reasoning_average,livebench_240701,[],reasoning
84,phi_3_small_8k_instruct,23.0,reasoning_average,livebench_240701,[],reasoning
85,openhermes_2.5_mistral_7b,17.0,reasoning_average,livebench_240701,[],reasoning
86,mixtral_8x7b_instruct_v0.1,18.0,reasoning_average,livebench_240701,[],reasoning
87,mistral_7b_instruct_v0.2,13.0,reasoning_average,livebench_240701,[],reasoning
88,phi_3_mini_4k_instruct,19.0,reasoning_average,livebench_240701,[],reasoning
89,zephyr_7b_alpha,17.0,reasoning_average,livebench_240701,[],reasoning
90,phi_3_mini_128k_instruct,10.0,reasoning_average,livebench_240701,[],reasoning
91,zephyr_7b_beta,16.0,reasoning_average,livebench_240701,[],reasoning
92,deepseek_v2_lite_chat,13.0,reasoning_average,livebench_240701,[],reasoning
93,qwen1.5_7b_chat,13.0,reasoning_average,livebench_240701,[],reasoning
94,starling_lm_7b_beta,19.0,reasoning_average,livebench_240701,[],reasoning
95,vicuna_7b_v1.5_16k,15.0,reasoning_average,livebench_240701,[],reasoning
96,vicuna_7b_v1.5,12.0,reasoning_average,livebench_240701,[],reasoning
97,qwen1.5_4b_chat,13.0,reasoning_average,livebench_240701,[],reasoning
98,llama_2_7b_chat,5.0,reasoning_average,livebench_240701,[],reasoning
99,qwen2_1.5b_instruct,8.0,reasoning_average,livebench_240701,[],reasoning
100,yi_6b_chat,8.0,reasoning_average,livebench_240701,[],reasoning
101,qwen2_0.5b_instruct,3.0,reasoning_average,livebench_240701,[],reasoning
102,qwen1.5_1.8b_chat,5.0,reasoning_average,livebench_240701,[],reasoning
103,qwen1.5_0.5b_chat,4.0,reasoning_average,livebench_240701,[],reasoning
104,claude_3_5_sonnet_20240620,63.21,coding_average,livebench_240701,[],code
105,gpt_4o_2024_05_13,46.37,coding_average,livebench_240701,[],code
106,gpt_4_turbo_2024_04_09,47.05,coding_average,livebench_240701,[],code
107,gpt_4_1106_preview,44.37,coding_average,livebench_240701,[],code
108,claude_3_opus_20240229,40.05,coding_average,livebench_240701,[],code
109,gpt_4_0125_preview,44.05,coding_average,livebench_240701,[],code
110,deepseek_coder_v2,41.05,coding_average,livebench_240701,[],code
111,gemini_1.5_pro_api_0514,32.79,coding_average,livebench_240701,[],code
112,gemma_2_27b_it,36.74,coding_average,livebench_240701,[],code
113,gemini_1.5_flash_api_0514,39.05,coding_average,livebench_240701,[],code
114,qwen2_72b_instruct,31.79,coding_average,livebench_240701,[],code
115,acm_rewrite_qwen2_72b_chat,39.05,coding_average,livebench_240701,[],code
116,mistral_large_2402,26.84,coding_average,livebench_240701,[],code
117,deepseek_chat_v2,33.47,coding_average,livebench_240701,[],code
118,claude_3_sonnet_20240229,25.21,coding_average,livebench_240701,[],code
119,meta_llama_3_70b_instruct,20.95,coding_average,livebench_240701,[],code
120,claude_3_haiku_20240307,24.53,coding_average,livebench_240701,[],code
121,mixtral_8x22b_instruct_v0.1,33.11,coding_average,livebench_240701,[],code
122,gpt_3.5_turbo_0125,29.16,coding_average,livebench_240701,[],code
123,gpt_3.5_turbo_1106,26.84,coding_average,livebench_240701,[],code
124,command_r_plus,20.26,coding_average,livebench_240701,[],code
125,mistral_small_2402,24.21,coding_average,livebench_240701,[],code
126,gemma_2_9b_it,22.21,coding_average,livebench_240701,[],code
127,phi_3_medium_4k_instruct,20.58,coding_average,livebench_240701,[],code
128,phi_3_medium_128k_instruct,21.58,coding_average,livebench_240701,[],code
129,deepseek_coder_v2_lite_instruct,26.84,coding_average,livebench_240701,[],code
130,qwen1.5_110b_chat,22.21,coding_average,livebench_240701,[],code
131,qwen1.5_72b_chat,22.89,coding_average,livebench_240701,[],code
132,command_r,14.95,coding_average,livebench_240701,[],code
133,phi_3_small_128k_instruct,25.84,coding_average,livebench_240701,[],code
134,meta_llama_3_8b_instruct,18.26,coding_average,livebench_240701,[],code
135,qwen2_7b_instruct,29.21,coding_average,livebench_240701,[],code
136,phi_3_small_8k_instruct,19.58,coding_average,livebench_240701,[],code
137,openhermes_2.5_mistral_7b,11.63,coding_average,livebench_240701,[],code
138,mixtral_8x7b_instruct_v0.1,11.32,coding_average,livebench_240701,[],code
139,mistral_7b_instruct_v0.2,11.63,coding_average,livebench_240701,[],code
140,phi_3_mini_4k_instruct,14.95,coding_average,livebench_240701,[],code
141,zephyr_7b_alpha,11.32,coding_average,livebench_240701,[],code
142,phi_3_mini_128k_instruct,11.63,coding_average,livebench_240701,[],code
143,zephyr_7b_beta,8.32,coding_average,livebench_240701,[],code
144,deepseek_v2_lite_chat,8.63,coding_average,livebench_240701,[],code
145,qwen1.5_7b_chat,6.63,coding_average,livebench_240701,[],code
146,starling_lm_7b_beta,18.26,coding_average,livebench_240701,[],code
147,vicuna_7b_v1.5_16k,1.32,coding_average,livebench_240701,[],code
148,vicuna_7b_v1.5,1.0,coding_average,livebench_240701,[],code
149,qwen1.5_4b_chat,4.0,coding_average,livebench_240701,[],code
150,llama_2_7b_chat,0.0,coding_average,livebench_240701,[],code
151,qwen2_1.5b_instruct,5.63,coding_average,livebench_240701,[],code
152,yi_6b_chat,1.32,coding_average,livebench_240701,[],code
153,qwen2_0.5b_instruct,2.0,coding_average,livebench_240701,[],code
154,qwen1.5_1.8b_chat,0.0,coding_average,livebench_240701,[],code
155,qwen1.5_0.5b_chat,0.0,coding_average,livebench_240701,[],code
156,claude_3_5_sonnet_20240620,53.75,mathematics_average,livebench_240701,[],math
157,gpt_4o_2024_05_13,49.88,mathematics_average,livebench_240701,[],math
158,gpt_4_turbo_2024_04_09,48.99,mathematics_average,livebench_240701,[],math
159,gpt_4_1106_preview,47.55,mathematics_average,livebench_240701,[],math
160,claude_3_opus_20240229,46.54,mathematics_average,livebench_240701,[],math
161,gpt_4_0125_preview,42.75,mathematics_average,livebench_240701,[],math
162,deepseek_coder_v2,52.19,mathematics_average,livebench_240701,[],math
163,gemini_1.5_pro_api_0514,42.07,mathematics_average,livebench_240701,[],math
164,gemma_2_27b_it,36.23,mathematics_average,livebench_240701,[],math
165,gemini_1.5_flash_api_0514,38.54,mathematics_average,livebench_240701,[],math
166,qwen2_72b_instruct,43.44,mathematics_average,livebench_240701,[],math
167,acm_rewrite_qwen2_72b_chat,40.32,mathematics_average,livebench_240701,[],math
168,mistral_large_2402,32.2,mathematics_average,livebench_240701,[],math
169,deepseek_chat_v2,33.23,mathematics_average,livebench_240701,[],math
170,claude_3_sonnet_20240229,29.65,mathematics_average,livebench_240701,[],math
171,meta_llama_3_70b_instruct,32.31,mathematics_average,livebench_240701,[],math
172,claude_3_haiku_20240307,25.72,mathematics_average,livebench_240701,[],math
173,mixtral_8x22b_instruct_v0.1,26.94,mathematics_average,livebench_240701,[],math
174,gpt_3.5_turbo_0125,25.54,mathematics_average,livebench_240701,[],math
175,gpt_3.5_turbo_1106,28.13,mathematics_average,livebench_240701,[],math
176,command_r_plus,24.85,mathematics_average,livebench_240701,[],math
177,mistral_small_2402,26.76,mathematics_average,livebench_240701,[],math
178,gemma_2_9b_it,23.98,mathematics_average,livebench_240701,[],math
179,phi_3_medium_4k_instruct,27.54,mathematics_average,livebench_240701,[],math
180,phi_3_medium_128k_instruct,24.25,mathematics_average,livebench_240701,[],math
181,deepseek_coder_v2_lite_instruct,34.09,mathematics_average,livebench_240701,[],math
182,qwen1.5_110b_chat,25.58,mathematics_average,livebench_240701,[],math
183,qwen1.5_72b_chat,26.82,mathematics_average,livebench_240701,[],math
184,command_r,16.92,mathematics_average,livebench_240701,[],math
185,phi_3_small_128k_instruct,24.84,mathematics_average,livebench_240701,[],math
186,meta_llama_3_8b_instruct,17.58,mathematics_average,livebench_240701,[],math
187,qwen2_7b_instruct,25.83,mathematics_average,livebench_240701,[],math
188,phi_3_small_8k_instruct,24.15,mathematics_average,livebench_240701,[],math
189,openhermes_2.5_mistral_7b,20.1,mathematics_average,livebench_240701,[],math
190,mixtral_8x7b_instruct_v0.1,18.97,mathematics_average,livebench_240701,[],math
191,mistral_7b_instruct_v0.2,16.04,mathematics_average,livebench_240701,[],math
192,phi_3_mini_4k_instruct,19.88,mathematics_average,livebench_240701,[],math
193,zephyr_7b_alpha,9.61,mathematics_average,livebench_240701,[],math
194,phi_3_mini_128k_instruct,21.48,mathematics_average,livebench_240701,[],math
195,zephyr_7b_beta,11.23,mathematics_average,livebench_240701,[],math
196,deepseek_v2_lite_chat,11.99,mathematics_average,livebench_240701,[],math
197,qwen1.5_7b_chat,12.86,mathematics_average,livebench_240701,[],math
198,starling_lm_7b_beta,13.82,mathematics_average,livebench_240701,[],math
199,vicuna_7b_v1.5_16k,6.61,mathematics_average,livebench_240701,[],math
200,vicuna_7b_v1.5,4.33,mathematics_average,livebench_240701,[],math
201,qwen1.5_4b_chat,7.08,mathematics_average,livebench_240701,[],math
202,llama_2_7b_chat,4.78,mathematics_average,livebench_240701,[],math
203,qwen2_1.5b_instruct,7.16,mathematics_average,livebench_240701,[],math
204,yi_6b_chat,7.14,mathematics_average,livebench_240701,[],math
205,qwen2_0.5b_instruct,4.22,mathematics_average,livebench_240701,[],math
206,qwen1.5_1.8b_chat,2.14,mathematics_average,livebench_240701,[],math
207,qwen1.5_0.5b_chat,3.39,mathematics_average,livebench_240701,[],math
208,claude_3_5_sonnet_20240620,56.74,data_analysis_average,livebench_240701,[],knowledge
209,gpt_4o_2024_05_13,52.41,data_analysis_average,livebench_240701,[],knowledge
210,gpt_4_turbo_2024_04_09,51.32,data_analysis_average,livebench_240701,[],knowledge
211,gpt_4_1106_preview,51.33,data_analysis_average,livebench_240701,[],knowledge
212,claude_3_opus_20240229,54.32,data_analysis_average,livebench_240701,[],knowledge
213,gpt_4_0125_preview,54.06,data_analysis_average,livebench_240701,[],knowledge
214,deepseek_coder_v2,38.25,data_analysis_average,livebench_240701,[],knowledge
215,gemini_1.5_pro_api_0514,52.81,data_analysis_average,livebench_240701,[],knowledge
216,gemma_2_27b_it,43.58,data_analysis_average,livebench_240701,[],knowledge
217,gemini_1.5_flash_api_0514,44.03,data_analysis_average,livebench_240701,[],knowledge
218,qwen2_72b_instruct,26.24,data_analysis_average,livebench_240701,[],knowledge
219,acm_rewrite_qwen2_72b_chat,26.19,data_analysis_average,livebench_240701,[],knowledge
220,mistral_large_2402,42.55,data_analysis_average,livebench_240701,[],knowledge
221,deepseek_chat_v2,38.03,data_analysis_average,livebench_240701,[],knowledge
222,claude_3_sonnet_20240229,44.56,data_analysis_average,livebench_240701,[],knowledge
223,meta_llama_3_70b_instruct,42.41,data_analysis_average,livebench_240701,[],knowledge
224,claude_3_haiku_20240307,41.54,data_analysis_average,livebench_240701,[],knowledge
225,mixtral_8x22b_instruct_v0.1,30.33,data_analysis_average,livebench_240701,[],knowledge
226,gpt_3.5_turbo_0125,41.21,data_analysis_average,livebench_240701,[],knowledge
227,gpt_3.5_turbo_1106,41.7,data_analysis_average,livebench_240701,[],knowledge
228,command_r_plus,24.6,data_analysis_average,livebench_240701,[],knowledge
229,mistral_small_2402,31.88,data_analysis_average,livebench_240701,[],knowledge
230,gemma_2_9b_it,35.06,data_analysis_average,livebench_240701,[],knowledge
231,phi_3_medium_4k_instruct,31.63,data_analysis_average,livebench_240701,[],knowledge
232,phi_3_medium_128k_instruct,32.12,data_analysis_average,livebench_240701,[],knowledge
233,deepseek_coder_v2_lite_instruct,33.0,data_analysis_average,livebench_240701,[],knowledge
234,qwen1.5_110b_chat,31.45,data_analysis_average,livebench_240701,[],knowledge
235,qwen1.5_72b_chat,32.98,data_analysis_average,livebench_240701,[],knowledge
236,command_r,31.69,data_analysis_average,livebench_240701,[],knowledge
237,phi_3_small_128k_instruct,27.33,data_analysis_average,livebench_240701,[],knowledge
238,meta_llama_3_8b_instruct,23.33,data_analysis_average,livebench_240701,[],knowledge
239,qwen2_7b_instruct,28.75,data_analysis_average,livebench_240701,[],knowledge
240,phi_3_small_8k_instruct,27.5,data_analysis_average,livebench_240701,[],knowledge
241,openhermes_2.5_mistral_7b,26.92,data_analysis_average,livebench_240701,[],knowledge
242,mixtral_8x7b_instruct_v0.1,28.13,data_analysis_average,livebench_240701,[],knowledge
243,mistral_7b_instruct_v0.2,14.62,data_analysis_average,livebench_240701,[],knowledge
244,phi_3_mini_4k_instruct,14.67,data_analysis_average,livebench_240701,[],knowledge
245,zephyr_7b_alpha,17.4,data_analysis_average,livebench_240701,[],knowledge
246,phi_3_mini_128k_instruct,8.69,data_analysis_average,livebench_240701,[],knowledge
247,zephyr_7b_beta,15.75,data_analysis_average,livebench_240701,[],knowledge
248,deepseek_v2_lite_chat,18.19,data_analysis_average,livebench_240701,[],knowledge
249,qwen1.5_7b_chat,16.23,data_analysis_average,livebench_240701,[],knowledge
250,starling_lm_7b_beta,2.0,data_analysis_average,livebench_240701,[],knowledge
251,vicuna_7b_v1.5_16k,9.27,data_analysis_average,livebench_240701,[],knowledge
252,vicuna_7b_v1.5,2.67,data_analysis_average,livebench_240701,[],knowledge
253,qwen1.5_4b_chat,9.13,data_analysis_average,livebench_240701,[],knowledge
254,llama_2_7b_chat,0.0,data_analysis_average,livebench_240701,[],knowledge
255,qwen2_1.5b_instruct,10.01,data_analysis_average,livebench_240701,[],knowledge
256,yi_6b_chat,4.38,data_analysis_average,livebench_240701,[],knowledge
257,qwen2_0.5b_instruct,2.0,data_analysis_average,livebench_240701,[],knowledge
258,qwen1.5_1.8b_chat,3.33,data_analysis_average,livebench_240701,[],knowledge
259,qwen1.5_0.5b_chat,0.0,data_analysis_average,livebench_240701,[],knowledge
260,claude_3_5_sonnet_20240620,56.94,language_average,livebench_240701,[],other
261,gpt_4o_2024_05_13,53.94,language_average,livebench_240701,[],other
262,gpt_4_turbo_2024_04_09,45.26,language_average,livebench_240701,[],other
263,gpt_4_1106_preview,48.37,language_average,livebench_240701,[],other
264,claude_3_opus_20240229,51.72,language_average,livebench_240701,[],other
265,gpt_4_0125_preview,43.55,language_average,livebench_240701,[],other
266,deepseek_coder_v2,33.04,language_average,livebench_240701,[],other
267,gemini_1.5_pro_api_0514,38.25,language_average,livebench_240701,[],other
268,gemma_2_27b_it,32.4,language_average,livebench_240701,[],other
269,gemini_1.5_flash_api_0514,30.69,language_average,livebench_240701,[],other
270,qwen2_72b_instruct,29.21,language_average,livebench_240701,[],other
271,acm_rewrite_qwen2_72b_chat,30.03,language_average,livebench_240701,[],other
272,mistral_large_2402,28.74,language_average,livebench_240701,[],other
273,deepseek_chat_v2,32.29,language_average,livebench_240701,[],other
274,claude_3_sonnet_20240229,38.08,language_average,livebench_240701,[],other
275,meta_llama_3_70b_instruct,34.11,language_average,livebench_240701,[],other
276,claude_3_haiku_20240307,30.07,language_average,livebench_240701,[],other
277,mixtral_8x22b_instruct_v0.1,26.48,language_average,livebench_240701,[],other
278,gpt_3.5_turbo_0125,24.22,language_average,livebench_240701,[],other
279,gpt_3.5_turbo_1106,28.63,language_average,livebench_240701,[],other
280,command_r_plus,23.92,language_average,livebench_240701,[],other
281,mistral_small_2402,22.06,language_average,livebench_240701,[],other
282,gemma_2_9b_it,27.64,language_average,livebench_240701,[],other
283,phi_3_medium_4k_instruct,13.91,language_average,livebench_240701,[],other
284,phi_3_medium_128k_instruct,12.76,language_average,livebench_240701,[],other
285,deepseek_coder_v2_lite_instruct,10.64,language_average,livebench_240701,[],other
286,qwen1.5_110b_chat,13.22,language_average,livebench_240701,[],other
287,qwen1.5_72b_chat,11.37,language_average,livebench_240701,[],other
288,command_r,14.64,language_average,livebench_240701,[],other
289,phi_3_small_128k_instruct,12.28,language_average,livebench_240701,[],other
290,meta_llama_3_8b_instruct,18.72,language_average,livebench_240701,[],other
291,qwen2_7b_instruct,10.21,language_average,livebench_240701,[],other
292,phi_3_small_8k_instruct,14.96,language_average,livebench_240701,[],other
293,openhermes_2.5_mistral_7b,11.37,language_average,livebench_240701,[],other
294,mixtral_8x7b_instruct_v0.1,13.76,language_average,livebench_240701,[],other
295,mistral_7b_instruct_v0.2,9.05,language_average,livebench_240701,[],other
296,phi_3_mini_4k_instruct,7.1,language_average,livebench_240701,[],other
297,zephyr_7b_alpha,7.2,language_average,livebench_240701,[],other
298,phi_3_mini_128k_instruct,6.8,language_average,livebench_240701,[],other
299,zephyr_7b_beta,4.28,language_average,livebench_240701,[],other
300,deepseek_v2_lite_chat,9.2,language_average,livebench_240701,[],other
301,qwen1.5_7b_chat,6.18,language_average,livebench_240701,[],other
302,starling_lm_7b_beta,7.26,language_average,livebench_240701,[],other
303,vicuna_7b_v1.5_16k,7.92,language_average,livebench_240701,[],other
304,vicuna_7b_v1.5,8.66,language_average,livebench_240701,[],other
305,qwen1.5_4b_chat,5.8,language_average,livebench_240701,[],other
306,llama_2_7b_chat,6.86,language_average,livebench_240701,[],other
307,qwen2_1.5b_instruct,3.05,language_average,livebench_240701,[],other
308,yi_6b_chat,4.69,language_average,livebench_240701,[],other
309,qwen2_0.5b_instruct,2.8,language_average,livebench_240701,[],other
310,qwen1.5_1.8b_chat,3.16,language_average,livebench_240701,[],other
311,qwen1.5_0.5b_chat,2.88,language_average,livebench_240701,[],other
312,claude_3_5_sonnet_20240620,72.3,if_average,livebench_240701,[],other
313,gpt_4o_2024_05_13,72.17,if_average,livebench_240701,[],other
314,gpt_4_turbo_2024_04_09,71.39,if_average,livebench_240701,[],other
315,gpt_4_1106_preview,69.39,if_average,livebench_240701,[],other
316,claude_3_opus_20240229,70.87,if_average,livebench_240701,[],other
317,gpt_4_0125_preview,63.92,if_average,livebench_240701,[],other
318,deepseek_coder_v2,67.18,if_average,livebench_240701,[],other
319,gemini_1.5_pro_api_0514,67.2,if_average,livebench_240701,[],other
320,gemma_2_27b_it,67.37,if_average,livebench_240701,[],other
321,gemini_1.5_flash_api_0514,63.01,if_average,livebench_240701,[],other
322,qwen2_72b_instruct,68.27,if_average,livebench_240701,[],other
323,acm_rewrite_qwen2_72b_chat,65.0,if_average,livebench_240701,[],other
324,mistral_large_2402,68.19,if_average,livebench_240701,[],other
325,deepseek_chat_v2,64.34,if_average,livebench_240701,[],other
326,claude_3_sonnet_20240229,65.0,if_average,livebench_240701,[],other
327,meta_llama_3_70b_instruct,63.5,if_average,livebench_240701,[],other
328,claude_3_haiku_20240307,64.03,if_average,livebench_240701,[],other
329,mixtral_8x22b_instruct_v0.1,63.17,if_average,livebench_240701,[],other
330,gpt_3.5_turbo_0125,60.47,if_average,livebench_240701,[],other
331,gpt_3.5_turbo_1106,51.53,if_average,livebench_240701,[],other
332,command_r_plus,71.51,if_average,livebench_240701,[],other
333,mistral_small_2402,63.91,if_average,livebench_240701,[],other
334,gemma_2_9b_it,61.55,if_average,livebench_240701,[],other
335,phi_3_medium_4k_instruct,53.3,if_average,livebench_240701,[],other
336,phi_3_medium_128k_instruct,56.15,if_average,livebench_240701,[],other
337,deepseek_coder_v2_lite_instruct,48.34,if_average,livebench_240701,[],other
338,qwen1.5_110b_chat,55.26,if_average,livebench_240701,[],other
339,qwen1.5_72b_chat,58.25,if_average,livebench_240701,[],other
340,command_r,57.16,if_average,livebench_240701,[],other
341,phi_3_small_128k_instruct,36.88,if_average,livebench_240701,[],other
342,meta_llama_3_8b_instruct,57.14,if_average,livebench_240701,[],other
343,qwen2_7b_instruct,44.74,if_average,livebench_240701,[],other
344,phi_3_small_8k_instruct,48.24,if_average,livebench_240701,[],other
345,openhermes_2.5_mistral_7b,52.78,if_average,livebench_240701,[],other
346,mixtral_8x7b_instruct_v0.1,44.81,if_average,livebench_240701,[],other
347,mistral_7b_instruct_v0.2,51.65,if_average,livebench_240701,[],other
348,phi_3_mini_4k_instruct,40.05,if_average,livebench_240701,[],other
349,zephyr_7b_alpha,52.79,if_average,livebench_240701,[],other
350,phi_3_mini_128k_instruct,49.65,if_average,livebench_240701,[],other
351,zephyr_7b_beta,48.32,if_average,livebench_240701,[],other
352,deepseek_v2_lite_chat,41.83,if_average,livebench_240701,[],other
353,qwen1.5_7b_chat,44.12,if_average,livebench_240701,[],other
354,starling_lm_7b_beta,38.32,if_average,livebench_240701,[],other
355,vicuna_7b_v1.5_16k,42.12,if_average,livebench_240701,[],other
356,vicuna_7b_v1.5,41.75,if_average,livebench_240701,[],other
357,qwen1.5_4b_chat,27.75,if_average,livebench_240701,[],other
358,llama_2_7b_chat,44.88,if_average,livebench_240701,[],other
359,qwen2_1.5b_instruct,25.9,if_average,livebench_240701,[],other
360,yi_6b_chat,27.22,if_average,livebench_240701,[],other
361,qwen2_0.5b_instruct,26.63,if_average,livebench_240701,[],other
362,qwen1.5_1.8b_chat,22.9,if_average,livebench_240701,[],other
363,qwen1.5_0.5b_chat,21.3,if_average,livebench_240701,[],other