diff --git "a/data/benchmarks/helm_mmlu.json" "b/data/benchmarks/helm_mmlu.json" new file mode 100644--- /dev/null +++ "b/data/benchmarks/helm_mmlu.json" @@ -0,0 +1,3401 @@ +{ + "models": [ + { + "model_id": "01-ai/yi-34b", + "name": "Yi 34B", + "developer": "01-ai", + "scores": { + "MMLU All Subjects": 0.762, + "Abstract Algebra": 0.4, + "Anatomy": 0.748, + "College Physics": 0.5, + "Computer Security": 0.83, + "Econometrics": 0.588, + "Global Facts": 0.53, + "Jurisprudence": 0.898, + "Philosophy": 0.82, + "Professional Psychology": 0.835, + "Us Foreign Policy": 0.91, + "Astronomy": 0.901, + "Business Ethics": 0.75, + "Clinical Knowledge": 0.8, + "Conceptual Physics": 0.77, + "Electrical Engineering": 0.779, + "Elementary Mathematics": 0.656, + "Formal Logic": 0.548, + "High School World History": 0.907, + "Human Sexuality": 0.87, + "International Law": 0.909, + "Logical Fallacies": 0.883, + "Machine Learning": 0.58, + "Management": 0.893, + "Marketing": 0.936, + "Medical Genetics": 0.87, + "Miscellaneous": 0.902, + "Moral Scenarios": 0.606, + "Nutrition": 0.869, + "Prehistory": 0.877, + "Public Relations": 0.745, + "Security Studies": 0.833, + "Sociology": 0.9, + "Virology": 0.572, + "World Religions": 0.877, + "Mean win rate": 0.315 + } + }, + { + "model_id": "01-ai/yi-6b", + "name": "Yi 6B", + "developer": "01-ai", + "scores": { + "MMLU All Subjects": 0.64, + "Abstract Algebra": 0.3, + "Anatomy": 0.6, + "College Physics": 0.422, + "Computer Security": 0.73, + "Econometrics": 0.351, + "Global Facts": 0.43, + "Jurisprudence": 0.796, + "Philosophy": 0.678, + "Professional Psychology": 0.668, + "Us Foreign Policy": 0.87, + "Astronomy": 0.684, + "Business Ethics": 0.67, + "Clinical Knowledge": 0.66, + "Conceptual Physics": 0.621, + "Electrical Engineering": 0.662, + "Elementary Mathematics": 0.452, + "Formal Logic": 0.452, + "High School World History": 0.785, + "Human Sexuality": 0.763, + "International Law": 0.769, + "Logical Fallacies": 0.779, + "Machine Learning": 0.411, + "Management": 0.806, + "Marketing": 0.893, + "Medical Genetics": 0.77, + "Miscellaneous": 0.796, + "Moral Scenarios": 0.335, + "Nutrition": 0.739, + "Prehistory": 0.713, + "Public Relations": 0.718, + "Security Studies": 0.735, + "Sociology": 0.831, + "Virology": 0.452, + "World Religions": 0.836, + "Mean win rate": 0.651 + } + }, + { + "model_id": "01-ai/yi-large-preview", + "name": "Yi Large Preview", + "developer": "01-ai", + "scores": { + "MMLU All Subjects": 0.793, + "Abstract Algebra": 0.6, + "Anatomy": 0.83, + "College Physics": 0.569, + "Computer Security": 0.86, + "Econometrics": 0.728, + "Global Facts": 0.52, + "Jurisprudence": 0.852, + "Philosophy": 0.842, + "Professional Psychology": 0.853, + "Us Foreign Policy": 0.85, + "Astronomy": 0.914, + "Business Ethics": 0.8, + "Clinical Knowledge": 0.857, + "Conceptual Physics": 0.864, + "Electrical Engineering": 0.779, + "Elementary Mathematics": 0.685, + "Formal Logic": 0.603, + "High School World History": 0.928, + "Human Sexuality": 0.901, + "International Law": 0.917, + "Logical Fallacies": 0.865, + "Machine Learning": 0.616, + "Management": 0.903, + "Marketing": 0.927, + "Medical Genetics": 0.83, + "Miscellaneous": 0.916, + "Moral Scenarios": 0.831, + "Nutrition": 0.846, + "Prehistory": 0.892, + "Public Relations": 0.827, + "Security Studies": 0.82, + "Sociology": 0.881, + "Virology": 0.59, + "World Religions": 0.871, + "Mean win rate": 0.258 + } + }, + { + "model_id": "ai21/jamba-1.5-large", + "name": "Jamba 1.5 Large", + "developer": "ai21", + "scores": { + "MMLU All Subjects": 0.782, + "Abstract Algebra": 0.53, + "Anatomy": 0.793, + "College Physics": 0.51, + "Computer Security": 0.8, + "Econometrics": 0.614, + "Global Facts": 0.54, + "Jurisprudence": 0.87, + "Philosophy": 0.849, + "Professional Psychology": 0.842, + "Us Foreign Policy": 0.92, + "Astronomy": 0.882, + "Business Ethics": 0.77, + "Clinical Knowledge": 0.849, + "Conceptual Physics": 0.779, + "Electrical Engineering": 0.793, + "Elementary Mathematics": 0.656, + "Formal Logic": 0.619, + "High School World History": 0.911, + "Human Sexuality": 0.832, + "International Law": 0.884, + "Logical Fallacies": 0.859, + "Machine Learning": 0.688, + "Management": 0.864, + "Marketing": 0.94, + "Medical Genetics": 0.89, + "Miscellaneous": 0.931, + "Moral Scenarios": 0.686, + "Nutrition": 0.869, + "Prehistory": 0.892, + "Public Relations": 0.755, + "Security Studies": 0.771, + "Sociology": 0.93, + "Virology": 0.554, + "World Religions": 0.865, + "Mean win rate": 0.147 + } + }, + { + "model_id": "ai21/jamba-1.5-mini", + "name": "Jamba 1.5 Mini", + "developer": "ai21", + "scores": { + "MMLU All Subjects": 0.699, + "Abstract Algebra": 0.33, + "Anatomy": 0.711, + "College Physics": 0.48, + "Computer Security": 0.73, + "Econometrics": 0.491, + "Global Facts": 0.43, + "Jurisprudence": 0.88, + "Philosophy": 0.752, + "Professional Psychology": 0.76, + "Us Foreign Policy": 0.9, + "Astronomy": 0.822, + "Business Ethics": 0.76, + "Clinical Knowledge": 0.74, + "Conceptual Physics": 0.677, + "Electrical Engineering": 0.683, + "Elementary Mathematics": 0.553, + "Formal Logic": 0.452, + "High School World History": 0.84, + "Human Sexuality": 0.809, + "International Law": 0.893, + "Logical Fallacies": 0.81, + "Machine Learning": 0.509, + "Management": 0.825, + "Marketing": 0.915, + "Medical Genetics": 0.69, + "Miscellaneous": 0.902, + "Moral Scenarios": 0.269, + "Nutrition": 0.801, + "Prehistory": 0.824, + "Public Relations": 0.727, + "Security Studies": 0.755, + "Sociology": 0.876, + "Virology": 0.578, + "World Religions": 0.842, + "Mean win rate": 0.206 + } + }, + { + "model_id": "ai21/jamba-instruct", + "name": "Jamba Instruct", + "developer": "ai21", + "scores": { + "MMLU All Subjects": 0.659, + "Abstract Algebra": 0.36, + "Anatomy": 0.615, + "College Physics": 0.422, + "Computer Security": 0.76, + "Econometrics": 0.439, + "Global Facts": 0.4, + "Jurisprudence": 0.796, + "Philosophy": 0.749, + "Professional Psychology": 0.716, + "Us Foreign Policy": 0.91, + "Astronomy": 0.73, + "Business Ethics": 0.6, + "Clinical Knowledge": 0.702, + "Conceptual Physics": 0.677, + "Electrical Engineering": 0.621, + "Elementary Mathematics": 0.497, + "Formal Logic": 0.444, + "High School World History": 0.797, + "Human Sexuality": 0.794, + "International Law": 0.835, + "Logical Fallacies": 0.706, + "Machine Learning": 0.536, + "Management": 0.786, + "Marketing": 0.885, + "Medical Genetics": 0.67, + "Miscellaneous": 0.865, + "Moral Scenarios": 0.465, + "Nutrition": 0.745, + "Prehistory": 0.796, + "Public Relations": 0.682, + "Security Studies": 0.743, + "Sociology": 0.891, + "Virology": 0.53, + "World Religions": 0.813, + "Mean win rate": 0.887 + } + }, + { + "model_id": "allenai/olmo-1.7-7b", + "name": "OLMo 1.7 7B", + "developer": "allenai", + "scores": { + "MMLU All Subjects": 0.538, + "Abstract Algebra": 0.33, + "Anatomy": 0.496, + "College Physics": 0.333, + "Computer Security": 0.65, + "Econometrics": 0.404, + "Global Facts": 0.34, + "Jurisprudence": 0.565, + "Philosophy": 0.592, + "Professional Psychology": 0.526, + "Us Foreign Policy": 0.76, + "Astronomy": 0.526, + "Business Ethics": 0.59, + "Clinical Knowledge": 0.57, + "Conceptual Physics": 0.434, + "Electrical Engineering": 0.517, + "Elementary Mathematics": 0.307, + "Formal Logic": 0.325, + "High School World History": 0.713, + "Human Sexuality": 0.595, + "International Law": 0.612, + "Logical Fallacies": 0.607, + "Machine Learning": 0.375, + "Management": 0.689, + "Marketing": 0.769, + "Medical Genetics": 0.56, + "Miscellaneous": 0.734, + "Moral Scenarios": 0.335, + "Nutrition": 0.608, + "Prehistory": 0.593, + "Public Relations": 0.6, + "Security Studies": 0.522, + "Sociology": 0.751, + "Virology": 0.452, + "World Religions": 0.731, + "Mean win rate": 0.196 + } + }, + { + "model_id": "allenai/olmo-7b", + "name": "OLMo 7B", + "developer": "allenai", + "scores": { + "MMLU All Subjects": 0.295, + "Abstract Algebra": 0.26, + "Anatomy": 0.222, + "College Physics": 0.294, + "Computer Security": 0.3, + "Econometrics": 0.325, + "Global Facts": 0.32, + "Jurisprudence": 0.25, + "Philosophy": 0.325, + "Professional Psychology": 0.232, + "Us Foreign Policy": 0.26, + "Astronomy": 0.342, + "Business Ethics": 0.24, + "Clinical Knowledge": 0.26, + "Conceptual Physics": 0.319, + "Electrical Engineering": 0.29, + "Elementary Mathematics": 0.254, + "Formal Logic": 0.278, + "High School World History": 0.253, + "Human Sexuality": 0.267, + "International Law": 0.306, + "Logical Fallacies": 0.264, + "Machine Learning": 0.286, + "Management": 0.272, + "Marketing": 0.269, + "Medical Genetics": 0.28, + "Miscellaneous": 0.292, + "Moral Scenarios": 0.265, + "Nutrition": 0.34, + "Prehistory": 0.318, + "Public Relations": 0.345, + "Security Studies": 0.408, + "Sociology": 0.383, + "Virology": 0.416, + "World Religions": 0.234, + "Mean win rate": 0.68 + } + }, + { + "model_id": "amazon/nova-lite-v1:0", + "name": "Amazon Nova Lite", + "developer": "amazon", + "scores": { + "MMLU All Subjects": 0.77, + "Abstract Algebra": 0.52, + "Anatomy": 0.719, + "College Physics": 0.608, + "Computer Security": 0.79, + "Econometrics": 0.675, + "Global Facts": 0.55, + "Jurisprudence": 0.852, + "Philosophy": 0.817, + "Professional Psychology": 0.812, + "Us Foreign Policy": 0.92, + "Astronomy": 0.862, + "Business Ethics": 0.73, + "Clinical Knowledge": 0.8, + "Conceptual Physics": 0.796, + "Electrical Engineering": 0.779, + "Elementary Mathematics": 0.757, + "Formal Logic": 0.643, + "High School World History": 0.886, + "Human Sexuality": 0.84, + "International Law": 0.843, + "Logical Fallacies": 0.81, + "Machine Learning": 0.509, + "Management": 0.864, + "Marketing": 0.889, + "Medical Genetics": 0.9, + "Miscellaneous": 0.872, + "Moral Scenarios": 0.694, + "Nutrition": 0.788, + "Prehistory": 0.849, + "Public Relations": 0.682, + "Security Studies": 0.788, + "Sociology": 0.896, + "Virology": 0.542, + "World Religions": 0.871, + "Mean win rate": 0.987 + } + }, + { + "model_id": "amazon/nova-micro-v1:0", + "name": "Amazon Nova Micro", + "developer": "amazon", + "scores": { + "MMLU All Subjects": 0.708, + "Abstract Algebra": 0.42, + "Anatomy": 0.726, + "College Physics": 0.5, + "Computer Security": 0.77, + "Econometrics": 0.57, + "Global Facts": 0.44, + "Jurisprudence": 0.815, + "Philosophy": 0.733, + "Professional Psychology": 0.739, + "Us Foreign Policy": 0.9, + "Astronomy": 0.822, + "Business Ethics": 0.71, + "Clinical Knowledge": 0.751, + "Conceptual Physics": 0.706, + "Electrical Engineering": 0.683, + "Elementary Mathematics": 0.55, + "Formal Logic": 0.508, + "High School World History": 0.84, + "Human Sexuality": 0.824, + "International Law": 0.843, + "Logical Fallacies": 0.798, + "Machine Learning": 0.562, + "Management": 0.816, + "Marketing": 0.91, + "Medical Genetics": 0.82, + "Miscellaneous": 0.83, + "Moral Scenarios": 0.464, + "Nutrition": 0.778, + "Prehistory": 0.787, + "Public Relations": 0.673, + "Security Studies": 0.718, + "Sociology": 0.846, + "Virology": 0.524, + "World Religions": 0.825, + "Mean win rate": 1.0 + } + }, + { + "model_id": "amazon/nova-pro-v1:0", + "name": "Amazon Nova Pro", + "developer": "amazon", + "scores": { + "MMLU All Subjects": 0.82, + "Abstract Algebra": 0.69, + "Anatomy": 0.807, + "College Physics": 0.647, + "Computer Security": 0.84, + "Econometrics": 0.702, + "Global Facts": 0.54, + "Jurisprudence": 0.861, + "Philosophy": 0.826, + "Professional Psychology": 0.864, + "Us Foreign Policy": 0.93, + "Astronomy": 0.895, + "Business Ethics": 0.81, + "Clinical Knowledge": 0.875, + "Conceptual Physics": 0.851, + "Electrical Engineering": 0.8, + "Elementary Mathematics": 0.831, + "Formal Logic": 0.714, + "High School World History": 0.928, + "Human Sexuality": 0.885, + "International Law": 0.901, + "Logical Fallacies": 0.871, + "Machine Learning": 0.625, + "Management": 0.922, + "Marketing": 0.923, + "Medical Genetics": 0.87, + "Miscellaneous": 0.912, + "Moral Scenarios": 0.76, + "Nutrition": 0.866, + "Prehistory": 0.926, + "Public Relations": 0.8, + "Security Studies": 0.849, + "Sociology": 0.905, + "Virology": 0.59, + "World Religions": 0.877, + "Mean win rate": 0.975 + } + }, + { + "model_id": "anthropic/claude-2.1", + "name": "Claude 2.1", + "developer": "anthropic", + "scores": { + "MMLU All Subjects": 0.735, + "Abstract Algebra": 0.4, + "Anatomy": 0.726, + "College Physics": 0.5, + "Computer Security": 0.81, + "Econometrics": 0.596, + "Global Facts": 0.55, + "Jurisprudence": 0.87, + "Philosophy": 0.794, + "Professional Psychology": 0.797, + "Us Foreign Policy": 0.92, + "Astronomy": 0.855, + "Business Ethics": 0.73, + "Clinical Knowledge": 0.785, + "Conceptual Physics": 0.766, + "Electrical Engineering": 0.724, + "Elementary Mathematics": 0.521, + "Formal Logic": 0.5, + "High School World History": 0.903, + "Human Sexuality": 0.847, + "International Law": 0.901, + "Logical Fallacies": 0.834, + "Machine Learning": 0.482, + "Management": 0.825, + "Marketing": 0.923, + "Medical Genetics": 0.81, + "Miscellaneous": 0.88, + "Moral Scenarios": 0.52, + "Nutrition": 0.781, + "Prehistory": 0.821, + "Public Relations": 0.773, + "Security Studies": 0.812, + "Sociology": 0.886, + "Virology": 0.554, + "World Religions": 0.854, + "Mean win rate": 0.048 + } + }, + { + "model_id": "anthropic/claude-3-5-haiku-20241022", + "name": "claude-3-5-haiku-20241022", + "developer": "anthropic", + "scores": { + "MMLU All Subjects": 0.743, + "Abstract Algebra": 0.47, + "Anatomy": 0.793, + "College Physics": 0.52, + "Computer Security": 0.84, + "Econometrics": 0.596, + "Global Facts": 0.5, + "Jurisprudence": 0.861, + "Philosophy": 0.823, + "Professional Psychology": 0.825, + "Us Foreign Policy": 0.94, + "Astronomy": 0.829, + "Business Ethics": 0.8, + "Clinical Knowledge": 0.823, + "Conceptual Physics": 0.723, + "Electrical Engineering": 0.717, + "Elementary Mathematics": 0.561, + "Formal Logic": 0.619, + "High School World History": 0.882, + "Human Sexuality": 0.885, + "International Law": 0.884, + "Logical Fallacies": 0.822, + "Machine Learning": 0.518, + "Management": 0.845, + "Marketing": 0.897, + "Medical Genetics": 0.83, + "Miscellaneous": 0.905, + "Moral Scenarios": 0.476, + "Nutrition": 0.846, + "Prehistory": 0.877, + "Public Relations": 0.727, + "Security Studies": 0.792, + "Sociology": 0.905, + "Virology": 0.566, + "World Religions": 0.865, + "Mean win rate": 0.128 + } + }, + { + "model_id": "anthropic/claude-3-5-sonnet-20240620", + "name": "Claude 3.5 Sonnet 20240620", + "developer": "anthropic", + "scores": { + "MMLU All Subjects": 0.865, + "Abstract Algebra": 0.75, + "Anatomy": 0.844, + "College Physics": 0.696, + "Computer Security": 0.89, + "Econometrics": 0.807, + "Global Facts": 0.72, + "Jurisprudence": 0.889, + "Philosophy": 0.891, + "Professional Psychology": 0.922, + "Us Foreign Policy": 0.96, + "Astronomy": 0.961, + "Business Ethics": 0.85, + "Clinical Knowledge": 0.913, + "Conceptual Physics": 0.885, + "Electrical Engineering": 0.828, + "Elementary Mathematics": 0.892, + "Formal Logic": 0.698, + "High School World History": 0.954, + "Human Sexuality": 0.939, + "International Law": 0.959, + "Logical Fallacies": 0.926, + "Machine Learning": 0.786, + "Management": 0.942, + "Marketing": 0.949, + "Medical Genetics": 0.98, + "Miscellaneous": 0.962, + "Moral Scenarios": 0.882, + "Nutrition": 0.912, + "Prehistory": 0.951, + "Public Relations": 0.855, + "Security Studies": 0.878, + "Sociology": 0.96, + "Virology": 0.602, + "World Religions": 0.924, + "Mean win rate": 0.17 + } + }, + { + "model_id": "anthropic/claude-3-5-sonnet-20241022", + "name": "Claude 3.5 Sonnet 20241022", + "developer": "anthropic", + "scores": { + "MMLU All Subjects": 0.873, + "Abstract Algebra": 0.78, + "Anatomy": 0.859, + "College Physics": 0.775, + "Computer Security": 0.87, + "Econometrics": 0.807, + "Global Facts": 0.8, + "Jurisprudence": 0.898, + "Philosophy": 0.891, + "Professional Psychology": 0.922, + "Us Foreign Policy": 0.96, + "Astronomy": 0.974, + "Business Ethics": 0.83, + "Clinical Knowledge": 0.928, + "Conceptual Physics": 0.906, + "Electrical Engineering": 0.848, + "Elementary Mathematics": 0.918, + "Formal Logic": 0.786, + "High School World History": 0.958, + "Human Sexuality": 0.939, + "International Law": 0.959, + "Logical Fallacies": 0.914, + "Machine Learning": 0.839, + "Management": 0.932, + "Marketing": 0.953, + "Medical Genetics": 0.96, + "Miscellaneous": 0.964, + "Moral Scenarios": 0.888, + "Nutrition": 0.922, + "Prehistory": 0.941, + "Public Relations": 0.8, + "Security Studies": 0.882, + "Sociology": 0.955, + "Virology": 0.584, + "World Religions": 0.901, + "Mean win rate": 0.311 + } + }, + { + "model_id": "anthropic/claude-3-haiku-20240307", + "name": "Claude 3 Haiku 20240307", + "developer": "anthropic", + "scores": { + "MMLU All Subjects": 0.738, + "Abstract Algebra": 0.42, + "Anatomy": 0.711, + "College Physics": 0.48, + "Computer Security": 0.79, + "Econometrics": 0.632, + "Global Facts": 0.47, + "Jurisprudence": 0.861, + "Philosophy": 0.814, + "Professional Psychology": 0.802, + "Us Foreign Policy": 0.95, + "Astronomy": 0.901, + "Business Ethics": 0.78, + "Clinical Knowledge": 0.789, + "Conceptual Physics": 0.715, + "Electrical Engineering": 0.69, + "Elementary Mathematics": 0.558, + "Formal Logic": 0.579, + "High School World History": 0.878, + "Human Sexuality": 0.824, + "International Law": 0.901, + "Logical Fallacies": 0.791, + "Machine Learning": 0.589, + "Management": 0.874, + "Marketing": 0.91, + "Medical Genetics": 0.8, + "Miscellaneous": 0.893, + "Moral Scenarios": 0.502, + "Nutrition": 0.83, + "Prehistory": 0.824, + "Public Relations": 0.755, + "Security Studies": 0.808, + "Sociology": 0.9, + "Virology": 0.542, + "World Religions": 0.871, + "Mean win rate": 0.28 + } + }, + { + "model_id": "anthropic/claude-3-opus-20240229", + "name": "Claude 3 Opus 20240229", + "developer": "anthropic", + "scores": { + "MMLU All Subjects": 0.846, + "Abstract Algebra": 0.64, + "Anatomy": 0.8, + "College Physics": 0.716, + "Computer Security": 0.85, + "Econometrics": 0.789, + "Global Facts": 0.66, + "Jurisprudence": 0.88, + "Philosophy": 0.9, + "Professional Psychology": 0.904, + "Us Foreign Policy": 0.96, + "Astronomy": 0.967, + "Business Ethics": 0.86, + "Clinical Knowledge": 0.879, + "Conceptual Physics": 0.881, + "Electrical Engineering": 0.814, + "Elementary Mathematics": 0.862, + "Formal Logic": 0.698, + "High School World History": 0.941, + "Human Sexuality": 0.908, + "International Law": 0.901, + "Logical Fallacies": 0.896, + "Machine Learning": 0.741, + "Management": 0.942, + "Marketing": 0.944, + "Medical Genetics": 0.93, + "Miscellaneous": 0.951, + "Moral Scenarios": 0.826, + "Nutrition": 0.925, + "Prehistory": 0.941, + "Public Relations": 0.827, + "Security Studies": 0.886, + "Sociology": 0.94, + "Virology": 0.578, + "World Religions": 0.901, + "Mean win rate": 0.014 + } + }, + { + "model_id": "anthropic/claude-3-sonnet-20240229", + "name": "Claude 3 Sonnet 20240229", + "developer": "anthropic", + "scores": { + "MMLU All Subjects": 0.759, + "Abstract Algebra": 0.39, + "Anatomy": 0.711, + "College Physics": 0.559, + "Computer Security": 0.79, + "Econometrics": 0.64, + "Global Facts": 0.53, + "Jurisprudence": 0.861, + "Philosophy": 0.852, + "Professional Psychology": 0.814, + "Us Foreign Policy": 0.94, + "Astronomy": 0.855, + "Business Ethics": 0.82, + "Clinical Knowledge": 0.804, + "Conceptual Physics": 0.774, + "Electrical Engineering": 0.703, + "Elementary Mathematics": 0.635, + "Formal Logic": 0.579, + "High School World History": 0.895, + "Human Sexuality": 0.809, + "International Law": 0.909, + "Logical Fallacies": 0.853, + "Machine Learning": 0.643, + "Management": 0.922, + "Marketing": 0.85, + "Medical Genetics": 0.79, + "Miscellaneous": 0.872, + "Moral Scenarios": 0.626, + "Nutrition": 0.82, + "Prehistory": 0.864, + "Public Relations": 0.782, + "Security Studies": 0.865, + "Sociology": 0.905, + "Virology": 0.578, + "World Religions": 0.871, + "Mean win rate": 0.082 + } + }, + { + "model_id": "anthropic/claude-instant-1.2", + "name": "Claude Instant 1.2", + "developer": "anthropic", + "scores": { + "MMLU All Subjects": 0.688, + "Abstract Algebra": 0.37, + "Anatomy": 0.637, + "College Physics": 0.49, + "Computer Security": 0.76, + "Econometrics": 0.614, + "Global Facts": 0.38, + "Jurisprudence": 0.833, + "Philosophy": 0.756, + "Professional Psychology": 0.724, + "Us Foreign Policy": 0.9, + "Astronomy": 0.743, + "Business Ethics": 0.7, + "Clinical Knowledge": 0.709, + "Conceptual Physics": 0.613, + "Electrical Engineering": 0.641, + "Elementary Mathematics": 0.45, + "Formal Logic": 0.444, + "High School World History": 0.878, + "Human Sexuality": 0.794, + "International Law": 0.851, + "Logical Fallacies": 0.81, + "Machine Learning": 0.67, + "Management": 0.835, + "Marketing": 0.885, + "Medical Genetics": 0.71, + "Miscellaneous": 0.828, + "Moral Scenarios": 0.488, + "Nutrition": 0.735, + "Prehistory": 0.762, + "Public Relations": 0.627, + "Security Studies": 0.784, + "Sociology": 0.841, + "Virology": 0.548, + "World Religions": 0.784, + "Mean win rate": 0.186 + } + }, + { + "model_id": "cohere/command-r", + "name": "Command R", + "developer": "cohere", + "scores": { + "MMLU All Subjects": 0.652, + "Abstract Algebra": 0.33, + "Anatomy": 0.615, + "College Physics": 0.382, + "Computer Security": 0.78, + "Econometrics": 0.456, + "Global Facts": 0.42, + "Jurisprudence": 0.796, + "Philosophy": 0.685, + "Professional Psychology": 0.681, + "Us Foreign Policy": 0.82, + "Astronomy": 0.743, + "Business Ethics": 0.63, + "Clinical Knowledge": 0.751, + "Conceptual Physics": 0.528, + "Electrical Engineering": 0.593, + "Elementary Mathematics": 0.437, + "Formal Logic": 0.405, + "High School World History": 0.84, + "Human Sexuality": 0.763, + "International Law": 0.802, + "Logical Fallacies": 0.798, + "Machine Learning": 0.446, + "Management": 0.796, + "Marketing": 0.872, + "Medical Genetics": 0.81, + "Miscellaneous": 0.848, + "Moral Scenarios": 0.451, + "Nutrition": 0.703, + "Prehistory": 0.728, + "Public Relations": 0.7, + "Security Studies": 0.714, + "Sociology": 0.866, + "Virology": 0.542, + "World Religions": 0.813, + "Mean win rate": 0.959 + } + }, + { + "model_id": "cohere/command-r-plus", + "name": "Command R Plus", + "developer": "cohere", + "scores": { + "MMLU All Subjects": 0.694, + "Abstract Algebra": 0.21, + "Anatomy": 0.644, + "College Physics": 0.52, + "Computer Security": 0.74, + "Econometrics": 0.561, + "Global Facts": 0.5, + "Jurisprudence": 0.806, + "Philosophy": 0.695, + "Professional Psychology": 0.735, + "Us Foreign Policy": 0.89, + "Astronomy": 0.783, + "Business Ethics": 0.77, + "Clinical Knowledge": 0.743, + "Conceptual Physics": 0.591, + "Electrical Engineering": 0.71, + "Elementary Mathematics": 0.474, + "Formal Logic": 0.484, + "High School World History": 0.827, + "Human Sexuality": 0.786, + "International Law": 0.835, + "Logical Fallacies": 0.791, + "Machine Learning": 0.518, + "Management": 0.835, + "Marketing": 0.927, + "Medical Genetics": 0.77, + "Miscellaneous": 0.844, + "Moral Scenarios": 0.585, + "Nutrition": 0.742, + "Prehistory": 0.821, + "Public Relations": 0.709, + "Security Studies": 0.751, + "Sociology": 0.876, + "Virology": 0.56, + "World Religions": 0.842, + "Mean win rate": 0.825 + } + }, + { + "model_id": "databricks/dbrx-instruct", + "name": "DBRX Instruct", + "developer": "databricks", + "scores": { + "MMLU All Subjects": 0.741, + "Abstract Algebra": 0.34, + "Anatomy": 0.667, + "College Physics": 0.539, + "Computer Security": 0.83, + "Econometrics": 0.605, + "Global Facts": 0.46, + "Jurisprudence": 0.843, + "Philosophy": 0.804, + "Professional Psychology": 0.801, + "Us Foreign Policy": 0.93, + "Astronomy": 0.836, + "Business Ethics": 0.78, + "Clinical Knowledge": 0.789, + "Conceptual Physics": 0.74, + "Electrical Engineering": 0.71, + "Elementary Mathematics": 0.563, + "Formal Logic": 0.563, + "High School World History": 0.903, + "Human Sexuality": 0.878, + "International Law": 0.884, + "Logical Fallacies": 0.847, + "Machine Learning": 0.625, + "Management": 0.854, + "Marketing": 0.94, + "Medical Genetics": 0.85, + "Miscellaneous": 0.911, + "Moral Scenarios": 0.465, + "Nutrition": 0.814, + "Prehistory": 0.84, + "Public Relations": 0.691, + "Security Studies": 0.804, + "Sociology": 0.896, + "Virology": 0.566, + "World Religions": 0.871, + "Mean win rate": 0.537 + } + }, + { + "model_id": "deepseek-ai/deepseek-llm-67b-chat", + "name": "DeepSeek LLM Chat 67B", + "developer": "deepseek-ai", + "scores": { + "MMLU All Subjects": 0.725, + "Abstract Algebra": 0.44, + "Anatomy": 0.667, + "College Physics": 0.363, + "Computer Security": 0.79, + "Econometrics": 0.553, + "Global Facts": 0.46, + "Jurisprudence": 0.852, + "Philosophy": 0.801, + "Professional Psychology": 0.809, + "Us Foreign Policy": 0.91, + "Astronomy": 0.822, + "Business Ethics": 0.86, + "Clinical Knowledge": 0.785, + "Conceptual Physics": 0.723, + "Electrical Engineering": 0.669, + "Elementary Mathematics": 0.548, + "Formal Logic": 0.548, + "High School World History": 0.911, + "Human Sexuality": 0.84, + "International Law": 0.851, + "Logical Fallacies": 0.847, + "Machine Learning": 0.562, + "Management": 0.903, + "Marketing": 0.923, + "Medical Genetics": 0.73, + "Miscellaneous": 0.904, + "Moral Scenarios": 0.544, + "Nutrition": 0.781, + "Prehistory": 0.858, + "Public Relations": 0.7, + "Security Studies": 0.796, + "Sociology": 0.876, + "Virology": 0.554, + "World Religions": 0.865, + "Mean win rate": 0.387 + } + }, + { + "model_id": "deepseek-ai/deepseek-v3", + "name": "DeepSeek v3", + "developer": "deepseek-ai", + "scores": { + "MMLU All Subjects": 0.872, + "Abstract Algebra": 0.84, + "Anatomy": 0.867, + "College Physics": 0.814, + "Computer Security": 0.86, + "Econometrics": 0.746, + "Global Facts": 0.68, + "Jurisprudence": 0.898, + "Philosophy": 0.9, + "Professional Psychology": 0.887, + "Us Foreign Policy": 0.92, + "Astronomy": 0.921, + "Business Ethics": 0.89, + "Clinical Knowledge": 0.913, + "Conceptual Physics": 0.94, + "Electrical Engineering": 0.869, + "Elementary Mathematics": 0.942, + "Formal Logic": 0.77, + "High School World History": 0.928, + "Human Sexuality": 0.924, + "International Law": 0.95, + "Logical Fallacies": 0.914, + "Machine Learning": 0.786, + "Management": 0.903, + "Marketing": 0.949, + "Medical Genetics": 0.96, + "Miscellaneous": 0.949, + "Moral Scenarios": 0.808, + "Nutrition": 0.918, + "Prehistory": 0.923, + "Public Relations": 0.809, + "Security Studies": 0.837, + "Sociology": 0.955, + "Virology": 0.596, + "World Religions": 0.912, + "Mean win rate": 0.215 + } + }, + { + "model_id": "google/gemini-1.0-pro-001", + "name": "Gemini 1.0 Pro 001", + "developer": "google", + "scores": { + "MMLU All Subjects": 0.7, + "Abstract Algebra": 0.34, + "Anatomy": 0.652, + "College Physics": 0.333, + "Computer Security": 0.84, + "Econometrics": 0.553, + "Global Facts": 0.49, + "Jurisprudence": 0.861, + "Philosophy": 0.762, + "Professional Psychology": 0.752, + "Us Foreign Policy": 0.89, + "Astronomy": 0.796, + "Business Ethics": 0.69, + "Clinical Knowledge": 0.758, + "Conceptual Physics": 0.706, + "Electrical Engineering": 0.69, + "Elementary Mathematics": 0.476, + "Formal Logic": 0.468, + "High School World History": 0.865, + "Human Sexuality": 0.618, + "International Law": 0.876, + "Logical Fallacies": 0.804, + "Machine Learning": 0.527, + "Management": 0.845, + "Marketing": 0.91, + "Medical Genetics": 0.8, + "Miscellaneous": 0.851, + "Moral Scenarios": 0.46, + "Nutrition": 0.788, + "Prehistory": 0.802, + "Public Relations": 0.691, + "Security Studies": 0.804, + "Sociology": 0.9, + "Virology": 0.536, + "World Religions": 0.86, + "Mean win rate": 0.677 + } + }, + { + "model_id": "google/gemini-1.5-flash-001", + "name": "Gemini 1.5 Flash 001", + "developer": "google", + "scores": { + "MMLU All Subjects": 0.779, + "Abstract Algebra": 0.58, + "Anatomy": 0.8, + "College Physics": 0.696, + "Computer Security": 0.79, + "Econometrics": 0.614, + "Global Facts": 0.53, + "Jurisprudence": 0.889, + "Philosophy": 0.791, + "Professional Psychology": 0.828, + "Us Foreign Policy": 0.93, + "Astronomy": 0.882, + "Business Ethics": 0.81, + "Clinical Knowledge": 0.834, + "Conceptual Physics": 0.851, + "Electrical Engineering": 0.8, + "Elementary Mathematics": 0.754, + "Formal Logic": 0.627, + "High School World History": 0.907, + "Human Sexuality": 0.374, + "International Law": 0.901, + "Logical Fallacies": 0.853, + "Machine Learning": 0.571, + "Management": 0.864, + "Marketing": 0.94, + "Medical Genetics": 0.86, + "Miscellaneous": 0.886, + "Moral Scenarios": 0.637, + "Nutrition": 0.82, + "Prehistory": 0.867, + "Public Relations": 0.764, + "Security Studies": 0.808, + "Sociology": 0.915, + "Virology": 0.566, + "World Religions": 0.883, + "Mean win rate": 0.47 + } + }, + { + "model_id": "google/gemini-1.5-flash-002", + "name": "Gemini 1.5 Flash 002", + "developer": "google", + "scores": { + "MMLU All Subjects": 0.739, + "Abstract Algebra": 0.63, + "Anatomy": 0.793, + "College Physics": 0.637, + "Computer Security": 0.72, + "Econometrics": 0.675, + "Global Facts": 0.47, + "Jurisprudence": 0.852, + "Philosophy": 0.797, + "Professional Psychology": 0.806, + "Us Foreign Policy": 0.81, + "Astronomy": 0.895, + "Business Ethics": 0.27, + "Clinical Knowledge": 0.792, + "Conceptual Physics": 0.851, + "Electrical Engineering": 0.772, + "Elementary Mathematics": 0.704, + "Formal Logic": 0.595, + "High School World History": 0.869, + "Human Sexuality": 0.847, + "International Law": 0.752, + "Logical Fallacies": 0.859, + "Machine Learning": 0.616, + "Management": 0.893, + "Marketing": 0.953, + "Medical Genetics": 0.89, + "Miscellaneous": 0.9, + "Moral Scenarios": 0.676, + "Nutrition": 0.588, + "Prehistory": 0.762, + "Public Relations": 0.7, + "Security Studies": 0.547, + "Sociology": 0.851, + "Virology": 0.524, + "World Religions": 0.865, + "Mean win rate": 0.817 + } + }, + { + "model_id": "google/gemini-1.5-flash-preview-0514", + "name": "Gemini 1.5 Flash 0514 preview", + "developer": "google", + "scores": { + "MMLU All Subjects": 0.778, + "Abstract Algebra": 0.56, + "Anatomy": 0.807, + "College Physics": 0.667, + "Computer Security": 0.77, + "Econometrics": 0.64, + "Global Facts": 0.55, + "Jurisprudence": 0.889, + "Philosophy": 0.807, + "Professional Psychology": 0.825, + "Us Foreign Policy": 0.93, + "Astronomy": 0.868, + "Business Ethics": 0.82, + "Clinical Knowledge": 0.838, + "Conceptual Physics": 0.855, + "Electrical Engineering": 0.814, + "Elementary Mathematics": 0.778, + "Formal Logic": 0.611, + "High School World History": 0.907, + "Human Sexuality": 0.374, + "International Law": 0.876, + "Logical Fallacies": 0.853, + "Machine Learning": 0.562, + "Management": 0.854, + "Marketing": 0.936, + "Medical Genetics": 0.86, + "Miscellaneous": 0.884, + "Moral Scenarios": 0.631, + "Nutrition": 0.801, + "Prehistory": 0.867, + "Public Relations": 0.773, + "Security Studies": 0.812, + "Sociology": 0.9, + "Virology": 0.566, + "World Religions": 0.871, + "Mean win rate": 0.713 + } + }, + { + "model_id": "google/gemini-1.5-pro-001", + "name": "Gemini 1.5 Pro 001", + "developer": "google", + "scores": { + "MMLU All Subjects": 0.827, + "Abstract Algebra": 0.75, + "Anatomy": 0.83, + "College Physics": 0.745, + "Computer Security": 0.83, + "Econometrics": 0.728, + "Global Facts": 0.66, + "Jurisprudence": 0.889, + "Philosophy": 0.871, + "Professional Psychology": 0.894, + "Us Foreign Policy": 0.93, + "Astronomy": 0.914, + "Business Ethics": 0.8, + "Clinical Knowledge": 0.853, + "Conceptual Physics": 0.949, + "Electrical Engineering": 0.745, + "Elementary Mathematics": 0.939, + "Formal Logic": 0.706, + "High School World History": 0.924, + "Human Sexuality": 0.374, + "International Law": 0.917, + "Logical Fallacies": 0.896, + "Machine Learning": 0.652, + "Management": 0.922, + "Marketing": 0.932, + "Medical Genetics": 0.91, + "Miscellaneous": 0.958, + "Moral Scenarios": 0.739, + "Nutrition": 0.879, + "Prehistory": 0.87, + "Public Relations": 0.818, + "Security Studies": 0.873, + "Sociology": 0.92, + "Virology": 0.554, + "World Religions": 0.854, + "Mean win rate": 0.349 + } + }, + { + "model_id": "google/gemini-1.5-pro-002", + "name": "Gemini 1.5 Pro 002", + "developer": "google", + "scores": { + "MMLU All Subjects": 0.869, + "Abstract Algebra": 0.82, + "Anatomy": 0.83, + "College Physics": 0.863, + "Computer Security": 0.85, + "Econometrics": 0.693, + "Global Facts": 0.77, + "Jurisprudence": 0.898, + "Philosophy": 0.887, + "Professional Psychology": 0.912, + "Us Foreign Policy": 0.94, + "Astronomy": 0.934, + "Business Ethics": 0.84, + "Clinical Knowledge": 0.906, + "Conceptual Physics": 0.945, + "Electrical Engineering": 0.855, + "Elementary Mathematics": 0.942, + "Formal Logic": 0.754, + "High School World History": 0.937, + "Human Sexuality": 0.878, + "International Law": 0.917, + "Logical Fallacies": 0.902, + "Machine Learning": 0.83, + "Management": 0.903, + "Marketing": 0.962, + "Medical Genetics": 0.92, + "Miscellaneous": 0.959, + "Moral Scenarios": 0.792, + "Nutrition": 0.886, + "Prehistory": 0.926, + "Public Relations": 0.809, + "Security Studies": 0.857, + "Sociology": 0.95, + "Virology": 0.566, + "World Religions": 0.889, + "Mean win rate": 0.334 + } + }, + { + "model_id": "google/gemini-1.5-pro-preview-0409", + "name": "Gemini 1.5 Pro 0409 preview", + "developer": "google", + "scores": { + "MMLU All Subjects": 0.81, + "Abstract Algebra": 0.6, + "Anatomy": 0.77, + "College Physics": 0.804, + "Computer Security": 0.81, + "Econometrics": 0.737, + "Global Facts": 0.66, + "Jurisprudence": 0.87, + "Philosophy": 0.846, + "Professional Psychology": 0.866, + "Us Foreign Policy": 0.94, + "Astronomy": 0.914, + "Business Ethics": 0.8, + "Clinical Knowledge": 0.868, + "Conceptual Physics": 0.915, + "Electrical Engineering": 0.772, + "Elementary Mathematics": 0.884, + "Formal Logic": 0.643, + "High School World History": 0.924, + "Human Sexuality": 0.397, + "International Law": 0.917, + "Logical Fallacies": 0.859, + "Machine Learning": 0.67, + "Management": 0.874, + "Marketing": 0.953, + "Medical Genetics": 0.91, + "Miscellaneous": 0.928, + "Moral Scenarios": 0.696, + "Nutrition": 0.846, + "Prehistory": 0.886, + "Public Relations": 0.755, + "Security Studies": 0.849, + "Sociology": 0.925, + "Virology": 0.584, + "World Religions": 0.877, + "Mean win rate": 0.118 + } + }, + { + "model_id": "google/gemini-2.0-flash-exp", + "name": "Gemini 2.0 Flash Experimental", + "developer": "google", + "scores": { + "MMLU All Subjects": 0.797, + "Abstract Algebra": 0.72, + "Anatomy": 0.807, + "College Physics": 0.696, + "Computer Security": 0.83, + "Econometrics": 0.693, + "Global Facts": 0.66, + "Jurisprudence": 0.898, + "Philosophy": 0.887, + "Professional Psychology": 0.876, + "Us Foreign Policy": 0.78, + "Astronomy": 0.928, + "Business Ethics": 0.73, + "Clinical Knowledge": 0.879, + "Conceptual Physics": 0.813, + "Electrical Engineering": 0.834, + "Elementary Mathematics": 0.857, + "Formal Logic": 0.571, + "High School World History": 0.743, + "Human Sexuality": 0.901, + "International Law": 0.645, + "Logical Fallacies": 0.914, + "Machine Learning": 0.759, + "Management": 0.718, + "Marketing": 0.944, + "Medical Genetics": 0.89, + "Miscellaneous": 0.939, + "Moral Scenarios": 0.815, + "Nutrition": 0.856, + "Prehistory": 0.898, + "Public Relations": 0.791, + "Security Studies": 0.69, + "Sociology": 0.786, + "Virology": 0.554, + "World Religions": 0.731, + "Mean win rate": 0.567 + } + }, + { + "model_id": "google/gemma-2-27b", + "name": "Gemma 2 27B", + "developer": "google", + "scores": { + "MMLU All Subjects": 0.757, + "Abstract Algebra": 0.4, + "Anatomy": 0.77, + "College Physics": 0.5, + "Computer Security": 0.84, + "Econometrics": 0.667, + "Global Facts": 0.43, + "Jurisprudence": 0.861, + "Philosophy": 0.849, + "Professional Psychology": 0.84, + "Us Foreign Policy": 0.95, + "Astronomy": 0.829, + "Business Ethics": 0.78, + "Clinical Knowledge": 0.808, + "Conceptual Physics": 0.834, + "Electrical Engineering": 0.738, + "Elementary Mathematics": 0.558, + "Formal Logic": 0.516, + "High School World History": 0.89, + "Human Sexuality": 0.84, + "International Law": 0.843, + "Logical Fallacies": 0.865, + "Machine Learning": 0.625, + "Management": 0.864, + "Marketing": 0.94, + "Medical Genetics": 0.87, + "Miscellaneous": 0.885, + "Moral Scenarios": 0.394, + "Nutrition": 0.824, + "Prehistory": 0.877, + "Public Relations": 0.745, + "Security Studies": 0.808, + "Sociology": 0.9, + "Virology": 0.56, + "World Religions": 0.924, + "Mean win rate": 0.05 + } + }, + { + "model_id": "google/gemma-2-9b", + "name": "Gemma 2 9B", + "developer": "google", + "scores": { + "MMLU All Subjects": 0.721, + "Abstract Algebra": 0.4, + "Anatomy": 0.704, + "College Physics": 0.5, + "Computer Security": 0.81, + "Econometrics": 0.579, + "Global Facts": 0.53, + "Jurisprudence": 0.833, + "Philosophy": 0.772, + "Professional Psychology": 0.788, + "Us Foreign Policy": 0.9, + "Astronomy": 0.789, + "Business Ethics": 0.77, + "Clinical Knowledge": 0.777, + "Conceptual Physics": 0.732, + "Electrical Engineering": 0.724, + "Elementary Mathematics": 0.577, + "Formal Logic": 0.492, + "High School World History": 0.865, + "Human Sexuality": 0.809, + "International Law": 0.835, + "Logical Fallacies": 0.816, + "Machine Learning": 0.509, + "Management": 0.874, + "Marketing": 0.919, + "Medical Genetics": 0.84, + "Miscellaneous": 0.844, + "Moral Scenarios": 0.295, + "Nutrition": 0.775, + "Prehistory": 0.812, + "Public Relations": 0.736, + "Security Studies": 0.78, + "Sociology": 0.9, + "Virology": 0.53, + "World Religions": 0.86, + "Mean win rate": 0.265 + } + }, + { + "model_id": "google/gemma-7b", + "name": "Gemma 7B", + "developer": "google", + "scores": { + "MMLU All Subjects": 0.661, + "Abstract Algebra": 0.28, + "Anatomy": 0.563, + "College Physics": 0.412, + "Computer Security": 0.75, + "Econometrics": 0.474, + "Global Facts": 0.42, + "Jurisprudence": 0.769, + "Philosophy": 0.727, + "Professional Psychology": 0.712, + "Us Foreign Policy": 0.87, + "Astronomy": 0.717, + "Business Ethics": 0.65, + "Clinical Knowledge": 0.698, + "Conceptual Physics": 0.621, + "Electrical Engineering": 0.628, + "Elementary Mathematics": 0.516, + "Formal Logic": 0.508, + "High School World History": 0.857, + "Human Sexuality": 0.733, + "International Law": 0.835, + "Logical Fallacies": 0.742, + "Machine Learning": 0.554, + "Management": 0.864, + "Marketing": 0.885, + "Medical Genetics": 0.7, + "Miscellaneous": 0.838, + "Moral Scenarios": 0.377, + "Nutrition": 0.778, + "Prehistory": 0.756, + "Public Relations": 0.682, + "Security Studies": 0.735, + "Sociology": 0.841, + "Virology": 0.548, + "World Religions": 0.842, + "Mean win rate": 0.824 + } + }, + { + "model_id": "google/text-bison@001", + "name": "PaLM-2 Bison", + "developer": "google", + "scores": { + "MMLU All Subjects": 0.692, + "Abstract Algebra": 0.39, + "Anatomy": 0.644, + "College Physics": 0.51, + "Computer Security": 0.74, + "Econometrics": 0.518, + "Global Facts": 0.38, + "Jurisprudence": 0.769, + "Philosophy": 0.736, + "Professional Psychology": 0.761, + "Us Foreign Policy": 0.87, + "Astronomy": 0.803, + "Business Ethics": 0.76, + "Clinical Knowledge": 0.725, + "Conceptual Physics": 0.694, + "Electrical Engineering": 0.69, + "Elementary Mathematics": 0.487, + "Formal Logic": 0.5, + "High School World History": 0.869, + "Human Sexuality": 0.84, + "International Law": 0.835, + "Logical Fallacies": 0.853, + "Machine Learning": 0.562, + "Management": 0.893, + "Marketing": 0.893, + "Medical Genetics": 0.75, + "Miscellaneous": 0.866, + "Moral Scenarios": 0.369, + "Nutrition": 0.709, + "Prehistory": 0.812, + "Public Relations": 0.691, + "Security Studies": 0.812, + "Sociology": 0.92, + "Virology": 0.494, + "World Religions": 0.883, + "Mean win rate": 0.192 + } + }, + { + "model_id": "google/text-unicorn@001", + "name": "PaLM-2 Unicorn", + "developer": "google", + "scores": { + "MMLU All Subjects": 0.786, + "Abstract Algebra": 0.51, + "Anatomy": 0.733, + "College Physics": 0.549, + "Computer Security": 0.77, + "Econometrics": 0.649, + "Global Facts": 0.53, + "Jurisprudence": 0.88, + "Philosophy": 0.836, + "Professional Psychology": 0.858, + "Us Foreign Policy": 0.96, + "Astronomy": 0.862, + "Business Ethics": 0.83, + "Clinical Knowledge": 0.804, + "Conceptual Physics": 0.809, + "Electrical Engineering": 0.772, + "Elementary Mathematics": 0.661, + "Formal Logic": 0.659, + "High School World History": 0.911, + "Human Sexuality": 0.924, + "International Law": 0.909, + "Logical Fallacies": 0.877, + "Machine Learning": 0.625, + "Management": 0.903, + "Marketing": 0.94, + "Medical Genetics": 0.83, + "Miscellaneous": 0.894, + "Moral Scenarios": 0.562, + "Nutrition": 0.856, + "Prehistory": 0.87, + "Public Relations": 0.773, + "Security Studies": 0.829, + "Sociology": 0.91, + "Virology": 0.572, + "World Religions": 0.877, + "Mean win rate": 0.142 + } + }, + { + "model_id": "meta/llama-2-13b", + "name": "Llama 2 13B", + "developer": "meta", + "scores": { + "MMLU All Subjects": 0.554, + "Abstract Algebra": 0.27, + "Anatomy": 0.496, + "College Physics": 0.235, + "Computer Security": 0.69, + "Econometrics": 0.307, + "Global Facts": 0.38, + "Jurisprudence": 0.704, + "Philosophy": 0.672, + "Professional Psychology": 0.567, + "Us Foreign Policy": 0.83, + "Astronomy": 0.546, + "Business Ethics": 0.55, + "Clinical Knowledge": 0.592, + "Conceptual Physics": 0.413, + "Electrical Engineering": 0.49, + "Elementary Mathematics": 0.307, + "Formal Logic": 0.381, + "High School World History": 0.705, + "Human Sexuality": 0.618, + "International Law": 0.752, + "Logical Fallacies": 0.687, + "Machine Learning": 0.286, + "Management": 0.738, + "Marketing": 0.786, + "Medical Genetics": 0.57, + "Miscellaneous": 0.748, + "Moral Scenarios": 0.407, + "Nutrition": 0.627, + "Prehistory": 0.654, + "Public Relations": 0.6, + "Security Studies": 0.608, + "Sociology": 0.761, + "Virology": 0.476, + "World Religions": 0.76, + "Mean win rate": 0.502 + } + }, + { + "model_id": "meta/llama-2-70b", + "name": "Llama 2 70B", + "developer": "meta", + "scores": { + "MMLU All Subjects": 0.695, + "Abstract Algebra": 0.31, + "Anatomy": 0.607, + "College Physics": 0.363, + "Computer Security": 0.77, + "Econometrics": 0.43, + "Global Facts": 0.47, + "Jurisprudence": 0.824, + "Philosophy": 0.791, + "Professional Psychology": 0.76, + "Us Foreign Policy": 0.92, + "Astronomy": 0.829, + "Business Ethics": 0.73, + "Clinical Knowledge": 0.717, + "Conceptual Physics": 0.668, + "Electrical Engineering": 0.634, + "Elementary Mathematics": 0.421, + "Formal Logic": 0.468, + "High School World History": 0.882, + "Human Sexuality": 0.84, + "International Law": 0.868, + "Logical Fallacies": 0.791, + "Machine Learning": 0.491, + "Management": 0.845, + "Marketing": 0.889, + "Medical Genetics": 0.72, + "Miscellaneous": 0.857, + "Moral Scenarios": 0.45, + "Nutrition": 0.758, + "Prehistory": 0.84, + "Public Relations": 0.745, + "Security Studies": 0.796, + "Sociology": 0.9, + "Virology": 0.53, + "World Religions": 0.854, + "Mean win rate": 0.508 + } + }, + { + "model_id": "meta/llama-2-7b", + "name": "Llama 2 7B", + "developer": "meta", + "scores": { + "MMLU All Subjects": 0.458, + "Abstract Algebra": 0.29, + "Anatomy": 0.452, + "College Physics": 0.196, + "Computer Security": 0.59, + "Econometrics": 0.316, + "Global Facts": 0.29, + "Jurisprudence": 0.519, + "Philosophy": 0.592, + "Professional Psychology": 0.459, + "Us Foreign Policy": 0.64, + "Astronomy": 0.408, + "Business Ethics": 0.48, + "Clinical Knowledge": 0.453, + "Conceptual Physics": 0.434, + "Electrical Engineering": 0.407, + "Elementary Mathematics": 0.254, + "Formal Logic": 0.27, + "High School World History": 0.662, + "Human Sexuality": 0.557, + "International Law": 0.628, + "Logical Fallacies": 0.466, + "Machine Learning": 0.402, + "Management": 0.563, + "Marketing": 0.697, + "Medical Genetics": 0.53, + "Miscellaneous": 0.632, + "Moral Scenarios": 0.238, + "Nutrition": 0.497, + "Prehistory": 0.503, + "Public Relations": 0.509, + "Security Studies": 0.433, + "Sociology": 0.617, + "Virology": 0.392, + "World Religions": 0.713, + "Mean win rate": 0.681 + } + }, + { + "model_id": "meta/llama-3-70b", + "name": "Llama 3 70B", + "developer": "meta", + "scores": { + "MMLU All Subjects": 0.793, + "Abstract Algebra": 0.43, + "Anatomy": 0.785, + "College Physics": 0.529, + "Computer Security": 0.85, + "Econometrics": 0.693, + "Global Facts": 0.49, + "Jurisprudence": 0.861, + "Philosophy": 0.865, + "Professional Psychology": 0.871, + "Us Foreign Policy": 0.94, + "Astronomy": 0.921, + "Business Ethics": 0.83, + "Clinical Knowledge": 0.845, + "Conceptual Physics": 0.838, + "Electrical Engineering": 0.766, + "Elementary Mathematics": 0.632, + "Formal Logic": 0.651, + "High School World History": 0.941, + "Human Sexuality": 0.878, + "International Law": 0.901, + "Logical Fallacies": 0.865, + "Machine Learning": 0.714, + "Management": 0.913, + "Marketing": 0.94, + "Medical Genetics": 0.89, + "Miscellaneous": 0.917, + "Moral Scenarios": 0.598, + "Nutrition": 0.876, + "Prehistory": 0.91, + "Public Relations": 0.727, + "Security Studies": 0.833, + "Sociology": 0.93, + "Virology": 0.59, + "World Religions": 0.906, + "Mean win rate": 0.524 + } + }, + { + "model_id": "meta/llama-3-8b", + "name": "Llama 3 8B", + "developer": "meta", + "scores": { + "MMLU All Subjects": 0.668, + "Abstract Algebra": 0.33, + "Anatomy": 0.696, + "College Physics": 0.451, + "Computer Security": 0.8, + "Econometrics": 0.518, + "Global Facts": 0.34, + "Jurisprudence": 0.741, + "Philosophy": 0.743, + "Professional Psychology": 0.711, + "Us Foreign Policy": 0.88, + "Astronomy": 0.711, + "Business Ethics": 0.65, + "Clinical Knowledge": 0.751, + "Conceptual Physics": 0.557, + "Electrical Engineering": 0.669, + "Elementary Mathematics": 0.426, + "Formal Logic": 0.468, + "High School World History": 0.823, + "Human Sexuality": 0.748, + "International Law": 0.843, + "Logical Fallacies": 0.755, + "Machine Learning": 0.545, + "Management": 0.874, + "Marketing": 0.885, + "Medical Genetics": 0.83, + "Miscellaneous": 0.831, + "Moral Scenarios": 0.416, + "Nutrition": 0.761, + "Prehistory": 0.738, + "Public Relations": 0.736, + "Security Studies": 0.771, + "Sociology": 0.866, + "Virology": 0.566, + "World Religions": 0.819, + "Mean win rate": 0.733 + } + }, + { + "model_id": "meta/llama-3.1-405b-instruct-turbo", + "name": "Llama 3.1 Instruct Turbo 405B", + "developer": "meta", + "scores": { + "MMLU All Subjects": 0.845, + "Abstract Algebra": 0.7, + "Anatomy": 0.822, + "College Physics": 0.696, + "Computer Security": 0.81, + "Econometrics": 0.746, + "Global Facts": 0.71, + "Jurisprudence": 0.87, + "Philosophy": 0.878, + "Professional Psychology": 0.861, + "Us Foreign Policy": 0.94, + "Astronomy": 0.921, + "Business Ethics": 0.81, + "Clinical Knowledge": 0.879, + "Conceptual Physics": 0.877, + "Electrical Engineering": 0.821, + "Elementary Mathematics": 0.828, + "Formal Logic": 0.698, + "High School World History": 0.941, + "Human Sexuality": 0.855, + "International Law": 0.95, + "Logical Fallacies": 0.92, + "Machine Learning": 0.795, + "Management": 0.893, + "Marketing": 0.962, + "Medical Genetics": 0.93, + "Miscellaneous": 0.939, + "Moral Scenarios": 0.876, + "Nutrition": 0.928, + "Prehistory": 0.929, + "Public Relations": 0.818, + "Security Studies": 0.857, + "Sociology": 0.94, + "Virology": 0.572, + "World Religions": 0.906, + "Mean win rate": 0.33 + } + }, + { + "model_id": "meta/llama-3.1-70b-instruct-turbo", + "name": "Llama 3.1 Instruct Turbo 70B", + "developer": "meta", + "scores": { + "MMLU All Subjects": 0.801, + "Abstract Algebra": 0.55, + "Anatomy": 0.8, + "College Physics": 0.559, + "Computer Security": 0.8, + "Econometrics": 0.675, + "Global Facts": 0.61, + "Jurisprudence": 0.889, + "Philosophy": 0.833, + "Professional Psychology": 0.846, + "Us Foreign Policy": 0.93, + "Astronomy": 0.908, + "Business Ethics": 0.72, + "Clinical Knowledge": 0.845, + "Conceptual Physics": 0.834, + "Electrical Engineering": 0.745, + "Elementary Mathematics": 0.701, + "Formal Logic": 0.675, + "High School World History": 0.937, + "Human Sexuality": 0.855, + "International Law": 0.926, + "Logical Fallacies": 0.84, + "Machine Learning": 0.696, + "Management": 0.913, + "Marketing": 0.936, + "Medical Genetics": 0.93, + "Miscellaneous": 0.913, + "Moral Scenarios": 0.834, + "Nutrition": 0.889, + "Prehistory": 0.88, + "Public Relations": 0.709, + "Security Studies": 0.849, + "Sociology": 0.92, + "Virology": 0.578, + "World Religions": 0.895, + "Mean win rate": 0.021 + } + }, + { + "model_id": "meta/llama-3.1-8b-instruct-turbo", + "name": "Llama 3.1 Instruct Turbo 8B", + "developer": "meta", + "scores": { + "MMLU All Subjects": 0.561, + "Abstract Algebra": 0.26, + "Anatomy": 0.459, + "College Physics": 0.363, + "Computer Security": 0.71, + "Econometrics": 0.351, + "Global Facts": 0.26, + "Jurisprudence": 0.731, + "Philosophy": 0.64, + "Professional Psychology": 0.649, + "Us Foreign Policy": 0.79, + "Astronomy": 0.645, + "Business Ethics": 0.65, + "Clinical Knowledge": 0.615, + "Conceptual Physics": 0.528, + "Electrical Engineering": 0.441, + "Elementary Mathematics": 0.429, + "Formal Logic": 0.444, + "High School World History": 0.515, + "Human Sexuality": 0.733, + "International Law": 0.694, + "Logical Fallacies": 0.742, + "Machine Learning": 0.384, + "Management": 0.709, + "Marketing": 0.833, + "Medical Genetics": 0.66, + "Miscellaneous": 0.653, + "Moral Scenarios": 0.368, + "Nutrition": 0.712, + "Prehistory": 0.728, + "Public Relations": 0.664, + "Security Studies": 0.576, + "Sociology": 0.701, + "Virology": 0.446, + "World Religions": 0.789, + "Mean win rate": 0.475 + } + }, + { + "model_id": "meta/llama-3.2-11b-vision-instruct-turbo", + "name": "Llama 3.2 Vision Instruct Turbo 11B", + "developer": "meta", + "scores": { + "MMLU All Subjects": 0.565, + "Abstract Algebra": 0.28, + "Anatomy": 0.533, + "College Physics": 0.333, + "Computer Security": 0.71, + "Econometrics": 0.395, + "Global Facts": 0.25, + "Jurisprudence": 0.722, + "Philosophy": 0.646, + "Professional Psychology": 0.649, + "Us Foreign Policy": 0.78, + "Astronomy": 0.671, + "Business Ethics": 0.64, + "Clinical Knowledge": 0.638, + "Conceptual Physics": 0.536, + "Electrical Engineering": 0.51, + "Elementary Mathematics": 0.458, + "Formal Logic": 0.46, + "High School World History": 0.502, + "Human Sexuality": 0.763, + "International Law": 0.711, + "Logical Fallacies": 0.742, + "Machine Learning": 0.375, + "Management": 0.728, + "Marketing": 0.838, + "Medical Genetics": 0.7, + "Miscellaneous": 0.644, + "Moral Scenarios": 0.328, + "Nutrition": 0.752, + "Prehistory": 0.744, + "Public Relations": 0.645, + "Security Studies": 0.567, + "Sociology": 0.627, + "Virology": 0.446, + "World Religions": 0.696, + "Mean win rate": 0.897 + } + }, + { + "model_id": "meta/llama-3.2-90b-vision-instruct-turbo", + "name": "Llama 3.2 Vision Instruct Turbo 90B", + "developer": "meta", + "scores": { + "MMLU All Subjects": 0.803, + "Abstract Algebra": 0.52, + "Anatomy": 0.8, + "College Physics": 0.539, + "Computer Security": 0.81, + "Econometrics": 0.684, + "Global Facts": 0.6, + "Jurisprudence": 0.88, + "Philosophy": 0.839, + "Professional Psychology": 0.843, + "Us Foreign Policy": 0.93, + "Astronomy": 0.921, + "Business Ethics": 0.76, + "Clinical Knowledge": 0.845, + "Conceptual Physics": 0.826, + "Electrical Engineering": 0.759, + "Elementary Mathematics": 0.688, + "Formal Logic": 0.683, + "High School World History": 0.941, + "Human Sexuality": 0.87, + "International Law": 0.934, + "Logical Fallacies": 0.834, + "Machine Learning": 0.688, + "Management": 0.913, + "Marketing": 0.944, + "Medical Genetics": 0.92, + "Miscellaneous": 0.913, + "Moral Scenarios": 0.841, + "Nutrition": 0.889, + "Prehistory": 0.886, + "Public Relations": 0.718, + "Security Studies": 0.853, + "Sociology": 0.92, + "Virology": 0.584, + "World Religions": 0.901, + "Mean win rate": 0.773 + } + }, + { + "model_id": "meta/llama-3.3-70b-instruct-turbo", + "name": "Llama 3.3 Instruct Turbo 70B", + "developer": "meta", + "scores": { + "MMLU All Subjects": 0.791, + "Abstract Algebra": 0.5, + "Anatomy": 0.778, + "College Physics": 0.52, + "Computer Security": 0.8, + "Econometrics": 0.719, + "Global Facts": 0.58, + "Jurisprudence": 0.87, + "Philosophy": 0.83, + "Professional Psychology": 0.845, + "Us Foreign Policy": 0.93, + "Astronomy": 0.888, + "Business Ethics": 0.8, + "Clinical Knowledge": 0.83, + "Conceptual Physics": 0.821, + "Electrical Engineering": 0.745, + "Elementary Mathematics": 0.672, + "Formal Logic": 0.675, + "High School World History": 0.907, + "Human Sexuality": 0.855, + "International Law": 0.884, + "Logical Fallacies": 0.816, + "Machine Learning": 0.714, + "Management": 0.903, + "Marketing": 0.927, + "Medical Genetics": 0.9, + "Miscellaneous": 0.914, + "Moral Scenarios": 0.698, + "Nutrition": 0.882, + "Prehistory": 0.895, + "Public Relations": 0.727, + "Security Studies": 0.845, + "Sociology": 0.92, + "Virology": 0.566, + "World Religions": 0.883, + "Mean win rate": 0.722 + } + }, + { + "model_id": "microsoft/phi-2", + "name": "Phi-2", + "developer": "microsoft", + "scores": { + "MMLU All Subjects": 0.584, + "Abstract Algebra": 0.31, + "Anatomy": 0.437, + "College Physics": 0.382, + "Computer Security": 0.73, + "Econometrics": 0.342, + "Global Facts": 0.35, + "Jurisprudence": 0.694, + "Philosophy": 0.598, + "Professional Psychology": 0.572, + "Us Foreign Policy": 0.78, + "Astronomy": 0.605, + "Business Ethics": 0.59, + "Clinical Knowledge": 0.619, + "Conceptual Physics": 0.519, + "Electrical Engineering": 0.545, + "Elementary Mathematics": 0.463, + "Formal Logic": 0.389, + "High School World History": 0.73, + "Human Sexuality": 0.733, + "International Law": 0.752, + "Logical Fallacies": 0.767, + "Machine Learning": 0.5, + "Management": 0.748, + "Marketing": 0.833, + "Medical Genetics": 0.62, + "Miscellaneous": 0.688, + "Moral Scenarios": 0.231, + "Nutrition": 0.627, + "Prehistory": 0.605, + "Public Relations": 0.673, + "Security Studies": 0.702, + "Sociology": 0.816, + "Virology": 0.47, + "World Religions": 0.702, + "Mean win rate": 0.824 + } + }, + { + "model_id": "microsoft/phi-3-medium-4k-instruct", + "name": "Phi-3 14B", + "developer": "microsoft", + "scores": { + "MMLU All Subjects": 0.775, + "Abstract Algebra": 0.5, + "Anatomy": 0.719, + "College Physics": 0.529, + "Computer Security": 0.79, + "Econometrics": 0.614, + "Global Facts": 0.5, + "Jurisprudence": 0.88, + "Philosophy": 0.804, + "Professional Psychology": 0.835, + "Us Foreign Policy": 0.95, + "Astronomy": 0.849, + "Business Ethics": 0.8, + "Clinical Knowledge": 0.826, + "Conceptual Physics": 0.809, + "Electrical Engineering": 0.683, + "Elementary Mathematics": 0.709, + "Formal Logic": 0.587, + "High School World History": 0.903, + "Human Sexuality": 0.863, + "International Law": 0.934, + "Logical Fallacies": 0.828, + "Machine Learning": 0.696, + "Management": 0.864, + "Marketing": 0.919, + "Medical Genetics": 0.91, + "Miscellaneous": 0.894, + "Moral Scenarios": 0.639, + "Nutrition": 0.837, + "Prehistory": 0.867, + "Public Relations": 0.755, + "Security Studies": 0.829, + "Sociology": 0.891, + "Virology": 0.554, + "World Religions": 0.865, + "Mean win rate": 0.015 + } + }, + { + "model_id": "microsoft/phi-3-small-8k-instruct", + "name": "Phi-3 7B", + "developer": "microsoft", + "scores": { + "MMLU All Subjects": 0.757, + "Abstract Algebra": 0.44, + "Anatomy": 0.726, + "College Physics": 0.559, + "Computer Security": 0.77, + "Econometrics": 0.596, + "Global Facts": 0.52, + "Jurisprudence": 0.843, + "Philosophy": 0.82, + "Professional Psychology": 0.835, + "Us Foreign Policy": 0.95, + "Astronomy": 0.849, + "Business Ethics": 0.77, + "Clinical Knowledge": 0.83, + "Conceptual Physics": 0.779, + "Electrical Engineering": 0.69, + "Elementary Mathematics": 0.619, + "Formal Logic": 0.595, + "High School World History": 0.848, + "Human Sexuality": 0.817, + "International Law": 0.851, + "Logical Fallacies": 0.81, + "Machine Learning": 0.652, + "Management": 0.903, + "Marketing": 0.897, + "Medical Genetics": 0.84, + "Miscellaneous": 0.871, + "Moral Scenarios": 0.711, + "Nutrition": 0.833, + "Prehistory": 0.858, + "Public Relations": 0.727, + "Security Studies": 0.804, + "Sociology": 0.886, + "Virology": 0.548, + "World Religions": 0.825, + "Mean win rate": 0.708 + } + }, + { + "model_id": "mistralai/mistral-7b-instruct-v0.3", + "name": "Mistral Instruct v0.3 7B", + "developer": "mistralai", + "scores": { + "MMLU All Subjects": 0.599, + "Abstract Algebra": 0.27, + "Anatomy": 0.585, + "College Physics": 0.343, + "Computer Security": 0.7, + "Econometrics": 0.421, + "Global Facts": 0.33, + "Jurisprudence": 0.713, + "Philosophy": 0.659, + "Professional Psychology": 0.641, + "Us Foreign Policy": 0.79, + "Astronomy": 0.638, + "Business Ethics": 0.57, + "Clinical Knowledge": 0.687, + "Conceptual Physics": 0.549, + "Electrical Engineering": 0.572, + "Elementary Mathematics": 0.402, + "Formal Logic": 0.397, + "High School World History": 0.759, + "Human Sexuality": 0.702, + "International Law": 0.76, + "Logical Fallacies": 0.712, + "Machine Learning": 0.455, + "Management": 0.767, + "Marketing": 0.842, + "Medical Genetics": 0.75, + "Miscellaneous": 0.785, + "Moral Scenarios": 0.393, + "Nutrition": 0.676, + "Prehistory": 0.673, + "Public Relations": 0.636, + "Security Studies": 0.682, + "Sociology": 0.806, + "Virology": 0.47, + "World Religions": 0.825, + "Mean win rate": 0.509 + } + }, + { + "model_id": "mistralai/mistral-7b-v0.1", + "name": "Mistral v0.1 7B", + "developer": "mistralai", + "scores": { + "MMLU All Subjects": 0.566, + "Abstract Algebra": 0.25, + "Anatomy": 0.467, + "College Physics": 0.314, + "Computer Security": 0.69, + "Econometrics": 0.351, + "Global Facts": 0.29, + "Jurisprudence": 0.667, + "Philosophy": 0.63, + "Professional Psychology": 0.578, + "Us Foreign Policy": 0.79, + "Astronomy": 0.599, + "Business Ethics": 0.56, + "Clinical Knowledge": 0.653, + "Conceptual Physics": 0.451, + "Electrical Engineering": 0.538, + "Elementary Mathematics": 0.32, + "Formal Logic": 0.365, + "High School World History": 0.726, + "Human Sexuality": 0.702, + "International Law": 0.76, + "Logical Fallacies": 0.693, + "Machine Learning": 0.438, + "Management": 0.709, + "Marketing": 0.833, + "Medical Genetics": 0.68, + "Miscellaneous": 0.72, + "Moral Scenarios": 0.33, + "Nutrition": 0.657, + "Prehistory": 0.642, + "Public Relations": 0.6, + "Security Studies": 0.731, + "Sociology": 0.831, + "Virology": 0.44, + "World Religions": 0.789, + "Mean win rate": 0.213 + } + }, + { + "model_id": "mistralai/mistral-large-2402", + "name": "Mistral Large 2402", + "developer": "mistralai", + "scores": { + "MMLU All Subjects": 0.688, + "Abstract Algebra": 0.45, + "Anatomy": 0.674, + "College Physics": 0.373, + "Computer Security": 0.8, + "Econometrics": 0.64, + "Global Facts": 0.34, + "Jurisprudence": 0.815, + "Philosophy": 0.794, + "Professional Psychology": 0.809, + "Us Foreign Policy": 0.92, + "Astronomy": 0.842, + "Business Ethics": 0.67, + "Clinical Knowledge": 0.751, + "Conceptual Physics": 0.574, + "Electrical Engineering": 0.545, + "Elementary Mathematics": 0.508, + "Formal Logic": 0.532, + "High School World History": 0.886, + "Human Sexuality": 0.847, + "International Law": 0.868, + "Logical Fallacies": 0.81, + "Machine Learning": 0.562, + "Management": 0.854, + "Marketing": 0.897, + "Medical Genetics": 0.74, + "Miscellaneous": 0.9, + "Moral Scenarios": 0.579, + "Nutrition": 0.791, + "Prehistory": 0.904, + "Public Relations": 0.709, + "Security Studies": 0.824, + "Sociology": 0.93, + "Virology": 0.554, + "World Religions": 0.883, + "Mean win rate": 0.464 + } + }, + { + "model_id": "mistralai/mistral-large-2407", + "name": "Mistral Large 2 2407", + "developer": "mistralai", + "scores": { + "MMLU All Subjects": 0.8, + "Abstract Algebra": 0.7, + "Anatomy": 0.785, + "College Physics": 0.559, + "Computer Security": 0.81, + "Econometrics": 0.693, + "Global Facts": 0.56, + "Jurisprudence": 0.861, + "Philosophy": 0.826, + "Professional Psychology": 0.861, + "Us Foreign Policy": 0.9, + "Astronomy": 0.921, + "Business Ethics": 0.79, + "Clinical Knowledge": 0.864, + "Conceptual Physics": 0.864, + "Electrical Engineering": 0.793, + "Elementary Mathematics": 0.799, + "Formal Logic": 0.579, + "High School World History": 0.92, + "Human Sexuality": 0.924, + "International Law": 0.926, + "Logical Fallacies": 0.847, + "Machine Learning": 0.661, + "Management": 0.883, + "Marketing": 0.94, + "Medical Genetics": 0.9, + "Miscellaneous": 0.936, + "Moral Scenarios": 0.839, + "Nutrition": 0.827, + "Prehistory": 0.92, + "Public Relations": 0.764, + "Security Studies": 0.865, + "Sociology": 0.91, + "Virology": 0.59, + "World Religions": 0.865, + "Mean win rate": 0.24 + } + }, + { + "model_id": "mistralai/mistral-small-2402", + "name": "Mistral Small 2402", + "developer": "mistralai", + "scores": { + "MMLU All Subjects": 0.687, + "Abstract Algebra": 0.26, + "Anatomy": 0.674, + "College Physics": 0.402, + "Computer Security": 0.77, + "Econometrics": 0.614, + "Global Facts": 0.45, + "Jurisprudence": 0.833, + "Philosophy": 0.765, + "Professional Psychology": 0.768, + "Us Foreign Policy": 0.89, + "Astronomy": 0.77, + "Business Ethics": 0.71, + "Clinical Knowledge": 0.766, + "Conceptual Physics": 0.685, + "Electrical Engineering": 0.628, + "Elementary Mathematics": 0.415, + "Formal Logic": 0.516, + "High School World History": 0.857, + "Human Sexuality": 0.824, + "International Law": 0.826, + "Logical Fallacies": 0.804, + "Machine Learning": 0.562, + "Management": 0.786, + "Marketing": 0.906, + "Medical Genetics": 0.75, + "Miscellaneous": 0.844, + "Moral Scenarios": 0.575, + "Nutrition": 0.761, + "Prehistory": 0.802, + "Public Relations": 0.773, + "Security Studies": 0.788, + "Sociology": 0.871, + "Virology": 0.542, + "World Religions": 0.848, + "Mean win rate": 0.54 + } + }, + { + "model_id": "mistralai/mixtral-8x22b", + "name": "Mixtral 8x22B", + "developer": "mistralai", + "scores": { + "MMLU All Subjects": 0.778, + "Abstract Algebra": 0.48, + "Anatomy": 0.741, + "College Physics": 0.569, + "Computer Security": 0.84, + "Econometrics": 0.667, + "Global Facts": 0.56, + "Jurisprudence": 0.852, + "Philosophy": 0.842, + "Professional Psychology": 0.845, + "Us Foreign Policy": 0.95, + "Astronomy": 0.882, + "Business Ethics": 0.74, + "Clinical Knowledge": 0.819, + "Conceptual Physics": 0.796, + "Electrical Engineering": 0.766, + "Elementary Mathematics": 0.622, + "Formal Logic": 0.627, + "High School World History": 0.895, + "Human Sexuality": 0.885, + "International Law": 0.917, + "Logical Fallacies": 0.877, + "Machine Learning": 0.661, + "Management": 0.883, + "Marketing": 0.915, + "Medical Genetics": 0.85, + "Miscellaneous": 0.899, + "Moral Scenarios": 0.646, + "Nutrition": 0.866, + "Prehistory": 0.87, + "Public Relations": 0.755, + "Security Studies": 0.865, + "Sociology": 0.92, + "Virology": 0.596, + "World Religions": 0.901, + "Mean win rate": 0.598 + } + }, + { + "model_id": "mistralai/mixtral-8x7b-32kseqlen", + "name": "Mixtral 8x7B 32K seqlen", + "developer": "mistralai", + "scores": { + "MMLU All Subjects": 0.717, + "Abstract Algebra": 0.38, + "Anatomy": 0.696, + "College Physics": 0.51, + "Computer Security": 0.81, + "Econometrics": 0.605, + "Global Facts": 0.46, + "Jurisprudence": 0.833, + "Philosophy": 0.797, + "Professional Psychology": 0.779, + "Us Foreign Policy": 0.93, + "Astronomy": 0.829, + "Business Ethics": 0.72, + "Clinical Knowledge": 0.785, + "Conceptual Physics": 0.681, + "Electrical Engineering": 0.676, + "Elementary Mathematics": 0.476, + "Formal Logic": 0.532, + "High School World History": 0.886, + "Human Sexuality": 0.87, + "International Law": 0.86, + "Logical Fallacies": 0.767, + "Machine Learning": 0.509, + "Management": 0.845, + "Marketing": 0.923, + "Medical Genetics": 0.76, + "Miscellaneous": 0.881, + "Moral Scenarios": 0.444, + "Nutrition": 0.83, + "Prehistory": 0.849, + "Public Relations": 0.682, + "Security Studies": 0.792, + "Sociology": 0.871, + "Virology": 0.506, + "World Religions": 0.871, + "Mean win rate": 0.689 + } + }, + { + "model_id": "mistralai/open-mistral-nemo-2407", + "name": "Mistral NeMo 2402", + "developer": "mistralai", + "scores": { + "MMLU All Subjects": 0.653, + "Abstract Algebra": 0.29, + "Anatomy": 0.607, + "College Physics": 0.373, + "Computer Security": 0.81, + "Econometrics": 0.561, + "Global Facts": 0.4, + "Jurisprudence": 0.796, + "Philosophy": 0.733, + "Professional Psychology": 0.588, + "Us Foreign Policy": 0.89, + "Astronomy": 0.691, + "Business Ethics": 0.49, + "Clinical Knowledge": 0.736, + "Conceptual Physics": 0.647, + "Electrical Engineering": 0.531, + "Elementary Mathematics": 0.439, + "Formal Logic": 0.405, + "High School World History": 0.848, + "Human Sexuality": 0.702, + "International Law": 0.769, + "Logical Fallacies": 0.791, + "Machine Learning": 0.402, + "Management": 0.796, + "Marketing": 0.889, + "Medical Genetics": 0.78, + "Miscellaneous": 0.861, + "Moral Scenarios": 0.381, + "Nutrition": 0.709, + "Prehistory": 0.765, + "Public Relations": 0.718, + "Security Studies": 0.771, + "Sociology": 0.726, + "Virology": 0.56, + "World Religions": 0.789, + "Mean win rate": 0.215 + } + }, + { + "model_id": "openai/gpt-3.5-turbo-0125", + "name": "GPT-3.5 Turbo 0125", + "developer": "openai", + "scores": { + "MMLU All Subjects": 0.673, + "Abstract Algebra": 0.31, + "Anatomy": 0.696, + "College Physics": 0.471, + "Computer Security": 0.78, + "Econometrics": 0.474, + "Global Facts": 0.39, + "Jurisprudence": 0.806, + "Philosophy": 0.746, + "Professional Psychology": 0.722, + "Us Foreign Policy": 0.89, + "Astronomy": 0.75, + "Business Ethics": 0.75, + "Clinical Knowledge": 0.755, + "Conceptual Physics": 0.634, + "Electrical Engineering": 0.669, + "Elementary Mathematics": 0.534, + "Formal Logic": 0.444, + "High School World History": 0.819, + "Human Sexuality": 0.779, + "International Law": 0.81, + "Logical Fallacies": 0.779, + "Machine Learning": 0.455, + "Management": 0.835, + "Marketing": 0.91, + "Medical Genetics": 0.73, + "Miscellaneous": 0.89, + "Moral Scenarios": 0.355, + "Nutrition": 0.748, + "Prehistory": 0.735, + "Public Relations": 0.727, + "Security Studies": 0.751, + "Sociology": 0.861, + "Virology": 0.536, + "World Religions": 0.842, + "Mean win rate": 0.493 + } + }, + { + "model_id": "openai/gpt-3.5-turbo-0613", + "name": "gpt-3.5-turbo-0613", + "developer": "openai", + "scores": { + "MMLU All Subjects": 0.689, + "Abstract Algebra": 0.38, + "Anatomy": 0.659, + "College Physics": 0.461, + "Computer Security": 0.81, + "Econometrics": 0.5, + "Global Facts": 0.37, + "Jurisprudence": 0.806, + "Philosophy": 0.759, + "Professional Psychology": 0.732, + "Us Foreign Policy": 0.88, + "Astronomy": 0.763, + "Business Ethics": 0.75, + "Clinical Knowledge": 0.777, + "Conceptual Physics": 0.613, + "Electrical Engineering": 0.648, + "Elementary Mathematics": 0.5, + "Formal Logic": 0.397, + "High School World History": 0.857, + "Human Sexuality": 0.786, + "International Law": 0.843, + "Logical Fallacies": 0.791, + "Machine Learning": 0.455, + "Management": 0.845, + "Marketing": 0.91, + "Medical Genetics": 0.8, + "Miscellaneous": 0.893, + "Moral Scenarios": 0.404, + "Nutrition": 0.758, + "Prehistory": 0.787, + "Public Relations": 0.745, + "Security Studies": 0.8, + "Sociology": 0.871, + "Virology": 0.542, + "World Religions": 0.836, + "Mean win rate": 0.589 + } + }, + { + "model_id": "openai/gpt-4-0613", + "name": "GPT-4 0613", + "developer": "openai", + "scores": { + "MMLU All Subjects": 0.824, + "Abstract Algebra": 0.63, + "Anatomy": 0.8, + "College Physics": 0.627, + "Computer Security": 0.86, + "Econometrics": 0.684, + "Global Facts": 0.62, + "Jurisprudence": 0.889, + "Philosophy": 0.859, + "Professional Psychology": 0.891, + "Us Foreign Policy": 0.95, + "Astronomy": 0.934, + "Business Ethics": 0.79, + "Clinical Knowledge": 0.845, + "Conceptual Physics": 0.868, + "Electrical Engineering": 0.786, + "Elementary Mathematics": 0.807, + "Formal Logic": 0.643, + "High School World History": 0.945, + "Human Sexuality": 0.908, + "International Law": 0.917, + "Logical Fallacies": 0.871, + "Machine Learning": 0.759, + "Management": 0.932, + "Marketing": 0.962, + "Medical Genetics": 0.94, + "Miscellaneous": 0.949, + "Moral Scenarios": 0.902, + "Nutrition": 0.892, + "Prehistory": 0.926, + "Public Relations": 0.745, + "Security Studies": 0.861, + "Sociology": 0.93, + "Virology": 0.596, + "World Religions": 0.877, + "Mean win rate": 0.517 + } + }, + { + "model_id": "openai/gpt-4-1106-preview", + "name": "GPT-4 Turbo 1106 preview", + "developer": "openai", + "scores": { + "MMLU All Subjects": 0.796, + "Abstract Algebra": 0.53, + "Anatomy": 0.807, + "College Physics": 0.402, + "Computer Security": 0.86, + "Econometrics": 0.675, + "Global Facts": 0.58, + "Jurisprudence": 0.889, + "Philosophy": 0.852, + "Professional Psychology": 0.887, + "Us Foreign Policy": 0.96, + "Astronomy": 0.941, + "Business Ethics": 0.78, + "Clinical Knowledge": 0.864, + "Conceptual Physics": 0.894, + "Electrical Engineering": 0.772, + "Elementary Mathematics": 0.638, + "Formal Logic": 0.651, + "High School World History": 0.958, + "Human Sexuality": 0.908, + "International Law": 0.926, + "Logical Fallacies": 0.865, + "Machine Learning": 0.723, + "Management": 0.913, + "Marketing": 0.932, + "Medical Genetics": 0.93, + "Miscellaneous": 0.946, + "Moral Scenarios": 0.816, + "Nutrition": 0.879, + "Prehistory": 0.917, + "Public Relations": 0.782, + "Security Studies": 0.841, + "Sociology": 0.925, + "Virology": 0.59, + "World Religions": 0.854, + "Mean win rate": 0.416 + } + }, + { + "model_id": "openai/gpt-4-turbo-2024-04-09", + "name": "GPT-4 Turbo 2024-04-09", + "developer": "openai", + "scores": { + "MMLU All Subjects": 0.813, + "Abstract Algebra": 0.56, + "Anatomy": 0.822, + "College Physics": 0.539, + "Computer Security": 0.83, + "Econometrics": 0.675, + "Global Facts": 0.58, + "Jurisprudence": 0.88, + "Philosophy": 0.868, + "Professional Psychology": 0.873, + "Us Foreign Policy": 0.96, + "Astronomy": 0.941, + "Business Ethics": 0.82, + "Clinical Knowledge": 0.83, + "Conceptual Physics": 0.894, + "Electrical Engineering": 0.752, + "Elementary Mathematics": 0.72, + "Formal Logic": 0.706, + "High School World History": 0.941, + "Human Sexuality": 0.901, + "International Law": 0.942, + "Logical Fallacies": 0.871, + "Machine Learning": 0.741, + "Management": 0.883, + "Marketing": 0.949, + "Medical Genetics": 0.92, + "Miscellaneous": 0.945, + "Moral Scenarios": 0.803, + "Nutrition": 0.892, + "Prehistory": 0.92, + "Public Relations": 0.755, + "Security Studies": 0.8, + "Sociology": 0.915, + "Virology": 0.602, + "World Religions": 0.848, + "Mean win rate": 0.351 + } + }, + { + "model_id": "openai/gpt-4o-2024-05-13", + "name": "GPT-4o 2024-05-13", + "developer": "openai", + "scores": { + "MMLU All Subjects": 0.842, + "Abstract Algebra": 0.66, + "Anatomy": 0.911, + "College Physics": 0.686, + "Computer Security": 0.85, + "Econometrics": 0.693, + "Global Facts": 0.64, + "Jurisprudence": 0.898, + "Philosophy": 0.9, + "Professional Psychology": 0.905, + "Us Foreign Policy": 0.96, + "Astronomy": 0.941, + "Business Ethics": 0.85, + "Clinical Knowledge": 0.894, + "Conceptual Physics": 0.911, + "Electrical Engineering": 0.807, + "Elementary Mathematics": 0.741, + "Formal Logic": 0.683, + "High School World History": 0.945, + "Human Sexuality": 0.908, + "International Law": 0.934, + "Logical Fallacies": 0.883, + "Machine Learning": 0.768, + "Management": 0.942, + "Marketing": 0.936, + "Medical Genetics": 0.96, + "Miscellaneous": 0.954, + "Moral Scenarios": 0.841, + "Nutrition": 0.899, + "Prehistory": 0.938, + "Public Relations": 0.809, + "Security Studies": 0.837, + "Sociology": 0.94, + "Virology": 0.596, + "World Religions": 0.889, + "Mean win rate": 0.671 + } + }, + { + "model_id": "openai/gpt-4o-2024-08-06", + "name": "GPT-4o 2024-08-06", + "developer": "openai", + "scores": { + "MMLU All Subjects": 0.843, + "Abstract Algebra": 0.58, + "Anatomy": 0.911, + "College Physics": 0.686, + "Computer Security": 0.85, + "Econometrics": 0.711, + "Global Facts": 0.69, + "Jurisprudence": 0.907, + "Philosophy": 0.894, + "Professional Psychology": 0.899, + "Us Foreign Policy": 0.95, + "Astronomy": 0.947, + "Business Ethics": 0.89, + "Clinical Knowledge": 0.894, + "Conceptual Physics": 0.923, + "Electrical Engineering": 0.793, + "Elementary Mathematics": 0.775, + "Formal Logic": 0.675, + "High School World History": 0.941, + "Human Sexuality": 0.901, + "International Law": 0.942, + "Logical Fallacies": 0.902, + "Machine Learning": 0.777, + "Management": 0.913, + "Marketing": 0.94, + "Medical Genetics": 0.98, + "Miscellaneous": 0.958, + "Moral Scenarios": 0.802, + "Nutrition": 0.905, + "Prehistory": 0.935, + "Public Relations": 0.782, + "Security Studies": 0.833, + "Sociology": 0.945, + "Virology": 0.578, + "World Religions": 0.883, + "Mean win rate": 0.52 + } + }, + { + "model_id": "openai/gpt-4o-mini-2024-07-18", + "name": "GPT-4o mini 2024-07-18", + "developer": "openai", + "scores": { + "MMLU All Subjects": 0.767, + "Abstract Algebra": 0.42, + "Anatomy": 0.77, + "College Physics": 0.559, + "Computer Security": 0.85, + "Econometrics": 0.649, + "Global Facts": 0.45, + "Jurisprudence": 0.87, + "Philosophy": 0.772, + "Professional Psychology": 0.833, + "Us Foreign Policy": 0.91, + "Astronomy": 0.849, + "Business Ethics": 0.79, + "Clinical Knowledge": 0.845, + "Conceptual Physics": 0.791, + "Electrical Engineering": 0.731, + "Elementary Mathematics": 0.651, + "Formal Logic": 0.556, + "High School World History": 0.903, + "Human Sexuality": 0.863, + "International Law": 0.926, + "Logical Fallacies": 0.871, + "Machine Learning": 0.616, + "Management": 0.845, + "Marketing": 0.927, + "Medical Genetics": 0.89, + "Miscellaneous": 0.913, + "Moral Scenarios": 0.485, + "Nutrition": 0.827, + "Prehistory": 0.833, + "Public Relations": 0.791, + "Security Studies": 0.788, + "Sociology": 0.9, + "Virology": 0.536, + "World Religions": 0.86, + "Mean win rate": 0.774 + } + }, + { + "model_id": "qwen/qwen1.5-110b-chat", + "name": "Qwen1.5 Chat 110B", + "developer": "qwen", + "scores": { + "MMLU All Subjects": 0.768, + "Abstract Algebra": 0.57, + "Anatomy": 0.696, + "College Physics": 0.51, + "Computer Security": 0.82, + "Econometrics": 0.64, + "Global Facts": 0.51, + "Jurisprudence": 0.833, + "Philosophy": 0.823, + "Professional Psychology": 0.82, + "Us Foreign Policy": 0.87, + "Astronomy": 0.901, + "Business Ethics": 0.8, + "Clinical Knowledge": 0.766, + "Conceptual Physics": 0.838, + "Electrical Engineering": 0.752, + "Elementary Mathematics": 0.669, + "Formal Logic": 0.643, + "High School World History": 0.903, + "Human Sexuality": 0.855, + "International Law": 0.876, + "Logical Fallacies": 0.828, + "Machine Learning": 0.634, + "Management": 0.835, + "Marketing": 0.919, + "Medical Genetics": 0.85, + "Miscellaneous": 0.934, + "Moral Scenarios": 0.783, + "Nutrition": 0.804, + "Prehistory": 0.867, + "Public Relations": 0.773, + "Security Studies": 0.735, + "Sociology": 0.866, + "Virology": 0.542, + "World Religions": 0.871, + "Mean win rate": 0.875 + } + }, + { + "model_id": "qwen/qwen1.5-14b", + "name": "Qwen1.5 14B", + "developer": "qwen", + "scores": { + "MMLU All Subjects": 0.686, + "Abstract Algebra": 0.4, + "Anatomy": 0.637, + "College Physics": 0.48, + "Computer Security": 0.84, + "Econometrics": 0.561, + "Global Facts": 0.49, + "Jurisprudence": 0.769, + "Philosophy": 0.717, + "Professional Psychology": 0.699, + "Us Foreign Policy": 0.87, + "Astronomy": 0.724, + "Business Ethics": 0.75, + "Clinical Knowledge": 0.736, + "Conceptual Physics": 0.694, + "Electrical Engineering": 0.683, + "Elementary Mathematics": 0.603, + "Formal Logic": 0.492, + "High School World History": 0.84, + "Human Sexuality": 0.756, + "International Law": 0.826, + "Logical Fallacies": 0.736, + "Machine Learning": 0.509, + "Management": 0.816, + "Marketing": 0.893, + "Medical Genetics": 0.76, + "Miscellaneous": 0.835, + "Moral Scenarios": 0.368, + "Nutrition": 0.742, + "Prehistory": 0.71, + "Public Relations": 0.655, + "Security Studies": 0.8, + "Sociology": 0.841, + "Virology": 0.458, + "World Religions": 0.842, + "Mean win rate": 0.796 + } + }, + { + "model_id": "qwen/qwen1.5-32b", + "name": "Qwen1.5 32B", + "developer": "qwen", + "scores": { + "MMLU All Subjects": 0.744, + "Abstract Algebra": 0.4, + "Anatomy": 0.644, + "College Physics": 0.51, + "Computer Security": 0.77, + "Econometrics": 0.561, + "Global Facts": 0.47, + "Jurisprudence": 0.843, + "Philosophy": 0.826, + "Professional Psychology": 0.75, + "Us Foreign Policy": 0.91, + "Astronomy": 0.855, + "Business Ethics": 0.77, + "Clinical Knowledge": 0.781, + "Conceptual Physics": 0.766, + "Electrical Engineering": 0.731, + "Elementary Mathematics": 0.685, + "Formal Logic": 0.524, + "High School World History": 0.869, + "Human Sexuality": 0.847, + "International Law": 0.884, + "Logical Fallacies": 0.822, + "Machine Learning": 0.616, + "Management": 0.874, + "Marketing": 0.936, + "Medical Genetics": 0.85, + "Miscellaneous": 0.884, + "Moral Scenarios": 0.545, + "Nutrition": 0.81, + "Prehistory": 0.83, + "Public Relations": 0.664, + "Security Studies": 0.829, + "Sociology": 0.881, + "Virology": 0.578, + "World Religions": 0.854, + "Mean win rate": 0.624 + } + }, + { + "model_id": "qwen/qwen1.5-72b", + "name": "Qwen1.5 72B", + "developer": "qwen", + "scores": { + "MMLU All Subjects": 0.774, + "Abstract Algebra": 0.44, + "Anatomy": 0.733, + "College Physics": 0.559, + "Computer Security": 0.81, + "Econometrics": 0.544, + "Global Facts": 0.56, + "Jurisprudence": 0.824, + "Philosophy": 0.83, + "Professional Psychology": 0.809, + "Us Foreign Policy": 0.94, + "Astronomy": 0.868, + "Business Ethics": 0.79, + "Clinical Knowledge": 0.834, + "Conceptual Physics": 0.821, + "Electrical Engineering": 0.779, + "Elementary Mathematics": 0.696, + "Formal Logic": 0.556, + "High School World History": 0.899, + "Human Sexuality": 0.878, + "International Law": 0.909, + "Logical Fallacies": 0.853, + "Machine Learning": 0.67, + "Management": 0.854, + "Marketing": 0.949, + "Medical Genetics": 0.87, + "Miscellaneous": 0.921, + "Moral Scenarios": 0.669, + "Nutrition": 0.859, + "Prehistory": 0.88, + "Public Relations": 0.755, + "Security Studies": 0.824, + "Sociology": 0.9, + "Virology": 0.584, + "World Religions": 0.883, + "Mean win rate": 0.65 + } + }, + { + "model_id": "qwen/qwen1.5-7b", + "name": "Qwen1.5 7B", + "developer": "qwen", + "scores": { + "MMLU All Subjects": 0.626, + "Abstract Algebra": 0.39, + "Anatomy": 0.526, + "College Physics": 0.471, + "Computer Security": 0.76, + "Econometrics": 0.447, + "Global Facts": 0.4, + "Jurisprudence": 0.778, + "Philosophy": 0.691, + "Professional Psychology": 0.603, + "Us Foreign Policy": 0.84, + "Astronomy": 0.671, + "Business Ethics": 0.69, + "Clinical Knowledge": 0.691, + "Conceptual Physics": 0.579, + "Electrical Engineering": 0.572, + "Elementary Mathematics": 0.5, + "Formal Logic": 0.397, + "High School World History": 0.789, + "Human Sexuality": 0.695, + "International Law": 0.76, + "Logical Fallacies": 0.706, + "Machine Learning": 0.411, + "Management": 0.816, + "Marketing": 0.863, + "Medical Genetics": 0.69, + "Miscellaneous": 0.765, + "Moral Scenarios": 0.372, + "Nutrition": 0.696, + "Prehistory": 0.688, + "Public Relations": 0.627, + "Security Studies": 0.727, + "Sociology": 0.836, + "Virology": 0.488, + "World Religions": 0.778, + "Mean win rate": 0.843 + } + }, + { + "model_id": "qwen/qwen2-72b-instruct", + "name": "Qwen2 Instruct 72B", + "developer": "qwen", + "scores": { + "MMLU All Subjects": 0.824, + "Abstract Algebra": 0.67, + "Anatomy": 0.793, + "College Physics": 0.598, + "Computer Security": 0.85, + "Econometrics": 0.737, + "Global Facts": 0.58, + "Jurisprudence": 0.87, + "Philosophy": 0.859, + "Professional Psychology": 0.886, + "Us Foreign Policy": 0.94, + "Astronomy": 0.934, + "Business Ethics": 0.82, + "Clinical Knowledge": 0.868, + "Conceptual Physics": 0.872, + "Electrical Engineering": 0.793, + "Elementary Mathematics": 0.825, + "Formal Logic": 0.667, + "High School World History": 0.932, + "Human Sexuality": 0.893, + "International Law": 0.893, + "Logical Fallacies": 0.914, + "Machine Learning": 0.768, + "Management": 0.903, + "Marketing": 0.953, + "Medical Genetics": 0.9, + "Miscellaneous": 0.943, + "Moral Scenarios": 0.815, + "Nutrition": 0.902, + "Prehistory": 0.914, + "Public Relations": 0.745, + "Security Studies": 0.837, + "Sociology": 0.935, + "Virology": 0.56, + "World Religions": 0.848, + "Mean win rate": 0.826 + } + }, + { + "model_id": "qwen/qwen2.5-72b-instruct-turbo", + "name": "Qwen2.5 Instruct Turbo 72B", + "developer": "qwen", + "scores": { + "MMLU All Subjects": 0.834, + "Abstract Algebra": 0.68, + "Anatomy": 0.822, + "College Physics": 0.588, + "Computer Security": 0.86, + "Econometrics": 0.728, + "Global Facts": 0.61, + "Jurisprudence": 0.87, + "Philosophy": 0.839, + "Professional Psychology": 0.864, + "Us Foreign Policy": 0.96, + "Astronomy": 0.934, + "Business Ethics": 0.85, + "Clinical Knowledge": 0.872, + "Conceptual Physics": 0.885, + "Electrical Engineering": 0.8, + "Elementary Mathematics": 0.87, + "Formal Logic": 0.73, + "High School World History": 0.92, + "Human Sexuality": 0.878, + "International Law": 0.893, + "Logical Fallacies": 0.89, + "Machine Learning": 0.777, + "Management": 0.913, + "Marketing": 0.953, + "Medical Genetics": 0.92, + "Miscellaneous": 0.932, + "Moral Scenarios": 0.787, + "Nutrition": 0.886, + "Prehistory": 0.91, + "Public Relations": 0.782, + "Security Studies": 0.849, + "Sociology": 0.925, + "Virology": 0.584, + "World Religions": 0.901, + "Mean win rate": 0.548 + } + }, + { + "model_id": "qwen/qwen2.5-7b-instruct-turbo", + "name": "Qwen2.5 Instruct Turbo 7B", + "developer": "qwen", + "scores": { + "MMLU All Subjects": 0.729, + "Abstract Algebra": 0.49, + "Anatomy": 0.689, + "College Physics": 0.51, + "Computer Security": 0.79, + "Econometrics": 0.64, + "Global Facts": 0.42, + "Jurisprudence": 0.796, + "Philosophy": 0.746, + "Professional Psychology": 0.757, + "Us Foreign Policy": 0.86, + "Astronomy": 0.836, + "Business Ethics": 0.82, + "Clinical Knowledge": 0.785, + "Conceptual Physics": 0.736, + "Electrical Engineering": 0.717, + "Elementary Mathematics": 0.643, + "Formal Logic": 0.587, + "High School World History": 0.878, + "Human Sexuality": 0.794, + "International Law": 0.86, + "Logical Fallacies": 0.773, + "Machine Learning": 0.554, + "Management": 0.845, + "Marketing": 0.919, + "Medical Genetics": 0.85, + "Miscellaneous": 0.852, + "Moral Scenarios": 0.511, + "Nutrition": 0.778, + "Prehistory": 0.836, + "Public Relations": 0.709, + "Security Studies": 0.682, + "Sociology": 0.861, + "Virology": 0.578, + "World Religions": 0.83, + "Mean win rate": 0.887 + } + }, + { + "model_id": "snowflake/snowflake-arctic-instruct", + "name": "Arctic Instruct", + "developer": "snowflake", + "scores": { + "MMLU All Subjects": 0.677, + "Abstract Algebra": 0.35, + "Anatomy": 0.652, + "College Physics": 0.461, + "Computer Security": 0.84, + "Econometrics": 0.5, + "Global Facts": 0.39, + "Jurisprudence": 0.741, + "Philosophy": 0.752, + "Professional Psychology": 0.724, + "Us Foreign Policy": 0.88, + "Astronomy": 0.763, + "Business Ethics": 0.69, + "Clinical Knowledge": 0.781, + "Conceptual Physics": 0.634, + "Electrical Engineering": 0.662, + "Elementary Mathematics": 0.481, + "Formal Logic": 0.444, + "High School World History": 0.827, + "Human Sexuality": 0.847, + "International Law": 0.826, + "Logical Fallacies": 0.779, + "Machine Learning": 0.473, + "Management": 0.796, + "Marketing": 0.902, + "Medical Genetics": 0.76, + "Miscellaneous": 0.875, + "Moral Scenarios": 0.28, + "Nutrition": 0.725, + "Prehistory": 0.79, + "Public Relations": 0.664, + "Security Studies": 0.78, + "Sociology": 0.891, + "Virology": 0.536, + "World Religions": 0.854, + "Mean win rate": 0.565 + } + }, + { + "model_id": "upstage/solar-pro-241126", + "name": "Solar Pro", + "developer": "upstage", + "scores": { + "MMLU All Subjects": 0.776, + "Abstract Algebra": 0.46, + "Anatomy": 0.719, + "College Physics": 0.559, + "Computer Security": 0.82, + "Econometrics": 0.605, + "Global Facts": 0.5, + "Jurisprudence": 0.898, + "Philosophy": 0.817, + "Professional Psychology": 0.85, + "Us Foreign Policy": 0.97, + "Astronomy": 0.868, + "Business Ethics": 0.8, + "Clinical Knowledge": 0.808, + "Conceptual Physics": 0.826, + "Electrical Engineering": 0.697, + "Elementary Mathematics": 0.611, + "Formal Logic": 0.579, + "High School World History": 0.907, + "Human Sexuality": 0.847, + "International Law": 0.901, + "Logical Fallacies": 0.865, + "Machine Learning": 0.616, + "Management": 0.864, + "Marketing": 0.953, + "Medical Genetics": 0.91, + "Miscellaneous": 0.888, + "Moral Scenarios": 0.811, + "Nutrition": 0.859, + "Prehistory": 0.867, + "Public Relations": 0.764, + "Security Studies": 0.82, + "Sociology": 0.886, + "Virology": 0.572, + "World Religions": 0.883, + "Mean win rate": 0.462 + } + }, + { + "model_id": "writer/palmyra-x-004", + "name": "Palmyra-X-004", + "developer": "writer", + "scores": { + "MMLU All Subjects": 0.813, + "Abstract Algebra": 0.75, + "Anatomy": 0.822, + "College Physics": 0.647, + "Computer Security": 0.82, + "Econometrics": 0.684, + "Global Facts": 0.62, + "Jurisprudence": 0.843, + "Philosophy": 0.83, + "Professional Psychology": 0.845, + "Us Foreign Policy": 0.92, + "Astronomy": 0.928, + "Business Ethics": 0.76, + "Clinical Knowledge": 0.879, + "Conceptual Physics": 0.885, + "Electrical Engineering": 0.793, + "Elementary Mathematics": 0.841, + "Formal Logic": 0.579, + "High School World History": 0.911, + "Human Sexuality": 0.924, + "International Law": 0.901, + "Logical Fallacies": 0.877, + "Machine Learning": 0.679, + "Management": 0.903, + "Marketing": 0.932, + "Medical Genetics": 0.87, + "Miscellaneous": 0.934, + "Moral Scenarios": 0.825, + "Nutrition": 0.869, + "Prehistory": 0.917, + "Public Relations": 0.791, + "Security Studies": 0.849, + "Sociology": 0.915, + "Virology": 0.584, + "World Religions": 0.842, + "Mean win rate": 0.629 + } + }, + { + "model_id": "writer/palmyra-x-v3", + "name": "Palmyra X V3 72B", + "developer": "writer", + "scores": { + "MMLU All Subjects": 0.786, + "Abstract Algebra": 0.53, + "Anatomy": 0.733, + "College Physics": 0.549, + "Computer Security": 0.78, + "Econometrics": 0.649, + "Global Facts": 0.53, + "Jurisprudence": 0.88, + "Philosophy": 0.836, + "Professional Psychology": 0.858, + "Us Foreign Policy": 0.96, + "Astronomy": 0.862, + "Business Ethics": 0.83, + "Clinical Knowledge": 0.804, + "Conceptual Physics": 0.809, + "Electrical Engineering": 0.772, + "Elementary Mathematics": 0.661, + "Formal Logic": 0.659, + "High School World History": 0.911, + "Human Sexuality": 0.924, + "International Law": 0.909, + "Logical Fallacies": 0.877, + "Machine Learning": 0.625, + "Management": 0.903, + "Marketing": 0.94, + "Medical Genetics": 0.83, + "Miscellaneous": 0.894, + "Moral Scenarios": 0.562, + "Nutrition": 0.856, + "Prehistory": 0.87, + "Public Relations": 0.773, + "Security Studies": 0.833, + "Sociology": 0.91, + "Virology": 0.572, + "World Religions": 0.877, + "Mean win rate": 0.325 + } + } + ] +} \ No newline at end of file