{ "developer": "ai21", "models": [ { "id": "ai21/J1-Grande-v1-17B", "name": "J1-Grande v1 17B", "developer": "ai21", "evaluator_relationship": null, "benchmark_scores": { "helm_classic/Mean win rate": 0.433, "helm_classic/MMLU": 0.27, "helm_classic/BoolQ": 0.722, "helm_classic/NarrativeQA": 0.672, "helm_classic/NaturalQuestions (open-book)": 0.578, "helm_classic/QuAC": 0.362, "helm_classic/HellaSwag": 0.739, "helm_classic/OpenbookQA": 0.52, "helm_classic/TruthfulQA": 0.193, "helm_classic/MS MARCO (TREC)": 0.341, "helm_classic/CNN/DailyMail": 0.143, "helm_classic/XSUM": 0.122, "helm_classic/IMDB": 0.953, "helm_classic/CivilComments": 0.529, "helm_classic/RAFT": 0.658 } }, { "id": "ai21/J1-Grande-v2-beta-17B", "name": "J1-Grande v2 beta 17B", "developer": "ai21", "evaluator_relationship": null, "benchmark_scores": { "helm_classic/Mean win rate": 0.706, "helm_classic/MMLU": 0.445, "helm_classic/BoolQ": 0.812, "helm_classic/NarrativeQA": 0.725, "helm_classic/NaturalQuestions (open-book)": 0.625, "helm_classic/QuAC": 0.392, "helm_classic/HellaSwag": 0.764, "helm_classic/OpenbookQA": 0.56, "helm_classic/TruthfulQA": 0.306, "helm_classic/MS MARCO (TREC)": 0.46, "helm_classic/CNN/DailyMail": 0.146, "helm_classic/XSUM": 0.152, "helm_classic/IMDB": 0.957, "helm_classic/CivilComments": 0.546, "helm_classic/RAFT": 0.679 } }, { "id": "ai21/J1-Jumbo-v1-178B", "name": "J1-Jumbo v1 178B", "developer": "ai21", "evaluator_relationship": null, "benchmark_scores": { "helm_classic/Mean win rate": 0.517, "helm_classic/MMLU": 0.259, "helm_classic/BoolQ": 0.776, "helm_classic/NarrativeQA": 0.695, "helm_classic/NaturalQuestions (open-book)": 0.595, "helm_classic/QuAC": 0.358, "helm_classic/HellaSwag": 0.765, "helm_classic/OpenbookQA": 0.534, "helm_classic/TruthfulQA": 0.175, "helm_classic/MS MARCO (TREC)": 0.363, "helm_classic/CNN/DailyMail": 0.144, "helm_classic/XSUM": 0.129, "helm_classic/IMDB": 0.943, "helm_classic/CivilComments": 0.553, "helm_classic/RAFT": 0.681 } }, { "id": "ai21/J1-Large-v1-7.5B", "name": "J1-Large v1 7.5B", "developer": "ai21", "evaluator_relationship": null, "benchmark_scores": { "helm_classic/Mean win rate": 0.285, "helm_classic/MMLU": 0.241, "helm_classic/BoolQ": 0.683, "helm_classic/NarrativeQA": 0.623, "helm_classic/NaturalQuestions (open-book)": 0.532, "helm_classic/QuAC": 0.328, "helm_classic/HellaSwag": 0.7, "helm_classic/OpenbookQA": 0.514, "helm_classic/TruthfulQA": 0.197, "helm_classic/MS MARCO (TREC)": 0.292, "helm_classic/CNN/DailyMail": 0.134, "helm_classic/XSUM": 0.102, "helm_classic/IMDB": 0.956, "helm_classic/CivilComments": 0.532, "helm_classic/RAFT": 0.545 } }, { "id": "ai21/j2-grande", "name": "Jurassic-2 Grande 17B", "developer": "ai21", "evaluator_relationship": null, "benchmark_scores": { "helm_lite/Mean win rate": 0.172, "helm_lite/NarrativeQA": 0.744, "helm_lite/NaturalQuestions (closed-book)": 0.35, "helm_lite/OpenbookQA": 0.614, "helm_lite/MMLU": 0.471, "helm_lite/MATH": 0.064, "helm_lite/GSM8K": 0.159, "helm_lite/LegalBench": 0.468, "helm_lite/MedQA": 0.39, "helm_lite/WMT 2014": 0.102 } }, { "id": "ai21/j2-jumbo", "name": "Jurassic-2 Jumbo 178B", "developer": "ai21", "evaluator_relationship": null, "benchmark_scores": { "helm_lite/Mean win rate": 0.215, "helm_lite/NarrativeQA": 0.728, "helm_lite/NaturalQuestions (closed-book)": 0.385, "helm_lite/OpenbookQA": 0.688, "helm_lite/MMLU": 0.483, "helm_lite/MATH": 0.103, "helm_lite/GSM8K": 0.239, "helm_lite/LegalBench": 0.533, "helm_lite/MedQA": 0.431, "helm_lite/WMT 2014": 0.114 } }, { "id": "ai21/jamba-1.5-large", "name": "Jamba 1.5 Large", "developer": "ai21", "evaluator_relationship": null, "benchmark_scores": { "helm_lite/Mean win rate": 0.637, "helm_lite/NarrativeQA": 0.664, "helm_lite/NaturalQuestions (closed-book)": 0.394, "helm_lite/OpenbookQA": 0.948, "helm_lite/MMLU": 0.683, "helm_lite/MATH": 0.692, "helm_lite/GSM8K": 0.846, "helm_lite/LegalBench": 0.675, "helm_lite/MedQA": 0.698, "helm_lite/WMT 2014": 0.203, "helm_mmlu/MMLU All Subjects": 0.782, "helm_mmlu/Abstract Algebra": 0.53, "helm_mmlu/Anatomy": 0.793, "helm_mmlu/College Physics": 0.51, "helm_mmlu/Computer Security": 0.8, "helm_mmlu/Econometrics": 0.614, "helm_mmlu/Global Facts": 0.54, "helm_mmlu/Jurisprudence": 0.87, "helm_mmlu/Philosophy": 0.849, "helm_mmlu/Professional Psychology": 0.842, "helm_mmlu/Us Foreign Policy": 0.92, "helm_mmlu/Astronomy": 0.882, "helm_mmlu/Business Ethics": 0.77, "helm_mmlu/Clinical Knowledge": 0.849, "helm_mmlu/Conceptual Physics": 0.779, "helm_mmlu/Electrical Engineering": 0.793, "helm_mmlu/Elementary Mathematics": 0.656, "helm_mmlu/Formal Logic": 0.619, "helm_mmlu/High School World History": 0.911, "helm_mmlu/Human Sexuality": 0.832, "helm_mmlu/International Law": 0.884, "helm_mmlu/Logical Fallacies": 0.859, "helm_mmlu/Machine Learning": 0.688, "helm_mmlu/Management": 0.864, "helm_mmlu/Marketing": 0.94, "helm_mmlu/Medical Genetics": 0.89, "helm_mmlu/Miscellaneous": 0.931, "helm_mmlu/Moral Scenarios": 0.686, "helm_mmlu/Nutrition": 0.869, "helm_mmlu/Prehistory": 0.892, "helm_mmlu/Public Relations": 0.755, "helm_mmlu/Security Studies": 0.771, "helm_mmlu/Sociology": 0.93, "helm_mmlu/Virology": 0.554, "helm_mmlu/World Religions": 0.865, "helm_mmlu/Mean win rate": 0.147 } }, { "id": "ai21/jamba-1.5-mini", "name": "Jamba 1.5 Mini", "developer": "ai21", "evaluator_relationship": null, "benchmark_scores": { "helm_lite/Mean win rate": 0.414, "helm_lite/NarrativeQA": 0.746, "helm_lite/NaturalQuestions (closed-book)": 0.388, "helm_lite/OpenbookQA": 0.89, "helm_lite/MMLU": 0.582, "helm_lite/MATH": 0.318, "helm_lite/GSM8K": 0.691, "helm_lite/LegalBench": 0.503, "helm_lite/MedQA": 0.632, "helm_lite/WMT 2014": 0.179, "helm_mmlu/MMLU All Subjects": 0.699, "helm_mmlu/Abstract Algebra": 0.33, "helm_mmlu/Anatomy": 0.711, "helm_mmlu/College Physics": 0.48, "helm_mmlu/Computer Security": 0.73, "helm_mmlu/Econometrics": 0.491, "helm_mmlu/Global Facts": 0.43, "helm_mmlu/Jurisprudence": 0.88, "helm_mmlu/Philosophy": 0.752, "helm_mmlu/Professional Psychology": 0.76, "helm_mmlu/Us Foreign Policy": 0.9, "helm_mmlu/Astronomy": 0.822, "helm_mmlu/Business Ethics": 0.76, "helm_mmlu/Clinical Knowledge": 0.74, "helm_mmlu/Conceptual Physics": 0.677, "helm_mmlu/Electrical Engineering": 0.683, "helm_mmlu/Elementary Mathematics": 0.553, "helm_mmlu/Formal Logic": 0.452, "helm_mmlu/High School World History": 0.84, "helm_mmlu/Human Sexuality": 0.809, "helm_mmlu/International Law": 0.893, "helm_mmlu/Logical Fallacies": 0.81, "helm_mmlu/Machine Learning": 0.509, "helm_mmlu/Management": 0.825, "helm_mmlu/Marketing": 0.915, "helm_mmlu/Medical Genetics": 0.69, "helm_mmlu/Miscellaneous": 0.902, "helm_mmlu/Moral Scenarios": 0.269, "helm_mmlu/Nutrition": 0.801, "helm_mmlu/Prehistory": 0.824, "helm_mmlu/Public Relations": 0.727, "helm_mmlu/Security Studies": 0.755, "helm_mmlu/Sociology": 0.876, "helm_mmlu/Virology": 0.578, "helm_mmlu/World Religions": 0.842, "helm_mmlu/Mean win rate": 0.206 } }, { "id": "ai21/jamba-instruct", "name": "Jamba Instruct", "developer": "ai21", "evaluator_relationship": null, "benchmark_scores": { "helm_lite/Mean win rate": 0.287, "helm_lite/NarrativeQA": 0.658, "helm_lite/NaturalQuestions (closed-book)": 0.384, "helm_lite/OpenbookQA": 0.796, "helm_lite/MMLU": 0.582, "helm_lite/MATH": 0.38, "helm_lite/GSM8K": 0.67, "helm_lite/LegalBench": 0.54, "helm_lite/MedQA": 0.519, "helm_lite/WMT 2014": 0.164, "helm_mmlu/MMLU All Subjects": 0.659, "helm_mmlu/Abstract Algebra": 0.36, "helm_mmlu/Anatomy": 0.615, "helm_mmlu/College Physics": 0.422, "helm_mmlu/Computer Security": 0.76, "helm_mmlu/Econometrics": 0.439, "helm_mmlu/Global Facts": 0.4, "helm_mmlu/Jurisprudence": 0.796, "helm_mmlu/Philosophy": 0.749, "helm_mmlu/Professional Psychology": 0.716, "helm_mmlu/Us Foreign Policy": 0.91, "helm_mmlu/Astronomy": 0.73, "helm_mmlu/Business Ethics": 0.6, "helm_mmlu/Clinical Knowledge": 0.702, "helm_mmlu/Conceptual Physics": 0.677, "helm_mmlu/Electrical Engineering": 0.621, "helm_mmlu/Elementary Mathematics": 0.497, "helm_mmlu/Formal Logic": 0.444, "helm_mmlu/High School World History": 0.797, "helm_mmlu/Human Sexuality": 0.794, "helm_mmlu/International Law": 0.835, "helm_mmlu/Logical Fallacies": 0.706, "helm_mmlu/Machine Learning": 0.536, "helm_mmlu/Management": 0.786, "helm_mmlu/Marketing": 0.885, "helm_mmlu/Medical Genetics": 0.67, "helm_mmlu/Miscellaneous": 0.865, "helm_mmlu/Moral Scenarios": 0.465, "helm_mmlu/Nutrition": 0.745, "helm_mmlu/Prehistory": 0.796, "helm_mmlu/Public Relations": 0.682, "helm_mmlu/Security Studies": 0.743, "helm_mmlu/Sociology": 0.891, "helm_mmlu/Virology": 0.53, "helm_mmlu/World Religions": 0.813, "helm_mmlu/Mean win rate": 0.887 } }, { "id": "ai21/Jurassic-2-Grande-17B", "name": "Jurassic-2 Grande 17B", "developer": "ai21", "evaluator_relationship": null, "benchmark_scores": { "helm_classic/Mean win rate": 0.743, "helm_classic/MMLU": 0.475, "helm_classic/BoolQ": 0.826, "helm_classic/NarrativeQA": 0.737, "helm_classic/NaturalQuestions (open-book)": 0.639, "helm_classic/QuAC": 0.418, "helm_classic/HellaSwag": 0.781, "helm_classic/OpenbookQA": 0.542, "helm_classic/TruthfulQA": 0.348, "helm_classic/MS MARCO (TREC)": 0.514, "helm_classic/CNN/DailyMail": 0.144, "helm_classic/XSUM": 0.167, "helm_classic/IMDB": 0.938, "helm_classic/CivilComments": 0.547, "helm_classic/RAFT": 0.712 } }, { "id": "ai21/Jurassic-2-Jumbo-178B", "name": "Jurassic-2 Jumbo 178B", "developer": "ai21", "evaluator_relationship": null, "benchmark_scores": { "helm_classic/Mean win rate": 0.824, "helm_classic/MMLU": 0.48, "helm_classic/BoolQ": 0.829, "helm_classic/NarrativeQA": 0.733, "helm_classic/NaturalQuestions (open-book)": 0.669, "helm_classic/QuAC": 0.435, "helm_classic/HellaSwag": 0.788, "helm_classic/OpenbookQA": 0.558, "helm_classic/TruthfulQA": 0.437, "helm_classic/MS MARCO (TREC)": 0.661, "helm_classic/CNN/DailyMail": 0.149, "helm_classic/XSUM": 0.182, "helm_classic/IMDB": 0.938, "helm_classic/CivilComments": 0.57, "helm_classic/RAFT": 0.746 } }, { "id": "ai21/Jurassic-2-Large-7.5B", "name": "Jurassic-2 Large 7.5B", "developer": "ai21", "evaluator_relationship": null, "benchmark_scores": { "helm_classic/Mean win rate": 0.553, "helm_classic/MMLU": 0.339, "helm_classic/BoolQ": 0.742, "helm_classic/NarrativeQA": -1.0, "helm_classic/NaturalQuestions (open-book)": 0.589, "helm_classic/QuAC": -1.0, "helm_classic/HellaSwag": 0.729, "helm_classic/OpenbookQA": 0.53, "helm_classic/TruthfulQA": 0.245, "helm_classic/MS MARCO (TREC)": 0.464, "helm_classic/CNN/DailyMail": 0.136, "helm_classic/XSUM": 0.142, "helm_classic/IMDB": 0.956, "helm_classic/CivilComments": 0.57, "helm_classic/RAFT": 0.622 } } ] }