GitHub Actions
chore: sync EEE pipeline output [2026-03-28 11:49 UTC]
d91b463
raw
history blame
13.3 kB
{
"developer": "ai21",
"models": [
{
"id": "ai21/J1-Grande-v1-17B",
"name": "J1-Grande v1 17B",
"developer": "ai21",
"evaluator_relationship": null,
"benchmark_scores": {
"helm_classic/Mean win rate": 0.433,
"helm_classic/MMLU": 0.27,
"helm_classic/BoolQ": 0.722,
"helm_classic/NarrativeQA": 0.672,
"helm_classic/NaturalQuestions (open-book)": 0.578,
"helm_classic/QuAC": 0.362,
"helm_classic/HellaSwag": 0.739,
"helm_classic/OpenbookQA": 0.52,
"helm_classic/TruthfulQA": 0.193,
"helm_classic/MS MARCO (TREC)": 0.341,
"helm_classic/CNN/DailyMail": 0.143,
"helm_classic/XSUM": 0.122,
"helm_classic/IMDB": 0.953,
"helm_classic/CivilComments": 0.529,
"helm_classic/RAFT": 0.658
}
},
{
"id": "ai21/J1-Grande-v2-beta-17B",
"name": "J1-Grande v2 beta 17B",
"developer": "ai21",
"evaluator_relationship": null,
"benchmark_scores": {
"helm_classic/Mean win rate": 0.706,
"helm_classic/MMLU": 0.445,
"helm_classic/BoolQ": 0.812,
"helm_classic/NarrativeQA": 0.725,
"helm_classic/NaturalQuestions (open-book)": 0.625,
"helm_classic/QuAC": 0.392,
"helm_classic/HellaSwag": 0.764,
"helm_classic/OpenbookQA": 0.56,
"helm_classic/TruthfulQA": 0.306,
"helm_classic/MS MARCO (TREC)": 0.46,
"helm_classic/CNN/DailyMail": 0.146,
"helm_classic/XSUM": 0.152,
"helm_classic/IMDB": 0.957,
"helm_classic/CivilComments": 0.546,
"helm_classic/RAFT": 0.679
}
},
{
"id": "ai21/J1-Jumbo-v1-178B",
"name": "J1-Jumbo v1 178B",
"developer": "ai21",
"evaluator_relationship": null,
"benchmark_scores": {
"helm_classic/Mean win rate": 0.517,
"helm_classic/MMLU": 0.259,
"helm_classic/BoolQ": 0.776,
"helm_classic/NarrativeQA": 0.695,
"helm_classic/NaturalQuestions (open-book)": 0.595,
"helm_classic/QuAC": 0.358,
"helm_classic/HellaSwag": 0.765,
"helm_classic/OpenbookQA": 0.534,
"helm_classic/TruthfulQA": 0.175,
"helm_classic/MS MARCO (TREC)": 0.363,
"helm_classic/CNN/DailyMail": 0.144,
"helm_classic/XSUM": 0.129,
"helm_classic/IMDB": 0.943,
"helm_classic/CivilComments": 0.553,
"helm_classic/RAFT": 0.681
}
},
{
"id": "ai21/J1-Large-v1-7.5B",
"name": "J1-Large v1 7.5B",
"developer": "ai21",
"evaluator_relationship": null,
"benchmark_scores": {
"helm_classic/Mean win rate": 0.285,
"helm_classic/MMLU": 0.241,
"helm_classic/BoolQ": 0.683,
"helm_classic/NarrativeQA": 0.623,
"helm_classic/NaturalQuestions (open-book)": 0.532,
"helm_classic/QuAC": 0.328,
"helm_classic/HellaSwag": 0.7,
"helm_classic/OpenbookQA": 0.514,
"helm_classic/TruthfulQA": 0.197,
"helm_classic/MS MARCO (TREC)": 0.292,
"helm_classic/CNN/DailyMail": 0.134,
"helm_classic/XSUM": 0.102,
"helm_classic/IMDB": 0.956,
"helm_classic/CivilComments": 0.532,
"helm_classic/RAFT": 0.545
}
},
{
"id": "ai21/j2-grande",
"name": "Jurassic-2 Grande 17B",
"developer": "ai21",
"evaluator_relationship": null,
"benchmark_scores": {
"helm_lite/Mean win rate": 0.172,
"helm_lite/NarrativeQA": 0.744,
"helm_lite/NaturalQuestions (closed-book)": 0.35,
"helm_lite/OpenbookQA": 0.614,
"helm_lite/MMLU": 0.471,
"helm_lite/MATH": 0.064,
"helm_lite/GSM8K": 0.159,
"helm_lite/LegalBench": 0.468,
"helm_lite/MedQA": 0.39,
"helm_lite/WMT 2014": 0.102
}
},
{
"id": "ai21/j2-jumbo",
"name": "Jurassic-2 Jumbo 178B",
"developer": "ai21",
"evaluator_relationship": null,
"benchmark_scores": {
"helm_lite/Mean win rate": 0.215,
"helm_lite/NarrativeQA": 0.728,
"helm_lite/NaturalQuestions (closed-book)": 0.385,
"helm_lite/OpenbookQA": 0.688,
"helm_lite/MMLU": 0.483,
"helm_lite/MATH": 0.103,
"helm_lite/GSM8K": 0.239,
"helm_lite/LegalBench": 0.533,
"helm_lite/MedQA": 0.431,
"helm_lite/WMT 2014": 0.114
}
},
{
"id": "ai21/jamba-1.5-large",
"name": "Jamba 1.5 Large",
"developer": "ai21",
"evaluator_relationship": null,
"benchmark_scores": {
"helm_lite/Mean win rate": 0.637,
"helm_lite/NarrativeQA": 0.664,
"helm_lite/NaturalQuestions (closed-book)": 0.394,
"helm_lite/OpenbookQA": 0.948,
"helm_lite/MMLU": 0.683,
"helm_lite/MATH": 0.692,
"helm_lite/GSM8K": 0.846,
"helm_lite/LegalBench": 0.675,
"helm_lite/MedQA": 0.698,
"helm_lite/WMT 2014": 0.203,
"helm_mmlu/MMLU All Subjects": 0.782,
"helm_mmlu/Abstract Algebra": 0.53,
"helm_mmlu/Anatomy": 0.793,
"helm_mmlu/College Physics": 0.51,
"helm_mmlu/Computer Security": 0.8,
"helm_mmlu/Econometrics": 0.614,
"helm_mmlu/Global Facts": 0.54,
"helm_mmlu/Jurisprudence": 0.87,
"helm_mmlu/Philosophy": 0.849,
"helm_mmlu/Professional Psychology": 0.842,
"helm_mmlu/Us Foreign Policy": 0.92,
"helm_mmlu/Astronomy": 0.882,
"helm_mmlu/Business Ethics": 0.77,
"helm_mmlu/Clinical Knowledge": 0.849,
"helm_mmlu/Conceptual Physics": 0.779,
"helm_mmlu/Electrical Engineering": 0.793,
"helm_mmlu/Elementary Mathematics": 0.656,
"helm_mmlu/Formal Logic": 0.619,
"helm_mmlu/High School World History": 0.911,
"helm_mmlu/Human Sexuality": 0.832,
"helm_mmlu/International Law": 0.884,
"helm_mmlu/Logical Fallacies": 0.859,
"helm_mmlu/Machine Learning": 0.688,
"helm_mmlu/Management": 0.864,
"helm_mmlu/Marketing": 0.94,
"helm_mmlu/Medical Genetics": 0.89,
"helm_mmlu/Miscellaneous": 0.931,
"helm_mmlu/Moral Scenarios": 0.686,
"helm_mmlu/Nutrition": 0.869,
"helm_mmlu/Prehistory": 0.892,
"helm_mmlu/Public Relations": 0.755,
"helm_mmlu/Security Studies": 0.771,
"helm_mmlu/Sociology": 0.93,
"helm_mmlu/Virology": 0.554,
"helm_mmlu/World Religions": 0.865,
"helm_mmlu/Mean win rate": 0.147
}
},
{
"id": "ai21/jamba-1.5-mini",
"name": "Jamba 1.5 Mini",
"developer": "ai21",
"evaluator_relationship": null,
"benchmark_scores": {
"helm_lite/Mean win rate": 0.414,
"helm_lite/NarrativeQA": 0.746,
"helm_lite/NaturalQuestions (closed-book)": 0.388,
"helm_lite/OpenbookQA": 0.89,
"helm_lite/MMLU": 0.582,
"helm_lite/MATH": 0.318,
"helm_lite/GSM8K": 0.691,
"helm_lite/LegalBench": 0.503,
"helm_lite/MedQA": 0.632,
"helm_lite/WMT 2014": 0.179,
"helm_mmlu/MMLU All Subjects": 0.699,
"helm_mmlu/Abstract Algebra": 0.33,
"helm_mmlu/Anatomy": 0.711,
"helm_mmlu/College Physics": 0.48,
"helm_mmlu/Computer Security": 0.73,
"helm_mmlu/Econometrics": 0.491,
"helm_mmlu/Global Facts": 0.43,
"helm_mmlu/Jurisprudence": 0.88,
"helm_mmlu/Philosophy": 0.752,
"helm_mmlu/Professional Psychology": 0.76,
"helm_mmlu/Us Foreign Policy": 0.9,
"helm_mmlu/Astronomy": 0.822,
"helm_mmlu/Business Ethics": 0.76,
"helm_mmlu/Clinical Knowledge": 0.74,
"helm_mmlu/Conceptual Physics": 0.677,
"helm_mmlu/Electrical Engineering": 0.683,
"helm_mmlu/Elementary Mathematics": 0.553,
"helm_mmlu/Formal Logic": 0.452,
"helm_mmlu/High School World History": 0.84,
"helm_mmlu/Human Sexuality": 0.809,
"helm_mmlu/International Law": 0.893,
"helm_mmlu/Logical Fallacies": 0.81,
"helm_mmlu/Machine Learning": 0.509,
"helm_mmlu/Management": 0.825,
"helm_mmlu/Marketing": 0.915,
"helm_mmlu/Medical Genetics": 0.69,
"helm_mmlu/Miscellaneous": 0.902,
"helm_mmlu/Moral Scenarios": 0.269,
"helm_mmlu/Nutrition": 0.801,
"helm_mmlu/Prehistory": 0.824,
"helm_mmlu/Public Relations": 0.727,
"helm_mmlu/Security Studies": 0.755,
"helm_mmlu/Sociology": 0.876,
"helm_mmlu/Virology": 0.578,
"helm_mmlu/World Religions": 0.842,
"helm_mmlu/Mean win rate": 0.206
}
},
{
"id": "ai21/jamba-instruct",
"name": "Jamba Instruct",
"developer": "ai21",
"evaluator_relationship": null,
"benchmark_scores": {
"helm_lite/Mean win rate": 0.287,
"helm_lite/NarrativeQA": 0.658,
"helm_lite/NaturalQuestions (closed-book)": 0.384,
"helm_lite/OpenbookQA": 0.796,
"helm_lite/MMLU": 0.582,
"helm_lite/MATH": 0.38,
"helm_lite/GSM8K": 0.67,
"helm_lite/LegalBench": 0.54,
"helm_lite/MedQA": 0.519,
"helm_lite/WMT 2014": 0.164,
"helm_mmlu/MMLU All Subjects": 0.659,
"helm_mmlu/Abstract Algebra": 0.36,
"helm_mmlu/Anatomy": 0.615,
"helm_mmlu/College Physics": 0.422,
"helm_mmlu/Computer Security": 0.76,
"helm_mmlu/Econometrics": 0.439,
"helm_mmlu/Global Facts": 0.4,
"helm_mmlu/Jurisprudence": 0.796,
"helm_mmlu/Philosophy": 0.749,
"helm_mmlu/Professional Psychology": 0.716,
"helm_mmlu/Us Foreign Policy": 0.91,
"helm_mmlu/Astronomy": 0.73,
"helm_mmlu/Business Ethics": 0.6,
"helm_mmlu/Clinical Knowledge": 0.702,
"helm_mmlu/Conceptual Physics": 0.677,
"helm_mmlu/Electrical Engineering": 0.621,
"helm_mmlu/Elementary Mathematics": 0.497,
"helm_mmlu/Formal Logic": 0.444,
"helm_mmlu/High School World History": 0.797,
"helm_mmlu/Human Sexuality": 0.794,
"helm_mmlu/International Law": 0.835,
"helm_mmlu/Logical Fallacies": 0.706,
"helm_mmlu/Machine Learning": 0.536,
"helm_mmlu/Management": 0.786,
"helm_mmlu/Marketing": 0.885,
"helm_mmlu/Medical Genetics": 0.67,
"helm_mmlu/Miscellaneous": 0.865,
"helm_mmlu/Moral Scenarios": 0.465,
"helm_mmlu/Nutrition": 0.745,
"helm_mmlu/Prehistory": 0.796,
"helm_mmlu/Public Relations": 0.682,
"helm_mmlu/Security Studies": 0.743,
"helm_mmlu/Sociology": 0.891,
"helm_mmlu/Virology": 0.53,
"helm_mmlu/World Religions": 0.813,
"helm_mmlu/Mean win rate": 0.887
}
},
{
"id": "ai21/Jurassic-2-Grande-17B",
"name": "Jurassic-2 Grande 17B",
"developer": "ai21",
"evaluator_relationship": null,
"benchmark_scores": {
"helm_classic/Mean win rate": 0.743,
"helm_classic/MMLU": 0.475,
"helm_classic/BoolQ": 0.826,
"helm_classic/NarrativeQA": 0.737,
"helm_classic/NaturalQuestions (open-book)": 0.639,
"helm_classic/QuAC": 0.418,
"helm_classic/HellaSwag": 0.781,
"helm_classic/OpenbookQA": 0.542,
"helm_classic/TruthfulQA": 0.348,
"helm_classic/MS MARCO (TREC)": 0.514,
"helm_classic/CNN/DailyMail": 0.144,
"helm_classic/XSUM": 0.167,
"helm_classic/IMDB": 0.938,
"helm_classic/CivilComments": 0.547,
"helm_classic/RAFT": 0.712
}
},
{
"id": "ai21/Jurassic-2-Jumbo-178B",
"name": "Jurassic-2 Jumbo 178B",
"developer": "ai21",
"evaluator_relationship": null,
"benchmark_scores": {
"helm_classic/Mean win rate": 0.824,
"helm_classic/MMLU": 0.48,
"helm_classic/BoolQ": 0.829,
"helm_classic/NarrativeQA": 0.733,
"helm_classic/NaturalQuestions (open-book)": 0.669,
"helm_classic/QuAC": 0.435,
"helm_classic/HellaSwag": 0.788,
"helm_classic/OpenbookQA": 0.558,
"helm_classic/TruthfulQA": 0.437,
"helm_classic/MS MARCO (TREC)": 0.661,
"helm_classic/CNN/DailyMail": 0.149,
"helm_classic/XSUM": 0.182,
"helm_classic/IMDB": 0.938,
"helm_classic/CivilComments": 0.57,
"helm_classic/RAFT": 0.746
}
},
{
"id": "ai21/Jurassic-2-Large-7.5B",
"name": "Jurassic-2 Large 7.5B",
"developer": "ai21",
"evaluator_relationship": null,
"benchmark_scores": {
"helm_classic/Mean win rate": 0.553,
"helm_classic/MMLU": 0.339,
"helm_classic/BoolQ": 0.742,
"helm_classic/NarrativeQA": -1.0,
"helm_classic/NaturalQuestions (open-book)": 0.589,
"helm_classic/QuAC": -1.0,
"helm_classic/HellaSwag": 0.729,
"helm_classic/OpenbookQA": 0.53,
"helm_classic/TruthfulQA": 0.245,
"helm_classic/MS MARCO (TREC)": 0.464,
"helm_classic/CNN/DailyMail": 0.136,
"helm_classic/XSUM": 0.142,
"helm_classic/IMDB": 0.956,
"helm_classic/CivilComments": 0.57,
"helm_classic/RAFT": 0.622
}
}
]
}