Spaces:

evaleval
/

general-eval-card

Running

general-eval-card / data /developers /ai21.json

GitHub Actions

chore: sync EEE pipeline output [2026-03-28 11:49 UTC]

d91b463 2 months ago

13.3 kB

	{
	"developer": "ai21",
	"models": [
	{
	"id": "ai21/J1-Grande-v1-17B",
	"name": "J1-Grande v1 17B",
	"developer": "ai21",
	"evaluator_relationship": null,
	"benchmark_scores": {
	"helm_classic/Mean win rate": 0.433,
	"helm_classic/MMLU": 0.27,
	"helm_classic/BoolQ": 0.722,
	"helm_classic/NarrativeQA": 0.672,
	"helm_classic/NaturalQuestions (open-book)": 0.578,
	"helm_classic/QuAC": 0.362,
	"helm_classic/HellaSwag": 0.739,
	"helm_classic/OpenbookQA": 0.52,
	"helm_classic/TruthfulQA": 0.193,
	"helm_classic/MS MARCO (TREC)": 0.341,
	"helm_classic/CNN/DailyMail": 0.143,
	"helm_classic/XSUM": 0.122,
	"helm_classic/IMDB": 0.953,
	"helm_classic/CivilComments": 0.529,
	"helm_classic/RAFT": 0.658
	}
	},
	{
	"id": "ai21/J1-Grande-v2-beta-17B",
	"name": "J1-Grande v2 beta 17B",
	"developer": "ai21",
	"evaluator_relationship": null,
	"benchmark_scores": {
	"helm_classic/Mean win rate": 0.706,
	"helm_classic/MMLU": 0.445,
	"helm_classic/BoolQ": 0.812,
	"helm_classic/NarrativeQA": 0.725,
	"helm_classic/NaturalQuestions (open-book)": 0.625,
	"helm_classic/QuAC": 0.392,
	"helm_classic/HellaSwag": 0.764,
	"helm_classic/OpenbookQA": 0.56,
	"helm_classic/TruthfulQA": 0.306,
	"helm_classic/MS MARCO (TREC)": 0.46,
	"helm_classic/CNN/DailyMail": 0.146,
	"helm_classic/XSUM": 0.152,
	"helm_classic/IMDB": 0.957,
	"helm_classic/CivilComments": 0.546,
	"helm_classic/RAFT": 0.679
	}
	},
	{
	"id": "ai21/J1-Jumbo-v1-178B",
	"name": "J1-Jumbo v1 178B",
	"developer": "ai21",
	"evaluator_relationship": null,
	"benchmark_scores": {
	"helm_classic/Mean win rate": 0.517,
	"helm_classic/MMLU": 0.259,
	"helm_classic/BoolQ": 0.776,
	"helm_classic/NarrativeQA": 0.695,
	"helm_classic/NaturalQuestions (open-book)": 0.595,
	"helm_classic/QuAC": 0.358,
	"helm_classic/HellaSwag": 0.765,
	"helm_classic/OpenbookQA": 0.534,
	"helm_classic/TruthfulQA": 0.175,
	"helm_classic/MS MARCO (TREC)": 0.363,
	"helm_classic/CNN/DailyMail": 0.144,
	"helm_classic/XSUM": 0.129,
	"helm_classic/IMDB": 0.943,
	"helm_classic/CivilComments": 0.553,
	"helm_classic/RAFT": 0.681
	}
	},
	{
	"id": "ai21/J1-Large-v1-7.5B",
	"name": "J1-Large v1 7.5B",
	"developer": "ai21",
	"evaluator_relationship": null,
	"benchmark_scores": {
	"helm_classic/Mean win rate": 0.285,
	"helm_classic/MMLU": 0.241,
	"helm_classic/BoolQ": 0.683,
	"helm_classic/NarrativeQA": 0.623,
	"helm_classic/NaturalQuestions (open-book)": 0.532,
	"helm_classic/QuAC": 0.328,
	"helm_classic/HellaSwag": 0.7,
	"helm_classic/OpenbookQA": 0.514,
	"helm_classic/TruthfulQA": 0.197,
	"helm_classic/MS MARCO (TREC)": 0.292,
	"helm_classic/CNN/DailyMail": 0.134,
	"helm_classic/XSUM": 0.102,
	"helm_classic/IMDB": 0.956,
	"helm_classic/CivilComments": 0.532,
	"helm_classic/RAFT": 0.545
	}
	},
	{
	"id": "ai21/j2-grande",
	"name": "Jurassic-2 Grande 17B",
	"developer": "ai21",
	"evaluator_relationship": null,
	"benchmark_scores": {
	"helm_lite/Mean win rate": 0.172,
	"helm_lite/NarrativeQA": 0.744,
	"helm_lite/NaturalQuestions (closed-book)": 0.35,
	"helm_lite/OpenbookQA": 0.614,
	"helm_lite/MMLU": 0.471,
	"helm_lite/MATH": 0.064,
	"helm_lite/GSM8K": 0.159,
	"helm_lite/LegalBench": 0.468,
	"helm_lite/MedQA": 0.39,
	"helm_lite/WMT 2014": 0.102
	}
	},
	{
	"id": "ai21/j2-jumbo",
	"name": "Jurassic-2 Jumbo 178B",
	"developer": "ai21",
	"evaluator_relationship": null,
	"benchmark_scores": {
	"helm_lite/Mean win rate": 0.215,
	"helm_lite/NarrativeQA": 0.728,
	"helm_lite/NaturalQuestions (closed-book)": 0.385,
	"helm_lite/OpenbookQA": 0.688,
	"helm_lite/MMLU": 0.483,
	"helm_lite/MATH": 0.103,
	"helm_lite/GSM8K": 0.239,
	"helm_lite/LegalBench": 0.533,
	"helm_lite/MedQA": 0.431,
	"helm_lite/WMT 2014": 0.114
	}
	},
	{
	"id": "ai21/jamba-1.5-large",
	"name": "Jamba 1.5 Large",
	"developer": "ai21",
	"evaluator_relationship": null,
	"benchmark_scores": {
	"helm_lite/Mean win rate": 0.637,
	"helm_lite/NarrativeQA": 0.664,
	"helm_lite/NaturalQuestions (closed-book)": 0.394,
	"helm_lite/OpenbookQA": 0.948,
	"helm_lite/MMLU": 0.683,
	"helm_lite/MATH": 0.692,
	"helm_lite/GSM8K": 0.846,
	"helm_lite/LegalBench": 0.675,
	"helm_lite/MedQA": 0.698,
	"helm_lite/WMT 2014": 0.203,
	"helm_mmlu/MMLU All Subjects": 0.782,
	"helm_mmlu/Abstract Algebra": 0.53,
	"helm_mmlu/Anatomy": 0.793,
	"helm_mmlu/College Physics": 0.51,
	"helm_mmlu/Computer Security": 0.8,
	"helm_mmlu/Econometrics": 0.614,
	"helm_mmlu/Global Facts": 0.54,
	"helm_mmlu/Jurisprudence": 0.87,
	"helm_mmlu/Philosophy": 0.849,
	"helm_mmlu/Professional Psychology": 0.842,
	"helm_mmlu/Us Foreign Policy": 0.92,
	"helm_mmlu/Astronomy": 0.882,
	"helm_mmlu/Business Ethics": 0.77,
	"helm_mmlu/Clinical Knowledge": 0.849,
	"helm_mmlu/Conceptual Physics": 0.779,
	"helm_mmlu/Electrical Engineering": 0.793,
	"helm_mmlu/Elementary Mathematics": 0.656,
	"helm_mmlu/Formal Logic": 0.619,
	"helm_mmlu/High School World History": 0.911,
	"helm_mmlu/Human Sexuality": 0.832,
	"helm_mmlu/International Law": 0.884,
	"helm_mmlu/Logical Fallacies": 0.859,
	"helm_mmlu/Machine Learning": 0.688,
	"helm_mmlu/Management": 0.864,
	"helm_mmlu/Marketing": 0.94,
	"helm_mmlu/Medical Genetics": 0.89,
	"helm_mmlu/Miscellaneous": 0.931,
	"helm_mmlu/Moral Scenarios": 0.686,
	"helm_mmlu/Nutrition": 0.869,
	"helm_mmlu/Prehistory": 0.892,
	"helm_mmlu/Public Relations": 0.755,
	"helm_mmlu/Security Studies": 0.771,
	"helm_mmlu/Sociology": 0.93,
	"helm_mmlu/Virology": 0.554,
	"helm_mmlu/World Religions": 0.865,
	"helm_mmlu/Mean win rate": 0.147
	}
	},
	{
	"id": "ai21/jamba-1.5-mini",
	"name": "Jamba 1.5 Mini",
	"developer": "ai21",
	"evaluator_relationship": null,
	"benchmark_scores": {
	"helm_lite/Mean win rate": 0.414,
	"helm_lite/NarrativeQA": 0.746,
	"helm_lite/NaturalQuestions (closed-book)": 0.388,
	"helm_lite/OpenbookQA": 0.89,
	"helm_lite/MMLU": 0.582,
	"helm_lite/MATH": 0.318,
	"helm_lite/GSM8K": 0.691,
	"helm_lite/LegalBench": 0.503,
	"helm_lite/MedQA": 0.632,
	"helm_lite/WMT 2014": 0.179,
	"helm_mmlu/MMLU All Subjects": 0.699,
	"helm_mmlu/Abstract Algebra": 0.33,
	"helm_mmlu/Anatomy": 0.711,
	"helm_mmlu/College Physics": 0.48,
	"helm_mmlu/Computer Security": 0.73,
	"helm_mmlu/Econometrics": 0.491,
	"helm_mmlu/Global Facts": 0.43,
	"helm_mmlu/Jurisprudence": 0.88,
	"helm_mmlu/Philosophy": 0.752,
	"helm_mmlu/Professional Psychology": 0.76,
	"helm_mmlu/Us Foreign Policy": 0.9,
	"helm_mmlu/Astronomy": 0.822,
	"helm_mmlu/Business Ethics": 0.76,
	"helm_mmlu/Clinical Knowledge": 0.74,
	"helm_mmlu/Conceptual Physics": 0.677,
	"helm_mmlu/Electrical Engineering": 0.683,
	"helm_mmlu/Elementary Mathematics": 0.553,
	"helm_mmlu/Formal Logic": 0.452,
	"helm_mmlu/High School World History": 0.84,
	"helm_mmlu/Human Sexuality": 0.809,
	"helm_mmlu/International Law": 0.893,
	"helm_mmlu/Logical Fallacies": 0.81,
	"helm_mmlu/Machine Learning": 0.509,
	"helm_mmlu/Management": 0.825,
	"helm_mmlu/Marketing": 0.915,
	"helm_mmlu/Medical Genetics": 0.69,
	"helm_mmlu/Miscellaneous": 0.902,
	"helm_mmlu/Moral Scenarios": 0.269,
	"helm_mmlu/Nutrition": 0.801,
	"helm_mmlu/Prehistory": 0.824,
	"helm_mmlu/Public Relations": 0.727,
	"helm_mmlu/Security Studies": 0.755,
	"helm_mmlu/Sociology": 0.876,
	"helm_mmlu/Virology": 0.578,
	"helm_mmlu/World Religions": 0.842,
	"helm_mmlu/Mean win rate": 0.206
	}
	},
	{
	"id": "ai21/jamba-instruct",
	"name": "Jamba Instruct",
	"developer": "ai21",
	"evaluator_relationship": null,
	"benchmark_scores": {
	"helm_lite/Mean win rate": 0.287,
	"helm_lite/NarrativeQA": 0.658,
	"helm_lite/NaturalQuestions (closed-book)": 0.384,
	"helm_lite/OpenbookQA": 0.796,
	"helm_lite/MMLU": 0.582,
	"helm_lite/MATH": 0.38,
	"helm_lite/GSM8K": 0.67,
	"helm_lite/LegalBench": 0.54,
	"helm_lite/MedQA": 0.519,
	"helm_lite/WMT 2014": 0.164,
	"helm_mmlu/MMLU All Subjects": 0.659,
	"helm_mmlu/Abstract Algebra": 0.36,
	"helm_mmlu/Anatomy": 0.615,
	"helm_mmlu/College Physics": 0.422,
	"helm_mmlu/Computer Security": 0.76,
	"helm_mmlu/Econometrics": 0.439,
	"helm_mmlu/Global Facts": 0.4,
	"helm_mmlu/Jurisprudence": 0.796,
	"helm_mmlu/Philosophy": 0.749,
	"helm_mmlu/Professional Psychology": 0.716,
	"helm_mmlu/Us Foreign Policy": 0.91,
	"helm_mmlu/Astronomy": 0.73,
	"helm_mmlu/Business Ethics": 0.6,
	"helm_mmlu/Clinical Knowledge": 0.702,
	"helm_mmlu/Conceptual Physics": 0.677,
	"helm_mmlu/Electrical Engineering": 0.621,
	"helm_mmlu/Elementary Mathematics": 0.497,
	"helm_mmlu/Formal Logic": 0.444,
	"helm_mmlu/High School World History": 0.797,
	"helm_mmlu/Human Sexuality": 0.794,
	"helm_mmlu/International Law": 0.835,
	"helm_mmlu/Logical Fallacies": 0.706,
	"helm_mmlu/Machine Learning": 0.536,
	"helm_mmlu/Management": 0.786,
	"helm_mmlu/Marketing": 0.885,
	"helm_mmlu/Medical Genetics": 0.67,
	"helm_mmlu/Miscellaneous": 0.865,
	"helm_mmlu/Moral Scenarios": 0.465,
	"helm_mmlu/Nutrition": 0.745,
	"helm_mmlu/Prehistory": 0.796,
	"helm_mmlu/Public Relations": 0.682,
	"helm_mmlu/Security Studies": 0.743,
	"helm_mmlu/Sociology": 0.891,
	"helm_mmlu/Virology": 0.53,
	"helm_mmlu/World Religions": 0.813,
	"helm_mmlu/Mean win rate": 0.887
	}
	},
	{
	"id": "ai21/Jurassic-2-Grande-17B",
	"name": "Jurassic-2 Grande 17B",
	"developer": "ai21",
	"evaluator_relationship": null,
	"benchmark_scores": {
	"helm_classic/Mean win rate": 0.743,
	"helm_classic/MMLU": 0.475,
	"helm_classic/BoolQ": 0.826,
	"helm_classic/NarrativeQA": 0.737,
	"helm_classic/NaturalQuestions (open-book)": 0.639,
	"helm_classic/QuAC": 0.418,
	"helm_classic/HellaSwag": 0.781,
	"helm_classic/OpenbookQA": 0.542,
	"helm_classic/TruthfulQA": 0.348,
	"helm_classic/MS MARCO (TREC)": 0.514,
	"helm_classic/CNN/DailyMail": 0.144,
	"helm_classic/XSUM": 0.167,
	"helm_classic/IMDB": 0.938,
	"helm_classic/CivilComments": 0.547,
	"helm_classic/RAFT": 0.712
	}
	},
	{
	"id": "ai21/Jurassic-2-Jumbo-178B",
	"name": "Jurassic-2 Jumbo 178B",
	"developer": "ai21",
	"evaluator_relationship": null,
	"benchmark_scores": {
	"helm_classic/Mean win rate": 0.824,
	"helm_classic/MMLU": 0.48,
	"helm_classic/BoolQ": 0.829,
	"helm_classic/NarrativeQA": 0.733,
	"helm_classic/NaturalQuestions (open-book)": 0.669,
	"helm_classic/QuAC": 0.435,
	"helm_classic/HellaSwag": 0.788,
	"helm_classic/OpenbookQA": 0.558,
	"helm_classic/TruthfulQA": 0.437,
	"helm_classic/MS MARCO (TREC)": 0.661,
	"helm_classic/CNN/DailyMail": 0.149,
	"helm_classic/XSUM": 0.182,
	"helm_classic/IMDB": 0.938,
	"helm_classic/CivilComments": 0.57,
	"helm_classic/RAFT": 0.746
	}
	},
	{
	"id": "ai21/Jurassic-2-Large-7.5B",
	"name": "Jurassic-2 Large 7.5B",
	"developer": "ai21",
	"evaluator_relationship": null,
	"benchmark_scores": {
	"helm_classic/Mean win rate": 0.553,
	"helm_classic/MMLU": 0.339,
	"helm_classic/BoolQ": 0.742,
	"helm_classic/NarrativeQA": -1.0,
	"helm_classic/NaturalQuestions (open-book)": 0.589,
	"helm_classic/QuAC": -1.0,
	"helm_classic/HellaSwag": 0.729,
	"helm_classic/OpenbookQA": 0.53,
	"helm_classic/TruthfulQA": 0.245,
	"helm_classic/MS MARCO (TREC)": 0.464,
	"helm_classic/CNN/DailyMail": 0.136,
	"helm_classic/XSUM": 0.142,
	"helm_classic/IMDB": 0.956,
	"helm_classic/CivilComments": 0.57,
	"helm_classic/RAFT": 0.622
	}
	}
	]
	}