Spaces:

evaleval
/

general-eval-card

Running

GitHub Actions

chore: sync EEE pipeline output [2026-03-28 11:37 UTC]

aa3daac about 2 months ago

1.81 kB

	{
	"developer": "AlephAlpha",
	"models": [
	{
	"id": "AlephAlpha/luminous-base",
	"name": "Luminous Base 13B",
	"developer": "AlephAlpha",
	"evaluator_relationship": null,
	"benchmark_scores": {
	"helm_lite/Mean win rate": 0.041,
	"helm_lite/NarrativeQA": 0.633,
	"helm_lite/NaturalQuestions (closed-book)": 0.197,
	"helm_lite/OpenbookQA": 0.286,
	"helm_lite/MMLU": 0.243,
	"helm_lite/MATH": 0.026,
	"helm_lite/GSM8K": 0.028,
	"helm_lite/LegalBench": 0.332,
	"helm_lite/MedQA": 0.26,
	"helm_lite/WMT 2014": 0.066
	}
	},
	{
	"id": "AlephAlpha/luminous-extended",
	"name": "Luminous Extended 30B",
	"developer": "AlephAlpha",
	"evaluator_relationship": null,
	"benchmark_scores": {
	"helm_lite/Mean win rate": 0.078,
	"helm_lite/NarrativeQA": 0.684,
	"helm_lite/NaturalQuestions (closed-book)": 0.253,
	"helm_lite/OpenbookQA": 0.272,
	"helm_lite/MMLU": 0.248,
	"helm_lite/MATH": 0.04,
	"helm_lite/GSM8K": 0.075,
	"helm_lite/LegalBench": 0.421,
	"helm_lite/MedQA": 0.276,
	"helm_lite/WMT 2014": 0.083
	}
	},
	{
	"id": "AlephAlpha/luminous-supreme",
	"name": "Luminous Supreme 70B",
	"developer": "AlephAlpha",
	"evaluator_relationship": null,
	"benchmark_scores": {
	"helm_lite/Mean win rate": 0.145,
	"helm_lite/NarrativeQA": 0.743,
	"helm_lite/NaturalQuestions (closed-book)": 0.299,
	"helm_lite/OpenbookQA": 0.284,
	"helm_lite/MMLU": 0.316,
	"helm_lite/MATH": 0.078,
	"helm_lite/GSM8K": 0.137,
	"helm_lite/LegalBench": 0.452,
	"helm_lite/MedQA": 0.276,
	"helm_lite/WMT 2014": 0.102
	}
	}
	]
	}