Spaces:
Build error
Build error
Clémentine
commited on
Commit
·
509661e
1
Parent(s):
270109b
hardcoded metadata
Browse files- leaderboards_metadata.py +107 -0
leaderboards_metadata.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from enum import Enum, auto
|
| 2 |
+
#from dataclasses import dataclass
|
| 3 |
+
|
| 4 |
+
SubmissionType = Enum(
|
| 5 |
+
"SubmissionType",
|
| 6 |
+
[
|
| 7 |
+
"Automatic",
|
| 8 |
+
"SemiAutomatic",
|
| 9 |
+
"Manual",
|
| 10 |
+
"Closed",
|
| 11 |
+
"Arena"
|
| 12 |
+
]
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
Evaluators = Enum(
|
| 16 |
+
"Evaluators",
|
| 17 |
+
[
|
| 18 |
+
"Humans", # Arena
|
| 19 |
+
"Automatic",
|
| 20 |
+
"Model"
|
| 21 |
+
]
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
TestSet = Enum(
|
| 25 |
+
"TestSet",
|
| 26 |
+
[
|
| 27 |
+
"Private",
|
| 28 |
+
"Public",
|
| 29 |
+
"Mix",
|
| 30 |
+
"Rolling",
|
| 31 |
+
"N/A"
|
| 32 |
+
]
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
Categories = Enum(
|
| 36 |
+
"Categories",
|
| 37 |
+
[
|
| 38 |
+
"Text",
|
| 39 |
+
"Image",
|
| 40 |
+
"Audio",
|
| 41 |
+
"Video",
|
| 42 |
+
"Multimodal",
|
| 43 |
+
"Generation",
|
| 44 |
+
"Math",
|
| 45 |
+
"Code",
|
| 46 |
+
"LanguageSpecific",
|
| 47 |
+
"Performance",
|
| 48 |
+
"Safety",
|
| 49 |
+
"VibeCheck",
|
| 50 |
+
"Tools",
|
| 51 |
+
"Artefacts"
|
| 52 |
+
]
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
Languages = Enum(
|
| 56 |
+
"Languages",
|
| 57 |
+
[
|
| 58 |
+
"Chinese",
|
| 59 |
+
"Korean",
|
| 60 |
+
"Dutch",
|
| 61 |
+
"Portuguese",
|
| 62 |
+
"Italian",
|
| 63 |
+
"Malay",
|
| 64 |
+
"Polish",
|
| 65 |
+
"Turkish"
|
| 66 |
+
|
| 67 |
+
]
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
leaderboard_to_tags = {
|
| 71 |
+
"HuggingFaceH4/open_llm_leaderboard": [SubmissionType.Automatic, Evaluators.Automatic, TestSet.Public, Categories.Text, Categories.Math],
|
| 72 |
+
"bigcode/bigcode-models-leaderboard": [SubmissionType.SemiAutomatic, Evaluators.Automatic, TestSet.Public, Categories.Code],
|
| 73 |
+
"optimum/llm-perf-leaderboard": [SubmissionType.Manual, Evaluators.Automatic, Categories.Performance],
|
| 74 |
+
"lmsys/chatbot-arena-leaderboard": [SubmissionType.Arena, Evaluators.Humans, Categories.Text, Categories.Generation],
|
| 75 |
+
"llmonitor/benchmarks": [SubmissionType.Manual, Evaluators.Humans, Categories.Text, Categories.VibeCheck],
|
| 76 |
+
"mteb/leaderboard": [SubmissionType.SemiAutomatic, Categories.Text, "Embeddings", Categories.Artefacts],
|
| 77 |
+
"gaia-benchmark/leaderboard": [SubmissionType.Automatic, TestSet.Private, Evaluators.Automatic, Categories.Text, Categories.Tools, Categories.Multimodal],
|
| 78 |
+
"opencompass/opencompass-llm-leaderboard": [SubmissionType.Manual, Categories.Text, Categories.LanguageSpecific, Languages.Chinese],
|
| 79 |
+
"upstage/open-ko-llm-leaderboard": [SubmissionType.Automatic, Evaluators.Automatic, TestSet.Mix, Categories.Text, Languages.Korean],
|
| 80 |
+
"BramVanroy/open_dutch_llm_leaderboard": [SubmissionType.Manual, Evaluators.Automatic, Categories.Text, Languages.Dutch],
|
| 81 |
+
"vectara/leaderboard": [SubmissionType.SemiAutomatic, Evaluators.Model, Categories.Text, "Hallucinations"],
|
| 82 |
+
"facebook/CyberSecEval": [SubmissionType.Closed, Categories.Code, Categories.Safety],
|
| 83 |
+
"mlabonne/Yet_Another_LLM_Leaderboard": [SubmissionType.Manual, Categories.Text, Evaluators.Automatic],
|
| 84 |
+
"AI-Secure/llm-trustworthy-leaderboard": [SubmissionType.Automatic, Categories.Safety, Categories.Text],
|
| 85 |
+
"AILab-CVC/EvalCrafter": [SubmissionType.Closed, Categories.Video, Categories.Generation],
|
| 86 |
+
"mike-ravkine/can-ai-code-results": [SubmissionType.Closed, Categories.Code],
|
| 87 |
+
"echo840/ocrbench-leaderboard": [SubmissionType.Closed, Categories.Image, "OCR"],
|
| 88 |
+
"NPHardEval/NPHardEval-leaderboard": [SubmissionType.Closed, Categories.Text, Categories.Math, TestSet.Rolling],
|
| 89 |
+
"HaizeLabs/red-teaming-resistance-benchmark": [SubmissionType.Manual, Categories.Safety, Categories.Text],
|
| 90 |
+
"devingulliver/subquadratic-llm-leaderboard": [SubmissionType.SemiAutomatic, Categories.Text, Categories.Math],
|
| 91 |
+
"WildVision/vision-arena": [SubmissionType.Arena, Categories.Image, Categories.Multimodal],
|
| 92 |
+
"Vchitect/VBench_Leaderboard": [SubmissionType.SemiAutomatic, Categories.Video, Categories.Generation],
|
| 93 |
+
"eduagarcia/open_pt_llm_leaderboard": [Categories.Text, Categories.LanguageSpecific, Languages.Portuguese],
|
| 94 |
+
"FinancialSupport/open_ita_llm_leaderboard": [Categories.Text, Categories.LanguageSpecific, Languages.Italian],
|
| 95 |
+
"mesolitica/malay-llm-leaderboard": [Categories.Text, Categories.LanguageSpecific, Languages.Malay],
|
| 96 |
+
"TIGER-Lab/GenAI-Arena": [Categories.Image, Categories.Generation, Evaluators.Humans, SubmissionType.Arena],
|
| 97 |
+
"q-future/Q-Bench-Leaderboard": [Categories.Image, Evaluators.Automatic, SubmissionType.Closed],
|
| 98 |
+
"OpenGenAI/parti-prompts-leaderboard": [Categories.Image, Categories.Generation, SubmissionType.Arena, Evaluators.Humans],
|
| 99 |
+
"speakleash/open_pl_llm_leaderboard": [Categories.LanguageSpecific, Categories.Text, Languages.Polish],
|
| 100 |
+
"malhajar/OpenLLMTurkishLeaderboard": [Categories.LanguageSpecific, Categories.Text, Languages.Turkish],
|
| 101 |
+
"allenai/WildBench": [Evaluators.Humans, SubmissionType.Arena, Evaluators.Model, Categories.Text, Categories.Generation],
|
| 102 |
+
"hf-audio/open_asr_leaderboard": [Evaluators.Automatic, Categories.Audio],
|
| 103 |
+
"opencompass/open_vlm_leaderboard": [Evaluators.Automatic, Categories.Generation, Categories.Image],
|
| 104 |
+
"livecodebench/benchmarks": [Evaluators.Automatic, Categories.Code],
|
| 105 |
+
"allenai/reward-bench": [Evaluators.Automatic, Categories.Artefacts, "Models", Categories.Text],
|
| 106 |
+
"TTS-AGI/TTS-Arena": [Evaluators.Humans, Categories.Audio]
|
| 107 |
+
}
|