Spaces:
Sleeping
Sleeping
xeon27
commited on
Commit
·
15e5347
1
Parent(s):
2a314d2
Make task names clickable and link to inspect-evals repo
Browse files- src/about.py +15 -14
- src/display/utils.py +1 -1
src/about.py
CHANGED
|
@@ -7,6 +7,7 @@ class Task:
|
|
| 7 |
metric: str
|
| 8 |
col_name: str
|
| 9 |
type: str
|
|
|
|
| 10 |
|
| 11 |
|
| 12 |
# Select your tasks here
|
|
@@ -15,22 +16,22 @@ class Tasks(Enum):
|
|
| 15 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
| 16 |
|
| 17 |
# base
|
| 18 |
-
task0 = Task("arc_easy", "accuracy", "ARC-Easy", "base")
|
| 19 |
-
task1 = Task("arc_challenge", "accuracy", "ARC-Challenge", "base")
|
| 20 |
-
task2 = Task("drop", "mean", "DROP", "base")
|
| 21 |
-
task3 = Task("winogrande", "accuracy", "WinoGrande", "base")
|
| 22 |
-
task4 = Task("gsm8k", "accuracy", "GSM8K", "base")
|
| 23 |
-
task5 = Task("hellaswag", "accuracy", "HellaSwag", "base")
|
| 24 |
-
task6 = Task("humaneval", "mean", "HumanEval", "base")
|
| 25 |
-
task7 = Task("ifeval", "final_acc", "IFEval", "base")
|
| 26 |
-
task8 = Task("math", "accuracy", "MATH", "base")
|
| 27 |
-
task9 = Task("mmlu", "accuracy", "MMLU", "base")
|
| 28 |
-
task10 = Task("mmlu_pro", "accuracy", "MMLU-Pro", "base")
|
| 29 |
-
task11 = Task("gpqa_diamond", "accuracy", "GPQA-Diamond", "base")
|
| 30 |
|
| 31 |
# agentic
|
| 32 |
-
task12 = Task("gaia", "mean", "GAIA", "agentic")
|
| 33 |
-
task13 = Task("gdm_intercode_ctf", "accuracy", "GDM-InterCode-CTF", "agentic")
|
| 34 |
|
| 35 |
|
| 36 |
NUM_FEWSHOT = 0 # Change with your few shot
|
|
|
|
| 7 |
metric: str
|
| 8 |
col_name: str
|
| 9 |
type: str
|
| 10 |
+
source: str
|
| 11 |
|
| 12 |
|
| 13 |
# Select your tasks here
|
|
|
|
| 16 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
| 17 |
|
| 18 |
# base
|
| 19 |
+
task0 = Task("arc_easy", "accuracy", "ARC-Easy", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc")
|
| 20 |
+
task1 = Task("arc_challenge", "accuracy", "ARC-Challenge", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc")
|
| 21 |
+
task2 = Task("drop", "mean", "DROP", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/drop")
|
| 22 |
+
task3 = Task("winogrande", "accuracy", "WinoGrande", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/winogrande")
|
| 23 |
+
task4 = Task("gsm8k", "accuracy", "GSM8K", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gsm8k")
|
| 24 |
+
task5 = Task("hellaswag", "accuracy", "HellaSwag", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/hellaswag")
|
| 25 |
+
task6 = Task("humaneval", "mean", "HumanEval", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/humaneval")
|
| 26 |
+
task7 = Task("ifeval", "final_acc", "IFEval", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/ifeval")
|
| 27 |
+
task8 = Task("math", "accuracy", "MATH", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mathematics")
|
| 28 |
+
task9 = Task("mmlu", "accuracy", "MMLU", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmlu")
|
| 29 |
+
task10 = Task("mmlu_pro", "accuracy", "MMLU-Pro", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmlu_pro")
|
| 30 |
+
task11 = Task("gpqa_diamond", "accuracy", "GPQA-Diamond", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gpqa")
|
| 31 |
|
| 32 |
# agentic
|
| 33 |
+
task12 = Task("gaia", "mean", "GAIA", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia")
|
| 34 |
+
task13 = Task("gdm_intercode_ctf", "accuracy", "GDM-InterCode-CTF", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gdm_capabilities/intercode_ctf")
|
| 35 |
|
| 36 |
|
| 37 |
NUM_FEWSHOT = 0 # Change with your few shot
|
src/display/utils.py
CHANGED
|
@@ -28,7 +28,7 @@ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "ma
|
|
| 28 |
#Scores
|
| 29 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
| 30 |
for task in Tasks:
|
| 31 |
-
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "markdown", True)])
|
| 32 |
# # Model information
|
| 33 |
# auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
| 34 |
# auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
|
|
|
| 28 |
#Scores
|
| 29 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
| 30 |
for task in Tasks:
|
| 31 |
+
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(f"[{task.value.col_name}]({task.value.source})", "markdown", True)])
|
| 32 |
# # Model information
|
| 33 |
# auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
| 34 |
# auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|