eval-leaderboard

Sleeping

App Files Files Community

xeon27 commited on Jan 21

Commit

15e5347

1 Parent(s): 2a314d2

Make task names clickable and link to inspect-evals repo

Browse files

Files changed (2) hide show

src/about.py +15 -14
src/display/utils.py +1 -1

src/about.py CHANGED Viewed

@@ -7,6 +7,7 @@ class Task:
     metric: str
     col_name: str
     type: str
 # Select your tasks here
@@ -15,22 +16,22 @@ class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
     # base
-    task0 = Task("arc_easy", "accuracy", "ARC-Easy", "base")
-    task1 = Task("arc_challenge", "accuracy", "ARC-Challenge", "base")
-    task2 = Task("drop", "mean", "DROP", "base")
-    task3 = Task("winogrande", "accuracy", "WinoGrande", "base")
-    task4 = Task("gsm8k", "accuracy", "GSM8K", "base")
-    task5 = Task("hellaswag", "accuracy", "HellaSwag", "base")
-    task6 = Task("humaneval", "mean", "HumanEval", "base")
-    task7 = Task("ifeval", "final_acc", "IFEval", "base")
-    task8 = Task("math", "accuracy", "MATH", "base")
-    task9 = Task("mmlu", "accuracy", "MMLU", "base")
-    task10 = Task("mmlu_pro", "accuracy", "MMLU-Pro", "base")
-    task11 = Task("gpqa_diamond", "accuracy", "GPQA-Diamond", "base")
     # agentic
-    task12 = Task("gaia", "mean", "GAIA", "agentic")
-    task13 = Task("gdm_intercode_ctf", "accuracy", "GDM-InterCode-CTF", "agentic")
 NUM_FEWSHOT = 0 # Change with your few shot

     metric: str
     col_name: str
     type: str
+    source: str
 # Select your tasks here
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
     # base
+    task0 = Task("arc_easy", "accuracy", "ARC-Easy", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc")
+    task1 = Task("arc_challenge", "accuracy", "ARC-Challenge", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc")
+    task2 = Task("drop", "mean", "DROP", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/drop")
+    task3 = Task("winogrande", "accuracy", "WinoGrande", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/winogrande")
+    task4 = Task("gsm8k", "accuracy", "GSM8K", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gsm8k")
+    task5 = Task("hellaswag", "accuracy", "HellaSwag", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/hellaswag")
+    task6 = Task("humaneval", "mean", "HumanEval", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/humaneval")
+    task7 = Task("ifeval", "final_acc", "IFEval", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/ifeval")
+    task8 = Task("math", "accuracy", "MATH", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mathematics")
+    task9 = Task("mmlu", "accuracy", "MMLU", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmlu")
+    task10 = Task("mmlu_pro", "accuracy", "MMLU-Pro", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmlu_pro")
+    task11 = Task("gpqa_diamond", "accuracy", "GPQA-Diamond", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gpqa")
     # agentic
+    task12 = Task("gaia", "mean", "GAIA", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia")
+    task13 = Task("gdm_intercode_ctf", "accuracy", "GDM-InterCode-CTF", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gdm_capabilities/intercode_ctf")
 NUM_FEWSHOT = 0 # Change with your few shot

src/display/utils.py CHANGED Viewed

@@ -28,7 +28,7 @@ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "ma
 #Scores
 auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
 for task in Tasks:
-    auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "markdown", True)])
 # # Model information
 # auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
 # auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])

 #Scores
 auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
 for task in Tasks:
+    auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(f"[{task.value.col_name}]({task.value.source})", "markdown", True)])
 # # Model information
 # auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
 # auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])