Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Create constants module
Browse files- app.py +3 -79
- constants.py +74 -0
app.py
CHANGED
|
@@ -1,86 +1,10 @@
|
|
| 1 |
-
import io
|
| 2 |
import json
|
| 3 |
|
| 4 |
import gradio as gr
|
| 5 |
import pandas as pd
|
| 6 |
from huggingface_hub import HfFileSystem
|
| 7 |
|
| 8 |
-
|
| 9 |
-
RESULTS_DATASET_ID = "datasets/open-llm-leaderboard/results"
|
| 10 |
-
EXCLUDED_KEYS = {
|
| 11 |
-
"pretty_env_info",
|
| 12 |
-
"chat_template",
|
| 13 |
-
"group_subtasks",
|
| 14 |
-
}
|
| 15 |
-
# EXCLUDED_RESULTS_KEYS = {
|
| 16 |
-
# "leaderboard",
|
| 17 |
-
# }
|
| 18 |
-
# EXCLUDED_RESULTS_LEADERBOARDS_KEYS = {
|
| 19 |
-
# "alias",
|
| 20 |
-
# }
|
| 21 |
-
|
| 22 |
-
DETAILS_DATASET_ID = "datasets/open-llm-leaderboard/{model_name_sanitized}-details"
|
| 23 |
-
DETAILS_FILENAME = "samples_{subtask}_*.json"
|
| 24 |
-
|
| 25 |
-
TASKS = {
|
| 26 |
-
"leaderboard_arc_challenge": ("ARC", "leaderboard_arc_challenge"),
|
| 27 |
-
"leaderboard_bbh": ("BBH", "leaderboard_bbh"),
|
| 28 |
-
"leaderboard_gpqa": ("GPQA", "leaderboard_gpqa"),
|
| 29 |
-
"leaderboard_ifeval": ("IFEval", "leaderboard_ifeval"),
|
| 30 |
-
"leaderboard_math_hard": ("MATH", "leaderboard_math"),
|
| 31 |
-
"leaderboard_mmlu_pro": ("MMLU-Pro", "leaderboard_mmlu_pro"),
|
| 32 |
-
"leaderboard_musr": ("MuSR", "leaderboard_musr"),
|
| 33 |
-
}
|
| 34 |
-
SUBTASKS = {
|
| 35 |
-
"leaderboard_arc_challenge": ["leaderboard_arc_challenge"],
|
| 36 |
-
"leaderboard_bbh": [
|
| 37 |
-
"leaderboard_bbh_boolean_expressions",
|
| 38 |
-
"leaderboard_bbh_causal_judgement",
|
| 39 |
-
"leaderboard_bbh_date_understanding",
|
| 40 |
-
"leaderboard_bbh_disambiguation_qa",
|
| 41 |
-
"leaderboard_bbh_formal_fallacies",
|
| 42 |
-
"leaderboard_bbh_geometric_shapes",
|
| 43 |
-
"leaderboard_bbh_hyperbaton",
|
| 44 |
-
"leaderboard_bbh_logical_deduction_five_objects",
|
| 45 |
-
"leaderboard_bbh_logical_deduction_seven_objects",
|
| 46 |
-
"leaderboard_bbh_logical_deduction_three_objects",
|
| 47 |
-
"leaderboard_bbh_movie_recommendation",
|
| 48 |
-
"leaderboard_bbh_navigate",
|
| 49 |
-
"leaderboard_bbh_object_counting",
|
| 50 |
-
"leaderboard_bbh_penguins_in_a_table",
|
| 51 |
-
"leaderboard_bbh_reasoning_about_colored_objects",
|
| 52 |
-
"leaderboard_bbh_ruin_names",
|
| 53 |
-
"leaderboard_bbh_salient_translation_error_detection",
|
| 54 |
-
"leaderboard_bbh_snarks", "leaderboard_bbh_sports_understanding",
|
| 55 |
-
"leaderboard_bbh_temporal_sequences",
|
| 56 |
-
"leaderboard_bbh_tracking_shuffled_objects_five_objects",
|
| 57 |
-
"leaderboard_bbh_tracking_shuffled_objects_seven_objects",
|
| 58 |
-
"leaderboard_bbh_tracking_shuffled_objects_three_objects",
|
| 59 |
-
"leaderboard_bbh_web_of_lies",
|
| 60 |
-
],
|
| 61 |
-
"leaderboard_gpqa": [
|
| 62 |
-
"leaderboard_gpqa_extended",
|
| 63 |
-
"leaderboard_gpqa_diamond",
|
| 64 |
-
"leaderboard_gpqa_main",
|
| 65 |
-
],
|
| 66 |
-
"leaderboard_ifeval": ["leaderboard_ifeval"],
|
| 67 |
-
# "leaderboard_math_hard": [
|
| 68 |
-
"leaderboard_math": [
|
| 69 |
-
"leaderboard_math_algebra_hard",
|
| 70 |
-
"leaderboard_math_counting_and_prob_hard",
|
| 71 |
-
"leaderboard_math_geometry_hard",
|
| 72 |
-
"leaderboard_math_intermediate_algebra_hard",
|
| 73 |
-
"leaderboard_math_num_theory_hard",
|
| 74 |
-
"leaderboard_math_prealgebra_hard",
|
| 75 |
-
"leaderboard_math_precalculus_hard",
|
| 76 |
-
],
|
| 77 |
-
"leaderboard_mmlu_pro": ["leaderboard_mmlu_pro"],
|
| 78 |
-
"leaderboard_musr": [
|
| 79 |
-
"leaderboard_musr_murder_mysteries",
|
| 80 |
-
"leaderboard_musr_object_placements",
|
| 81 |
-
"leaderboard_musr_team_allocation",
|
| 82 |
-
],
|
| 83 |
-
}
|
| 84 |
|
| 85 |
|
| 86 |
fs = HfFileSystem()
|
|
@@ -96,7 +20,7 @@ def filter_latest_result_path_per_model(paths):
|
|
| 96 |
|
| 97 |
d = defaultdict(list)
|
| 98 |
for path in paths:
|
| 99 |
-
model_id, _ = path[len(RESULTS_DATASET_ID) +1:].rsplit("/", 1)
|
| 100 |
d[model_id].append(path)
|
| 101 |
return {model_id: max(paths) for model_id, paths in d.items()}
|
| 102 |
|
|
@@ -121,7 +45,7 @@ def load_results_dataframe(model_id):
|
|
| 121 |
result_path = get_result_path_from_model(model_id, latest_result_path_per_model)
|
| 122 |
data = load_data(result_path)
|
| 123 |
model_name = data.get("model_name", "Model")
|
| 124 |
-
df = pd.json_normalize([{key: value for key, value in data.items()
|
| 125 |
# df.columns = df.columns.str.split(".") # .split return a list instead of a tuple
|
| 126 |
return df.set_index(pd.Index([model_name])).reset_index()
|
| 127 |
|
|
|
|
|
|
|
| 1 |
import json
|
| 2 |
|
| 3 |
import gradio as gr
|
| 4 |
import pandas as pd
|
| 5 |
from huggingface_hub import HfFileSystem
|
| 6 |
|
| 7 |
+
from constants import DETAILS_DATASET_ID, DETAILS_FILENAME, RESULTS_DATASET_ID, SUBTASKS, TASKS
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
|
| 10 |
fs = HfFileSystem()
|
|
|
|
| 20 |
|
| 21 |
d = defaultdict(list)
|
| 22 |
for path in paths:
|
| 23 |
+
model_id, _ = path[len(RESULTS_DATASET_ID) + 1:].rsplit("/", 1)
|
| 24 |
d[model_id].append(path)
|
| 25 |
return {model_id: max(paths) for model_id, paths in d.items()}
|
| 26 |
|
|
|
|
| 45 |
result_path = get_result_path_from_model(model_id, latest_result_path_per_model)
|
| 46 |
data = load_data(result_path)
|
| 47 |
model_name = data.get("model_name", "Model")
|
| 48 |
+
df = pd.json_normalize([{key: value for key, value in data.items()}])
|
| 49 |
# df.columns = df.columns.str.split(".") # .split return a list instead of a tuple
|
| 50 |
return df.set_index(pd.Index([model_name])).reset_index()
|
| 51 |
|
constants.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
RESULTS_DATASET_ID = "datasets/open-llm-leaderboard/results"
|
| 2 |
+
# EXCLUDED_KEYS = {
|
| 3 |
+
# "pretty_env_info",
|
| 4 |
+
# "chat_template",
|
| 5 |
+
# "group_subtasks",
|
| 6 |
+
# }
|
| 7 |
+
# EXCLUDED_RESULTS_KEYS = {
|
| 8 |
+
# "leaderboard",
|
| 9 |
+
# }
|
| 10 |
+
# EXCLUDED_RESULTS_LEADERBOARDS_KEYS = {
|
| 11 |
+
# "alias",
|
| 12 |
+
# }
|
| 13 |
+
|
| 14 |
+
DETAILS_DATASET_ID = "datasets/open-llm-leaderboard/{model_name_sanitized}-details"
|
| 15 |
+
DETAILS_FILENAME = "samples_{subtask}_*.json"
|
| 16 |
+
TASKS = {
|
| 17 |
+
"leaderboard_arc_challenge": ("ARC", "leaderboard_arc_challenge"),
|
| 18 |
+
"leaderboard_bbh": ("BBH", "leaderboard_bbh"),
|
| 19 |
+
"leaderboard_gpqa": ("GPQA", "leaderboard_gpqa"),
|
| 20 |
+
"leaderboard_ifeval": ("IFEval", "leaderboard_ifeval"),
|
| 21 |
+
"leaderboard_math_hard": ("MATH", "leaderboard_math"),
|
| 22 |
+
"leaderboard_mmlu_pro": ("MMLU-Pro", "leaderboard_mmlu_pro"),
|
| 23 |
+
"leaderboard_musr": ("MuSR", "leaderboard_musr"),
|
| 24 |
+
}
|
| 25 |
+
SUBTASKS = {
|
| 26 |
+
"leaderboard_arc_challenge": ["leaderboard_arc_challenge"],
|
| 27 |
+
"leaderboard_bbh": [
|
| 28 |
+
"leaderboard_bbh_boolean_expressions",
|
| 29 |
+
"leaderboard_bbh_causal_judgement",
|
| 30 |
+
"leaderboard_bbh_date_understanding",
|
| 31 |
+
"leaderboard_bbh_disambiguation_qa",
|
| 32 |
+
"leaderboard_bbh_formal_fallacies",
|
| 33 |
+
"leaderboard_bbh_geometric_shapes",
|
| 34 |
+
"leaderboard_bbh_hyperbaton",
|
| 35 |
+
"leaderboard_bbh_logical_deduction_five_objects",
|
| 36 |
+
"leaderboard_bbh_logical_deduction_seven_objects",
|
| 37 |
+
"leaderboard_bbh_logical_deduction_three_objects",
|
| 38 |
+
"leaderboard_bbh_movie_recommendation",
|
| 39 |
+
"leaderboard_bbh_navigate",
|
| 40 |
+
"leaderboard_bbh_object_counting",
|
| 41 |
+
"leaderboard_bbh_penguins_in_a_table",
|
| 42 |
+
"leaderboard_bbh_reasoning_about_colored_objects",
|
| 43 |
+
"leaderboard_bbh_ruin_names",
|
| 44 |
+
"leaderboard_bbh_salient_translation_error_detection",
|
| 45 |
+
"leaderboard_bbh_snarks", "leaderboard_bbh_sports_understanding",
|
| 46 |
+
"leaderboard_bbh_temporal_sequences",
|
| 47 |
+
"leaderboard_bbh_tracking_shuffled_objects_five_objects",
|
| 48 |
+
"leaderboard_bbh_tracking_shuffled_objects_seven_objects",
|
| 49 |
+
"leaderboard_bbh_tracking_shuffled_objects_three_objects",
|
| 50 |
+
"leaderboard_bbh_web_of_lies",
|
| 51 |
+
],
|
| 52 |
+
"leaderboard_gpqa": [
|
| 53 |
+
"leaderboard_gpqa_extended",
|
| 54 |
+
"leaderboard_gpqa_diamond",
|
| 55 |
+
"leaderboard_gpqa_main",
|
| 56 |
+
],
|
| 57 |
+
"leaderboard_ifeval": ["leaderboard_ifeval"],
|
| 58 |
+
# "leaderboard_math_hard": [
|
| 59 |
+
"leaderboard_math": [
|
| 60 |
+
"leaderboard_math_algebra_hard",
|
| 61 |
+
"leaderboard_math_counting_and_prob_hard",
|
| 62 |
+
"leaderboard_math_geometry_hard",
|
| 63 |
+
"leaderboard_math_intermediate_algebra_hard",
|
| 64 |
+
"leaderboard_math_num_theory_hard",
|
| 65 |
+
"leaderboard_math_prealgebra_hard",
|
| 66 |
+
"leaderboard_math_precalculus_hard",
|
| 67 |
+
],
|
| 68 |
+
"leaderboard_mmlu_pro": ["leaderboard_mmlu_pro"],
|
| 69 |
+
"leaderboard_musr": [
|
| 70 |
+
"leaderboard_musr_murder_mysteries",
|
| 71 |
+
"leaderboard_musr_object_placements",
|
| 72 |
+
"leaderboard_musr_team_allocation",
|
| 73 |
+
],
|
| 74 |
+
}
|