Spaces:
Runtime error
Runtime error
select phenotypes and metrics
Browse files- app.py +50 -15
- src/about.py +36 -7
- src/display/utils.py +9 -5
- src/envs.py +4 -4
- src/leaderboard/read_evals.py +17 -17
- src/populate.py +2 -4
app.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
import subprocess
|
| 2 |
import gradio as gr
|
| 3 |
import pandas as pd
|
| 4 |
from apscheduler.schedulers.background import BackgroundScheduler
|
|
@@ -14,7 +13,6 @@ from src.about import (
|
|
| 14 |
)
|
| 15 |
from src.display.css_html_js import custom_css
|
| 16 |
from src.display.utils import (
|
| 17 |
-
BENCHMARK_COLS,
|
| 18 |
COLS,
|
| 19 |
EVAL_COLS,
|
| 20 |
EVAL_TYPES,
|
|
@@ -24,11 +22,16 @@ from src.display.utils import (
|
|
| 24 |
ModelType,
|
| 25 |
fields,
|
| 26 |
WeightType,
|
| 27 |
-
Precision
|
|
|
|
| 28 |
)
|
| 29 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
| 30 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
| 31 |
from src.submission.submit import add_new_eval
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
|
| 34 |
def restart_space():
|
|
@@ -50,7 +53,7 @@ except Exception:
|
|
| 50 |
restart_space()
|
| 51 |
|
| 52 |
|
| 53 |
-
raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS
|
| 54 |
leaderboard_df = original_df.copy()
|
| 55 |
|
| 56 |
(
|
|
@@ -64,6 +67,8 @@ leaderboard_df = original_df.copy()
|
|
| 64 |
def update_table(
|
| 65 |
hidden_df: pd.DataFrame,
|
| 66 |
columns: list,
|
|
|
|
|
|
|
| 67 |
type_query: list,
|
| 68 |
precision_query: str,
|
| 69 |
size_query: list,
|
|
@@ -72,7 +77,7 @@ def update_table(
|
|
| 72 |
):
|
| 73 |
filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
|
| 74 |
filtered_df = filter_queries(query, filtered_df)
|
| 75 |
-
df = select_columns(filtered_df, columns)
|
| 76 |
return df
|
| 77 |
|
| 78 |
|
|
@@ -80,14 +85,19 @@ def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
|
|
| 80 |
return df[(df[AutoEvalColumn.model.name].str.contains(query, case=False))]
|
| 81 |
|
| 82 |
|
| 83 |
-
def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
|
| 84 |
always_here_cols = [
|
| 85 |
AutoEvalColumn.model_type_symbol.name,
|
| 86 |
AutoEvalColumn.model.name,
|
| 87 |
]
|
| 88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
filtered_df = df[
|
| 90 |
-
always_here_cols + [c for c in COLS if c in df.columns and c in columns]
|
| 91 |
]
|
| 92 |
return filtered_df
|
| 93 |
|
|
@@ -147,12 +157,34 @@ with demo:
|
|
| 147 |
show_label=False,
|
| 148 |
elem_id="search-bar",
|
| 149 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
with gr.Row():
|
| 151 |
shown_columns = gr.CheckboxGroup(
|
| 152 |
choices=[
|
| 153 |
c.name
|
| 154 |
for c in fields(AutoEvalColumn)
|
| 155 |
-
if not c.hidden and not c.never_hidden
|
| 156 |
],
|
| 157 |
value=[
|
| 158 |
c.name
|
|
@@ -163,12 +195,7 @@ with demo:
|
|
| 163 |
elem_id="column-select",
|
| 164 |
interactive=True,
|
| 165 |
)
|
| 166 |
-
with gr.Row():
|
| 167 |
-
deleted_models_visibility = gr.Checkbox(
|
| 168 |
-
value=False, label="Show gated/private/deleted models", interactive=True
|
| 169 |
-
)
|
| 170 |
with gr.Column(min_width=320):
|
| 171 |
-
#with gr.Box(elem_id="box-filter"):
|
| 172 |
filter_columns_type = gr.CheckboxGroup(
|
| 173 |
label="Model types",
|
| 174 |
choices=[t.to_str() for t in ModelType],
|
|
@@ -190,6 +217,10 @@ with demo:
|
|
| 190 |
interactive=True,
|
| 191 |
elem_id="filter-columns-size",
|
| 192 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
|
| 194 |
leaderboard_table = gr.components.Dataframe(
|
| 195 |
value=leaderboard_df[
|
|
@@ -215,6 +246,8 @@ with demo:
|
|
| 215 |
[
|
| 216 |
hidden_leaderboard_table_for_search,
|
| 217 |
shown_columns,
|
|
|
|
|
|
|
| 218 |
filter_columns_type,
|
| 219 |
filter_columns_precision,
|
| 220 |
filter_columns_size,
|
|
@@ -223,12 +256,14 @@ with demo:
|
|
| 223 |
],
|
| 224 |
leaderboard_table,
|
| 225 |
)
|
| 226 |
-
for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size, deleted_models_visibility]:
|
| 227 |
selector.change(
|
| 228 |
update_table,
|
| 229 |
[
|
| 230 |
hidden_leaderboard_table_for_search,
|
| 231 |
shown_columns,
|
|
|
|
|
|
|
| 232 |
filter_columns_type,
|
| 233 |
filter_columns_precision,
|
| 234 |
filter_columns_size,
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import pandas as pd
|
| 3 |
from apscheduler.schedulers.background import BackgroundScheduler
|
|
|
|
| 13 |
)
|
| 14 |
from src.display.css_html_js import custom_css
|
| 15 |
from src.display.utils import (
|
|
|
|
| 16 |
COLS,
|
| 17 |
EVAL_COLS,
|
| 18 |
EVAL_TYPES,
|
|
|
|
| 22 |
ModelType,
|
| 23 |
fields,
|
| 24 |
WeightType,
|
| 25 |
+
Precision,
|
| 26 |
+
generate_column_name
|
| 27 |
)
|
| 28 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
| 29 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
| 30 |
from src.submission.submit import add_new_eval
|
| 31 |
+
from dotenv import load_dotenv
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
load_dotenv()
|
| 35 |
|
| 36 |
|
| 37 |
def restart_space():
|
|
|
|
| 53 |
restart_space()
|
| 54 |
|
| 55 |
|
| 56 |
+
raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS)
|
| 57 |
leaderboard_df = original_df.copy()
|
| 58 |
|
| 59 |
(
|
|
|
|
| 67 |
def update_table(
|
| 68 |
hidden_df: pd.DataFrame,
|
| 69 |
columns: list,
|
| 70 |
+
phenotypes: list,
|
| 71 |
+
metrics: list,
|
| 72 |
type_query: list,
|
| 73 |
precision_query: str,
|
| 74 |
size_query: list,
|
|
|
|
| 77 |
):
|
| 78 |
filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
|
| 79 |
filtered_df = filter_queries(query, filtered_df)
|
| 80 |
+
df = select_columns(filtered_df, columns, phenotypes, metrics)
|
| 81 |
return df
|
| 82 |
|
| 83 |
|
|
|
|
| 85 |
return df[(df[AutoEvalColumn.model.name].str.contains(query, case=False))]
|
| 86 |
|
| 87 |
|
| 88 |
+
def select_columns(df: pd.DataFrame, columns: list, phenotypes: list, metrics:list) -> pd.DataFrame:
|
| 89 |
always_here_cols = [
|
| 90 |
AutoEvalColumn.model_type_symbol.name,
|
| 91 |
AutoEvalColumn.model.name,
|
| 92 |
]
|
| 93 |
+
|
| 94 |
+
task_cols = []
|
| 95 |
+
for phenotype in phenotypes:
|
| 96 |
+
for metric in metrics:
|
| 97 |
+
task_cols.append(generate_column_name(phenotype, metric))
|
| 98 |
+
|
| 99 |
filtered_df = df[
|
| 100 |
+
always_here_cols + [c for c in COLS if c in df.columns and c in columns] + sorted(task_cols)
|
| 101 |
]
|
| 102 |
return filtered_df
|
| 103 |
|
|
|
|
| 157 |
show_label=False,
|
| 158 |
elem_id="search-bar",
|
| 159 |
)
|
| 160 |
+
with gr.Row():
|
| 161 |
+
with gr.Column(min_width=320):
|
| 162 |
+
shown_phenotypes = gr.CheckboxGroup(
|
| 163 |
+
choices=sorted(set([
|
| 164 |
+
c.task.value.phenotype_name
|
| 165 |
+
for c in fields(AutoEvalColumn)
|
| 166 |
+
if not c.hidden and not c.never_hidden and c.is_task
|
| 167 |
+
])),
|
| 168 |
+
label="Select phenotypes to show",
|
| 169 |
+
elem_id="phenotype-select",
|
| 170 |
+
interactive=True,
|
| 171 |
+
)
|
| 172 |
+
shown_metrics = gr.CheckboxGroup(
|
| 173 |
+
choices=sorted(set([
|
| 174 |
+
c.task.value.metric_name
|
| 175 |
+
for c in fields(AutoEvalColumn)
|
| 176 |
+
if not c.hidden and not c.never_hidden and c.is_task
|
| 177 |
+
])),
|
| 178 |
+
label="Select metrics to show",
|
| 179 |
+
elem_id="metric-select",
|
| 180 |
+
interactive=True,
|
| 181 |
+
)
|
| 182 |
with gr.Row():
|
| 183 |
shown_columns = gr.CheckboxGroup(
|
| 184 |
choices=[
|
| 185 |
c.name
|
| 186 |
for c in fields(AutoEvalColumn)
|
| 187 |
+
if not c.hidden and not c.never_hidden and not c.is_task
|
| 188 |
],
|
| 189 |
value=[
|
| 190 |
c.name
|
|
|
|
| 195 |
elem_id="column-select",
|
| 196 |
interactive=True,
|
| 197 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
with gr.Column(min_width=320):
|
|
|
|
| 199 |
filter_columns_type = gr.CheckboxGroup(
|
| 200 |
label="Model types",
|
| 201 |
choices=[t.to_str() for t in ModelType],
|
|
|
|
| 217 |
interactive=True,
|
| 218 |
elem_id="filter-columns-size",
|
| 219 |
)
|
| 220 |
+
with gr.Row():
|
| 221 |
+
deleted_models_visibility = gr.Checkbox(
|
| 222 |
+
value=True, label="Show gated/private/deleted models", interactive=True
|
| 223 |
+
)
|
| 224 |
|
| 225 |
leaderboard_table = gr.components.Dataframe(
|
| 226 |
value=leaderboard_df[
|
|
|
|
| 246 |
[
|
| 247 |
hidden_leaderboard_table_for_search,
|
| 248 |
shown_columns,
|
| 249 |
+
shown_phenotypes,
|
| 250 |
+
shown_metrics,
|
| 251 |
filter_columns_type,
|
| 252 |
filter_columns_precision,
|
| 253 |
filter_columns_size,
|
|
|
|
| 256 |
],
|
| 257 |
leaderboard_table,
|
| 258 |
)
|
| 259 |
+
for selector in [shown_phenotypes, shown_metrics, shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size, deleted_models_visibility]:
|
| 260 |
selector.change(
|
| 261 |
update_table,
|
| 262 |
[
|
| 263 |
hidden_leaderboard_table_for_search,
|
| 264 |
shown_columns,
|
| 265 |
+
shown_phenotypes,
|
| 266 |
+
shown_metrics,
|
| 267 |
filter_columns_type,
|
| 268 |
filter_columns_precision,
|
| 269 |
filter_columns_size,
|
src/about.py
CHANGED
|
@@ -3,17 +3,39 @@ from enum import Enum
|
|
| 3 |
|
| 4 |
@dataclass
|
| 5 |
class Task:
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
|
|
|
| 9 |
|
| 10 |
|
| 11 |
# Select your tasks here
|
| 12 |
# ---------------------------------------------------
|
| 13 |
class Tasks(Enum):
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
NUM_FEWSHOT = 0 # Change with your few shot
|
| 19 |
# ---------------------------------------------------
|
|
@@ -25,7 +47,14 @@ TITLE = """<h1 align="center" id="space-title">OpenHeLM Leaderboard</h1>"""
|
|
| 25 |
|
| 26 |
# What does your leaderboard evaluate?
|
| 27 |
INTRODUCTION_TEXT = """
|
| 28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
"""
|
| 30 |
|
| 31 |
# Which evaluations are you running? how can people reproduce what you have?
|
|
|
|
| 3 |
|
| 4 |
@dataclass
|
| 5 |
class Task:
|
| 6 |
+
phenotype_key: str
|
| 7 |
+
phenotype_name: str
|
| 8 |
+
metric_key: str
|
| 9 |
+
metric_name: str
|
| 10 |
|
| 11 |
|
| 12 |
# Select your tasks here
|
| 13 |
# ---------------------------------------------------
|
| 14 |
class Tasks(Enum):
|
| 15 |
+
task0 = Task("asthma", "Asthma", "auroc", "AUROC")
|
| 16 |
+
task1 = Task("cataract", "Cataract", "auroc", "AUROC")
|
| 17 |
+
task2 = Task("diabete", "Diabete", "auroc", "AUROC")
|
| 18 |
+
task3 = Task("GERD", "GERD", "auroc", "AUROC")
|
| 19 |
+
task4 = Task("hay-fever-eczema", "Hay-fever & Eczema", "auroc", "AUROC")
|
| 20 |
+
task5 = Task("hypertension", "Hypertension", "auroc", "AUROC")
|
| 21 |
+
task6 = Task("major-depression", "Major Depression", "auroc", "AUROC")
|
| 22 |
+
task7 = Task("migraine", "Migraine", "auroc", "AUROC")
|
| 23 |
+
task8 = Task("myocardial-infarction", "Myocardial Infarction", "auroc", "AUROC")
|
| 24 |
+
task9 = Task("osteoarthritis", "Osteoarthritis", "auroc", "AUROC")
|
| 25 |
+
task10 = Task("pneumonia", "Pneumonia", "auroc", "AUROC")
|
| 26 |
+
task11 = Task("stroke", "Stroke", "auroc", "AUROC")
|
| 27 |
+
task12 = Task("asthma", "Asthma", "auprc", "AUPRC")
|
| 28 |
+
task13 = Task("cataract", "Cataract", "auprc", "AUPRC")
|
| 29 |
+
task14 = Task("diabete", "Diabete", "auprc", "AUPRC")
|
| 30 |
+
task15 = Task("GERD", "GERD", "auprc", "AUPRC")
|
| 31 |
+
task16 = Task("hay-fever-eczema", "Hay-fever & Eczema", "auprc", "AUPRC")
|
| 32 |
+
task17 = Task("hypertension", "Hypertension", "auprc", "AUPRC")
|
| 33 |
+
task18 = Task("major-depression", "Major Depression", "auprc", "AUPRC")
|
| 34 |
+
task19 = Task("migraine", "Migraine", "auprc", "AUPRC")
|
| 35 |
+
task20 = Task("myocardial-infarction", "Myocardial Infarction", "auprc", "AUPRC")
|
| 36 |
+
task21 = Task("osteoarthritis", "Osteoarthritis", "auprc", "AUPRC")
|
| 37 |
+
task22 = Task("pneumonia", "Pneumonia", "auprc", "AUPRC")
|
| 38 |
+
task23 = Task("stroke", "Stroke", "auprc", "AUPRC")
|
| 39 |
|
| 40 |
NUM_FEWSHOT = 0 # Change with your few shot
|
| 41 |
# ---------------------------------------------------
|
|
|
|
| 47 |
|
| 48 |
# What does your leaderboard evaluate?
|
| 49 |
INTRODUCTION_TEXT = """
|
| 50 |
+
TODO:
|
| 51 |
+
|
| 52 |
+
- Add a description of the leaderboard
|
| 53 |
+
- Add class distribution for each phenotype
|
| 54 |
+
- Potentially a warning when we should not rely on AUROC
|
| 55 |
+
- Plot of AUROC and AUPRC for each phenotype
|
| 56 |
+
- Edit about section
|
| 57 |
+
- Edit submit section (AutoModelForCausalLM)
|
| 58 |
"""
|
| 59 |
|
| 60 |
# Which evaluations are you running? how can people reproduce what you have?
|
src/display/utils.py
CHANGED
|
@@ -2,12 +2,15 @@ from dataclasses import dataclass, make_dataclass
|
|
| 2 |
from enum import Enum
|
| 3 |
|
| 4 |
import pandas as pd
|
|
|
|
| 5 |
|
| 6 |
-
from src.about import Tasks
|
| 7 |
|
| 8 |
def fields(raw_class):
|
| 9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
| 10 |
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
# These classes are for user facing column names,
|
| 13 |
# to avoid having to change them all around the code
|
|
@@ -19,6 +22,8 @@ class ColumnContent:
|
|
| 19 |
displayed_by_default: bool
|
| 20 |
hidden: bool = False
|
| 21 |
never_hidden: bool = False
|
|
|
|
|
|
|
| 22 |
|
| 23 |
## Leaderboard columns
|
| 24 |
auto_eval_column_dict = []
|
|
@@ -26,9 +31,10 @@ auto_eval_column_dict = []
|
|
| 26 |
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
| 27 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
| 28 |
#Scores
|
| 29 |
-
auto_eval_column_dict.append(["
|
|
|
|
| 30 |
for task in Tasks:
|
| 31 |
-
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.
|
| 32 |
# Model information
|
| 33 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
| 34 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
|
@@ -121,8 +127,6 @@ TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default a
|
|
| 121 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
| 122 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
| 123 |
|
| 124 |
-
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
| 125 |
-
|
| 126 |
NUMERIC_INTERVALS = {
|
| 127 |
"?": pd.Interval(-1, 0, closed="right"),
|
| 128 |
"~1.5": pd.Interval(0, 2, closed="right"),
|
|
|
|
| 2 |
from enum import Enum
|
| 3 |
|
| 4 |
import pandas as pd
|
| 5 |
+
from src.about import Task, Tasks
|
| 6 |
|
|
|
|
| 7 |
|
| 8 |
def fields(raw_class):
|
| 9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
| 10 |
|
| 11 |
+
def generate_column_name(phenotype_name, metric_name):
|
| 12 |
+
return f"{phenotype_name} ({metric_name})"
|
| 13 |
+
|
| 14 |
|
| 15 |
# These classes are for user facing column names,
|
| 16 |
# to avoid having to change them all around the code
|
|
|
|
| 22 |
displayed_by_default: bool
|
| 23 |
hidden: bool = False
|
| 24 |
never_hidden: bool = False
|
| 25 |
+
is_task: bool = False
|
| 26 |
+
task: Task = None
|
| 27 |
|
| 28 |
## Leaderboard columns
|
| 29 |
auto_eval_column_dict = []
|
|
|
|
| 31 |
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
| 32 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
| 33 |
#Scores
|
| 34 |
+
auto_eval_column_dict.append(["average_auroc", ColumnContent, ColumnContent("Average AUROC ⬆️", "number", True)])
|
| 35 |
+
auto_eval_column_dict.append(["average_auprc", ColumnContent, ColumnContent("Average AUPRC ⬆️", "number", True)])
|
| 36 |
for task in Tasks:
|
| 37 |
+
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(generate_column_name(task.value.phenotype_name, task.value.metric_name), "number", displayed_by_default=False, is_task=True, task=task)])
|
| 38 |
# Model information
|
| 39 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
| 40 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
|
|
|
| 127 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
| 128 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
| 129 |
|
|
|
|
|
|
|
| 130 |
NUMERIC_INTERVALS = {
|
| 131 |
"?": pd.Interval(-1, 0, closed="right"),
|
| 132 |
"~1.5": pd.Interval(0, 2, closed="right"),
|
src/envs.py
CHANGED
|
@@ -6,12 +6,12 @@ from huggingface_hub import HfApi
|
|
| 6 |
# ----------------------------------
|
| 7 |
TOKEN = os.environ.get("TOKEN") # A read/write token for your org
|
| 8 |
|
| 9 |
-
OWNER = "
|
| 10 |
# ----------------------------------
|
| 11 |
|
| 12 |
-
REPO_ID = f"{OWNER}/leaderboard"
|
| 13 |
-
QUEUE_REPO = f"{OWNER}/requests"
|
| 14 |
-
RESULTS_REPO = f"{OWNER}/results"
|
| 15 |
|
| 16 |
# If you setup a cache later, just change HF_HOME
|
| 17 |
CACHE_PATH=os.getenv("HF_HOME", ".")
|
|
|
|
| 6 |
# ----------------------------------
|
| 7 |
TOKEN = os.environ.get("TOKEN") # A read/write token for your org
|
| 8 |
|
| 9 |
+
OWNER = "TemryL" # Change to your org - don't forget to create a results and request dataset, with the correct format!
|
| 10 |
# ----------------------------------
|
| 11 |
|
| 12 |
+
REPO_ID = f"{OWNER}/OpenHeLM-leaderboard"
|
| 13 |
+
QUEUE_REPO = f"{OWNER}/OpenHeLM-requests"
|
| 14 |
+
RESULTS_REPO = f"{OWNER}/OpenHeLM-results"
|
| 15 |
|
| 16 |
# If you setup a cache later, just change HF_HOME
|
| 17 |
CACHE_PATH=os.getenv("HF_HOME", ".")
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
| 1 |
import glob
|
| 2 |
import json
|
| 3 |
-
import math
|
| 4 |
import os
|
| 5 |
from dataclasses import dataclass
|
| 6 |
|
|
@@ -8,7 +7,7 @@ import dateutil
|
|
| 8 |
import numpy as np
|
| 9 |
|
| 10 |
from src.display.formatting import make_clickable_model
|
| 11 |
-
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
|
| 12 |
from src.submission.check_validity import is_model_on_hub
|
| 13 |
|
| 14 |
|
|
@@ -22,6 +21,7 @@ class EvalResult:
|
|
| 22 |
model: str
|
| 23 |
revision: str # commit hash, "" if main
|
| 24 |
results: dict
|
|
|
|
| 25 |
precision: Precision = Precision.Unknown
|
| 26 |
model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
| 27 |
weight_type: WeightType = WeightType.Original # Original or Adapter
|
|
@@ -58,7 +58,7 @@ class EvalResult:
|
|
| 58 |
full_model = "/".join(org_and_model)
|
| 59 |
|
| 60 |
still_on_hub, _, model_config = is_model_on_hub(
|
| 61 |
-
full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
|
| 62 |
)
|
| 63 |
architecture = "?"
|
| 64 |
if model_config is not None:
|
|
@@ -70,14 +70,12 @@ class EvalResult:
|
|
| 70 |
results = {}
|
| 71 |
for task in Tasks:
|
| 72 |
task = task.value
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
mean_acc = np.mean(accs) * 100.0
|
| 80 |
-
results[task.benchmark] = mean_acc
|
| 81 |
|
| 82 |
return self(
|
| 83 |
eval_name=result_key,
|
|
@@ -85,8 +83,9 @@ class EvalResult:
|
|
| 85 |
org=org,
|
| 86 |
model=model,
|
| 87 |
results=results,
|
| 88 |
-
|
| 89 |
-
|
|
|
|
| 90 |
still_on_hub=still_on_hub,
|
| 91 |
architecture=architecture
|
| 92 |
)
|
|
@@ -109,7 +108,8 @@ class EvalResult:
|
|
| 109 |
|
| 110 |
def to_dict(self):
|
| 111 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
| 112 |
-
|
|
|
|
| 113 |
data_dict = {
|
| 114 |
"eval_name": self.eval_name, # not a column, just a save name,
|
| 115 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
|
@@ -119,7 +119,8 @@ class EvalResult:
|
|
| 119 |
AutoEvalColumn.architecture.name: self.architecture,
|
| 120 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
| 121 |
AutoEvalColumn.revision.name: self.revision,
|
| 122 |
-
AutoEvalColumn.
|
|
|
|
| 123 |
AutoEvalColumn.license.name: self.license,
|
| 124 |
AutoEvalColumn.likes.name: self.likes,
|
| 125 |
AutoEvalColumn.params.name: self.num_params,
|
|
@@ -127,8 +128,7 @@ class EvalResult:
|
|
| 127 |
}
|
| 128 |
|
| 129 |
for task in Tasks:
|
| 130 |
-
data_dict[task.value.
|
| 131 |
-
|
| 132 |
return data_dict
|
| 133 |
|
| 134 |
|
|
|
|
| 1 |
import glob
|
| 2 |
import json
|
|
|
|
| 3 |
import os
|
| 4 |
from dataclasses import dataclass
|
| 5 |
|
|
|
|
| 7 |
import numpy as np
|
| 8 |
|
| 9 |
from src.display.formatting import make_clickable_model
|
| 10 |
+
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, generate_column_name
|
| 11 |
from src.submission.check_validity import is_model_on_hub
|
| 12 |
|
| 13 |
|
|
|
|
| 21 |
model: str
|
| 22 |
revision: str # commit hash, "" if main
|
| 23 |
results: dict
|
| 24 |
+
raw_data: dict
|
| 25 |
precision: Precision = Precision.Unknown
|
| 26 |
model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
| 27 |
weight_type: WeightType = WeightType.Original # Original or Adapter
|
|
|
|
| 58 |
full_model = "/".join(org_and_model)
|
| 59 |
|
| 60 |
still_on_hub, _, model_config = is_model_on_hub(
|
| 61 |
+
full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False, token=os.environ.get("TOKEN")
|
| 62 |
)
|
| 63 |
architecture = "?"
|
| 64 |
if model_config is not None:
|
|
|
|
| 70 |
results = {}
|
| 71 |
for task in Tasks:
|
| 72 |
task = task.value
|
| 73 |
+
|
| 74 |
+
mean = data["results"].get(task.phenotype_key, {}).get("_".join(["mean", task.metric_key]), None)
|
| 75 |
+
lower = data["results"].get(task.phenotype_key, {}).get("_".join(["lower", task.metric_key]), None)
|
| 76 |
+
upper = data["results"].get(task.phenotype_key, {}).get("_".join(["upper", task.metric_key]), None)
|
| 77 |
+
formated_score = f"{mean:.2f} ({lower:.2f}-{upper:.2f})" if mean is not None else None
|
| 78 |
+
results["_".join([task.phenotype_key, task.metric_key])] = formated_score
|
|
|
|
|
|
|
| 79 |
|
| 80 |
return self(
|
| 81 |
eval_name=result_key,
|
|
|
|
| 83 |
org=org,
|
| 84 |
model=model,
|
| 85 |
results=results,
|
| 86 |
+
raw_data=data,
|
| 87 |
+
precision=precision,
|
| 88 |
+
revision=config.get("model_sha", ""),
|
| 89 |
still_on_hub=still_on_hub,
|
| 90 |
architecture=architecture
|
| 91 |
)
|
|
|
|
| 108 |
|
| 109 |
def to_dict(self):
|
| 110 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
| 111 |
+
average_auroc = np.mean(np.array([d["mean_auroc"] for d in self.raw_data["results"].values() if "mean_auroc" in d.keys()]))
|
| 112 |
+
average_auprc = np.mean(np.array([d["mean_auprc"] for d in self.raw_data["results"].values() if "mean_auprc" in d.keys()]))
|
| 113 |
data_dict = {
|
| 114 |
"eval_name": self.eval_name, # not a column, just a save name,
|
| 115 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
|
|
|
| 119 |
AutoEvalColumn.architecture.name: self.architecture,
|
| 120 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
| 121 |
AutoEvalColumn.revision.name: self.revision,
|
| 122 |
+
AutoEvalColumn.average_auroc.name: average_auroc,
|
| 123 |
+
AutoEvalColumn.average_auprc.name: average_auprc,
|
| 124 |
AutoEvalColumn.license.name: self.license,
|
| 125 |
AutoEvalColumn.likes.name: self.likes,
|
| 126 |
AutoEvalColumn.params.name: self.num_params,
|
|
|
|
| 128 |
}
|
| 129 |
|
| 130 |
for task in Tasks:
|
| 131 |
+
data_dict[generate_column_name(task.value.phenotype_name, task.value.metric_name)] = self.results["_".join([task.value.phenotype_key, task.value.metric_key])]
|
|
|
|
| 132 |
return data_dict
|
| 133 |
|
| 134 |
|
src/populate.py
CHANGED
|
@@ -8,17 +8,15 @@ from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
|
| 8 |
from src.leaderboard.read_evals import get_raw_eval_results
|
| 9 |
|
| 10 |
|
| 11 |
-
def get_leaderboard_df(results_path: str, requests_path: str, cols: list
|
| 12 |
"""Creates a dataframe from all the individual experiment results"""
|
| 13 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
| 14 |
all_data_json = [v.to_dict() for v in raw_data]
|
| 15 |
|
| 16 |
df = pd.DataFrame.from_records(all_data_json)
|
| 17 |
-
df = df.sort_values(by=[AutoEvalColumn.
|
| 18 |
df = df[cols].round(decimals=2)
|
| 19 |
|
| 20 |
-
# filter out if any of the benchmarks have not been produced
|
| 21 |
-
df = df[has_no_nan_values(df, benchmark_cols)]
|
| 22 |
return raw_data, df
|
| 23 |
|
| 24 |
|
|
|
|
| 8 |
from src.leaderboard.read_evals import get_raw_eval_results
|
| 9 |
|
| 10 |
|
| 11 |
+
def get_leaderboard_df(results_path: str, requests_path: str, cols: list) -> pd.DataFrame:
|
| 12 |
"""Creates a dataframe from all the individual experiment results"""
|
| 13 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
| 14 |
all_data_json = [v.to_dict() for v in raw_data]
|
| 15 |
|
| 16 |
df = pd.DataFrame.from_records(all_data_json)
|
| 17 |
+
df = df.sort_values(by=[AutoEvalColumn.average_auroc.name], ascending=False)
|
| 18 |
df = df[cols].round(decimals=2)
|
| 19 |
|
|
|
|
|
|
|
| 20 |
return raw_data, df
|
| 21 |
|
| 22 |
|