TemryL commited on
Commit
b87a596
·
1 Parent(s): 35b35ef

select phenotypes and metrics

Browse files
Files changed (6) hide show
  1. app.py +50 -15
  2. src/about.py +36 -7
  3. src/display/utils.py +9 -5
  4. src/envs.py +4 -4
  5. src/leaderboard/read_evals.py +17 -17
  6. src/populate.py +2 -4
app.py CHANGED
@@ -1,4 +1,3 @@
1
- import subprocess
2
  import gradio as gr
3
  import pandas as pd
4
  from apscheduler.schedulers.background import BackgroundScheduler
@@ -14,7 +13,6 @@ from src.about import (
14
  )
15
  from src.display.css_html_js import custom_css
16
  from src.display.utils import (
17
- BENCHMARK_COLS,
18
  COLS,
19
  EVAL_COLS,
20
  EVAL_TYPES,
@@ -24,11 +22,16 @@ from src.display.utils import (
24
  ModelType,
25
  fields,
26
  WeightType,
27
- Precision
 
28
  )
29
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
30
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
31
  from src.submission.submit import add_new_eval
 
 
 
 
32
 
33
 
34
  def restart_space():
@@ -50,7 +53,7 @@ except Exception:
50
  restart_space()
51
 
52
 
53
- raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
54
  leaderboard_df = original_df.copy()
55
 
56
  (
@@ -64,6 +67,8 @@ leaderboard_df = original_df.copy()
64
  def update_table(
65
  hidden_df: pd.DataFrame,
66
  columns: list,
 
 
67
  type_query: list,
68
  precision_query: str,
69
  size_query: list,
@@ -72,7 +77,7 @@ def update_table(
72
  ):
73
  filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
74
  filtered_df = filter_queries(query, filtered_df)
75
- df = select_columns(filtered_df, columns)
76
  return df
77
 
78
 
@@ -80,14 +85,19 @@ def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
80
  return df[(df[AutoEvalColumn.model.name].str.contains(query, case=False))]
81
 
82
 
83
- def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
84
  always_here_cols = [
85
  AutoEvalColumn.model_type_symbol.name,
86
  AutoEvalColumn.model.name,
87
  ]
88
- # We use COLS to maintain sorting
 
 
 
 
 
89
  filtered_df = df[
90
- always_here_cols + [c for c in COLS if c in df.columns and c in columns]
91
  ]
92
  return filtered_df
93
 
@@ -147,12 +157,34 @@ with demo:
147
  show_label=False,
148
  elem_id="search-bar",
149
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  with gr.Row():
151
  shown_columns = gr.CheckboxGroup(
152
  choices=[
153
  c.name
154
  for c in fields(AutoEvalColumn)
155
- if not c.hidden and not c.never_hidden
156
  ],
157
  value=[
158
  c.name
@@ -163,12 +195,7 @@ with demo:
163
  elem_id="column-select",
164
  interactive=True,
165
  )
166
- with gr.Row():
167
- deleted_models_visibility = gr.Checkbox(
168
- value=False, label="Show gated/private/deleted models", interactive=True
169
- )
170
  with gr.Column(min_width=320):
171
- #with gr.Box(elem_id="box-filter"):
172
  filter_columns_type = gr.CheckboxGroup(
173
  label="Model types",
174
  choices=[t.to_str() for t in ModelType],
@@ -190,6 +217,10 @@ with demo:
190
  interactive=True,
191
  elem_id="filter-columns-size",
192
  )
 
 
 
 
193
 
194
  leaderboard_table = gr.components.Dataframe(
195
  value=leaderboard_df[
@@ -215,6 +246,8 @@ with demo:
215
  [
216
  hidden_leaderboard_table_for_search,
217
  shown_columns,
 
 
218
  filter_columns_type,
219
  filter_columns_precision,
220
  filter_columns_size,
@@ -223,12 +256,14 @@ with demo:
223
  ],
224
  leaderboard_table,
225
  )
226
- for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size, deleted_models_visibility]:
227
  selector.change(
228
  update_table,
229
  [
230
  hidden_leaderboard_table_for_search,
231
  shown_columns,
 
 
232
  filter_columns_type,
233
  filter_columns_precision,
234
  filter_columns_size,
 
 
1
  import gradio as gr
2
  import pandas as pd
3
  from apscheduler.schedulers.background import BackgroundScheduler
 
13
  )
14
  from src.display.css_html_js import custom_css
15
  from src.display.utils import (
 
16
  COLS,
17
  EVAL_COLS,
18
  EVAL_TYPES,
 
22
  ModelType,
23
  fields,
24
  WeightType,
25
+ Precision,
26
+ generate_column_name
27
  )
28
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
29
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
30
  from src.submission.submit import add_new_eval
31
+ from dotenv import load_dotenv
32
+
33
+
34
+ load_dotenv()
35
 
36
 
37
  def restart_space():
 
53
  restart_space()
54
 
55
 
56
+ raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS)
57
  leaderboard_df = original_df.copy()
58
 
59
  (
 
67
  def update_table(
68
  hidden_df: pd.DataFrame,
69
  columns: list,
70
+ phenotypes: list,
71
+ metrics: list,
72
  type_query: list,
73
  precision_query: str,
74
  size_query: list,
 
77
  ):
78
  filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
79
  filtered_df = filter_queries(query, filtered_df)
80
+ df = select_columns(filtered_df, columns, phenotypes, metrics)
81
  return df
82
 
83
 
 
85
  return df[(df[AutoEvalColumn.model.name].str.contains(query, case=False))]
86
 
87
 
88
+ def select_columns(df: pd.DataFrame, columns: list, phenotypes: list, metrics:list) -> pd.DataFrame:
89
  always_here_cols = [
90
  AutoEvalColumn.model_type_symbol.name,
91
  AutoEvalColumn.model.name,
92
  ]
93
+
94
+ task_cols = []
95
+ for phenotype in phenotypes:
96
+ for metric in metrics:
97
+ task_cols.append(generate_column_name(phenotype, metric))
98
+
99
  filtered_df = df[
100
+ always_here_cols + [c for c in COLS if c in df.columns and c in columns] + sorted(task_cols)
101
  ]
102
  return filtered_df
103
 
 
157
  show_label=False,
158
  elem_id="search-bar",
159
  )
160
+ with gr.Row():
161
+ with gr.Column(min_width=320):
162
+ shown_phenotypes = gr.CheckboxGroup(
163
+ choices=sorted(set([
164
+ c.task.value.phenotype_name
165
+ for c in fields(AutoEvalColumn)
166
+ if not c.hidden and not c.never_hidden and c.is_task
167
+ ])),
168
+ label="Select phenotypes to show",
169
+ elem_id="phenotype-select",
170
+ interactive=True,
171
+ )
172
+ shown_metrics = gr.CheckboxGroup(
173
+ choices=sorted(set([
174
+ c.task.value.metric_name
175
+ for c in fields(AutoEvalColumn)
176
+ if not c.hidden and not c.never_hidden and c.is_task
177
+ ])),
178
+ label="Select metrics to show",
179
+ elem_id="metric-select",
180
+ interactive=True,
181
+ )
182
  with gr.Row():
183
  shown_columns = gr.CheckboxGroup(
184
  choices=[
185
  c.name
186
  for c in fields(AutoEvalColumn)
187
+ if not c.hidden and not c.never_hidden and not c.is_task
188
  ],
189
  value=[
190
  c.name
 
195
  elem_id="column-select",
196
  interactive=True,
197
  )
 
 
 
 
198
  with gr.Column(min_width=320):
 
199
  filter_columns_type = gr.CheckboxGroup(
200
  label="Model types",
201
  choices=[t.to_str() for t in ModelType],
 
217
  interactive=True,
218
  elem_id="filter-columns-size",
219
  )
220
+ with gr.Row():
221
+ deleted_models_visibility = gr.Checkbox(
222
+ value=True, label="Show gated/private/deleted models", interactive=True
223
+ )
224
 
225
  leaderboard_table = gr.components.Dataframe(
226
  value=leaderboard_df[
 
246
  [
247
  hidden_leaderboard_table_for_search,
248
  shown_columns,
249
+ shown_phenotypes,
250
+ shown_metrics,
251
  filter_columns_type,
252
  filter_columns_precision,
253
  filter_columns_size,
 
256
  ],
257
  leaderboard_table,
258
  )
259
+ for selector in [shown_phenotypes, shown_metrics, shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size, deleted_models_visibility]:
260
  selector.change(
261
  update_table,
262
  [
263
  hidden_leaderboard_table_for_search,
264
  shown_columns,
265
+ shown_phenotypes,
266
+ shown_metrics,
267
  filter_columns_type,
268
  filter_columns_precision,
269
  filter_columns_size,
src/about.py CHANGED
@@ -3,17 +3,39 @@ from enum import Enum
3
 
4
  @dataclass
5
  class Task:
6
- benchmark: str
7
- metric: str
8
- col_name: str
 
9
 
10
 
11
  # Select your tasks here
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
- # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- task0 = Task("anli_r1", "acc", "ANLI")
16
- task1 = Task("logiqa", "acc_norm", "LogiQA")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  NUM_FEWSHOT = 0 # Change with your few shot
19
  # ---------------------------------------------------
@@ -25,7 +47,14 @@ TITLE = """<h1 align="center" id="space-title">OpenHeLM Leaderboard</h1>"""
25
 
26
  # What does your leaderboard evaluate?
27
  INTRODUCTION_TEXT = """
28
- Intro text
 
 
 
 
 
 
 
29
  """
30
 
31
  # Which evaluations are you running? how can people reproduce what you have?
 
3
 
4
  @dataclass
5
  class Task:
6
+ phenotype_key: str
7
+ phenotype_name: str
8
+ metric_key: str
9
+ metric_name: str
10
 
11
 
12
  # Select your tasks here
13
  # ---------------------------------------------------
14
  class Tasks(Enum):
15
+ task0 = Task("asthma", "Asthma", "auroc", "AUROC")
16
+ task1 = Task("cataract", "Cataract", "auroc", "AUROC")
17
+ task2 = Task("diabete", "Diabete", "auroc", "AUROC")
18
+ task3 = Task("GERD", "GERD", "auroc", "AUROC")
19
+ task4 = Task("hay-fever-eczema", "Hay-fever & Eczema", "auroc", "AUROC")
20
+ task5 = Task("hypertension", "Hypertension", "auroc", "AUROC")
21
+ task6 = Task("major-depression", "Major Depression", "auroc", "AUROC")
22
+ task7 = Task("migraine", "Migraine", "auroc", "AUROC")
23
+ task8 = Task("myocardial-infarction", "Myocardial Infarction", "auroc", "AUROC")
24
+ task9 = Task("osteoarthritis", "Osteoarthritis", "auroc", "AUROC")
25
+ task10 = Task("pneumonia", "Pneumonia", "auroc", "AUROC")
26
+ task11 = Task("stroke", "Stroke", "auroc", "AUROC")
27
+ task12 = Task("asthma", "Asthma", "auprc", "AUPRC")
28
+ task13 = Task("cataract", "Cataract", "auprc", "AUPRC")
29
+ task14 = Task("diabete", "Diabete", "auprc", "AUPRC")
30
+ task15 = Task("GERD", "GERD", "auprc", "AUPRC")
31
+ task16 = Task("hay-fever-eczema", "Hay-fever & Eczema", "auprc", "AUPRC")
32
+ task17 = Task("hypertension", "Hypertension", "auprc", "AUPRC")
33
+ task18 = Task("major-depression", "Major Depression", "auprc", "AUPRC")
34
+ task19 = Task("migraine", "Migraine", "auprc", "AUPRC")
35
+ task20 = Task("myocardial-infarction", "Myocardial Infarction", "auprc", "AUPRC")
36
+ task21 = Task("osteoarthritis", "Osteoarthritis", "auprc", "AUPRC")
37
+ task22 = Task("pneumonia", "Pneumonia", "auprc", "AUPRC")
38
+ task23 = Task("stroke", "Stroke", "auprc", "AUPRC")
39
 
40
  NUM_FEWSHOT = 0 # Change with your few shot
41
  # ---------------------------------------------------
 
47
 
48
  # What does your leaderboard evaluate?
49
  INTRODUCTION_TEXT = """
50
+ TODO:
51
+
52
+ - Add a description of the leaderboard
53
+ - Add class distribution for each phenotype
54
+ - Potentially a warning when we should not rely on AUROC
55
+ - Plot of AUROC and AUPRC for each phenotype
56
+ - Edit about section
57
+ - Edit submit section (AutoModelForCausalLM)
58
  """
59
 
60
  # Which evaluations are you running? how can people reproduce what you have?
src/display/utils.py CHANGED
@@ -2,12 +2,15 @@ from dataclasses import dataclass, make_dataclass
2
  from enum import Enum
3
 
4
  import pandas as pd
 
5
 
6
- from src.about import Tasks
7
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
10
 
 
 
 
11
 
12
  # These classes are for user facing column names,
13
  # to avoid having to change them all around the code
@@ -19,6 +22,8 @@ class ColumnContent:
19
  displayed_by_default: bool
20
  hidden: bool = False
21
  never_hidden: bool = False
 
 
22
 
23
  ## Leaderboard columns
24
  auto_eval_column_dict = []
@@ -26,9 +31,10 @@ auto_eval_column_dict = []
26
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
  #Scores
29
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
 
30
  for task in Tasks:
31
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
  # Model information
33
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
@@ -121,8 +127,6 @@ TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default a
121
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
122
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
123
 
124
- BENCHMARK_COLS = [t.value.col_name for t in Tasks]
125
-
126
  NUMERIC_INTERVALS = {
127
  "?": pd.Interval(-1, 0, closed="right"),
128
  "~1.5": pd.Interval(0, 2, closed="right"),
 
2
  from enum import Enum
3
 
4
  import pandas as pd
5
+ from src.about import Task, Tasks
6
 
 
7
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
10
 
11
+ def generate_column_name(phenotype_name, metric_name):
12
+ return f"{phenotype_name} ({metric_name})"
13
+
14
 
15
  # These classes are for user facing column names,
16
  # to avoid having to change them all around the code
 
22
  displayed_by_default: bool
23
  hidden: bool = False
24
  never_hidden: bool = False
25
+ is_task: bool = False
26
+ task: Task = None
27
 
28
  ## Leaderboard columns
29
  auto_eval_column_dict = []
 
31
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
32
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
33
  #Scores
34
+ auto_eval_column_dict.append(["average_auroc", ColumnContent, ColumnContent("Average AUROC ⬆️", "number", True)])
35
+ auto_eval_column_dict.append(["average_auprc", ColumnContent, ColumnContent("Average AUPRC ⬆️", "number", True)])
36
  for task in Tasks:
37
+ auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(generate_column_name(task.value.phenotype_name, task.value.metric_name), "number", displayed_by_default=False, is_task=True, task=task)])
38
  # Model information
39
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
40
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
 
127
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
128
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
129
 
 
 
130
  NUMERIC_INTERVALS = {
131
  "?": pd.Interval(-1, 0, closed="right"),
132
  "~1.5": pd.Interval(0, 2, closed="right"),
src/envs.py CHANGED
@@ -6,12 +6,12 @@ from huggingface_hub import HfApi
6
  # ----------------------------------
7
  TOKEN = os.environ.get("TOKEN") # A read/write token for your org
8
 
9
- OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
- REPO_ID = f"{OWNER}/leaderboard"
13
- QUEUE_REPO = f"{OWNER}/requests"
14
- RESULTS_REPO = f"{OWNER}/results"
15
 
16
  # If you setup a cache later, just change HF_HOME
17
  CACHE_PATH=os.getenv("HF_HOME", ".")
 
6
  # ----------------------------------
7
  TOKEN = os.environ.get("TOKEN") # A read/write token for your org
8
 
9
+ OWNER = "TemryL" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
+ REPO_ID = f"{OWNER}/OpenHeLM-leaderboard"
13
+ QUEUE_REPO = f"{OWNER}/OpenHeLM-requests"
14
+ RESULTS_REPO = f"{OWNER}/OpenHeLM-results"
15
 
16
  # If you setup a cache later, just change HF_HOME
17
  CACHE_PATH=os.getenv("HF_HOME", ".")
src/leaderboard/read_evals.py CHANGED
@@ -1,6 +1,5 @@
1
  import glob
2
  import json
3
- import math
4
  import os
5
  from dataclasses import dataclass
6
 
@@ -8,7 +7,7 @@ import dateutil
8
  import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
- from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
12
  from src.submission.check_validity import is_model_on_hub
13
 
14
 
@@ -22,6 +21,7 @@ class EvalResult:
22
  model: str
23
  revision: str # commit hash, "" if main
24
  results: dict
 
25
  precision: Precision = Precision.Unknown
26
  model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
27
  weight_type: WeightType = WeightType.Original # Original or Adapter
@@ -58,7 +58,7 @@ class EvalResult:
58
  full_model = "/".join(org_and_model)
59
 
60
  still_on_hub, _, model_config = is_model_on_hub(
61
- full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
62
  )
63
  architecture = "?"
64
  if model_config is not None:
@@ -70,14 +70,12 @@ class EvalResult:
70
  results = {}
71
  for task in Tasks:
72
  task = task.value
73
-
74
- # We average all scores of a given metric (not all metrics are present in all files)
75
- accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
76
- if accs.size == 0 or any([acc is None for acc in accs]):
77
- continue
78
-
79
- mean_acc = np.mean(accs) * 100.0
80
- results[task.benchmark] = mean_acc
81
 
82
  return self(
83
  eval_name=result_key,
@@ -85,8 +83,9 @@ class EvalResult:
85
  org=org,
86
  model=model,
87
  results=results,
88
- precision=precision,
89
- revision= config.get("model_sha", ""),
 
90
  still_on_hub=still_on_hub,
91
  architecture=architecture
92
  )
@@ -109,7 +108,8 @@ class EvalResult:
109
 
110
  def to_dict(self):
111
  """Converts the Eval Result to a dict compatible with our dataframe display"""
112
- average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
 
113
  data_dict = {
114
  "eval_name": self.eval_name, # not a column, just a save name,
115
  AutoEvalColumn.precision.name: self.precision.value.name,
@@ -119,7 +119,8 @@ class EvalResult:
119
  AutoEvalColumn.architecture.name: self.architecture,
120
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
121
  AutoEvalColumn.revision.name: self.revision,
122
- AutoEvalColumn.average.name: average,
 
123
  AutoEvalColumn.license.name: self.license,
124
  AutoEvalColumn.likes.name: self.likes,
125
  AutoEvalColumn.params.name: self.num_params,
@@ -127,8 +128,7 @@ class EvalResult:
127
  }
128
 
129
  for task in Tasks:
130
- data_dict[task.value.col_name] = self.results[task.value.benchmark]
131
-
132
  return data_dict
133
 
134
 
 
1
  import glob
2
  import json
 
3
  import os
4
  from dataclasses import dataclass
5
 
 
7
  import numpy as np
8
 
9
  from src.display.formatting import make_clickable_model
10
+ from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, generate_column_name
11
  from src.submission.check_validity import is_model_on_hub
12
 
13
 
 
21
  model: str
22
  revision: str # commit hash, "" if main
23
  results: dict
24
+ raw_data: dict
25
  precision: Precision = Precision.Unknown
26
  model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
27
  weight_type: WeightType = WeightType.Original # Original or Adapter
 
58
  full_model = "/".join(org_and_model)
59
 
60
  still_on_hub, _, model_config = is_model_on_hub(
61
+ full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False, token=os.environ.get("TOKEN")
62
  )
63
  architecture = "?"
64
  if model_config is not None:
 
70
  results = {}
71
  for task in Tasks:
72
  task = task.value
73
+
74
+ mean = data["results"].get(task.phenotype_key, {}).get("_".join(["mean", task.metric_key]), None)
75
+ lower = data["results"].get(task.phenotype_key, {}).get("_".join(["lower", task.metric_key]), None)
76
+ upper = data["results"].get(task.phenotype_key, {}).get("_".join(["upper", task.metric_key]), None)
77
+ formated_score = f"{mean:.2f} ({lower:.2f}-{upper:.2f})" if mean is not None else None
78
+ results["_".join([task.phenotype_key, task.metric_key])] = formated_score
 
 
79
 
80
  return self(
81
  eval_name=result_key,
 
83
  org=org,
84
  model=model,
85
  results=results,
86
+ raw_data=data,
87
+ precision=precision,
88
+ revision=config.get("model_sha", ""),
89
  still_on_hub=still_on_hub,
90
  architecture=architecture
91
  )
 
108
 
109
  def to_dict(self):
110
  """Converts the Eval Result to a dict compatible with our dataframe display"""
111
+ average_auroc = np.mean(np.array([d["mean_auroc"] for d in self.raw_data["results"].values() if "mean_auroc" in d.keys()]))
112
+ average_auprc = np.mean(np.array([d["mean_auprc"] for d in self.raw_data["results"].values() if "mean_auprc" in d.keys()]))
113
  data_dict = {
114
  "eval_name": self.eval_name, # not a column, just a save name,
115
  AutoEvalColumn.precision.name: self.precision.value.name,
 
119
  AutoEvalColumn.architecture.name: self.architecture,
120
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
121
  AutoEvalColumn.revision.name: self.revision,
122
+ AutoEvalColumn.average_auroc.name: average_auroc,
123
+ AutoEvalColumn.average_auprc.name: average_auprc,
124
  AutoEvalColumn.license.name: self.license,
125
  AutoEvalColumn.likes.name: self.likes,
126
  AutoEvalColumn.params.name: self.num_params,
 
128
  }
129
 
130
  for task in Tasks:
131
+ data_dict[generate_column_name(task.value.phenotype_name, task.value.metric_name)] = self.results["_".join([task.value.phenotype_key, task.value.metric_key])]
 
132
  return data_dict
133
 
134
 
src/populate.py CHANGED
@@ -8,17 +8,15 @@ from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
  from src.leaderboard.read_evals import get_raw_eval_results
9
 
10
 
11
- def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
13
  raw_data = get_raw_eval_results(results_path, requests_path)
14
  all_data_json = [v.to_dict() for v in raw_data]
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
17
- df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
18
  df = df[cols].round(decimals=2)
19
 
20
- # filter out if any of the benchmarks have not been produced
21
- df = df[has_no_nan_values(df, benchmark_cols)]
22
  return raw_data, df
23
 
24
 
 
8
  from src.leaderboard.read_evals import get_raw_eval_results
9
 
10
 
11
+ def get_leaderboard_df(results_path: str, requests_path: str, cols: list) -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
13
  raw_data = get_raw_eval_results(results_path, requests_path)
14
  all_data_json = [v.to_dict() for v in raw_data]
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
17
+ df = df.sort_values(by=[AutoEvalColumn.average_auroc.name], ascending=False)
18
  df = df[cols].round(decimals=2)
19
 
 
 
20
  return raw_data, df
21
 
22