TemryL commited on
Commit
8d8ba34
·
1 Parent(s): d2750fc

update leaderboard

Browse files
Files changed (6) hide show
  1. app.py +61 -45
  2. src/about.py +23 -29
  3. src/display/utils.py +1 -1
  4. src/envs.py +1 -2
  5. src/leaderboard/read_evals.py +7 -10
  6. src/populate.py +3 -5
app.py CHANGED
@@ -168,7 +168,7 @@ with demo:
168
  with gr.Column(min_width=320):
169
  shown_phenotypes = gr.CheckboxGroup(
170
  choices=sorted(set([
171
- c.task.value.phenotype_name
172
  for c in fields(AutoEvalColumn)
173
  if not c.hidden and not c.never_hidden and c.is_task
174
  ])),
@@ -178,12 +178,12 @@ with demo:
178
  )
179
  shown_metrics = gr.CheckboxGroup(
180
  choices=sorted(set([
181
- c.task.value.metric_name
182
  for c in fields(AutoEvalColumn)
183
  if not c.hidden and not c.never_hidden and c.is_task
184
  ])),
185
  value=sorted(set([
186
- c.task.value.metric_name
187
  for c in fields(AutoEvalColumn)
188
  if not c.hidden and not c.never_hidden and c.is_task
189
  ])),
@@ -212,42 +212,41 @@ with demo:
212
  value=True, label="Show gated/private/deleted models", interactive=True
213
  )
214
  with gr.Column(min_width=320):
215
- with gr.Column(min_width=320):
216
- filter_features = gr.CheckboxGroup(
217
- label="Features Set",
218
- choices=[("Baseline (age, sex, BMI)", "baseline"), ("Expanded (age, sex, BMI, HDL, LDL, total-cholesterol, triglycerides, diastolic-blood-pressure, smoking-status, snoring, insomnia, daytime-napping, sleep-duration, chronotype)", "expanded")],
219
- value=["baseline"],
220
- interactive=True,
221
- elem_id="filter-feature-set",
222
- )
223
- filter_nb_shots = gr.CheckboxGroup(
224
- label="Number of shots",
225
- choices=[("Zero-shot", 0), ("10-shot", 10), ("All", -1)],
226
- value=[0],
227
- interactive=True,
228
- elem_id="filter-nb-shots",
229
- )
230
- filter_columns_type = gr.CheckboxGroup(
231
- label="Model types",
232
- choices=[t.to_str() for t in ModelType],
233
- value=[t.to_str() for t in ModelType],
234
- interactive=True,
235
- elem_id="filter-columns-type",
236
- )
237
- filter_columns_precision = gr.CheckboxGroup(
238
- label="Precision",
239
- choices=[i.value.name for i in Precision],
240
- value=[i.value.name for i in Precision],
241
- interactive=True,
242
- elem_id="filter-columns-precision",
243
- )
244
- filter_columns_size = gr.CheckboxGroup(
245
- label="Model sizes (in billions of parameters)",
246
- choices=list(NUMERIC_INTERVALS.keys()),
247
- value=list(NUMERIC_INTERVALS.keys()),
248
- interactive=True,
249
- elem_id="filter-columns-size",
250
- )
251
 
252
  leaderboard_table = gr.components.Dataframe(
253
  value=leaderboard_df[
@@ -260,12 +259,6 @@ with demo:
260
  interactive=False,
261
  visible=True,
262
  )
263
-
264
- # Plotting the curves
265
- # gr.Plot(
266
- # plot_curves(),
267
- # elem_id="plot-curves"
268
- # )
269
 
270
  # Dummy leaderboard for handling the case when the user uses backspace key
271
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
@@ -309,6 +302,29 @@ with demo:
309
  queue=True,
310
  )
311
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
312
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
313
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
314
 
 
168
  with gr.Column(min_width=320):
169
  shown_phenotypes = gr.CheckboxGroup(
170
  choices=sorted(set([
171
+ c.task.value.phenotype
172
  for c in fields(AutoEvalColumn)
173
  if not c.hidden and not c.never_hidden and c.is_task
174
  ])),
 
178
  )
179
  shown_metrics = gr.CheckboxGroup(
180
  choices=sorted(set([
181
+ c.task.value.metric.upper()
182
  for c in fields(AutoEvalColumn)
183
  if not c.hidden and not c.never_hidden and c.is_task
184
  ])),
185
  value=sorted(set([
186
+ c.task.value.metric.upper()
187
  for c in fields(AutoEvalColumn)
188
  if not c.hidden and not c.never_hidden and c.is_task
189
  ])),
 
212
  value=True, label="Show gated/private/deleted models", interactive=True
213
  )
214
  with gr.Column(min_width=320):
215
+ filter_features = gr.CheckboxGroup(
216
+ label="Features Set",
217
+ choices=[("Baseline (Age, Sex, BMI)", "baseline"), ("Expanded (Age, Sex, BMI, HDL, LDL, Total cholesterol, Triglycerides, Diastolic blood pressure, Smoking status, Snoring, Insomnia, Daytime napping, Sleep duration, Chronotype)", "expanded")],
218
+ value=["baseline", "expanded"],
219
+ interactive=True,
220
+ elem_id="filter-feature-set",
221
+ )
222
+ filter_nb_shots = gr.CheckboxGroup(
223
+ label="Number of shots",
224
+ choices=[("Zero-Shot", 0), ("10-Shot", 10), ("All", -1)],
225
+ value=[0],
226
+ interactive=True,
227
+ elem_id="filter-nb-shots",
228
+ )
229
+ filter_columns_type = gr.CheckboxGroup(
230
+ label="Model types",
231
+ choices=[t.to_str() for t in ModelType],
232
+ value=[t.to_str() for t in ModelType],
233
+ interactive=True,
234
+ elem_id="filter-columns-type",
235
+ )
236
+ filter_columns_precision = gr.CheckboxGroup(
237
+ label="Precision",
238
+ choices=[i.value.name for i in Precision],
239
+ value=[i.value.name for i in Precision],
240
+ interactive=True,
241
+ elem_id="filter-columns-precision",
242
+ )
243
+ filter_columns_size = gr.CheckboxGroup(
244
+ label="Model sizes (in billions of parameters)",
245
+ choices=list(NUMERIC_INTERVALS.keys()),
246
+ value=list(NUMERIC_INTERVALS.keys()),
247
+ interactive=True,
248
+ elem_id="filter-columns-size",
249
+ )
 
250
 
251
  leaderboard_table = gr.components.Dataframe(
252
  value=leaderboard_df[
 
259
  interactive=False,
260
  visible=True,
261
  )
 
 
 
 
 
 
262
 
263
  # Dummy leaderboard for handling the case when the user uses backspace key
264
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
 
302
  queue=True,
303
  )
304
 
305
+ # with gr.TabItem("📈 Metrics through time", elem_id="llm-benchmark-tab-table", id=2):
306
+ # with gr.Row():
307
+ # gr.Plot(
308
+ # plot_curves(),
309
+ # elem_id="plot-curves"
310
+ # )
311
+ # with gr.Column():
312
+ # plot_df = load_and_create_plots()
313
+ # chart = create_metric_plot_obj(
314
+ # plot_df,
315
+ # [AutoEvalColumn.average.name],
316
+ # title="Average of Top Scores and Human Baseline Over Time (from last update)",
317
+ # )
318
+ # gr.Plot(value=chart, min_width=500)
319
+ # with gr.Column():
320
+ # plot_df = load_and_create_plots()
321
+ # chart = create_metric_plot_obj(
322
+ # plot_df,
323
+ # BENCHMARK_COLS,
324
+ # title="Top Scores and Human Baseline Over Time (from last update)",
325
+ # )
326
+ # gr.Plot(value=chart, min_width=500)
327
+
328
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
329
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
330
 
src/about.py CHANGED
@@ -3,44 +3,38 @@ from enum import Enum
3
 
4
  @dataclass
5
  class Task:
6
- phenotype_key: str
7
- phenotype_name: str
8
- metric_key: str
9
- metric_name: str
10
 
11
 
12
  # Select your tasks here
13
  # ---------------------------------------------------
14
  class Tasks(Enum):
15
- task0 = Task("asthma", "Asthma", "auroc", "AUROC")
16
- task1 = Task("cataract", "Cataract", "auroc", "AUROC")
17
- task2 = Task("diabete", "Diabete", "auroc", "AUROC")
18
- task3 = Task("GERD", "GERD", "auroc", "AUROC")
19
- task4 = Task("hay-fever-eczema", "Hay-fever & Eczema", "auroc", "AUROC")
20
- task5 = Task("hypertension", "Hypertension", "auroc", "AUROC")
21
- task6 = Task("major-depression", "Major Depression", "auroc", "AUROC")
22
- task7 = Task("migraine", "Migraine", "auroc", "AUROC")
23
- task8 = Task("myocardial-infarction", "Myocardial Infarction", "auroc", "AUROC")
24
- task9 = Task("osteoarthritis", "Osteoarthritis", "auroc", "AUROC")
25
- task10 = Task("pneumonia", "Pneumonia", "auroc", "AUROC")
26
- task11 = Task("stroke", "Stroke", "auroc", "AUROC")
27
- task12 = Task("asthma", "Asthma", "auprc", "AUPRC")
28
- task13 = Task("cataract", "Cataract", "auprc", "AUPRC")
29
- task14 = Task("diabete", "Diabete", "auprc", "AUPRC")
30
- task15 = Task("GERD", "GERD", "auprc", "AUPRC")
31
- task16 = Task("hay-fever-eczema", "Hay-fever & Eczema", "auprc", "AUPRC")
32
- task17 = Task("hypertension", "Hypertension", "auprc", "AUPRC")
33
- task18 = Task("major-depression", "Major Depression", "auprc", "AUPRC")
34
- task19 = Task("migraine", "Migraine", "auprc", "AUPRC")
35
- task20 = Task("myocardial-infarction", "Myocardial Infarction", "auprc", "AUPRC")
36
- task21 = Task("osteoarthritis", "Osteoarthritis", "auprc", "AUPRC")
37
- task22 = Task("pneumonia", "Pneumonia", "auprc", "AUPRC")
38
- task23 = Task("stroke", "Stroke", "auprc", "AUPRC")
39
  # ---------------------------------------------------
40
 
41
 
42
  # Your leaderboard name
43
- TITLE = """<h1 align="center" id="space-title">LLM Disease Risk Prediction Leaderboard</h1>"""
44
 
45
  # What does your leaderboard evaluate?
46
  INTRODUCTION_TEXT = """
 
3
 
4
  @dataclass
5
  class Task:
6
+ phenotype: str
7
+ metric: str
 
 
8
 
9
 
10
  # Select your tasks here
11
  # ---------------------------------------------------
12
  class Tasks(Enum):
13
+ task0 = Task("Asthma", "auroc")
14
+ task1 = Task("Cataract", "auroc")
15
+ task2 = Task("Diabetes", "auroc")
16
+ task3 = Task("GERD", "auroc")
17
+ task4 = Task("Hay-fever & Eczema", "auroc")
18
+ task5 = Task("Major depression", "auroc")
19
+ task6 = Task("Myocardial infarction", "auroc")
20
+ task7 = Task("Osteoarthritis", "auroc")
21
+ task8 = Task("Pneumonia", "auroc")
22
+ task9 = Task("Stroke", "auroc")
23
+ task10 = Task("Asthma", "auprc")
24
+ task11 = Task("Cataract", "auprc")
25
+ task12 = Task("Diabetes", "auprc")
26
+ task13 = Task("GERD", "auprc")
27
+ task14 = Task("Hay-fever & Eczema", "auprc")
28
+ task15 = Task("Major depression", "auprc")
29
+ task16 = Task("Myocardial infarction", "auprc")
30
+ task17 = Task("Osteoarthritis", "auprc")
31
+ task18 = Task("Pneumonia", "auprc")
32
+ task19 = Task("Stroke", "auprc")
 
 
 
 
33
  # ---------------------------------------------------
34
 
35
 
36
  # Your leaderboard name
37
+ TITLE = """<h1 align="center" id="space-title">LLMs Disease Risk Prediction Leaderboard</h1>"""
38
 
39
  # What does your leaderboard evaluate?
40
  INTRODUCTION_TEXT = """
src/display/utils.py CHANGED
@@ -36,7 +36,7 @@ auto_eval_column_dict.append(["nb_shots", ColumnContent, ColumnContent("#Shots",
36
  auto_eval_column_dict.append(["average_auroc", ColumnContent, ColumnContent("Average AUROC ⬆️", "number", True)])
37
  auto_eval_column_dict.append(["average_auprc", ColumnContent, ColumnContent("Average AUPRC ⬆️", "number", True)])
38
  for task in Tasks:
39
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(generate_column_name(task.value.phenotype_name, task.value.metric_name), "number", displayed_by_default=False, is_task=True, task=task)])
40
  # Model information
41
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
42
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
 
36
  auto_eval_column_dict.append(["average_auroc", ColumnContent, ColumnContent("Average AUROC ⬆️", "number", True)])
37
  auto_eval_column_dict.append(["average_auprc", ColumnContent, ColumnContent("Average AUPRC ⬆️", "number", True)])
38
  for task in Tasks:
39
+ auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(generate_column_name(task.value.phenotype, task.value.metric.upper()), "number", displayed_by_default=False, is_task=True, task=task)])
40
  # Model information
41
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
42
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
src/envs.py CHANGED
@@ -1,11 +1,10 @@
1
  import os
2
-
3
  from huggingface_hub import HfApi
4
 
 
5
  # Info to change for your repository
6
  # ----------------------------------
7
  HF_TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
-
9
  OWNER = "TemryL" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
 
1
  import os
 
2
  from huggingface_hub import HfApi
3
 
4
+
5
  # Info to change for your repository
6
  # ----------------------------------
7
  HF_TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
 
8
  OWNER = "TemryL" # Change to your org - don't forget to create a results and request dataset, with the correct format!
9
  # ----------------------------------
10
 
src/leaderboard/read_evals.py CHANGED
@@ -1,11 +1,9 @@
 
1
  import glob
2
  import json
3
- import os
4
- from dataclasses import dataclass
5
-
6
  import dateutil
7
  import numpy as np
8
-
9
  from src.display.formatting import make_clickable_model
10
  from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, generate_column_name
11
  from src.submission.check_validity import is_model_on_hub
@@ -69,12 +67,11 @@ class EvalResult:
69
  results = {}
70
  for task in Tasks:
71
  task = task.value
72
-
73
- mean = data["results"].get(task.phenotype_key, {}).get("metrics", {}).get("_".join(["mean", task.metric_key]), None)
74
- lower = data["results"].get(task.phenotype_key, {}).get("metrics", {}).get("_".join(["lower", task.metric_key]), None)
75
- upper = data["results"].get(task.phenotype_key, {}).get("metrics", {}).get("_".join(["upper", task.metric_key]), None)
76
  formated_score = f"{mean:.2f} ({lower:.2f}-{upper:.2f})" if mean is not None else None
77
- results["_".join([task.phenotype_key, task.metric_key])] = formated_score
78
 
79
  return self(
80
  eval_name=f"{org}_{model}_{precision.value.name}_{feature_set}_{nb_shots}",
@@ -121,7 +118,7 @@ class EvalResult:
121
  }
122
 
123
  for task in Tasks:
124
- data_dict[generate_column_name(task.value.phenotype_name, task.value.metric_name)] = self.results["_".join([task.value.phenotype_key, task.value.metric_key])]
125
  return data_dict
126
 
127
 
 
1
+ import os
2
  import glob
3
  import json
 
 
 
4
  import dateutil
5
  import numpy as np
6
+ from dataclasses import dataclass
7
  from src.display.formatting import make_clickable_model
8
  from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, generate_column_name
9
  from src.submission.check_validity import is_model_on_hub
 
67
  results = {}
68
  for task in Tasks:
69
  task = task.value
70
+ mean = data["results"].get(task.phenotype, {}).get("metrics", {}).get("_".join(["mean", task.metric]), None)
71
+ lower = data["results"].get(task.phenotype, {}).get("metrics", {}).get("_".join(["lower", task.metric]), None)
72
+ upper = data["results"].get(task.phenotype, {}).get("metrics", {}).get("_".join(["upper", task.metric]), None)
 
73
  formated_score = f"{mean:.2f} ({lower:.2f}-{upper:.2f})" if mean is not None else None
74
+ results["_".join([task.phenotype, task.metric])] = formated_score
75
 
76
  return self(
77
  eval_name=f"{org}_{model}_{precision.value.name}_{feature_set}_{nb_shots}",
 
118
  }
119
 
120
  for task in Tasks:
121
+ data_dict[generate_column_name(task.value.phenotype, task.value.metric.upper())] = self.results["_".join([task.value.phenotype, task.value.metric])]
122
  return data_dict
123
 
124
 
src/populate.py CHANGED
@@ -1,11 +1,9 @@
1
- import json
2
  import os
3
-
4
  import pandas as pd
5
-
6
- from src.display.formatting import has_no_nan_values, make_clickable_model
7
  from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
- from src.leaderboard.read_evals import get_raw_eval_results, EvalResult
9
 
10
 
11
  def get_leaderboard_df(results_path: str, cols: list) -> pd.DataFrame:
 
 
1
  import os
2
+ import json
3
  import pandas as pd
4
+ from src.display.formatting import make_clickable_model
 
5
  from src.display.utils import AutoEvalColumn, EvalQueueColumn
6
+ from src.leaderboard.read_evals import get_raw_eval_results
7
 
8
 
9
  def get_leaderboard_df(results_path: str, cols: list) -> pd.DataFrame: