TemryL commited on
Commit
4c7aa51
·
1 Parent(s): 225ae73

add nb_shots and make request work

Browse files
app.py CHANGED
@@ -25,6 +25,7 @@ from src.display.utils import (
25
  Precision,
26
  generate_column_name
27
  )
 
28
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, HF_TOKEN
29
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
30
  from src.submission.submit import add_new_eval
@@ -53,7 +54,7 @@ except Exception:
53
  restart_space()
54
 
55
 
56
- raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS)
57
  leaderboard_df = original_df.copy()
58
 
59
  (
@@ -89,6 +90,7 @@ def select_columns(df: pd.DataFrame, columns: list, phenotypes: list, metrics:li
89
  always_here_cols = [
90
  AutoEvalColumn.model_type_symbol.name,
91
  AutoEvalColumn.model.name,
 
92
  ]
93
 
94
  task_cols = []
@@ -197,6 +199,13 @@ with demo:
197
  elem_id="search-bar",
198
  )
199
  with gr.Column(min_width=320):
 
 
 
 
 
 
 
200
  filter_columns_type = gr.CheckboxGroup(
201
  label="Model types",
202
  choices=[t.to_str() for t in ModelType],
@@ -234,6 +243,12 @@ with demo:
234
  interactive=False,
235
  visible=True,
236
  )
 
 
 
 
 
 
237
 
238
  # Dummy leaderboard for handling the case when the user uses backspace key
239
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
 
25
  Precision,
26
  generate_column_name
27
  )
28
+ from src.display.plot_curves import plot_curves
29
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, HF_TOKEN
30
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
31
  from src.submission.submit import add_new_eval
 
54
  restart_space()
55
 
56
 
57
+ results, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS)
58
  leaderboard_df = original_df.copy()
59
 
60
  (
 
90
  always_here_cols = [
91
  AutoEvalColumn.model_type_symbol.name,
92
  AutoEvalColumn.model.name,
93
+ AutoEvalColumn.nb_shots.name,
94
  ]
95
 
96
  task_cols = []
 
199
  elem_id="search-bar",
200
  )
201
  with gr.Column(min_width=320):
202
+ filter_nb_shots = gr.CheckboxGroup(
203
+ label="Number of shots",
204
+ choices=["Zero-shot", "10-shot", "All"],
205
+ value=["Zero-shot", "10-shot", "All"],
206
+ interactive=True,
207
+ elem_id="filter-nb-shots",
208
+ )
209
  filter_columns_type = gr.CheckboxGroup(
210
  label="Model types",
211
  choices=[t.to_str() for t in ModelType],
 
243
  interactive=False,
244
  visible=True,
245
  )
246
+
247
+ # Plotting the curves
248
+ # gr.Plot(
249
+ # plot_curves(),
250
+ # elem_id="plot-curves"
251
+ # )
252
 
253
  # Dummy leaderboard for handling the case when the user uses backspace key
254
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
requirements.txt CHANGED
@@ -15,4 +15,6 @@ transformers==4.35.2
15
  tokenizers>=0.15.0
16
  git+https://github.com/EleutherAI/lm-evaluation-harness.git@b281b0921b636bc36ad05c0b0b0763bd6dd43463#egg=lm-eval
17
  accelerate==0.24.1
18
- sentencepiece
 
 
 
15
  tokenizers>=0.15.0
16
  git+https://github.com/EleutherAI/lm-evaluation-harness.git@b281b0921b636bc36ad05c0b0b0763bd6dd43463#egg=lm-eval
17
  accelerate==0.24.1
18
+ sentencepiece
19
+ python-dotenv==1.0.1
20
+ plotly==5.22.0
src/display/utils.py CHANGED
@@ -30,6 +30,7 @@ auto_eval_column_dict = []
30
  # Init
31
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
32
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 
33
  #Scores
34
  auto_eval_column_dict.append(["average_auroc", ColumnContent, ColumnContent("Average AUROC ⬆️", "number", True)])
35
  auto_eval_column_dict.append(["average_auprc", ColumnContent, ColumnContent("Average AUPRC ⬆️", "number", True)])
 
30
  # Init
31
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
32
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
33
+ auto_eval_column_dict.append(["nb_shots", ColumnContent, ColumnContent("#Shots", "number", True, never_hidden=True)])
34
  #Scores
35
  auto_eval_column_dict.append(["average_auroc", ColumnContent, ColumnContent("Average AUROC ⬆️", "number", True)])
36
  auto_eval_column_dict.append(["average_auprc", ColumnContent, ColumnContent("Average AUPRC ⬆️", "number", True)])
src/leaderboard/read_evals.py CHANGED
@@ -15,13 +15,14 @@ from src.submission.check_validity import is_model_on_hub
15
  class EvalResult:
16
  """Represents one full evaluation. Built from a combination of the result and request file for a given run.
17
  """
18
- eval_name: str # org_model_precision (uid)
19
  full_model: str # org/model (path on hub)
20
  org: str
21
  model: str
22
  revision: str # commit hash, "" if main
23
  results: dict
24
  raw_data: dict
 
25
  precision: Precision = Precision.Unknown
26
  model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
27
  weight_type: WeightType = WeightType.Original # Original or Adapter
@@ -37,28 +38,26 @@ class EvalResult:
37
  """Inits the result from the specific model result file"""
38
  with open(json_filepath) as fp:
39
  data = json.load(fp)
40
-
 
41
  config = data.get("config")
42
-
43
- # Precision
44
- precision = Precision.from_str(config.get("model_dtype"))
45
-
46
- # Get model and org
47
- org_and_model = config.get("model_name", config.get("model_args", None))
48
- org_and_model = org_and_model.split("/", 1)
49
-
50
- if len(org_and_model) == 1:
51
- org = None
52
- model = org_and_model[0]
53
- result_key = f"{model}_{precision.value.name}"
54
- else:
55
- org = org_and_model[0]
56
- model = org_and_model[1]
57
- result_key = f"{org}_{model}_{precision.value.name}"
58
- full_model = "/".join(org_and_model)
59
-
60
  still_on_hub, _, model_config = is_model_on_hub(
61
- full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False, token=os.environ.get("TOKEN")
62
  )
63
  architecture = "?"
64
  if model_config is not None:
@@ -66,52 +65,43 @@ class EvalResult:
66
  if architectures:
67
  architecture = ";".join(architectures)
68
 
69
- # Extract results available in this file (some results are split in several files)
70
  results = {}
71
  for task in Tasks:
72
  task = task.value
73
 
74
- mean = data["results"].get(task.phenotype_key, {}).get("_".join(["mean", task.metric_key]), None)
75
- lower = data["results"].get(task.phenotype_key, {}).get("_".join(["lower", task.metric_key]), None)
76
- upper = data["results"].get(task.phenotype_key, {}).get("_".join(["upper", task.metric_key]), None)
77
  formated_score = f"{mean:.2f} ({lower:.2f}-{upper:.2f})" if mean is not None else None
78
  results["_".join([task.phenotype_key, task.metric_key])] = formated_score
79
 
80
  return self(
81
- eval_name=result_key,
82
  full_model=full_model,
83
- org=org,
84
- model=model,
85
  results=results,
86
  raw_data=data,
 
87
  precision=precision,
88
- revision=config.get("model_sha", ""),
89
  still_on_hub=still_on_hub,
90
- architecture=architecture
 
 
 
 
 
 
91
  )
92
 
93
- def update_with_request_file(self, requests_path):
94
- """Finds the relevant request file for the current model and updates info with it"""
95
- request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
96
-
97
- try:
98
- with open(request_file, "r") as f:
99
- request = json.load(f)
100
- self.model_type = ModelType.from_str(request.get("model_type", ""))
101
- self.weight_type = WeightType[request.get("weight_type", "Original")]
102
- self.license = request.get("license", "?")
103
- self.likes = request.get("likes", 0)
104
- self.num_params = request.get("params", 0)
105
- self.date = request.get("submitted_time", "")
106
- except Exception:
107
- print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
108
-
109
  def to_dict(self):
110
  """Converts the Eval Result to a dict compatible with our dataframe display"""
111
- average_auroc = np.mean(np.array([d["mean_auroc"] for d in self.raw_data["results"].values() if "mean_auroc" in d.keys()]))
112
- average_auprc = np.mean(np.array([d["mean_auprc"] for d in self.raw_data["results"].values() if "mean_auprc" in d.keys()]))
113
  data_dict = {
114
  "eval_name": self.eval_name, # not a column, just a save name,
 
115
  AutoEvalColumn.precision.name: self.precision.value.name,
116
  AutoEvalColumn.model_type.name: self.model_type.value.name,
117
  AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
@@ -154,7 +144,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
154
  return request_file
155
 
156
 
157
- def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
158
  """From the path of the results folder root, extract all needed info for results"""
159
  model_result_filepaths = []
160
 
@@ -176,7 +166,6 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
176
  for model_result_filepath in model_result_filepaths:
177
  # Creation of result
178
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
179
- eval_result.update_with_request_file(requests_path)
180
 
181
  # Store results of same eval together
182
  eval_name = eval_result.eval_name
 
15
  class EvalResult:
16
  """Represents one full evaluation. Built from a combination of the result and request file for a given run.
17
  """
18
+ eval_name: str # org_model_precision_feature-set_nb-shots (uid)
19
  full_model: str # org/model (path on hub)
20
  org: str
21
  model: str
22
  revision: str # commit hash, "" if main
23
  results: dict
24
  raw_data: dict
25
+ nb_shots: int = 0
26
  precision: Precision = Precision.Unknown
27
  model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
28
  weight_type: WeightType = WeightType.Original # Original or Adapter
 
38
  """Inits the result from the specific model result file"""
39
  with open(json_filepath) as fp:
40
  data = json.load(fp)
41
+
42
+ # Get config
43
  config = data.get("config")
44
+ full_model = config.get("model")
45
+ org = full_model.split("/")[0]
46
+ model = full_model.split("/")[1]
47
+ precision = Precision.from_str(config.get("precision"))
48
+ revision = config.get("revision", "")
49
+ feature_set = config.get("feature_set", "Unknown")
50
+ nb_shots = config.get("nb_shots", None)
51
+ model_type = ModelType.from_str(config.get("model_type", ""))
52
+ weight_type = WeightType[config.get("weight_type", "Original")]
53
+ license = config.get("license", "?")
54
+ likes = config.get("likes", 0)
55
+ num_params = config.get("params", 0)
56
+ date = config.get("submitted_time", "")
57
+
58
+ # Check if model is still on hub
 
 
 
59
  still_on_hub, _, model_config = is_model_on_hub(
60
+ full_model, revision, trust_remote_code=True, test_tokenizer=False, token=os.environ.get("TOKEN")
61
  )
62
  architecture = "?"
63
  if model_config is not None:
 
65
  if architectures:
66
  architecture = ";".join(architectures)
67
 
 
68
  results = {}
69
  for task in Tasks:
70
  task = task.value
71
 
72
+ mean = data["results"].get(task.phenotype_key, {}).get("metrics", {}).get("_".join(["mean", task.metric_key]), None)
73
+ lower = data["results"].get(task.phenotype_key, {}).get("metrics", {}).get("_".join(["lower", task.metric_key]), None)
74
+ upper = data["results"].get(task.phenotype_key, {}).get("metrics", {}).get("_".join(["upper", task.metric_key]), None)
75
  formated_score = f"{mean:.2f} ({lower:.2f}-{upper:.2f})" if mean is not None else None
76
  results["_".join([task.phenotype_key, task.metric_key])] = formated_score
77
 
78
  return self(
79
+ eval_name=f"{org}_{model}_{precision.value.name}_{feature_set}_{nb_shots}",
80
  full_model=full_model,
81
+ org=full_model.split("/")[0],
82
+ model=full_model.split("/")[1],
83
  results=results,
84
  raw_data=data,
85
+ nb_shots=nb_shots,
86
  precision=precision,
87
+ revision=revision,
88
  still_on_hub=still_on_hub,
89
+ architecture=architecture,
90
+ model_type=model_type,
91
+ weight_type=weight_type,
92
+ license=license,
93
+ likes=likes,
94
+ num_params=num_params,
95
+ date=date
96
  )
97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  def to_dict(self):
99
  """Converts the Eval Result to a dict compatible with our dataframe display"""
100
+ average_auroc = np.mean(np.array([d["metrics"]["mean_auroc"] for d in self.raw_data["results"].values() if "mean_auroc" in d["metrics"].keys()]))
101
+ average_auprc = np.mean(np.array([d["metrics"]["mean_auprc"] for d in self.raw_data["results"].values() if "mean_auprc" in d["metrics"].keys()]))
102
  data_dict = {
103
  "eval_name": self.eval_name, # not a column, just a save name,
104
+ AutoEvalColumn.nb_shots.name: self.nb_shots,
105
  AutoEvalColumn.precision.name: self.precision.value.name,
106
  AutoEvalColumn.model_type.name: self.model_type.value.name,
107
  AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
 
144
  return request_file
145
 
146
 
147
+ def get_raw_eval_results(results_path: str) -> list[EvalResult]:
148
  """From the path of the results folder root, extract all needed info for results"""
149
  model_result_filepaths = []
150
 
 
166
  for model_result_filepath in model_result_filepaths:
167
  # Creation of result
168
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
 
169
 
170
  # Store results of same eval together
171
  eval_name = eval_result.eval_name
src/populate.py CHANGED
@@ -5,12 +5,12 @@ import pandas as pd
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
  from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
- from src.leaderboard.read_evals import get_raw_eval_results
9
 
10
 
11
- def get_leaderboard_df(results_path: str, requests_path: str, cols: list) -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
13
- raw_data = get_raw_eval_results(results_path, requests_path)
14
  all_data_json = [v.to_dict() for v in raw_data]
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
 
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
  from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
+ from src.leaderboard.read_evals import get_raw_eval_results, EvalResult
9
 
10
 
11
+ def get_leaderboard_df(results_path: str, cols: list) -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
13
+ raw_data = get_raw_eval_results(results_path)
14
  all_data_json = [v.to_dict() for v in raw_data]
15
 
16
  df = pd.DataFrame.from_records(all_data_json)