Taha Aksu commited on
Commit
6039937
·
1 Parent(s): 68eb6a5

Update leaderboard structure

Browse files
Files changed (5) hide show
  1. app.py +34 -5
  2. src/about.py +6 -1
  3. src/display/utils.py +8 -2
  4. src/leaderboard/read_evals.py +12 -4
  5. src/utils.py +87 -2
app.py CHANGED
@@ -90,7 +90,7 @@ grouped_dfs = get_grouped_dfs()
90
  domain_df, freq_df, term_length_df, variate_type_df, overall_df = grouped_dfs['domain'], grouped_dfs['frequency'], grouped_dfs['term_length'], grouped_dfs['univariate'], grouped_dfs['overall']
91
  overall_df = rename_metrics(overall_df)
92
  overall_df = format_df(overall_df)
93
- overall_df = overall_df.sort_values(by=['Rank'])
94
  domain_df = pivot_existed_df(domain_df, tab_name='domain')
95
  print(f'Domain dataframe is {domain_df}')
96
  freq_df = pivot_existed_df(freq_df, tab_name='frequency')
@@ -107,7 +107,7 @@ model_info_df = get_model_info_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH)
107
  # ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
108
 
109
 
110
- def init_leaderboard(ori_dataframe, model_info_df, sort_val: str|None = None):
111
  if ori_dataframe is None or ori_dataframe.empty:
112
  raise ValueError("Leaderboard DataFrame is empty or None.")
113
  model_info_col_list = [c.name for c in fields(ModelInfoColumn) if c.displayed_by_default if c.name not in ['#Params (B)', 'available_on_hub', 'hub', 'Model sha','Hub License']]
@@ -120,11 +120,19 @@ def init_leaderboard(ori_dataframe, model_info_df, sort_val: str|None = None):
120
  new_cols = ['T'] + [col for col in merged_df.columns if col != 'T']
121
  merged_df = merged_df[new_cols]
122
  if sort_val:
123
- if sort_val in merged_df.columns:
 
 
 
 
 
 
 
124
  merged_df = merged_df.sort_values(by=[sort_val])
125
  else:
126
  print(f'Warning: cannot sort by {sort_val}')
127
  print('Merged df: ', merged_df)
 
128
  # get the data type
129
  datatype_list = [col2type_dict[col] if col in col2type_dict else 'number' for col in merged_df.columns]
130
  # print('datatype_list: ', datatype_list)
@@ -162,13 +170,13 @@ def init_leaderboard(ori_dataframe, model_info_df, sort_val: str|None = None):
162
  # ],
163
  filter_columns=[
164
  ColumnFilter(ModelInfoColumn.model_type.name, type="checkboxgroup", label="Model types"),
 
165
  ],
166
  # bool_checkboxgroup_label="",
167
  column_widths=[40, 150] + [180 for _ in range(len(merged_df.columns)-2)],
168
  interactive=False,
169
  )
170
 
171
-
172
  demo = gr.Blocks(css=custom_css)
173
  with demo:
174
  gr.HTML(TITLE)
@@ -176,7 +184,8 @@ with demo:
176
 
177
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
178
  with gr.TabItem('🏅 Overall', elem_id="llm-benchmark-tab-table", id=5):
179
- leaderboard = init_leaderboard(overall_df, model_info_df, sort_val='Rank')
 
180
  print(f'FINAL Overall LEADERBOARD {overall_df}')
181
  with gr.TabItem("🏅 By Domain", elem_id="llm-benchmark-tab-table", id=0):
182
  leaderboard = init_leaderboard(domain_df, model_info_df)
@@ -196,6 +205,26 @@ with demo:
196
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=4):
197
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
198
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  with gr.Row():
200
  with gr.Accordion("📙 Citation", open=False):
201
  citation_button = gr.Textbox(
 
90
  domain_df, freq_df, term_length_df, variate_type_df, overall_df = grouped_dfs['domain'], grouped_dfs['frequency'], grouped_dfs['term_length'], grouped_dfs['univariate'], grouped_dfs['overall']
91
  overall_df = rename_metrics(overall_df)
92
  overall_df = format_df(overall_df)
93
+ overall_df = overall_df.sort_values(by=['MASE_Rank'])
94
  domain_df = pivot_existed_df(domain_df, tab_name='domain')
95
  print(f'Domain dataframe is {domain_df}')
96
  freq_df = pivot_existed_df(freq_df, tab_name='frequency')
 
107
  # ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
108
 
109
 
110
+ def init_leaderboard(ori_dataframe, model_info_df, sort_val: str | list | None = None):
111
  if ori_dataframe is None or ori_dataframe.empty:
112
  raise ValueError("Leaderboard DataFrame is empty or None.")
113
  model_info_col_list = [c.name for c in fields(ModelInfoColumn) if c.displayed_by_default if c.name not in ['#Params (B)', 'available_on_hub', 'hub', 'Model sha','Hub License']]
 
120
  new_cols = ['T'] + [col for col in merged_df.columns if col != 'T']
121
  merged_df = merged_df[new_cols]
122
  if sort_val:
123
+ if isinstance(sort_val, list):
124
+ assert sort_val[0] == 'TestData Leakage'
125
+ # ipdb.set_trace()
126
+ leakage_order = pd.Categorical(merged_df[sort_val[0]], categories=['No', 'Yes', 'N/A'], ordered=True)
127
+ merged_df['leakage_order'] = leakage_order
128
+ merged_df = merged_df.sort_values(by=['leakage_order', sort_val[1]])
129
+ merged_df = merged_df.drop(columns=['leakage_order'])
130
+ elif sort_val in merged_df.columns:
131
  merged_df = merged_df.sort_values(by=[sort_val])
132
  else:
133
  print(f'Warning: cannot sort by {sort_val}')
134
  print('Merged df: ', merged_df)
135
+ # ipdb.set_trace()
136
  # get the data type
137
  datatype_list = [col2type_dict[col] if col in col2type_dict else 'number' for col in merged_df.columns]
138
  # print('datatype_list: ', datatype_list)
 
170
  # ],
171
  filter_columns=[
172
  ColumnFilter(ModelInfoColumn.model_type.name, type="checkboxgroup", label="Model types"),
173
+ ColumnFilter(ModelInfoColumn.testdata_leakage.name, type="checkboxgroup", label="TestData Leakage"),
174
  ],
175
  # bool_checkboxgroup_label="",
176
  column_widths=[40, 150] + [180 for _ in range(len(merged_df.columns)-2)],
177
  interactive=False,
178
  )
179
 
 
180
  demo = gr.Blocks(css=custom_css)
181
  with demo:
182
  gr.HTML(TITLE)
 
184
 
185
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
186
  with gr.TabItem('🏅 Overall', elem_id="llm-benchmark-tab-table", id=5):
187
+ # leaderboard = init_leaderboard(overall_df, model_info_df, sort_val='Rank')
188
+ leaderboard = init_leaderboard(overall_df, model_info_df, sort_val=['TestData Leakage', 'MASE_Rank'])
189
  print(f'FINAL Overall LEADERBOARD {overall_df}')
190
  with gr.TabItem("🏅 By Domain", elem_id="llm-benchmark-tab-table", id=0):
191
  leaderboard = init_leaderboard(domain_df, model_info_df)
 
205
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=4):
206
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
207
 
208
+ # Trigger the column filters once on initial load so default selections take effect
209
+ demo.load(
210
+ js="""
211
+ () => {
212
+ // Make the JS fire one legitimate `input` event once the checkboxgroup
213
+ // component is ready. `querySelector` looks for the *wrapper* div Gradio
214
+ // puts around the checkbox-group.
215
+ const target = document.querySelector(
216
+ 'div[data-testid="checkboxgroup-model types"]');
217
+
218
+ if (!target) { return []; } // safety guard
219
+
220
+ // Ask Gradio’s front-end to re-compute its filters:
221
+ target.dispatchEvent(new Event('input', { bubbles: true }));
222
+
223
+ return []; // load() must return something
224
+ }
225
+ """
226
+ )
227
+
228
  with gr.Row():
229
  with gr.Accordion("📙 Citation", open=False):
230
  citation_button = gr.Textbox(
src/about.py CHANGED
@@ -44,7 +44,12 @@ points, spanning seven domains, 10 frequencies, multivariate inputs, and predict
44
  LLM_BENCHMARKS_TEXT = f"""
45
  ## Update Log
46
 
47
- - 2025‑07‑24: Corrected the Naive and Seasonal Naive scores to match the latest GIFT‑Eval notebooks. Most model rankings remain unchanged; only a few near the bottom shifted slightly (AutoETS and Timer each dropped two places now at 35th and 36th places respectively, while NBEATS moved up one now at 27th place).
 
 
 
 
 
48
 
49
  ## How It Works
50
 
 
44
  LLM_BENCHMARKS_TEXT = f"""
45
  ## Update Log
46
 
47
+ ### 2025‑07‑24
48
+ - Corrected the Naive and Seasonal Naive scores to match the latest GIFT‑Eval notebooks. Most model rankings remain unchanged; only a few near the bottom shifted slightly (AutoETS and Timer each dropped two places now at 35th and 36th places respectively, while NBEATS moved up one now at 27th place).
49
+
50
+ ### 2025-08-05
51
+ - Added new columns to the leaderboard: Organization, TestData Leakage, and MASE_Rank. TestData Leakage is a binary indicator specifying whether any test data was present in the training set. MASE_Rank reflects the model's ranking based on the MASE metric, aligned with the ranking scheme used for CRPS_Rank. These additions were made in response to multiple requests from independent groups seeking fairer comparisons. With these updates, the leaderboard now supports sorting by models that do not leak test data, and viewers can choose to rank models based on either MASE_Rank or CRPS_Rank, depending on their use case.
52
+ - Added new model type: Agentic to indicate submissions that use agentic system to generate the forecasts.
53
 
54
  ## How It Works
55
 
src/display/utils.py CHANGED
@@ -34,6 +34,8 @@ model_info_dict.append(["license", ColumnContent, ColumnContent("Hub License", "
34
  model_info_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False, True)])
35
  model_info_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False, True)])
36
  model_info_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
 
 
37
  # model_info_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
38
 
39
  # We use make dataclass to dynamically fill the scores from Tasks
@@ -59,9 +61,11 @@ class ModelDetails:
59
 
60
  class ModelType(Enum):
61
  PT = ModelDetails(name="🟢 pretrained", symbol="🟢")
62
- FT = ModelDetails(name="🔶 fine-tuned", symbol="🔶")
 
63
  DL = ModelDetails(name="🔷 deep-learning", symbol="🔷")
64
- ST = ModelDetails(name="🟣 statistical", symbol="🟣")
 
65
 
66
  Unknown = ModelDetails(name="", symbol="?")
67
 
@@ -74,6 +78,8 @@ class ModelType(Enum):
74
  return ModelType.FT
75
  if "pretrained" in type or "🟢" in type:
76
  return ModelType.PT
 
 
77
  if "deep-learning" in type or "🟦" in type:
78
  return ModelType.DL
79
  if "statistical" in type or "🟣" in type:
 
34
  model_info_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False, True)])
35
  model_info_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False, True)])
36
  model_info_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
37
+ model_info_dict.append(["org", ColumnContent, ColumnContent("Organization", "str", True, hidden=False)])
38
+ model_info_dict.append(["testdata_leakage", ColumnContent, ColumnContent("TestData Leakage", "str", True, hidden=False)])
39
  # model_info_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
40
 
41
  # We use make dataclass to dynamically fill the scores from Tasks
 
61
 
62
  class ModelType(Enum):
63
  PT = ModelDetails(name="🟢 pretrained", symbol="🟢")
64
+ FT = ModelDetails(name="🟣 fine-tuned", symbol="🟣")
65
+ AG = ModelDetails(name="🟡 agentic", symbol="🟡")
66
  DL = ModelDetails(name="🔷 deep-learning", symbol="🔷")
67
+ ST = ModelDetails(name="🔶 statistical", symbol="🔶")
68
+
69
 
70
  Unknown = ModelDetails(name="", symbol="?")
71
 
 
78
  return ModelType.FT
79
  if "pretrained" in type or "🟢" in type:
80
  return ModelType.PT
81
+ if "agentic" in type or "🟡" in type:
82
+ return ModelType.AG
83
  if "deep-learning" in type or "🟦" in type:
84
  return ModelType.DL
85
  if "statistical" in type or "🟣" in type:
src/leaderboard/read_evals.py CHANGED
@@ -21,9 +21,11 @@ class ModelConfig:
21
  model_type: ModelType = ModelType.Unknown
22
  code_link: str = ""
23
  precision: Precision = Precision.Unknown
 
24
  license: str = "?"
25
  likes: int = 0
26
  num_params: int | str = 0
 
27
 
28
  @classmethod
29
  def init_from_json_file(cls, json_filepath):
@@ -39,7 +41,10 @@ class ModelConfig:
39
  model = data.get("model", "")
40
  model_link = data.get("model_link", "")
41
  code_link = data.get("code_link", "")
42
- return cls(model=model, model_link=model_link, model_type=model_type, code_link=code_link, precision=precision)
 
 
 
43
 
44
  def to_dict(self):
45
  """Converts the model info to a dict compatible with our dataframe display"""
@@ -53,6 +58,8 @@ class ModelConfig:
53
  ModelInfoColumn.license.name: self.license,
54
  ModelInfoColumn.likes.name: self.likes,
55
  ModelInfoColumn.params.name: self.num_params,
 
 
56
  }
57
 
58
  return data_dict
@@ -63,14 +70,14 @@ class EvalResult:
63
  """
64
  eval_name: str # org_model_precision (uid)
65
  full_model: str # org/model (path on hub)
66
- org: str
67
  model: str
68
  revision: str # commit hash, "" if main
69
  results: dict
70
  precision: Precision = Precision.Unknown
71
  model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
72
  weight_type: WeightType = WeightType.Original # Original or Adapter
73
- architecture: str = "Unknown"
74
  license: str = "?"
75
  likes: int = 0
76
  num_params: int = 0
@@ -130,7 +137,7 @@ class EvalResult:
130
  org=org,
131
  model=model,
132
  results=results,
133
- precision=precision,
134
  revision= config.get("model_sha", ""),
135
  still_on_hub=still_on_hub,
136
  architecture=architecture
@@ -237,6 +244,7 @@ def get_model_info(results_path: str, requests_path: str) -> list[ModelConfig]:
237
  for v in model_infos.values():
238
  try:
239
  v.to_dict() # we test if the dict version is complete
 
240
  results.append(v)
241
  except KeyError: # not all eval values present
242
  continue
 
21
  model_type: ModelType = ModelType.Unknown
22
  code_link: str = ""
23
  precision: Precision = Precision.Unknown
24
+ org: str = ""
25
  license: str = "?"
26
  likes: int = 0
27
  num_params: int | str = 0
28
+ testdata_leakage: str = "NA"
29
 
30
  @classmethod
31
  def init_from_json_file(cls, json_filepath):
 
41
  model = data.get("model", "")
42
  model_link = data.get("model_link", "")
43
  code_link = data.get("code_link", "")
44
+ org = data.get("org", "")
45
+ testdata_leakage = data.get("testdata_leakage", "N/A")
46
+ return cls(model=model, model_link=model_link, model_type=model_type, code_link=code_link, org=org,
47
+ precision=precision, testdata_leakage=testdata_leakage)
48
 
49
  def to_dict(self):
50
  """Converts the model info to a dict compatible with our dataframe display"""
 
58
  ModelInfoColumn.license.name: self.license,
59
  ModelInfoColumn.likes.name: self.likes,
60
  ModelInfoColumn.params.name: self.num_params,
61
+ ModelInfoColumn.org.name: self.org,
62
+ ModelInfoColumn.testdata_leakage.name: self.testdata_leakage,
63
  }
64
 
65
  return data_dict
 
70
  """
71
  eval_name: str # org_model_precision (uid)
72
  full_model: str # org/model (path on hub)
73
+ org: str
74
  model: str
75
  revision: str # commit hash, "" if main
76
  results: dict
77
  precision: Precision = Precision.Unknown
78
  model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
79
  weight_type: WeightType = WeightType.Original # Original or Adapter
80
+ architecture: str = "Unknown"
81
  license: str = "?"
82
  likes: int = 0
83
  num_params: int = 0
 
137
  org=org,
138
  model=model,
139
  results=results,
140
+ precision=precision,
141
  revision= config.get("model_sha", ""),
142
  still_on_hub=still_on_hub,
143
  architecture=architecture
 
244
  for v in model_infos.values():
245
  try:
246
  v.to_dict() # we test if the dict version is complete
247
+ # ipdb.set_trace()
248
  results.append(v)
249
  except KeyError: # not all eval values present
250
  continue
src/utils.py CHANGED
@@ -48,7 +48,8 @@ def rename_metrics(df):
48
  df = df.rename(columns={
49
  'eval_metrics/MASE[0.5]': 'MASE',
50
  'eval_metrics/mean_weighted_sum_quantile_loss': 'CRPS',
51
- 'rank': 'Rank'
 
52
  })
53
  return df
54
 
@@ -105,6 +106,59 @@ def pivot_existed_df(df, tab_name):
105
  return df_pivot
106
 
107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  def get_grouped_dfs(root_dir='results', ds_properties='results/dataset_properties.csv'):
109
  df_list = []
110
 
@@ -163,6 +217,24 @@ def get_grouped_dfs(root_dir='results', ds_properties='results/dataset_propertie
163
  RANKING_METRIC = "eval_metrics/mean_weighted_sum_quantile_loss"
164
  df['rank'] = df.groupby(['dataset', 'term_length', 'frequency'])[f'{RANKING_METRIC}'].rank(method='first',
165
  ascending=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  # create a new column called rank
167
  metric_columns.append('rank')
168
  # create a new column called univariate. Set it to true if column num_variates is 1, otherwise set it to false
@@ -173,7 +245,20 @@ def get_grouped_dfs(root_dir='results', ds_properties='results/dataset_propertie
173
  # ipdb.set_trace()
174
  grouped_results_overall = df.groupby(['model'])[METRIC_CHOICES].agg(stats.gmean)
175
  grouped_results_overall_rank = df.groupby(['model'])[['rank']].mean()
176
- grouped_results_overall = pd.concat([grouped_results_overall, grouped_results_overall_rank], axis=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
177
 
178
  # grouped_results_overall = grouped_results_overall.rename(columns={'model':'Model'})
179
  # grouped_results.to_csv(f'artefacts/grouped_results_by_model.csv')
 
48
  df = df.rename(columns={
49
  'eval_metrics/MASE[0.5]': 'MASE',
50
  'eval_metrics/mean_weighted_sum_quantile_loss': 'CRPS',
51
+ 'rank': 'CRPS_Rank',
52
+ 'Rank_MASE': 'MASE_Rank',
53
  })
54
  return df
55
 
 
106
  return df_pivot
107
 
108
 
109
+ def get_all_res_dfs(root_dir='results', ds_properties='results/dataset_properties.csv'):
110
+ df_list = []
111
+
112
+ # Walk through all folders and subfolders in the root directory
113
+ for subdir, _, files in os.walk(root_dir):
114
+ for file in files:
115
+ if file == 'all_results.csv':
116
+ file_path = os.path.join(subdir, file)
117
+ df = pd.read_csv(file_path)
118
+ df_list.append(df)
119
+ # Concatenate all dataframes into one
120
+ all_results_df = pd.concat(df_list, ignore_index=True)
121
+ all_results_df = all_results_df.sort_values(by=['model', 'dataset']).reset_index(drop=True)
122
+ all_results_df[['dataset', 'frequency', 'term_length']] = all_results_df['dataset'].str.split('/', expand=True)
123
+
124
+ dataset_properties = pd.read_csv(ds_properties)
125
+ # Reforemat the the first element of each row after the header following these rules:
126
+ # 1. make all characters lowercase
127
+ dataset_properties['dataset'] = dataset_properties['dataset'].apply(lambda x: x.lower())
128
+ # 2. replace all spaces with underscores
129
+ dataset_properties['dataset'] = dataset_properties['dataset'].apply(lambda x: x.replace(' ', '_'))
130
+ # 3. Replace all dashes with underscores
131
+ dataset_properties['dataset'] = dataset_properties['dataset'].apply(lambda x: x.replace('-', '_'))
132
+ # 4. Replace consecutive underscores with a single underscore. There maybe more than 2 consecutive underscores
133
+ dataset_properties['dataset'] = dataset_properties['dataset'].apply(lambda x: re.sub('_+', '_', x))
134
+ # 5. Remove all leading and trailing underscores
135
+ dataset_properties['dataset'] = dataset_properties['dataset'].apply(lambda x: x.strip('_'))
136
+
137
+ df = all_results_df
138
+
139
+ # convert it to a dictionary, with dataset as the key, and the value as another dictionary. The inner dictionary has the column names as the key, and the value as the value.
140
+ dataset_properties_dict = dataset_properties.set_index('dataset').T.to_dict('dict')
141
+ dataset_properties_dict.keys()
142
+
143
+ # # match the dataset name in model_properties_dict with the dataset name in df and add a new column for each key value pair in the inner dictionary.
144
+ for dataset in dataset_properties_dict.keys():
145
+ for key in dataset_properties_dict[dataset].keys():
146
+ # set the row with the dataset name to the value of the key think step by step
147
+ # First, get the row with the dataset name
148
+ # Second, set the value of the key to the value of the key
149
+ if key == 'frequency':
150
+ # only set the frequency if the frequency column for all rows for the dataset is empty string
151
+ if all(df[df['dataset'] == dataset]['frequency'].isna()):
152
+ df.loc[df['dataset'] == dataset, key] = dataset_properties_dict[dataset][key]
153
+ else:
154
+ df.loc[df['dataset'] == dataset, key] = dataset_properties_dict[dataset][key]
155
+
156
+ # unify the frequency
157
+ df = unify_freq(df)
158
+ # standardize by seasonal naive
159
+ df = standardize_df(df)
160
+ ipdb.set_trace()
161
+ return None
162
  def get_grouped_dfs(root_dir='results', ds_properties='results/dataset_properties.csv'):
163
  df_list = []
164
 
 
217
  RANKING_METRIC = "eval_metrics/mean_weighted_sum_quantile_loss"
218
  df['rank'] = df.groupby(['dataset', 'term_length', 'frequency'])[f'{RANKING_METRIC}'].rank(method='first',
219
  ascending=True)
220
+ MASE_RANKING_METRICS = "eval_metrics/MASE[0.5]"
221
+ df['Rank_MASE'] = df.groupby(['dataset', 'term_length', 'frequency'])[f'{MASE_RANKING_METRICS}'].rank(method='first',
222
+ ascending=True)
223
+ # Add a new column to identify top 5 ranks
224
+ df['is_top5_CRPS'] = df['rank'] <= 5
225
+
226
+ # Count top 3 performances for each model
227
+ top5_counts = df.groupby('model')['is_top5_CRPS'].sum().reset_index()
228
+ top5_counts = top5_counts.rename(columns={'is_top5_CRPS': 'count_top5_CRPS'})
229
+
230
+ # Add a new column to identify top 5 ranks
231
+ df['is_top5_MASE'] = df['Rank_MASE'] <= 5
232
+
233
+ # Count top 3 performances for each model
234
+ top5_counts_MASE = df.groupby('model')['is_top5_MASE'].sum().reset_index()
235
+ top5_counts_MASE = top5_counts_MASE.rename(columns={'is_top5_MASE': 'count_top5_MASE'})
236
+
237
+ # ipdb.set_trace()
238
  # create a new column called rank
239
  metric_columns.append('rank')
240
  # create a new column called univariate. Set it to true if column num_variates is 1, otherwise set it to false
 
245
  # ipdb.set_trace()
246
  grouped_results_overall = df.groupby(['model'])[METRIC_CHOICES].agg(stats.gmean)
247
  grouped_results_overall_rank = df.groupby(['model'])[['rank']].mean()
248
+ grouped_results_overall_rank_mase = df.groupby(['model'])[['Rank_MASE']].mean()
249
+ grouped_results_overall = pd.concat([grouped_results_overall, grouped_results_overall_rank,grouped_results_overall_rank_mase], axis=1)
250
+
251
+ # grouped_results_overall_avg_rank = pd.concat([grouped_results_overall_rank, grouped_results_overall_rank_mase], axis=1)
252
+ # grouped_results_overall_avg_rank['Avg_Rank'] = grouped_results_overall_avg_rank.mean(axis=1)
253
+ # grouped_results_overall_avg_rank = grouped_results_overall_avg_rank[['Avg_Rank']]
254
+ # grouped_results_overall = pd.concat([grouped_results_overall, grouped_results_overall_avg_rank], axis=1)
255
+ # switch the order to ['model', 'eval_metrics/MASE[0.5]', 'Rank_MASE', 'eval_metrics/mean_weighted_sum_quantile_loss', 'rank']
256
+ # grouped_results_overall = grouped_results_overall[['eval_metrics/MASE[0.5]', 'Rank_MASE', 'eval_metrics/mean_weighted_sum_quantile_loss', 'rank', 'Avg_Rank']]
257
+ grouped_results_overall = grouped_results_overall[['eval_metrics/MASE[0.5]', 'Rank_MASE', 'eval_metrics/mean_weighted_sum_quantile_loss', 'rank']]
258
+ # ipdb.set_trace()
259
+ # Add top3 performance statistics to the overall results
260
+ # grouped_results_overall = pd.merge(grouped_results_overall, top5_counts, on='model')
261
+ # grouped_results_overall = pd.merge(grouped_results_overall, top5_counts_MASE, on='model')
262
 
263
  # grouped_results_overall = grouped_results_overall.rename(columns={'model':'Model'})
264
  # grouped_results.to_csv(f'artefacts/grouped_results_by_model.csv')