Spaces:
Running
Running
Taha Aksu
commited on
Commit
·
6039937
1
Parent(s):
68eb6a5
Update leaderboard structure
Browse files- app.py +34 -5
- src/about.py +6 -1
- src/display/utils.py +8 -2
- src/leaderboard/read_evals.py +12 -4
- src/utils.py +87 -2
app.py
CHANGED
|
@@ -90,7 +90,7 @@ grouped_dfs = get_grouped_dfs()
|
|
| 90 |
domain_df, freq_df, term_length_df, variate_type_df, overall_df = grouped_dfs['domain'], grouped_dfs['frequency'], grouped_dfs['term_length'], grouped_dfs['univariate'], grouped_dfs['overall']
|
| 91 |
overall_df = rename_metrics(overall_df)
|
| 92 |
overall_df = format_df(overall_df)
|
| 93 |
-
overall_df = overall_df.sort_values(by=['
|
| 94 |
domain_df = pivot_existed_df(domain_df, tab_name='domain')
|
| 95 |
print(f'Domain dataframe is {domain_df}')
|
| 96 |
freq_df = pivot_existed_df(freq_df, tab_name='frequency')
|
|
@@ -107,7 +107,7 @@ model_info_df = get_model_info_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH)
|
|
| 107 |
# ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
| 108 |
|
| 109 |
|
| 110 |
-
def init_leaderboard(ori_dataframe, model_info_df, sort_val: str|None = None):
|
| 111 |
if ori_dataframe is None or ori_dataframe.empty:
|
| 112 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
| 113 |
model_info_col_list = [c.name for c in fields(ModelInfoColumn) if c.displayed_by_default if c.name not in ['#Params (B)', 'available_on_hub', 'hub', 'Model sha','Hub License']]
|
|
@@ -120,11 +120,19 @@ def init_leaderboard(ori_dataframe, model_info_df, sort_val: str|None = None):
|
|
| 120 |
new_cols = ['T'] + [col for col in merged_df.columns if col != 'T']
|
| 121 |
merged_df = merged_df[new_cols]
|
| 122 |
if sort_val:
|
| 123 |
-
if sort_val
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
merged_df = merged_df.sort_values(by=[sort_val])
|
| 125 |
else:
|
| 126 |
print(f'Warning: cannot sort by {sort_val}')
|
| 127 |
print('Merged df: ', merged_df)
|
|
|
|
| 128 |
# get the data type
|
| 129 |
datatype_list = [col2type_dict[col] if col in col2type_dict else 'number' for col in merged_df.columns]
|
| 130 |
# print('datatype_list: ', datatype_list)
|
|
@@ -162,13 +170,13 @@ def init_leaderboard(ori_dataframe, model_info_df, sort_val: str|None = None):
|
|
| 162 |
# ],
|
| 163 |
filter_columns=[
|
| 164 |
ColumnFilter(ModelInfoColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
|
|
|
| 165 |
],
|
| 166 |
# bool_checkboxgroup_label="",
|
| 167 |
column_widths=[40, 150] + [180 for _ in range(len(merged_df.columns)-2)],
|
| 168 |
interactive=False,
|
| 169 |
)
|
| 170 |
|
| 171 |
-
|
| 172 |
demo = gr.Blocks(css=custom_css)
|
| 173 |
with demo:
|
| 174 |
gr.HTML(TITLE)
|
|
@@ -176,7 +184,8 @@ with demo:
|
|
| 176 |
|
| 177 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 178 |
with gr.TabItem('🏅 Overall', elem_id="llm-benchmark-tab-table", id=5):
|
| 179 |
-
leaderboard = init_leaderboard(overall_df, model_info_df, sort_val='Rank')
|
|
|
|
| 180 |
print(f'FINAL Overall LEADERBOARD {overall_df}')
|
| 181 |
with gr.TabItem("🏅 By Domain", elem_id="llm-benchmark-tab-table", id=0):
|
| 182 |
leaderboard = init_leaderboard(domain_df, model_info_df)
|
|
@@ -196,6 +205,26 @@ with demo:
|
|
| 196 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=4):
|
| 197 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 198 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
with gr.Row():
|
| 200 |
with gr.Accordion("📙 Citation", open=False):
|
| 201 |
citation_button = gr.Textbox(
|
|
|
|
| 90 |
domain_df, freq_df, term_length_df, variate_type_df, overall_df = grouped_dfs['domain'], grouped_dfs['frequency'], grouped_dfs['term_length'], grouped_dfs['univariate'], grouped_dfs['overall']
|
| 91 |
overall_df = rename_metrics(overall_df)
|
| 92 |
overall_df = format_df(overall_df)
|
| 93 |
+
overall_df = overall_df.sort_values(by=['MASE_Rank'])
|
| 94 |
domain_df = pivot_existed_df(domain_df, tab_name='domain')
|
| 95 |
print(f'Domain dataframe is {domain_df}')
|
| 96 |
freq_df = pivot_existed_df(freq_df, tab_name='frequency')
|
|
|
|
| 107 |
# ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
| 108 |
|
| 109 |
|
| 110 |
+
def init_leaderboard(ori_dataframe, model_info_df, sort_val: str | list | None = None):
|
| 111 |
if ori_dataframe is None or ori_dataframe.empty:
|
| 112 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
| 113 |
model_info_col_list = [c.name for c in fields(ModelInfoColumn) if c.displayed_by_default if c.name not in ['#Params (B)', 'available_on_hub', 'hub', 'Model sha','Hub License']]
|
|
|
|
| 120 |
new_cols = ['T'] + [col for col in merged_df.columns if col != 'T']
|
| 121 |
merged_df = merged_df[new_cols]
|
| 122 |
if sort_val:
|
| 123 |
+
if isinstance(sort_val, list):
|
| 124 |
+
assert sort_val[0] == 'TestData Leakage'
|
| 125 |
+
# ipdb.set_trace()
|
| 126 |
+
leakage_order = pd.Categorical(merged_df[sort_val[0]], categories=['No', 'Yes', 'N/A'], ordered=True)
|
| 127 |
+
merged_df['leakage_order'] = leakage_order
|
| 128 |
+
merged_df = merged_df.sort_values(by=['leakage_order', sort_val[1]])
|
| 129 |
+
merged_df = merged_df.drop(columns=['leakage_order'])
|
| 130 |
+
elif sort_val in merged_df.columns:
|
| 131 |
merged_df = merged_df.sort_values(by=[sort_val])
|
| 132 |
else:
|
| 133 |
print(f'Warning: cannot sort by {sort_val}')
|
| 134 |
print('Merged df: ', merged_df)
|
| 135 |
+
# ipdb.set_trace()
|
| 136 |
# get the data type
|
| 137 |
datatype_list = [col2type_dict[col] if col in col2type_dict else 'number' for col in merged_df.columns]
|
| 138 |
# print('datatype_list: ', datatype_list)
|
|
|
|
| 170 |
# ],
|
| 171 |
filter_columns=[
|
| 172 |
ColumnFilter(ModelInfoColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
| 173 |
+
ColumnFilter(ModelInfoColumn.testdata_leakage.name, type="checkboxgroup", label="TestData Leakage"),
|
| 174 |
],
|
| 175 |
# bool_checkboxgroup_label="",
|
| 176 |
column_widths=[40, 150] + [180 for _ in range(len(merged_df.columns)-2)],
|
| 177 |
interactive=False,
|
| 178 |
)
|
| 179 |
|
|
|
|
| 180 |
demo = gr.Blocks(css=custom_css)
|
| 181 |
with demo:
|
| 182 |
gr.HTML(TITLE)
|
|
|
|
| 184 |
|
| 185 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
| 186 |
with gr.TabItem('🏅 Overall', elem_id="llm-benchmark-tab-table", id=5):
|
| 187 |
+
# leaderboard = init_leaderboard(overall_df, model_info_df, sort_val='Rank')
|
| 188 |
+
leaderboard = init_leaderboard(overall_df, model_info_df, sort_val=['TestData Leakage', 'MASE_Rank'])
|
| 189 |
print(f'FINAL Overall LEADERBOARD {overall_df}')
|
| 190 |
with gr.TabItem("🏅 By Domain", elem_id="llm-benchmark-tab-table", id=0):
|
| 191 |
leaderboard = init_leaderboard(domain_df, model_info_df)
|
|
|
|
| 205 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=4):
|
| 206 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 207 |
|
| 208 |
+
# Trigger the column filters once on initial load so default selections take effect
|
| 209 |
+
demo.load(
|
| 210 |
+
js="""
|
| 211 |
+
() => {
|
| 212 |
+
// Make the JS fire one legitimate `input` event once the checkboxgroup
|
| 213 |
+
// component is ready. `querySelector` looks for the *wrapper* div Gradio
|
| 214 |
+
// puts around the checkbox-group.
|
| 215 |
+
const target = document.querySelector(
|
| 216 |
+
'div[data-testid="checkboxgroup-model types"]');
|
| 217 |
+
|
| 218 |
+
if (!target) { return []; } // safety guard
|
| 219 |
+
|
| 220 |
+
// Ask Gradio’s front-end to re-compute its filters:
|
| 221 |
+
target.dispatchEvent(new Event('input', { bubbles: true }));
|
| 222 |
+
|
| 223 |
+
return []; // load() must return something
|
| 224 |
+
}
|
| 225 |
+
"""
|
| 226 |
+
)
|
| 227 |
+
|
| 228 |
with gr.Row():
|
| 229 |
with gr.Accordion("📙 Citation", open=False):
|
| 230 |
citation_button = gr.Textbox(
|
src/about.py
CHANGED
|
@@ -44,7 +44,12 @@ points, spanning seven domains, 10 frequencies, multivariate inputs, and predict
|
|
| 44 |
LLM_BENCHMARKS_TEXT = f"""
|
| 45 |
## Update Log
|
| 46 |
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
## How It Works
|
| 50 |
|
|
|
|
| 44 |
LLM_BENCHMARKS_TEXT = f"""
|
| 45 |
## Update Log
|
| 46 |
|
| 47 |
+
### 2025‑07‑24
|
| 48 |
+
- Corrected the Naive and Seasonal Naive scores to match the latest GIFT‑Eval notebooks. Most model rankings remain unchanged; only a few near the bottom shifted slightly (AutoETS and Timer each dropped two places now at 35th and 36th places respectively, while NBEATS moved up one now at 27th place).
|
| 49 |
+
|
| 50 |
+
### 2025-08-05
|
| 51 |
+
- Added new columns to the leaderboard: Organization, TestData Leakage, and MASE_Rank. TestData Leakage is a binary indicator specifying whether any test data was present in the training set. MASE_Rank reflects the model's ranking based on the MASE metric, aligned with the ranking scheme used for CRPS_Rank. These additions were made in response to multiple requests from independent groups seeking fairer comparisons. With these updates, the leaderboard now supports sorting by models that do not leak test data, and viewers can choose to rank models based on either MASE_Rank or CRPS_Rank, depending on their use case.
|
| 52 |
+
- Added new model type: Agentic to indicate submissions that use agentic system to generate the forecasts.
|
| 53 |
|
| 54 |
## How It Works
|
| 55 |
|
src/display/utils.py
CHANGED
|
@@ -34,6 +34,8 @@ model_info_dict.append(["license", ColumnContent, ColumnContent("Hub License", "
|
|
| 34 |
model_info_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False, True)])
|
| 35 |
model_info_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False, True)])
|
| 36 |
model_info_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
|
|
|
|
|
|
| 37 |
# model_info_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
| 38 |
|
| 39 |
# We use make dataclass to dynamically fill the scores from Tasks
|
|
@@ -59,9 +61,11 @@ class ModelDetails:
|
|
| 59 |
|
| 60 |
class ModelType(Enum):
|
| 61 |
PT = ModelDetails(name="🟢 pretrained", symbol="🟢")
|
| 62 |
-
FT = ModelDetails(name="
|
|
|
|
| 63 |
DL = ModelDetails(name="🔷 deep-learning", symbol="🔷")
|
| 64 |
-
ST = ModelDetails(name="
|
|
|
|
| 65 |
|
| 66 |
Unknown = ModelDetails(name="", symbol="?")
|
| 67 |
|
|
@@ -74,6 +78,8 @@ class ModelType(Enum):
|
|
| 74 |
return ModelType.FT
|
| 75 |
if "pretrained" in type or "🟢" in type:
|
| 76 |
return ModelType.PT
|
|
|
|
|
|
|
| 77 |
if "deep-learning" in type or "🟦" in type:
|
| 78 |
return ModelType.DL
|
| 79 |
if "statistical" in type or "🟣" in type:
|
|
|
|
| 34 |
model_info_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False, True)])
|
| 35 |
model_info_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False, True)])
|
| 36 |
model_info_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
| 37 |
+
model_info_dict.append(["org", ColumnContent, ColumnContent("Organization", "str", True, hidden=False)])
|
| 38 |
+
model_info_dict.append(["testdata_leakage", ColumnContent, ColumnContent("TestData Leakage", "str", True, hidden=False)])
|
| 39 |
# model_info_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
| 40 |
|
| 41 |
# We use make dataclass to dynamically fill the scores from Tasks
|
|
|
|
| 61 |
|
| 62 |
class ModelType(Enum):
|
| 63 |
PT = ModelDetails(name="🟢 pretrained", symbol="🟢")
|
| 64 |
+
FT = ModelDetails(name="🟣 fine-tuned", symbol="🟣")
|
| 65 |
+
AG = ModelDetails(name="🟡 agentic", symbol="🟡")
|
| 66 |
DL = ModelDetails(name="🔷 deep-learning", symbol="🔷")
|
| 67 |
+
ST = ModelDetails(name="🔶 statistical", symbol="🔶")
|
| 68 |
+
|
| 69 |
|
| 70 |
Unknown = ModelDetails(name="", symbol="?")
|
| 71 |
|
|
|
|
| 78 |
return ModelType.FT
|
| 79 |
if "pretrained" in type or "🟢" in type:
|
| 80 |
return ModelType.PT
|
| 81 |
+
if "agentic" in type or "🟡" in type:
|
| 82 |
+
return ModelType.AG
|
| 83 |
if "deep-learning" in type or "🟦" in type:
|
| 84 |
return ModelType.DL
|
| 85 |
if "statistical" in type or "🟣" in type:
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -21,9 +21,11 @@ class ModelConfig:
|
|
| 21 |
model_type: ModelType = ModelType.Unknown
|
| 22 |
code_link: str = ""
|
| 23 |
precision: Precision = Precision.Unknown
|
|
|
|
| 24 |
license: str = "?"
|
| 25 |
likes: int = 0
|
| 26 |
num_params: int | str = 0
|
|
|
|
| 27 |
|
| 28 |
@classmethod
|
| 29 |
def init_from_json_file(cls, json_filepath):
|
|
@@ -39,7 +41,10 @@ class ModelConfig:
|
|
| 39 |
model = data.get("model", "")
|
| 40 |
model_link = data.get("model_link", "")
|
| 41 |
code_link = data.get("code_link", "")
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
def to_dict(self):
|
| 45 |
"""Converts the model info to a dict compatible with our dataframe display"""
|
|
@@ -53,6 +58,8 @@ class ModelConfig:
|
|
| 53 |
ModelInfoColumn.license.name: self.license,
|
| 54 |
ModelInfoColumn.likes.name: self.likes,
|
| 55 |
ModelInfoColumn.params.name: self.num_params,
|
|
|
|
|
|
|
| 56 |
}
|
| 57 |
|
| 58 |
return data_dict
|
|
@@ -63,14 +70,14 @@ class EvalResult:
|
|
| 63 |
"""
|
| 64 |
eval_name: str # org_model_precision (uid)
|
| 65 |
full_model: str # org/model (path on hub)
|
| 66 |
-
org: str
|
| 67 |
model: str
|
| 68 |
revision: str # commit hash, "" if main
|
| 69 |
results: dict
|
| 70 |
precision: Precision = Precision.Unknown
|
| 71 |
model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
| 72 |
weight_type: WeightType = WeightType.Original # Original or Adapter
|
| 73 |
-
architecture: str = "Unknown"
|
| 74 |
license: str = "?"
|
| 75 |
likes: int = 0
|
| 76 |
num_params: int = 0
|
|
@@ -130,7 +137,7 @@ class EvalResult:
|
|
| 130 |
org=org,
|
| 131 |
model=model,
|
| 132 |
results=results,
|
| 133 |
-
precision=precision,
|
| 134 |
revision= config.get("model_sha", ""),
|
| 135 |
still_on_hub=still_on_hub,
|
| 136 |
architecture=architecture
|
|
@@ -237,6 +244,7 @@ def get_model_info(results_path: str, requests_path: str) -> list[ModelConfig]:
|
|
| 237 |
for v in model_infos.values():
|
| 238 |
try:
|
| 239 |
v.to_dict() # we test if the dict version is complete
|
|
|
|
| 240 |
results.append(v)
|
| 241 |
except KeyError: # not all eval values present
|
| 242 |
continue
|
|
|
|
| 21 |
model_type: ModelType = ModelType.Unknown
|
| 22 |
code_link: str = ""
|
| 23 |
precision: Precision = Precision.Unknown
|
| 24 |
+
org: str = ""
|
| 25 |
license: str = "?"
|
| 26 |
likes: int = 0
|
| 27 |
num_params: int | str = 0
|
| 28 |
+
testdata_leakage: str = "NA"
|
| 29 |
|
| 30 |
@classmethod
|
| 31 |
def init_from_json_file(cls, json_filepath):
|
|
|
|
| 41 |
model = data.get("model", "")
|
| 42 |
model_link = data.get("model_link", "")
|
| 43 |
code_link = data.get("code_link", "")
|
| 44 |
+
org = data.get("org", "")
|
| 45 |
+
testdata_leakage = data.get("testdata_leakage", "N/A")
|
| 46 |
+
return cls(model=model, model_link=model_link, model_type=model_type, code_link=code_link, org=org,
|
| 47 |
+
precision=precision, testdata_leakage=testdata_leakage)
|
| 48 |
|
| 49 |
def to_dict(self):
|
| 50 |
"""Converts the model info to a dict compatible with our dataframe display"""
|
|
|
|
| 58 |
ModelInfoColumn.license.name: self.license,
|
| 59 |
ModelInfoColumn.likes.name: self.likes,
|
| 60 |
ModelInfoColumn.params.name: self.num_params,
|
| 61 |
+
ModelInfoColumn.org.name: self.org,
|
| 62 |
+
ModelInfoColumn.testdata_leakage.name: self.testdata_leakage,
|
| 63 |
}
|
| 64 |
|
| 65 |
return data_dict
|
|
|
|
| 70 |
"""
|
| 71 |
eval_name: str # org_model_precision (uid)
|
| 72 |
full_model: str # org/model (path on hub)
|
| 73 |
+
org: str
|
| 74 |
model: str
|
| 75 |
revision: str # commit hash, "" if main
|
| 76 |
results: dict
|
| 77 |
precision: Precision = Precision.Unknown
|
| 78 |
model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
| 79 |
weight_type: WeightType = WeightType.Original # Original or Adapter
|
| 80 |
+
architecture: str = "Unknown"
|
| 81 |
license: str = "?"
|
| 82 |
likes: int = 0
|
| 83 |
num_params: int = 0
|
|
|
|
| 137 |
org=org,
|
| 138 |
model=model,
|
| 139 |
results=results,
|
| 140 |
+
precision=precision,
|
| 141 |
revision= config.get("model_sha", ""),
|
| 142 |
still_on_hub=still_on_hub,
|
| 143 |
architecture=architecture
|
|
|
|
| 244 |
for v in model_infos.values():
|
| 245 |
try:
|
| 246 |
v.to_dict() # we test if the dict version is complete
|
| 247 |
+
# ipdb.set_trace()
|
| 248 |
results.append(v)
|
| 249 |
except KeyError: # not all eval values present
|
| 250 |
continue
|
src/utils.py
CHANGED
|
@@ -48,7 +48,8 @@ def rename_metrics(df):
|
|
| 48 |
df = df.rename(columns={
|
| 49 |
'eval_metrics/MASE[0.5]': 'MASE',
|
| 50 |
'eval_metrics/mean_weighted_sum_quantile_loss': 'CRPS',
|
| 51 |
-
'rank': '
|
|
|
|
| 52 |
})
|
| 53 |
return df
|
| 54 |
|
|
@@ -105,6 +106,59 @@ def pivot_existed_df(df, tab_name):
|
|
| 105 |
return df_pivot
|
| 106 |
|
| 107 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
def get_grouped_dfs(root_dir='results', ds_properties='results/dataset_properties.csv'):
|
| 109 |
df_list = []
|
| 110 |
|
|
@@ -163,6 +217,24 @@ def get_grouped_dfs(root_dir='results', ds_properties='results/dataset_propertie
|
|
| 163 |
RANKING_METRIC = "eval_metrics/mean_weighted_sum_quantile_loss"
|
| 164 |
df['rank'] = df.groupby(['dataset', 'term_length', 'frequency'])[f'{RANKING_METRIC}'].rank(method='first',
|
| 165 |
ascending=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
# create a new column called rank
|
| 167 |
metric_columns.append('rank')
|
| 168 |
# create a new column called univariate. Set it to true if column num_variates is 1, otherwise set it to false
|
|
@@ -173,7 +245,20 @@ def get_grouped_dfs(root_dir='results', ds_properties='results/dataset_propertie
|
|
| 173 |
# ipdb.set_trace()
|
| 174 |
grouped_results_overall = df.groupby(['model'])[METRIC_CHOICES].agg(stats.gmean)
|
| 175 |
grouped_results_overall_rank = df.groupby(['model'])[['rank']].mean()
|
| 176 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
|
| 178 |
# grouped_results_overall = grouped_results_overall.rename(columns={'model':'Model'})
|
| 179 |
# grouped_results.to_csv(f'artefacts/grouped_results_by_model.csv')
|
|
|
|
| 48 |
df = df.rename(columns={
|
| 49 |
'eval_metrics/MASE[0.5]': 'MASE',
|
| 50 |
'eval_metrics/mean_weighted_sum_quantile_loss': 'CRPS',
|
| 51 |
+
'rank': 'CRPS_Rank',
|
| 52 |
+
'Rank_MASE': 'MASE_Rank',
|
| 53 |
})
|
| 54 |
return df
|
| 55 |
|
|
|
|
| 106 |
return df_pivot
|
| 107 |
|
| 108 |
|
| 109 |
+
def get_all_res_dfs(root_dir='results', ds_properties='results/dataset_properties.csv'):
|
| 110 |
+
df_list = []
|
| 111 |
+
|
| 112 |
+
# Walk through all folders and subfolders in the root directory
|
| 113 |
+
for subdir, _, files in os.walk(root_dir):
|
| 114 |
+
for file in files:
|
| 115 |
+
if file == 'all_results.csv':
|
| 116 |
+
file_path = os.path.join(subdir, file)
|
| 117 |
+
df = pd.read_csv(file_path)
|
| 118 |
+
df_list.append(df)
|
| 119 |
+
# Concatenate all dataframes into one
|
| 120 |
+
all_results_df = pd.concat(df_list, ignore_index=True)
|
| 121 |
+
all_results_df = all_results_df.sort_values(by=['model', 'dataset']).reset_index(drop=True)
|
| 122 |
+
all_results_df[['dataset', 'frequency', 'term_length']] = all_results_df['dataset'].str.split('/', expand=True)
|
| 123 |
+
|
| 124 |
+
dataset_properties = pd.read_csv(ds_properties)
|
| 125 |
+
# Reforemat the the first element of each row after the header following these rules:
|
| 126 |
+
# 1. make all characters lowercase
|
| 127 |
+
dataset_properties['dataset'] = dataset_properties['dataset'].apply(lambda x: x.lower())
|
| 128 |
+
# 2. replace all spaces with underscores
|
| 129 |
+
dataset_properties['dataset'] = dataset_properties['dataset'].apply(lambda x: x.replace(' ', '_'))
|
| 130 |
+
# 3. Replace all dashes with underscores
|
| 131 |
+
dataset_properties['dataset'] = dataset_properties['dataset'].apply(lambda x: x.replace('-', '_'))
|
| 132 |
+
# 4. Replace consecutive underscores with a single underscore. There maybe more than 2 consecutive underscores
|
| 133 |
+
dataset_properties['dataset'] = dataset_properties['dataset'].apply(lambda x: re.sub('_+', '_', x))
|
| 134 |
+
# 5. Remove all leading and trailing underscores
|
| 135 |
+
dataset_properties['dataset'] = dataset_properties['dataset'].apply(lambda x: x.strip('_'))
|
| 136 |
+
|
| 137 |
+
df = all_results_df
|
| 138 |
+
|
| 139 |
+
# convert it to a dictionary, with dataset as the key, and the value as another dictionary. The inner dictionary has the column names as the key, and the value as the value.
|
| 140 |
+
dataset_properties_dict = dataset_properties.set_index('dataset').T.to_dict('dict')
|
| 141 |
+
dataset_properties_dict.keys()
|
| 142 |
+
|
| 143 |
+
# # match the dataset name in model_properties_dict with the dataset name in df and add a new column for each key value pair in the inner dictionary.
|
| 144 |
+
for dataset in dataset_properties_dict.keys():
|
| 145 |
+
for key in dataset_properties_dict[dataset].keys():
|
| 146 |
+
# set the row with the dataset name to the value of the key think step by step
|
| 147 |
+
# First, get the row with the dataset name
|
| 148 |
+
# Second, set the value of the key to the value of the key
|
| 149 |
+
if key == 'frequency':
|
| 150 |
+
# only set the frequency if the frequency column for all rows for the dataset is empty string
|
| 151 |
+
if all(df[df['dataset'] == dataset]['frequency'].isna()):
|
| 152 |
+
df.loc[df['dataset'] == dataset, key] = dataset_properties_dict[dataset][key]
|
| 153 |
+
else:
|
| 154 |
+
df.loc[df['dataset'] == dataset, key] = dataset_properties_dict[dataset][key]
|
| 155 |
+
|
| 156 |
+
# unify the frequency
|
| 157 |
+
df = unify_freq(df)
|
| 158 |
+
# standardize by seasonal naive
|
| 159 |
+
df = standardize_df(df)
|
| 160 |
+
ipdb.set_trace()
|
| 161 |
+
return None
|
| 162 |
def get_grouped_dfs(root_dir='results', ds_properties='results/dataset_properties.csv'):
|
| 163 |
df_list = []
|
| 164 |
|
|
|
|
| 217 |
RANKING_METRIC = "eval_metrics/mean_weighted_sum_quantile_loss"
|
| 218 |
df['rank'] = df.groupby(['dataset', 'term_length', 'frequency'])[f'{RANKING_METRIC}'].rank(method='first',
|
| 219 |
ascending=True)
|
| 220 |
+
MASE_RANKING_METRICS = "eval_metrics/MASE[0.5]"
|
| 221 |
+
df['Rank_MASE'] = df.groupby(['dataset', 'term_length', 'frequency'])[f'{MASE_RANKING_METRICS}'].rank(method='first',
|
| 222 |
+
ascending=True)
|
| 223 |
+
# Add a new column to identify top 5 ranks
|
| 224 |
+
df['is_top5_CRPS'] = df['rank'] <= 5
|
| 225 |
+
|
| 226 |
+
# Count top 3 performances for each model
|
| 227 |
+
top5_counts = df.groupby('model')['is_top5_CRPS'].sum().reset_index()
|
| 228 |
+
top5_counts = top5_counts.rename(columns={'is_top5_CRPS': 'count_top5_CRPS'})
|
| 229 |
+
|
| 230 |
+
# Add a new column to identify top 5 ranks
|
| 231 |
+
df['is_top5_MASE'] = df['Rank_MASE'] <= 5
|
| 232 |
+
|
| 233 |
+
# Count top 3 performances for each model
|
| 234 |
+
top5_counts_MASE = df.groupby('model')['is_top5_MASE'].sum().reset_index()
|
| 235 |
+
top5_counts_MASE = top5_counts_MASE.rename(columns={'is_top5_MASE': 'count_top5_MASE'})
|
| 236 |
+
|
| 237 |
+
# ipdb.set_trace()
|
| 238 |
# create a new column called rank
|
| 239 |
metric_columns.append('rank')
|
| 240 |
# create a new column called univariate. Set it to true if column num_variates is 1, otherwise set it to false
|
|
|
|
| 245 |
# ipdb.set_trace()
|
| 246 |
grouped_results_overall = df.groupby(['model'])[METRIC_CHOICES].agg(stats.gmean)
|
| 247 |
grouped_results_overall_rank = df.groupby(['model'])[['rank']].mean()
|
| 248 |
+
grouped_results_overall_rank_mase = df.groupby(['model'])[['Rank_MASE']].mean()
|
| 249 |
+
grouped_results_overall = pd.concat([grouped_results_overall, grouped_results_overall_rank,grouped_results_overall_rank_mase], axis=1)
|
| 250 |
+
|
| 251 |
+
# grouped_results_overall_avg_rank = pd.concat([grouped_results_overall_rank, grouped_results_overall_rank_mase], axis=1)
|
| 252 |
+
# grouped_results_overall_avg_rank['Avg_Rank'] = grouped_results_overall_avg_rank.mean(axis=1)
|
| 253 |
+
# grouped_results_overall_avg_rank = grouped_results_overall_avg_rank[['Avg_Rank']]
|
| 254 |
+
# grouped_results_overall = pd.concat([grouped_results_overall, grouped_results_overall_avg_rank], axis=1)
|
| 255 |
+
# switch the order to ['model', 'eval_metrics/MASE[0.5]', 'Rank_MASE', 'eval_metrics/mean_weighted_sum_quantile_loss', 'rank']
|
| 256 |
+
# grouped_results_overall = grouped_results_overall[['eval_metrics/MASE[0.5]', 'Rank_MASE', 'eval_metrics/mean_weighted_sum_quantile_loss', 'rank', 'Avg_Rank']]
|
| 257 |
+
grouped_results_overall = grouped_results_overall[['eval_metrics/MASE[0.5]', 'Rank_MASE', 'eval_metrics/mean_weighted_sum_quantile_loss', 'rank']]
|
| 258 |
+
# ipdb.set_trace()
|
| 259 |
+
# Add top3 performance statistics to the overall results
|
| 260 |
+
# grouped_results_overall = pd.merge(grouped_results_overall, top5_counts, on='model')
|
| 261 |
+
# grouped_results_overall = pd.merge(grouped_results_overall, top5_counts_MASE, on='model')
|
| 262 |
|
| 263 |
# grouped_results_overall = grouped_results_overall.rename(columns={'model':'Model'})
|
| 264 |
# grouped_results.to_csv(f'artefacts/grouped_results_by_model.csv')
|