GIFT-Eval

Running

App Files Files Community

Taha Aksu commited on Aug 5, 2025

Commit

6039937

1 Parent(s): 68eb6a5

Update leaderboard structure

Browse files

Files changed (5) hide show

app.py +34 -5
src/about.py +6 -1
src/display/utils.py +8 -2
src/leaderboard/read_evals.py +12 -4
src/utils.py +87 -2

app.py CHANGED Viewed

@@ -90,7 +90,7 @@ grouped_dfs = get_grouped_dfs()
 domain_df, freq_df, term_length_df, variate_type_df, overall_df = grouped_dfs['domain'], grouped_dfs['frequency'], grouped_dfs['term_length'], grouped_dfs['univariate'], grouped_dfs['overall']
 overall_df = rename_metrics(overall_df)
 overall_df = format_df(overall_df)
-overall_df = overall_df.sort_values(by=['Rank'])
 domain_df = pivot_existed_df(domain_df, tab_name='domain')
 print(f'Domain dataframe is {domain_df}')
 freq_df = pivot_existed_df(freq_df, tab_name='frequency')
@@ -107,7 +107,7 @@ model_info_df = get_model_info_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH)
 # ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
-def init_leaderboard(ori_dataframe, model_info_df, sort_val: str|None = None):
     if ori_dataframe is None or ori_dataframe.empty:
         raise ValueError("Leaderboard DataFrame is empty or None.")
     model_info_col_list = [c.name for c in fields(ModelInfoColumn) if c.displayed_by_default if c.name not in ['#Params (B)', 'available_on_hub', 'hub', 'Model sha','Hub License']]
@@ -120,11 +120,19 @@ def init_leaderboard(ori_dataframe, model_info_df, sort_val: str|None = None):
     new_cols = ['T'] + [col for col in merged_df.columns if col != 'T']
     merged_df = merged_df[new_cols]
     if sort_val:
-        if sort_val in merged_df.columns:
             merged_df = merged_df.sort_values(by=[sort_val])
         else:
             print(f'Warning: cannot sort by {sort_val}')
     print('Merged df: ', merged_df)
     # get the data type
     datatype_list = [col2type_dict[col] if col in col2type_dict else 'number' for col in merged_df.columns]
     # print('datatype_list: ', datatype_list)
@@ -162,13 +170,13 @@ def init_leaderboard(ori_dataframe, model_info_df, sort_val: str|None = None):
         # ],
         filter_columns=[
             ColumnFilter(ModelInfoColumn.model_type.name, type="checkboxgroup", label="Model types"),
         ],
         # bool_checkboxgroup_label="",
         column_widths=[40, 150] + [180 for _ in range(len(merged_df.columns)-2)],
         interactive=False,
     )
 demo = gr.Blocks(css=custom_css)
 with demo:
     gr.HTML(TITLE)
@@ -176,7 +184,8 @@ with demo:
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         with gr.TabItem('🏅 Overall', elem_id="llm-benchmark-tab-table", id=5):
-            leaderboard = init_leaderboard(overall_df, model_info_df, sort_val='Rank')
             print(f'FINAL Overall LEADERBOARD {overall_df}')
         with gr.TabItem("🏅 By Domain", elem_id="llm-benchmark-tab-table", id=0):
             leaderboard = init_leaderboard(domain_df, model_info_df)
@@ -196,6 +205,26 @@ with demo:
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=4):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):
             citation_button = gr.Textbox(

 domain_df, freq_df, term_length_df, variate_type_df, overall_df = grouped_dfs['domain'], grouped_dfs['frequency'], grouped_dfs['term_length'], grouped_dfs['univariate'], grouped_dfs['overall']
 overall_df = rename_metrics(overall_df)
 overall_df = format_df(overall_df)
+overall_df = overall_df.sort_values(by=['MASE_Rank'])
 domain_df = pivot_existed_df(domain_df, tab_name='domain')
 print(f'Domain dataframe is {domain_df}')
 freq_df = pivot_existed_df(freq_df, tab_name='frequency')
 # ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
+def init_leaderboard(ori_dataframe, model_info_df, sort_val: str | list | None = None):
     if ori_dataframe is None or ori_dataframe.empty:
         raise ValueError("Leaderboard DataFrame is empty or None.")
     model_info_col_list = [c.name for c in fields(ModelInfoColumn) if c.displayed_by_default if c.name not in ['#Params (B)', 'available_on_hub', 'hub', 'Model sha','Hub License']]
     new_cols = ['T'] + [col for col in merged_df.columns if col != 'T']
     merged_df = merged_df[new_cols]
     if sort_val:
+        if isinstance(sort_val, list):
+            assert sort_val[0] == 'TestData Leakage'
+            # ipdb.set_trace()
+            leakage_order = pd.Categorical(merged_df[sort_val[0]], categories=['No', 'Yes', 'N/A'], ordered=True)
+            merged_df['leakage_order'] = leakage_order
+            merged_df = merged_df.sort_values(by=['leakage_order', sort_val[1]])
+            merged_df = merged_df.drop(columns=['leakage_order'])
+        elif sort_val in merged_df.columns:
             merged_df = merged_df.sort_values(by=[sort_val])
         else:
             print(f'Warning: cannot sort by {sort_val}')
     print('Merged df: ', merged_df)
+    # ipdb.set_trace()
     # get the data type
     datatype_list = [col2type_dict[col] if col in col2type_dict else 'number' for col in merged_df.columns]
     # print('datatype_list: ', datatype_list)
         # ],
         filter_columns=[
             ColumnFilter(ModelInfoColumn.model_type.name, type="checkboxgroup", label="Model types"),
+            ColumnFilter(ModelInfoColumn.testdata_leakage.name, type="checkboxgroup", label="TestData Leakage"),
         ],
         # bool_checkboxgroup_label="",
         column_widths=[40, 150] + [180 for _ in range(len(merged_df.columns)-2)],
         interactive=False,
     )
 demo = gr.Blocks(css=custom_css)
 with demo:
     gr.HTML(TITLE)
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         with gr.TabItem('🏅 Overall', elem_id="llm-benchmark-tab-table", id=5):
+            # leaderboard = init_leaderboard(overall_df, model_info_df, sort_val='Rank')
+            leaderboard = init_leaderboard(overall_df, model_info_df, sort_val=['TestData Leakage', 'MASE_Rank'])
             print(f'FINAL Overall LEADERBOARD {overall_df}')
         with gr.TabItem("🏅 By Domain", elem_id="llm-benchmark-tab-table", id=0):
             leaderboard = init_leaderboard(domain_df, model_info_df)
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=4):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
+    # Trigger the column filters once on initial load so default selections take effect
+        demo.load(
+        js="""
+        () => {
+        // Make the JS fire one legitimate `input` event once the checkboxgroup
+        // component is ready.  `querySelector` looks for the *wrapper* div Gradio
+        // puts around the checkbox-group.
+        const target = document.querySelector(
+            'div[data-testid="checkboxgroup-model types"]');
+        if (!target) { return []; }           // safety guard
+        // Ask Gradio’s front-end to re-compute its filters:
+        target.dispatchEvent(new Event('input', { bubbles: true }));
+        return [];                            // load() must return something
+        }
+        """
+    )
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):
             citation_button = gr.Textbox(

src/about.py CHANGED Viewed

@@ -44,7 +44,12 @@ points, spanning seven domains, 10 frequencies, multivariate inputs, and predict
 LLM_BENCHMARKS_TEXT = f"""
 ## Update Log
-- 2025‑07‑24: Corrected the Naive and Seasonal Naive scores to match the latest GIFT‑Eval notebooks. Most model rankings remain unchanged; only a few near the bottom shifted slightly (AutoETS and Timer each dropped two places now at 35th and 36th places respectively, while NBEATS moved up one now at 27th place).
 ## How It Works

 LLM_BENCHMARKS_TEXT = f"""
 ## Update Log
+### 2025‑07‑24
+- Corrected the Naive and Seasonal Naive scores to match the latest GIFT‑Eval notebooks. Most model rankings remain unchanged; only a few near the bottom shifted slightly (AutoETS and Timer each dropped two places now at 35th and 36th places respectively, while NBEATS moved up one now at 27th place).
+### 2025-08-05
+- Added new columns to the leaderboard: Organization, TestData Leakage, and MASE_Rank. TestData Leakage is a binary indicator specifying whether any test data was present in the training set. MASE_Rank reflects the model's ranking based on the MASE metric, aligned with the ranking scheme used for CRPS_Rank. These additions were made in response to multiple requests from independent groups seeking fairer comparisons. With these updates, the leaderboard now supports sorting by models that do not leak test data, and viewers can choose to rank models based on either MASE_Rank or CRPS_Rank, depending on their use case.
+- Added new model type: Agentic to indicate submissions that use agentic system to generate the forecasts.
 ## How It Works

src/display/utils.py CHANGED Viewed

@@ -34,6 +34,8 @@ model_info_dict.append(["license", ColumnContent, ColumnContent("Hub License", "
 model_info_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False, True)])
 model_info_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False, True)])
 model_info_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
 # model_info_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
 # We use make dataclass to dynamically fill the scores from Tasks
@@ -59,9 +61,11 @@ class ModelDetails:
 class ModelType(Enum):
     PT = ModelDetails(name="🟢 pretrained", symbol="🟢")
-    FT = ModelDetails(name="🔶 fine-tuned", symbol="🔶")
     DL = ModelDetails(name="🔷 deep-learning", symbol="🔷")
-    ST = ModelDetails(name="🟣 statistical", symbol="🟣")
     Unknown = ModelDetails(name="", symbol="?")
@@ -74,6 +78,8 @@ class ModelType(Enum):
             return ModelType.FT
         if "pretrained" in type or "🟢" in type:
             return ModelType.PT
         if "deep-learning" in type or "🟦" in type:
             return ModelType.DL
         if "statistical" in type or "🟣" in type:

 model_info_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False, True)])
 model_info_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False, True)])
 model_info_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
+model_info_dict.append(["org", ColumnContent, ColumnContent("Organization", "str", True, hidden=False)])
+model_info_dict.append(["testdata_leakage", ColumnContent, ColumnContent("TestData Leakage", "str", True, hidden=False)])
 # model_info_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
 # We use make dataclass to dynamically fill the scores from Tasks
 class ModelType(Enum):
     PT = ModelDetails(name="🟢 pretrained", symbol="🟢")
+    FT = ModelDetails(name="🟣 fine-tuned", symbol="🟣")
+    AG = ModelDetails(name="🟡 agentic", symbol="🟡")
     DL = ModelDetails(name="🔷 deep-learning", symbol="🔷")
+    ST = ModelDetails(name="🔶 statistical", symbol="🔶")
     Unknown = ModelDetails(name="", symbol="?")
             return ModelType.FT
         if "pretrained" in type or "🟢" in type:
             return ModelType.PT
+        if "agentic" in type or "🟡" in type:
+            return ModelType.AG
         if "deep-learning" in type or "🟦" in type:
             return ModelType.DL
         if "statistical" in type or "🟣" in type:

src/leaderboard/read_evals.py CHANGED Viewed

@@ -21,9 +21,11 @@ class ModelConfig:
     model_type: ModelType = ModelType.Unknown
     code_link: str = ""
     precision: Precision = Precision.Unknown
     license: str = "?"
     likes: int = 0
     num_params: int | str = 0
     @classmethod
     def init_from_json_file(cls, json_filepath):
@@ -39,7 +41,10 @@ class ModelConfig:
         model = data.get("model", "")
         model_link = data.get("model_link", "")
         code_link = data.get("code_link", "")
-        return cls(model=model, model_link=model_link, model_type=model_type, code_link=code_link, precision=precision)
     def to_dict(self):
         """Converts the model info to a dict compatible with our dataframe display"""
@@ -53,6 +58,8 @@ class ModelConfig:
             ModelInfoColumn.license.name: self.license,
             ModelInfoColumn.likes.name: self.likes,
             ModelInfoColumn.params.name: self.num_params,
         }
         return data_dict
@@ -63,14 +70,14 @@ class EvalResult:
     """
     eval_name: str # org_model_precision (uid)
     full_model: str # org/model (path on hub)
-    org: str
     model: str
     revision: str # commit hash, "" if main
     results: dict
     precision: Precision = Precision.Unknown
     model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
     weight_type: WeightType = WeightType.Original # Original or Adapter
-    architecture: str = "Unknown"
     license: str = "?"
     likes: int = 0
     num_params: int = 0
@@ -130,7 +137,7 @@ class EvalResult:
             org=org,
             model=model,
             results=results,
-            precision=precision,
             revision= config.get("model_sha", ""),
             still_on_hub=still_on_hub,
             architecture=architecture
@@ -237,6 +244,7 @@ def get_model_info(results_path: str, requests_path: str) -> list[ModelConfig]:
     for v in model_infos.values():
         try:
             v.to_dict() # we test if the dict version is complete
             results.append(v)
         except KeyError:  # not all eval values present
             continue

     model_type: ModelType = ModelType.Unknown
     code_link: str = ""
     precision: Precision = Precision.Unknown
+    org: str = ""
     license: str = "?"
     likes: int = 0
     num_params: int | str = 0
+    testdata_leakage: str = "NA"
     @classmethod
     def init_from_json_file(cls, json_filepath):
         model = data.get("model", "")
         model_link = data.get("model_link", "")
         code_link = data.get("code_link", "")
+        org = data.get("org", "")
+        testdata_leakage = data.get("testdata_leakage", "N/A")
+        return cls(model=model, model_link=model_link, model_type=model_type, code_link=code_link, org=org,
+                   precision=precision, testdata_leakage=testdata_leakage)
     def to_dict(self):
         """Converts the model info to a dict compatible with our dataframe display"""
             ModelInfoColumn.license.name: self.license,
             ModelInfoColumn.likes.name: self.likes,
             ModelInfoColumn.params.name: self.num_params,
+            ModelInfoColumn.org.name: self.org,
+            ModelInfoColumn.testdata_leakage.name: self.testdata_leakage,
         }
         return data_dict
     """
     eval_name: str # org_model_precision (uid)
     full_model: str # org/model (path on hub)
+    org: str
     model: str
     revision: str # commit hash, "" if main
     results: dict
     precision: Precision = Precision.Unknown
     model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
     weight_type: WeightType = WeightType.Original # Original or Adapter
+    architecture: str = "Unknown"
     license: str = "?"
     likes: int = 0
     num_params: int = 0
             org=org,
             model=model,
             results=results,
+            precision=precision,
             revision= config.get("model_sha", ""),
             still_on_hub=still_on_hub,
             architecture=architecture
     for v in model_infos.values():
         try:
             v.to_dict() # we test if the dict version is complete
+            # ipdb.set_trace()
             results.append(v)
         except KeyError:  # not all eval values present
             continue

src/utils.py CHANGED Viewed

@@ -48,7 +48,8 @@ def rename_metrics(df):
     df = df.rename(columns={
         'eval_metrics/MASE[0.5]': 'MASE',
         'eval_metrics/mean_weighted_sum_quantile_loss': 'CRPS',
-        'rank': 'Rank'
     })
     return df
@@ -105,6 +106,59 @@ def pivot_existed_df(df, tab_name):
     return df_pivot
 def get_grouped_dfs(root_dir='results', ds_properties='results/dataset_properties.csv'):
     df_list = []
@@ -163,6 +217,24 @@ def get_grouped_dfs(root_dir='results', ds_properties='results/dataset_propertie
     RANKING_METRIC = "eval_metrics/mean_weighted_sum_quantile_loss"
     df['rank'] = df.groupby(['dataset', 'term_length', 'frequency'])[f'{RANKING_METRIC}'].rank(method='first',
                                                                                                ascending=True)
     # create a new column called rank
     metric_columns.append('rank')
     # create a new column called univariate. Set it to true if column num_variates is 1, otherwise set it to false
@@ -173,7 +245,20 @@ def get_grouped_dfs(root_dir='results', ds_properties='results/dataset_propertie
     # ipdb.set_trace()
     grouped_results_overall = df.groupby(['model'])[METRIC_CHOICES].agg(stats.gmean)
     grouped_results_overall_rank = df.groupby(['model'])[['rank']].mean()
-    grouped_results_overall = pd.concat([grouped_results_overall, grouped_results_overall_rank], axis=1)
     # grouped_results_overall = grouped_results_overall.rename(columns={'model':'Model'})
     # grouped_results.to_csv(f'artefacts/grouped_results_by_model.csv')

     df = df.rename(columns={
         'eval_metrics/MASE[0.5]': 'MASE',
         'eval_metrics/mean_weighted_sum_quantile_loss': 'CRPS',
+        'rank': 'CRPS_Rank',
+        'Rank_MASE': 'MASE_Rank',
     })
     return df
     return df_pivot
+def get_all_res_dfs(root_dir='results', ds_properties='results/dataset_properties.csv'):
+    df_list = []
+    # Walk through all folders and subfolders in the root directory
+    for subdir, _, files in os.walk(root_dir):
+        for file in files:
+            if file == 'all_results.csv':
+                file_path = os.path.join(subdir, file)
+                df = pd.read_csv(file_path)
+                df_list.append(df)
+    # Concatenate all dataframes into one
+    all_results_df = pd.concat(df_list, ignore_index=True)
+    all_results_df = all_results_df.sort_values(by=['model', 'dataset']).reset_index(drop=True)
+    all_results_df[['dataset', 'frequency', 'term_length']] = all_results_df['dataset'].str.split('/', expand=True)
+    dataset_properties = pd.read_csv(ds_properties)
+    # Reforemat the the first element of each row after the header following these rules:
+    # 1. make all characters lowercase
+    dataset_properties['dataset'] = dataset_properties['dataset'].apply(lambda x: x.lower())
+    # 2. replace all spaces with underscores
+    dataset_properties['dataset'] = dataset_properties['dataset'].apply(lambda x: x.replace(' ', '_'))
+    # 3. Replace all dashes with underscores
+    dataset_properties['dataset'] = dataset_properties['dataset'].apply(lambda x: x.replace('-', '_'))
+    # 4. Replace consecutive underscores with a single underscore. There maybe more than 2 consecutive underscores
+    dataset_properties['dataset'] = dataset_properties['dataset'].apply(lambda x: re.sub('_+', '_', x))
+    # 5. Remove all leading and trailing underscores
+    dataset_properties['dataset'] = dataset_properties['dataset'].apply(lambda x: x.strip('_'))
+    df = all_results_df
+    # convert it to a dictionary, with dataset as the key, and the value as another dictionary. The inner dictionary has the column names as the key, and the value as the value.
+    dataset_properties_dict = dataset_properties.set_index('dataset').T.to_dict('dict')
+    dataset_properties_dict.keys()
+    # # match the dataset name in model_properties_dict with the dataset name in df and add a new column for each key value pair in the inner dictionary.
+    for dataset in dataset_properties_dict.keys():
+        for key in dataset_properties_dict[dataset].keys():
+            # set the row with the dataset name to the value of the key think step by step
+            # First, get the row with the dataset name
+            # Second, set the value of the key to the value of the key
+            if key == 'frequency':
+                # only set the frequency if the frequency column for all rows for the dataset is empty string
+                if all(df[df['dataset'] == dataset]['frequency'].isna()):
+                    df.loc[df['dataset'] == dataset, key] = dataset_properties_dict[dataset][key]
+            else:
+                df.loc[df['dataset'] == dataset, key] = dataset_properties_dict[dataset][key]
+    # unify the frequency
+    df = unify_freq(df)
+    # standardize by seasonal naive
+    df = standardize_df(df)
+    ipdb.set_trace()
+    return None
 def get_grouped_dfs(root_dir='results', ds_properties='results/dataset_properties.csv'):
     df_list = []
     RANKING_METRIC = "eval_metrics/mean_weighted_sum_quantile_loss"
     df['rank'] = df.groupby(['dataset', 'term_length', 'frequency'])[f'{RANKING_METRIC}'].rank(method='first',
                                                                                                ascending=True)
+    MASE_RANKING_METRICS = "eval_metrics/MASE[0.5]"
+    df['Rank_MASE'] = df.groupby(['dataset', 'term_length', 'frequency'])[f'{MASE_RANKING_METRICS}'].rank(method='first',
+                                                                                               ascending=True)
+    # Add a new column to identify top 5 ranks
+    df['is_top5_CRPS'] = df['rank'] <= 5
+    # Count top 3 performances for each model
+    top5_counts = df.groupby('model')['is_top5_CRPS'].sum().reset_index()
+    top5_counts = top5_counts.rename(columns={'is_top5_CRPS': 'count_top5_CRPS'})
+    # Add a new column to identify top 5 ranks
+    df['is_top5_MASE'] = df['Rank_MASE'] <= 5
+    # Count top 3 performances for each model
+    top5_counts_MASE = df.groupby('model')['is_top5_MASE'].sum().reset_index()
+    top5_counts_MASE = top5_counts_MASE.rename(columns={'is_top5_MASE': 'count_top5_MASE'})
+    # ipdb.set_trace()
     # create a new column called rank
     metric_columns.append('rank')
     # create a new column called univariate. Set it to true if column num_variates is 1, otherwise set it to false
     # ipdb.set_trace()
     grouped_results_overall = df.groupby(['model'])[METRIC_CHOICES].agg(stats.gmean)
     grouped_results_overall_rank = df.groupby(['model'])[['rank']].mean()
+    grouped_results_overall_rank_mase = df.groupby(['model'])[['Rank_MASE']].mean()
+    grouped_results_overall = pd.concat([grouped_results_overall, grouped_results_overall_rank,grouped_results_overall_rank_mase], axis=1)
+    # grouped_results_overall_avg_rank = pd.concat([grouped_results_overall_rank, grouped_results_overall_rank_mase], axis=1)
+    # grouped_results_overall_avg_rank['Avg_Rank'] = grouped_results_overall_avg_rank.mean(axis=1)
+    # grouped_results_overall_avg_rank = grouped_results_overall_avg_rank[['Avg_Rank']]
+    # grouped_results_overall = pd.concat([grouped_results_overall, grouped_results_overall_avg_rank], axis=1)
+    # switch the order to ['model', 'eval_metrics/MASE[0.5]', 'Rank_MASE', 'eval_metrics/mean_weighted_sum_quantile_loss', 'rank']
+    # grouped_results_overall = grouped_results_overall[['eval_metrics/MASE[0.5]', 'Rank_MASE', 'eval_metrics/mean_weighted_sum_quantile_loss', 'rank', 'Avg_Rank']]
+    grouped_results_overall = grouped_results_overall[['eval_metrics/MASE[0.5]', 'Rank_MASE', 'eval_metrics/mean_weighted_sum_quantile_loss', 'rank']]
+    # ipdb.set_trace()
+    # Add top3 performance statistics to the overall results
+    # grouped_results_overall = pd.merge(grouped_results_overall, top5_counts, on='model')
+    # grouped_results_overall = pd.merge(grouped_results_overall, top5_counts_MASE, on='model')
     # grouped_results_overall = grouped_results_overall.rename(columns={'model':'Model'})
     # grouped_results.to_csv(f'artefacts/grouped_results_by_model.csv')