Spaces:

TIGER-Lab
/

MMEB-Leaderboard

Running on CPU Upgrade

App Files Files Community

153

add visdoc sub task scores

#52

by MINGYISU - opened Jun 15, 2025

base: refs/heads/main

←

from: refs/pr/52

Discussion Files changed

+39

-40

Files changed (3) hide show

app.py +18 -7
utils.py +5 -22
utils_v2.py +16 -11

app.py CHANGED Viewed

@@ -11,12 +11,9 @@ def update_table(query, min_size, max_size, selected_tasks=None):
         filtered_df = filtered_df[selected_columns]
     return filtered_df
-def update_table_v2(query, min_size, max_size, selected_tasks=None):
     df = v2.get_df()
     filtered_df = v2.search_and_filter_models(df, query, min_size, max_size)
-    if selected_tasks and len(selected_tasks) > 0:
-        selected_columns = v2.BASE_COLS + selected_tasks
-        filtered_df = filtered_df[selected_columns]
     return filtered_df
 with gr.Blocks() as block:
@@ -42,6 +39,7 @@ with gr.Blocks() as block:
                     elem_id="search-bar"
                 )
             df2 = v2.get_df()
             min_size2, max_size2 = get_size_range(df2)
@@ -92,11 +90,25 @@ with gr.Blocks() as block:
             )
             refresh_button2.click(fn=v2.refresh_data, outputs=data_component2)
         # table 2, image scores only
         with gr.TabItem("🖼️ Image", elem_id="qa-tab-table1", id=2):
             gr.Markdown(v2.TABLE_INTRODUCTION_I)
             data_component3 = gr.components.Dataframe(
-                value=v2.rank_models(df2[v2.COLUMN_NAMES_I], 'Image-Overall'),
                 headers=v2.COLUMN_NAMES_I,
                 type="pandas",
                 datatype=v2.DATA_TITLE_TYPE_I,
@@ -122,7 +134,7 @@ with gr.Blocks() as block:
         with gr.TabItem("📑 Visual Doc", elem_id="qa-tab-table1", id=4):
             gr.Markdown(v2.TABLE_INTRODUCTION_D)
             data_component5 = gr.components.Dataframe(
-                value=v2.rank_models(df2[v2.COLUMN_NAMES_D], 'VisDoc'),
                 headers=v2.COLUMN_NAMES_D,
                 type="pandas",
                 datatype=v2.DATA_TITLE_TYPE_D,
@@ -160,7 +172,6 @@ with gr.Blocks() as block:
                     elem_id="search-bar"
                 )
-            df = get_df()
             min_size, max_size = get_size_range(df)
             with gr.Row():

         filtered_df = filtered_df[selected_columns]
     return filtered_df
+def update_table_v2(query, min_size, max_size):
     df = v2.get_df()
     filtered_df = v2.search_and_filter_models(df, query, min_size, max_size)
     return filtered_df
 with gr.Blocks() as block:
                     elem_id="search-bar"
                 )
+            df = get_df()
             df2 = v2.get_df()
             min_size2, max_size2 = get_size_range(df2)
             )
             refresh_button2.click(fn=v2.refresh_data, outputs=data_component2)
+        def get_special_processed_df2():
+            """Temporary special processing to merge v1 scores with v2 image scores.
+            Will be removed later after v2 is fully adopted."""
+            df2_i = df2[v2.COLUMN_NAMES_I]
+            df1 = df.rename(columns={'V1-Overall': 'Image-Overall'})
+            df1 = df1[v2.BASE_COLS + v2.SUB_TASKS_I + ['Image-Overall']]
+            combined_df = pd.concat([df1, df2_i], ignore_index=True)
+            for task in v2.TASKS_I:
+                combined_df[task] = combined_df[task].apply(lambda score: '-' if pd.isna(score) else score)
+            combined_df = v2.rank_models(combined_df, 'Image-Overall')
+            return combined_df[v2.COLUMN_NAMES_I]
         # table 2, image scores only
         with gr.TabItem("🖼️ Image", elem_id="qa-tab-table1", id=2):
             gr.Markdown(v2.TABLE_INTRODUCTION_I)
+            df2_i = get_special_processed_df2()
             data_component3 = gr.components.Dataframe(
+                value=df2_i,
                 headers=v2.COLUMN_NAMES_I,
                 type="pandas",
                 datatype=v2.DATA_TITLE_TYPE_I,
         with gr.TabItem("📑 Visual Doc", elem_id="qa-tab-table1", id=4):
             gr.Markdown(v2.TABLE_INTRODUCTION_D)
             data_component5 = gr.components.Dataframe(
+                value=v2.rank_models(df2[v2.COLUMN_NAMES_D], 'Visdoc-Overall'),
                 headers=v2.COLUMN_NAMES_D,
                 type="pandas",
                 datatype=v2.DATA_TITLE_TYPE_D,
                     elem_id="search-bar"
                 )
             min_size, max_size = get_size_range(df)
             with gr.Row():

utils.py CHANGED Viewed

@@ -38,8 +38,8 @@ This comprehensive suite enables robust evaluation of multimodal embedding model
 | [**🤗Hugging Face**](https://huggingface.co/datasets/TIGER-Lab/MMEB-V2) |
 """
-TABLE_INTRODUCTION = """***Important Notes:*** \n
-**We will be depreciating the MMEB-V1 leaderboard soon, and we will be releasing MMEB-V2 with more detailed scores and automatic evaluation.** \n"""
 LEADERBOARD_INFO = """
 ## Dataset Summary
@@ -57,9 +57,9 @@ SUBMIT_INTRODUCTION = """# Submit on MMEB Leaderboard Introduction
 ## ⚠ Please note that you need to submit the JSON file with the following format:
-### ***Important Notes: We have released MMEB-V2 and will deprecate MMEB-V1 soon. All further submissions should be made using the V2 format (see following).***
 ### ***In V2, the detailed scores of each dataset will be included, and our code will automatically generate the results and calculate the overall scores. See the [**GitHub page**](https://github.com/TIGER-AI-Lab/VLM2Vec) for more information.***
-### **A V2 Submission would look like this:**
 ```json
 {
     "metadata": {
@@ -103,23 +103,6 @@ SUBMIT_INTRODUCTION = """# Submit on MMEB Leaderboard Introduction
     }
 }
 ```
-### **TO SUBMIT V1 ONLY (Depreciated, but we still accept this format until 2025-06-30)**
-```json
-[
-    {
-        "Model": "<Model Name>",
-        "URL": "<Model URL>" or null,
-        "Model Size(B)": 1000 or null,
-        "Data Source": "Self-Reported",
-        "V1-Overall": 50.0,
-        "I-CLS": 50.0,
-        "I-QA": 50.0,
-        "I-RET": 50.0,
-        "I-VG": 50.0
-    },
-]
-```
 Please refer to the [**GitHub page**](https://github.com/TIGER-AI-Lab/VLM2Vec) for detailed instructions about evaluating your model. \n
 To submit, create a pull request and upload the generated JSON file to the ***scores*** folder, then send us an email at m7su@uwaterloo.ca, including your model's information. \n We will review your submission and update the leaderboard accordingly. \n
 Please also share any feedback or suggestions you have for improving the leaderboard experience. We appreciate your contributions to the MMEB community!
@@ -195,7 +178,7 @@ def process_model_size(size):
         return 'unknown'
     try:
         val = float(size)
-        return val
     except (ValueError, TypeError):
         return 'unknown'

 | [**🤗Hugging Face**](https://huggingface.co/datasets/TIGER-Lab/MMEB-V2) |
 """
+TABLE_INTRODUCTION = """***Important Notes: ***
+This is the MMEB-V1 leaderboard, which is now deprecated. MMEB-V1 is now the Image section of MMEB-V2, and the results on this leaderboard have been integrated into MMEB-V2 Image tab. For researchers relying on MMEB-V1, we recommend transitioning to MMEB-V2 for more comprehensive evaluation metrics and support. Thank you for your collaborations and understanding! \n"""
 LEADERBOARD_INFO = """
 ## Dataset Summary
 ## ⚠ Please note that you need to submit the JSON file with the following format:
+### ***Important Notes: We have released MMEB-V2 and will deprecate MMEB-V1 soon.*** \n
+### ***All further submissions should be made using the V2 format (see following).*** \n
 ### ***In V2, the detailed scores of each dataset will be included, and our code will automatically generate the results and calculate the overall scores. See the [**GitHub page**](https://github.com/TIGER-AI-Lab/VLM2Vec) for more information.***
 ```json
 {
     "metadata": {
     }
 }
 ```
 Please refer to the [**GitHub page**](https://github.com/TIGER-AI-Lab/VLM2Vec) for detailed instructions about evaluating your model. \n
 To submit, create a pull request and upload the generated JSON file to the ***scores*** folder, then send us an email at m7su@uwaterloo.ca, including your model's information. \n We will review your submission and update the leaderboard accordingly. \n
 Please also share any feedback or suggestions you have for improving the leaderboard experience. We appreciate your contributions to the MMEB community!
         return 'unknown'
     try:
         val = float(size)
+        return round(val, 3)
     except (ValueError, TypeError):
         return 'unknown'

utils_v2.py CHANGED Viewed

@@ -20,7 +20,10 @@ DATASETS = {
         "I-VG": ['MSCOCO', 'RefCOCO', 'RefCOCO-Matching', 'Visual7W']
         },
     "visdoc": {
-        "VisDoc": ['ViDoRe_arxivqa', 'ViDoRe_docvqa', 'ViDoRe_infovqa', 'ViDoRe_tabfquad', 'ViDoRe_tatdqa', 'ViDoRe_shiftproject', 'ViDoRe_syntheticDocQA_artificial_intelligence', 'ViDoRe_syntheticDocQA_energy', 'ViDoRe_syntheticDocQA_government_reports', 'ViDoRe_syntheticDocQA_healthcare_industry', 'VisRAG_ArxivQA', 'VisRAG_ChartQA', 'VisRAG_MP-DocVQA', 'VisRAG_SlideVQA', 'VisRAG_InfoVQA', 'VisRAG_PlotQA', 'ViDoSeek-page', 'ViDoSeek-doc', 'MMLongBench-page', 'MMLongBench-doc', "ViDoRe_esg_reports_human_labeled_v2", "ViDoRe_biomedical_lectures_v2", "ViDoRe_biomedical_lectures_v2_multilingual", "ViDoRe_economics_reports_v2", "ViDoRe_economics_reports_v2_multilingual", "ViDoRe_esg_reports_v2", "ViDoRe_esg_reports_v2_multilingual"]
         },
     "video": {
         "V-CLS": ['K700', 'UCF101', 'HMDB51', 'SmthSmthV2', 'Breakfast'],
@@ -37,27 +40,29 @@ SPECIAL_METRICS = {
 }
 BASE_COLS = ['Rank', 'Models', 'Model Size(B)']
-TASKS = ["Overall", "I-CLS", "I-QA", "I-RET", "I-VG", "VisDoc", "V-CLS", "V-QA", "V-RET", "V-MRET"]
 BASE_DATA_TITLE_TYPE = ['number', 'markdown', 'str', 'markdown']
-COLUMN_NAMES = BASE_COLS + ["Overall", 'Image-Overall', 'Video-Overall', 'VisDoc']
 DATA_TITLE_TYPE = BASE_DATA_TITLE_TYPE + \
                     ['number'] * 3
-TASKS_I = ['Image-Overall'] + TASKS[1:5] + ALL_DATASETS_SPLITS['image']
 COLUMN_NAMES_I = BASE_COLS + TASKS_I
 DATA_TITLE_TYPE_I = BASE_DATA_TITLE_TYPE + \
-                    ['number'] * (len(TASKS_I) + 4)
-TASKS_V = ['Video-Overall'] + TASKS[6:10] + ALL_DATASETS_SPLITS['video']
 COLUMN_NAMES_V = BASE_COLS + TASKS_V
 DATA_TITLE_TYPE_V = BASE_DATA_TITLE_TYPE + \
-                    ['number'] * (len(TASKS_V) + 4)
-TASKS_D = ['VisDoc'] + ALL_DATASETS_SPLITS['visdoc']
 COLUMN_NAMES_D = BASE_COLS + TASKS_D
 DATA_TITLE_TYPE_D = BASE_DATA_TITLE_TYPE + \
-                    ['number'] * len(TASKS_D)
 TABLE_INTRODUCTION = """**MMEB**: Massive MultiModal Embedding Benchmark \n
                         Models are ranked based on **Overall**"""
@@ -147,10 +152,10 @@ def generate_model_row(data):
     row.update(scores)
     return row
-def rank_models(df, column='Overall'):
     """Ranks the models based on the specific score."""
     df = df.sort_values(by=column, ascending=False).reset_index(drop=True)
-    df['Rank'] = range(1, len(df) + 1)
     return df
 def get_df():

         "I-VG": ['MSCOCO', 'RefCOCO', 'RefCOCO-Matching', 'Visual7W']
         },
     "visdoc": {
+        "ViDoRe-V1": ['ViDoRe_arxivqa', 'ViDoRe_docvqa', 'ViDoRe_infovqa', 'ViDoRe_tabfquad', 'ViDoRe_tatdqa', 'ViDoRe_shiftproject', 'ViDoRe_syntheticDocQA_artificial_intelligence', 'ViDoRe_syntheticDocQA_energy', 'ViDoRe_syntheticDocQA_government_reports', 'ViDoRe_syntheticDocQA_healthcare_industry'],
+        "ViDoRe-V2": ["ViDoRe_esg_reports_human_labeled_v2", "ViDoRe_biomedical_lectures_v2", "ViDoRe_economics_reports_v2", "ViDoRe_esg_reports_v2"],  # Following Abandoned: "ViDoRe_biomedical_lectures_v2_multilingual", "ViDoRe_economics_reports_v2_multilingual", "ViDoRe_esg_reports_v2_multilingual"
+        "VisRAG": ['VisRAG_ArxivQA', 'VisRAG_ChartQA', 'VisRAG_MP-DocVQA', 'VisRAG_SlideVQA', 'VisRAG_InfoVQA', 'VisRAG_PlotQA'],
+        "VisDoc-OOD": ['ViDoSeek-page', 'ViDoSeek-doc', 'MMLongBench-page', 'MMLongBench-doc']
         },
     "video": {
         "V-CLS": ['K700', 'UCF101', 'HMDB51', 'SmthSmthV2', 'Breakfast'],
 }
 BASE_COLS = ['Rank', 'Models', 'Model Size(B)']
 BASE_DATA_TITLE_TYPE = ['number', 'markdown', 'str', 'markdown']
+COLUMN_NAMES = BASE_COLS + ["Overall", 'Image-Overall', 'Video-Overall', 'Visdoc-Overall']
 DATA_TITLE_TYPE = BASE_DATA_TITLE_TYPE + \
                     ['number'] * 3
+SUB_TASKS_I = ["I-CLS", "I-QA", "I-RET", "I-VG"]
+TASKS_I = ['Image-Overall'] + SUB_TASKS_I + ALL_DATASETS_SPLITS['image']
 COLUMN_NAMES_I = BASE_COLS + TASKS_I
 DATA_TITLE_TYPE_I = BASE_DATA_TITLE_TYPE + \
+                    ['number'] * len(TASKS_I + SUB_TASKS_I)
+SUB_TASKS_V = ["V-CLS", "V-QA", "V-RET", "V-MRET"]
+TASKS_V = ['Video-Overall'] + SUB_TASKS_V + ALL_DATASETS_SPLITS['video']
 COLUMN_NAMES_V = BASE_COLS + TASKS_V
 DATA_TITLE_TYPE_V = BASE_DATA_TITLE_TYPE + \
+                    ['number'] * len(TASKS_V + SUB_TASKS_V)
+SUB_TASKS_D = ['ViDoRe-V1', 'ViDoRe-V2', 'VisRAG', 'VisDoc-OOD']
+TASKS_D = ['Visdoc-Overall'] + SUB_TASKS_D + ALL_DATASETS_SPLITS['visdoc']
 COLUMN_NAMES_D = BASE_COLS + TASKS_D
 DATA_TITLE_TYPE_D = BASE_DATA_TITLE_TYPE + \
+                    ['number'] * len(TASKS_D + SUB_TASKS_D)
 TABLE_INTRODUCTION = """**MMEB**: Massive MultiModal Embedding Benchmark \n
                         Models are ranked based on **Overall**"""
     row.update(scores)
     return row
+def rank_models(df, column='Overall', rank_name='Rank'):
     """Ranks the models based on the specific score."""
     df = df.sort_values(by=column, ascending=False).reset_index(drop=True)
+    df[rank_name] = range(1, len(df) + 1)
     return df
 def get_df():