PhysCodeBench-Leaderboard

Build error

App Files Files Community

Sealical commited on Mar 19, 2025

Commit

0c53877

1 Parent(s): 3f70be6

update space

Browse files

Files changed (1) hide show

app.py +101 -42

app.py CHANGED Viewed

@@ -3,6 +3,9 @@ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
 import pandas as pd
 import os
 import json
 from huggingface_hub import snapshot_download
 # Constants for PhysicalCodeBench
@@ -72,7 +75,7 @@ SUBMISSION_TEXT = """
    ├── evaluation_results/   # Directory containing all result files
    └── PhysCodeEval_results.json  # Main evaluation results file
    ```
-5. Submit a pull request with your results
 Your submission will be verified and added to the leaderboard once approved.
 """
@@ -111,7 +114,7 @@ COLUMNS = [
     PhysCodeColumn("rank", "number", True, True, False),
     PhysCodeColumn("model", "str", True, True, False),
     PhysCodeColumn("model_type", "str", True, False, False),
-    #PhysCodeColumn("params", "number", True, False, False),
     PhysCodeColumn("text_score", "number", True, False, False),
     PhysCodeColumn("visual_score", "number", True, False, False),
     PhysCodeColumn("total_score", "number", True, False, False),
@@ -120,8 +123,7 @@ COLUMNS = [
     PhysCodeColumn("execution_success", "number", False, False, False),
     PhysCodeColumn("file_generation", "number", False, False, False),
     PhysCodeColumn("file_quality", "number", False, False, False),
-    PhysCodeColumn("submission_date", "date", False, False, False),
-    PhysCodeColumn("license", "str", False, False, False)
 ]
 # Enums for model metadata
@@ -144,7 +146,7 @@ def get_leaderboard_df():
             "rank": 1,
             "model": "GPT4o",
             "model_type": ModelType.CloseSource,
-            "params": 1000,
             "text_score": 16.0,
             "visual_score": 18.262,
             "total_score": 34.262,
@@ -153,14 +155,13 @@ def get_leaderboard_df():
             "execution_success": 10.0,
             "file_generation": 3.0,
             "file_quality": 3.0,
-            "submission_date": "2025-01-15",
-            "license": "Proprietary"
         },
         {
             "rank": 2,
             "model": "Gemini-2.0-flash",
             "model_type": ModelType.CloseSource,
-            "params": 450,
             "text_score": 15.0,
             "visual_score": 16.963,
             "total_score": 31.963,
@@ -169,14 +170,13 @@ def get_leaderboard_df():
             "execution_success": 9.0,
             "file_generation": 3.0,
             "file_quality": 3.0,
-            "submission_date": "2025-01-20",
-            "license": "Proprietary"
         },
         {
             "rank": 3,
             "model": "DS-R1",
             "model_type": ModelType.OpenSource,
-            "params": 32,
             "text_score": 14.0,
             "visual_score": 15.815,
             "total_score": 29.815,
@@ -185,14 +185,13 @@ def get_leaderboard_df():
             "execution_success": 8.5,
             "file_generation": 3.0,
             "file_quality": 2.5,
-            "submission_date": "2025-01-25",
-            "license": "Apache 2.0"
         },
         {
             "rank": 4,
             "model": "DeepSeek-R1-Distill-Qwen-32B",
             "model_type": ModelType.OpenSource,
-            "params": 32,
             "text_score": 12.2,
             "visual_score": 15.82,
             "total_score": 28.02,
@@ -201,14 +200,13 @@ def get_leaderboard_df():
             "execution_success": 7.2,
             "file_generation": 2.5,
             "file_quality": 2.5,
-            "submission_date": "2025-01-28",
-            "license": "Apache 2.0"
         },
         {
             "rank": 5,
             "model": "QwQ-32B",
             "model_type": ModelType.OpenSource,
-            "params": 32,
             "text_score": 7.1,
             "visual_score": 8.964,
             "total_score": 16.064,
@@ -217,14 +215,13 @@ def get_leaderboard_df():
             "execution_success": 4.1,
             "file_generation": 1.5,
             "file_quality": 1.5,
-            "submission_date": "2025-02-05",
-            "license": "Apache 2.0"
         },
         {
             "rank": 6,
             "model": "Qwen-2.5-32B",
             "model_type": ModelType.OpenSource,
-            "params": 32,
             "text_score": 0.7,
             "visual_score": 1.126,
             "total_score": 1.826,
@@ -233,8 +230,7 @@ def get_leaderboard_df():
             "execution_success": 0.5,
             "file_generation": 0.1,
             "file_quality": 0.1,
-            "submission_date": "2025-02-10",
-            "license": "Apache 2.0"
         }
     ]
@@ -261,25 +257,73 @@ def init_leaderboard(dataframe):
             cant_deselect=[c.name for c in COLUMNS if c.never_hidden],
             label="Select Columns to Display:",
         ),
-        search_columns=["model", "license"],
         hide_columns=[c.name for c in COLUMNS if c.hidden],
         filter_columns=[
             ColumnFilter("model_type", type="checkboxgroup", label="Model types"),
-            # ColumnFilter(
-            #     "params",
-            #     type="slider",
-            #     min=0.01,
-            #     max=1500,
-            #     label="Select the number of parameters (B)",
-            # ),
         ],
         interactive=False,
     )
 # Submission form handling
-def process_submission(model_name, model_type, license_type, submission_link):
     # This would be implemented to handle actual submission processing
-    return f"Thank you for submitting {model_name}! Your submission will be reviewed and added to the leaderboard once verified."
 # Main application
 def create_demo():
@@ -322,27 +366,42 @@ def create_demo():
             with gr.TabItem("🚀 Submit", id=3):
                 gr.Markdown(SUBMISSION_TEXT, elem_classes="markdown-text")
                 with gr.Row():
                     with gr.Column():
-                        model_name_input = gr.Textbox(label="Model Name")
                         model_type_input = gr.Dropdown(
-                            choices=["CloseSource", "Open Source", "API"],
-                            label="Model Type",
                             multiselect=False,
                         )
-                        #params_input = gr.Number(label="Parameters (billions)")
                     with gr.Column():
-                        license_input = gr.Textbox(label="License")
                         submission_link_input = gr.Textbox(label="GitHub Pull Request URL")
-                submit_button = gr.Button("Submit")
-                submission_result = gr.Markdown()
-                submit_button.click(
                     process_submission,
-                    [model_name_input, model_type_input, license_input, submission_link_input],
-                    submission_result,
                 )
         with gr.Row():

 import pandas as pd
 import os
 import json
+import tempfile
+import shutil
+import zipfile
 from huggingface_hub import snapshot_download
 # Constants for PhysicalCodeBench
    ├── evaluation_results/   # Directory containing all result files
    └── PhysCodeEval_results.json  # Main evaluation results file
    ```
+5. Submit your results by uploading a ZIP file below or via the form
 Your submission will be verified and added to the leaderboard once approved.
 """
     PhysCodeColumn("rank", "number", True, True, False),
     PhysCodeColumn("model", "str", True, True, False),
     PhysCodeColumn("model_type", "str", True, False, False),
+    PhysCodeColumn("organization", "str", True, False, False),
     PhysCodeColumn("text_score", "number", True, False, False),
     PhysCodeColumn("visual_score", "number", True, False, False),
     PhysCodeColumn("total_score", "number", True, False, False),
     PhysCodeColumn("execution_success", "number", False, False, False),
     PhysCodeColumn("file_generation", "number", False, False, False),
     PhysCodeColumn("file_quality", "number", False, False, False),
+    PhysCodeColumn("submission_date", "date", False, False, False)
 ]
 # Enums for model metadata
             "rank": 1,
             "model": "GPT4o",
             "model_type": ModelType.CloseSource,
+            "organization": "OpenAI",
             "text_score": 16.0,
             "visual_score": 18.262,
             "total_score": 34.262,
             "execution_success": 10.0,
             "file_generation": 3.0,
             "file_quality": 3.0,
+            "submission_date": "2025-01-15"
         },
         {
             "rank": 2,
             "model": "Gemini-2.0-flash",
             "model_type": ModelType.CloseSource,
+            "organization": "Google",
             "text_score": 15.0,
             "visual_score": 16.963,
             "total_score": 31.963,
             "execution_success": 9.0,
             "file_generation": 3.0,
             "file_quality": 3.0,
+            "submission_date": "2025-01-20"
         },
         {
             "rank": 3,
             "model": "DS-R1",
             "model_type": ModelType.OpenSource,
+            "organization": "DeepSeek",
             "text_score": 14.0,
             "visual_score": 15.815,
             "total_score": 29.815,
             "execution_success": 8.5,
             "file_generation": 3.0,
             "file_quality": 2.5,
+            "submission_date": "2025-01-25"
         },
         {
             "rank": 4,
             "model": "DeepSeek-R1-Distill-Qwen-32B",
             "model_type": ModelType.OpenSource,
+            "organization": "DeepSeek",
             "text_score": 12.2,
             "visual_score": 15.82,
             "total_score": 28.02,
             "execution_success": 7.2,
             "file_generation": 2.5,
             "file_quality": 2.5,
+            "submission_date": "2025-01-28"
         },
         {
             "rank": 5,
             "model": "QwQ-32B",
             "model_type": ModelType.OpenSource,
+            "organization": "QwQ Team",
             "text_score": 7.1,
             "visual_score": 8.964,
             "total_score": 16.064,
             "execution_success": 4.1,
             "file_generation": 1.5,
             "file_quality": 1.5,
+            "submission_date": "2025-02-05"
         },
         {
             "rank": 6,
             "model": "Qwen-2.5-32B",
             "model_type": ModelType.OpenSource,
+            "organization": "Alibaba",
             "text_score": 0.7,
             "visual_score": 1.126,
             "total_score": 1.826,
             "execution_success": 0.5,
             "file_generation": 0.1,
             "file_quality": 0.1,
+            "submission_date": "2025-02-10"
         }
     ]
             cant_deselect=[c.name for c in COLUMNS if c.never_hidden],
             label="Select Columns to Display:",
         ),
+        search_columns=["model", "organization"],
         hide_columns=[c.name for c in COLUMNS if c.hidden],
         filter_columns=[
             ColumnFilter("model_type", type="checkboxgroup", label="Model types"),
+            ColumnFilter("organization", type="checkboxgroup", label="Organizations"),
         ],
         interactive=False,
     )
+# Function to handle ZIP file upload and extraction
+def process_zip_submission(zip_file):
+    if zip_file is None:
+        return "No file uploaded. Please upload a ZIP file containing your submission."
+    # Create temp directory for extraction
+    temp_dir = tempfile.mkdtemp()
+    try:
+        # Extract the zip file
+        with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
+            zip_ref.extractall(temp_dir)
+        # Check for required files
+        model_info_path = os.path.join(temp_dir, "model_info.json")
+        results_json_path = os.path.join(temp_dir, "PhysCodeEval_results.json")
+        if not os.path.exists(model_info_path):
+            return "Error: model_info.json not found in the ZIP file."
+        if not os.path.exists(results_json_path):
+            return "Error: PhysCodeEval_results.json not found in the ZIP file."
+        # Load model info
+        with open(model_info_path, 'r') as f:
+            model_info = json.load(f)
+        # Check for required model info fields
+        required_fields = ["model_name", "model_type", "organization"]
+        missing_fields = [field for field in required_fields if field not in model_info]
+        if missing_fields:
+            return f"Error: Missing required fields in model_info.json: {', '.join(missing_fields)}"
+        # TODO: Process the submission files (this would involve your validation logic)
+        return f"Successfully processed submission for {model_info['model_name']} by {model_info['organization']}. Your submission will be reviewed and added to the leaderboard once approved."
+    except zipfile.BadZipFile:
+        return "Error: Invalid ZIP file."
+    except Exception as e:
+        return f"Error processing submission: {str(e)}"
+    finally:
+        # Clean up
+        shutil.rmtree(temp_dir)
 # Submission form handling
+def process_submission(model_name, model_type, organization, team_name, email, submission_link):
+    # Check for required fields
+    if not model_name:
+        return "Error: Model name is required."
+    if not model_type:
+        return "Error: Model type is required."
+    if not email:
+        return "Error: Contact email is required."
     # This would be implemented to handle actual submission processing
+    return f"Thank you for submitting {model_name} from {organization or team_name}! Your submission will be reviewed and added to the leaderboard once verified. We will contact you at {email} if we need additional information."
 # Main application
 def create_demo():
             with gr.TabItem("🚀 Submit", id=3):
                 gr.Markdown(SUBMISSION_TEXT, elem_classes="markdown-text")
+                gr.Markdown("### Option 1: Upload Submission ZIP File")
+                with gr.Row():
+                    zip_file_input = gr.File(label="Upload submission ZIP file")
+                zip_submit_button = gr.Button("Submit ZIP File")
+                zip_submission_result = gr.Markdown()
+                zip_submit_button.click(
+                    process_zip_submission,
+                    [zip_file_input],
+                    zip_submission_result,
+                )
+                gr.Markdown("### Option 2: Submit Form")
                 with gr.Row():
                     with gr.Column():
+                        model_name_input = gr.Textbox(label="Model Name*")
                         model_type_input = gr.Dropdown(
+                            choices=["Open Source", "Close Source", "API", "Proprietary"],
+                            label="Model Type*",
                             multiselect=False,
                         )
+                        organization_input = gr.Textbox(label="Organization (if applicable)")
                     with gr.Column():
+                        team_name_input = gr.Textbox(label="Team Name (if applicable)")
+                        email_input = gr.Textbox(label="Contact Email*")
                         submission_link_input = gr.Textbox(label="GitHub Pull Request URL")
+                form_submit_button = gr.Button("Submit Form")
+                form_submission_result = gr.Markdown()
+                form_submit_button.click(
                     process_submission,
+                    [model_name_input, model_type_input, organization_input, team_name_input, email_input, submission_link_input],
+                    form_submission_result,
                 )
         with gr.Row():