CP-Bench-Leaderboard

Running

App Files Files Community

kostis-init commited on Jun 6

Commit

444cb2e

1 Parent(s): 0030c6c

Add base LLM and modelling framework to submission metadata; update leaderboard columns

Browse files

Files changed (3) hide show

src/config.py +1 -1
src/hf_utils.py +29 -7
src/ui.py +14 -7

src/config.py CHANGED Viewed

@@ -7,7 +7,7 @@ DS_RESULTS_PATH = "results"
 # leaderboard
 # LDB_COLS = ["Submission Name", "Solution Found (%)", "Consistency (%)", "Final Solution Accuracy (%)", "# of Models submitted"]
-LDB_COLS = ["Name", "Models Submitted (%)", "Accuracy (%)", "Runtime Errors (%)"]
 # modelling frameworks

 # leaderboard
 # LDB_COLS = ["Submission Name", "Solution Found (%)", "Consistency (%)", "Final Solution Accuracy (%)", "# of Models submitted"]
+LDB_COLS = ["Name", 'Modelling Framework', 'Base LLM', "Models Submitted (%)", "Accuracy (%)", "Runtime Errors (%)"]
 # modelling frameworks

src/hf_utils.py CHANGED Viewed

@@ -43,17 +43,38 @@ def load_leaderboard_data():
             if f.endswith("submission.jsonl") and f.startswith(DS_SUBMISSIONS_PATH + "/")
         ]
         # for file_path in summary_files:
         for file_path in submissions:
             dir_name = Path(file_path).parent.name
             if dir_name in processed_result_dirs:
                 continue
             processed_result_dirs.add(dir_name)
             entry = {LDB_COLS[0]: dir_name,
-                     LDB_COLS[1]: '*Calculating...*',
-                     LDB_COLS[2]: '*Calculating...*',
-                     LDB_COLS[3]: '*Calculating...*'}
             # check if summary file exists, otherwise skip
             if f"{DS_RESULTS_PATH}/{dir_name}/summary.txt" not in repo_files:
@@ -72,11 +93,11 @@ def load_leaderboard_data():
                 with open(local_summary_path, "r", encoding="utf-8") as f:
                     for line in f:
                         if 'Error perc' in line:
-                            entry[LDB_COLS[3]] = float(line.split(":")[1].strip().replace("%", ""))
                         if 'Final Solution Accuracy' in line:
-                            entry[LDB_COLS[2]] = float(line.split(":")[1].strip().replace("%", ""))
                         if 'Submission coverage perc' in line:
-                            entry[LDB_COLS[1]] = float(line.split(":")[1].strip().replace("%", ""))
                 os.remove(local_summary_path)
             else:
                 print(f"Warning: Summary file {local_summary_path} does not exist or is empty.")
@@ -92,7 +113,7 @@ def load_leaderboard_data():
     return pd.DataFrame(leaderboard_entries)
-def upload_submission(uploaded_file, dir_name, report_file, model_framework):
     """Upload submission to Hugging Face Dataset."""
     if not HF_API:
         return False, "Hugging Face API not initialized"
@@ -119,6 +140,7 @@ def upload_submission(uploaded_file, dir_name, report_file, model_framework):
         metadata = {
             "submission_name": dir_name,
             "modelling_framework": model_framework,
         }
         HF_API.upload_file(
             path_or_fileobj=io.BytesIO(json.dumps(metadata, indent=4).encode('utf-8')),

             if f.endswith("submission.jsonl") and f.startswith(DS_SUBMISSIONS_PATH + "/")
         ]
+        metadata_files = [
+            f for f in repo_files
+            if f.endswith("metadata.json") and f.startswith(DS_SUBMISSIONS_PATH + "/")
+        ]
         # for file_path in summary_files:
         for file_path in submissions:
             dir_name = Path(file_path).parent.name
             if dir_name in processed_result_dirs:
                 continue
+            # download metadata file of this submission
+            metadata_file = next((f for f in metadata_files if f.startswith(f"{DS_SUBMISSIONS_PATH}/{dir_name}/")), None)
+            if metadata_file:
+                local_metadata_path = hf_hub_download(
+                    repo_id=DATASET_REPO_ID,
+                    filename=metadata_file,
+                    repo_type="dataset",
+                    local_dir=os.path.join("local_hf_downloads", dir_name),
+                )
+                with open(local_metadata_path, "r", encoding="utf-8") as f:
+                    metadata = json.load(f)
+                os.remove(local_metadata_path)
             processed_result_dirs.add(dir_name)
             entry = {LDB_COLS[0]: dir_name,
+                     LDB_COLS[1]: metadata.get("modelling_framework", "Unknown"),
+                     LDB_COLS[2]: metadata.get("base_llm", "Unknown"),
+                     LDB_COLS[3]: '*Calculating...*',
+                     LDB_COLS[4]: '*Calculating...*',
+                     LDB_COLS[5]: '*Calculating...*'}
             # check if summary file exists, otherwise skip
             if f"{DS_RESULTS_PATH}/{dir_name}/summary.txt" not in repo_files:
                 with open(local_summary_path, "r", encoding="utf-8") as f:
                     for line in f:
                         if 'Error perc' in line:
+                            entry[LDB_COLS[4]] = float(line.split(":")[1].strip().replace("%", ""))
                         if 'Final Solution Accuracy' in line:
+                            entry[LDB_COLS[3]] = float(line.split(":")[1].strip().replace("%", ""))
                         if 'Submission coverage perc' in line:
+                            entry[LDB_COLS[2]] = float(line.split(":")[1].strip().replace("%", ""))
                 os.remove(local_summary_path)
             else:
                 print(f"Warning: Summary file {local_summary_path} does not exist or is empty.")
     return pd.DataFrame(leaderboard_entries)
+def upload_submission(uploaded_file, dir_name, report_file, model_framework, base_llm):
     """Upload submission to Hugging Face Dataset."""
     if not HF_API:
         return False, "Hugging Face API not initialized"
         metadata = {
             "submission_name": dir_name,
             "modelling_framework": model_framework,
+            "base_llm": base_llm,
         }
         HF_API.upload_file(
             path_or_fileobj=io.BytesIO(json.dumps(metadata, indent=4).encode('utf-8')),

src/ui.py CHANGED Viewed

@@ -8,7 +8,7 @@ from src.hf_utils import load_leaderboard_data, upload_submission, check_name_ex
 from src.eval import start_background_evaluation
-def handle_upload(submission_name, uploaded_file, report_file, model_framework, progress=gr.Progress()):
     """Handle file upload and start evaluation."""
     if model_framework not in SUPPORTED_FRAMEWORKS:
         return f"Unsupported modelling framework: {model_framework}. Supported frameworks are: {', '.join(SUPPORTED_FRAMEWORKS)}"
@@ -22,13 +22,14 @@ def handle_upload(submission_name, uploaded_file, report_file, model_framework,
     # normalize the submission name
     submission_name = submission_name.strip().replace(" ", "_").lower()
     # keep only alphanumeric characters and underscores, restrict to 30 characters
-    submission_name = "".join(
-        c for c in submission_name if c.isalnum() or c == "_"
-    )[:30]
     if not submission_name or submission_name.strip() == "":
         return "Submission name is required"
     if check_name_exists(submission_name):
         return f"Submission name '{submission_name}' already exists. Please choose a different name."
@@ -50,7 +51,7 @@ def handle_upload(submission_name, uploaded_file, report_file, model_framework,
             if not found_one:
                 return "Empty file. Please upload a valid JSONL file."
-        success, result = upload_submission(uploaded_file, submission_name, report_file, model_framework)
         if not success:
             return f"Upload failed: {result}"
@@ -116,7 +117,7 @@ def create_ui():
                     label="Submission Name (required)",
                     placeholder="Enter a unique name for your submission",
                     interactive=True,
-                    info="This name will appear on the leaderboard"
                 )
                 model_framework = gr.Dropdown(
                     label="Modelling Framework (required)",
@@ -128,6 +129,12 @@ def create_ui():
                     allow_custom_value=False,
                     filterable=False,
                 )
                 with gr.Row():
                     report_file = gr.File(
@@ -153,7 +160,7 @@ def create_ui():
         # Event handlers
         upload_button.click(
             fn=handle_upload,
-            inputs=[submission_name, submission_file, report_file, model_framework],
             outputs=[status_box],
             show_progress="full",
         )

 from src.eval import start_background_evaluation
+def handle_upload(submission_name, uploaded_file, report_file, model_framework, base_llm, progress=gr.Progress()):
     """Handle file upload and start evaluation."""
     if model_framework not in SUPPORTED_FRAMEWORKS:
         return f"Unsupported modelling framework: {model_framework}. Supported frameworks are: {', '.join(SUPPORTED_FRAMEWORKS)}"
     # normalize the submission name
     submission_name = submission_name.strip().replace(" ", "_").lower()
     # keep only alphanumeric characters and underscores, restrict to 30 characters
+    submission_name = "".join(c for c in submission_name if c.isalnum() or c == "_")[:30]
     if not submission_name or submission_name.strip() == "":
         return "Submission name is required"
+    if not base_llm or base_llm.strip() == "":
+        return "Base LLM is required. Please specify the base language model used for generating the models."
     if check_name_exists(submission_name):
         return f"Submission name '{submission_name}' already exists. Please choose a different name."
             if not found_one:
                 return "Empty file. Please upload a valid JSONL file."
+        success, result = upload_submission(uploaded_file, submission_name, report_file, model_framework, base_llm)
         if not success:
             return f"Upload failed: {result}"
                     label="Submission Name (required)",
                     placeholder="Enter a unique name for your submission",
                     interactive=True,
+                    info="This name will appear on the leaderboard. It is recommended that it represents the approach you used to generate the models (e.g. 'smart_prompting')",
                 )
                 model_framework = gr.Dropdown(
                     label="Modelling Framework (required)",
                     allow_custom_value=False,
                     filterable=False,
                 )
+                base_llm = gr.Textbox(
+                    label="Base LLM (required)",
+                    placeholder="Enter the base LLM used for generating the models (e.g., GPT-4, Llama-3.3)",
+                    interactive=True,
+                    info="The base LLM used for generating the models."
+                )
                 with gr.Row():
                     report_file = gr.File(
         # Event handlers
         upload_button.click(
             fn=handle_upload,
+            inputs=[submission_name, submission_file, report_file, model_framework, base_llm],
             outputs=[status_box],
             show_progress="full",
         )