CodeReviewBench

Sleeping

App Files Files Community

Alex commited on Jul 2, 2025

Commit

15c92e9

1 Parent(s): b4d9db9

zalupa

Browse files

Files changed (3) hide show

app.py +55 -1
src/populate.py +35 -0
src/submission/submit.py +130 -1

app.py CHANGED Viewed

@@ -28,7 +28,7 @@ from src.display.utils import (
 )
 from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
-from src.submission.submit import add_new_eval
 def restart_space():
@@ -190,6 +190,60 @@ with demo:
                 submission_result,
             )
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):
             citation_button = gr.Textbox(

 )
 from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
+from src.submission.submit import add_new_eval, add_manual_results
 def restart_space():
                 submission_result,
             )
+            # ----------------------------------------------------
+            # Manual metrics submission form
+            # ----------------------------------------------------
+            with gr.Markdown("## 📝 Submit metrics manually (advanced)"):
+                pass
+            with gr.Row():
+                with gr.Column():
+                    model_name_metrics = gr.Textbox(label="Model name", placeholder="org/model")
+                    revision_metrics = gr.Textbox(label="Revision commit", placeholder="main", value="main")
+                    bleu_input = gr.Number(label="BLEU", value=0.5)
+                    pass1_input = gr.Number(label="Pass@1", value=0.5, minimum=0.0, maximum=1.0)
+                    pass5_input = gr.Number(label="Pass@5", value=0.5, minimum=0.0, maximum=1.0)
+                    pass10_input = gr.Number(label="Pass@10", value=0.5, minimum=0.0, maximum=1.0)
+                with gr.Column():
+                    # Subjective metrics sliders (0-5)
+                    readability_slider = gr.Slider(0, 5, step=1, value=3, label="Readability")
+                    relevance_slider = gr.Slider(0, 5, step=1, value=3, label="Relevance")
+                    explanation_slider = gr.Slider(0, 5, step=1, value=3, label="Explanation clarity")
+                    problem_slider = gr.Slider(0, 5, step=1, value=3, label="Problem identification")
+                    actionability_slider = gr.Slider(0, 5, step=1, value=3, label="Actionability")
+                    completeness_slider = gr.Slider(0, 5, step=1, value=3, label="Completeness")
+                    specificity_slider = gr.Slider(0, 5, step=1, value=3, label="Specificity")
+                    contextual_slider = gr.Slider(0, 5, step=1, value=3, label="Contextual adequacy")
+                    consistency_slider = gr.Slider(0, 5, step=1, value=3, label="Consistency")
+                    brevity_slider = gr.Slider(0, 5, step=1, value=3, label="Brevity")
+            submit_metrics_button = gr.Button("Submit Metrics")
+            metrics_submission_result = gr.Markdown()
+            submit_metrics_button.click(
+                add_manual_results,
+                [
+                    model_name_metrics,
+                    revision_metrics,
+                    bleu_input,
+                    readability_slider,
+                    relevance_slider,
+                    explanation_slider,
+                    problem_slider,
+                    actionability_slider,
+                    completeness_slider,
+                    specificity_slider,
+                    contextual_slider,
+                    consistency_slider,
+                    brevity_slider,
+                    pass1_input,
+                    pass5_input,
+                    pass10_input,
+                ],
+                metrics_submission_result,
+            )
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):
             citation_button = gr.Textbox(

src/populate.py CHANGED Viewed

@@ -14,6 +14,35 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
     all_data_json = [v.to_dict() for v in raw_data]
     df = pd.DataFrame.from_records(all_data_json)
     # Sort primarily by LLM exact-match Pass@1 metric; if not present, fall back to average
     preferred_cols = []
     if hasattr(AutoEvalColumn, "pass_at_1"):
@@ -24,6 +53,12 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
         if col in df.columns:
             df = df.sort_values(by=[col], ascending=False)
             break
     df = df[cols].round(decimals=2)
     # filter out if any of the benchmarks have not been produced

     all_data_json = [v.to_dict() for v in raw_data]
     df = pd.DataFrame.from_records(all_data_json)
+    # ------------------------------------------------------------------
+    # Fallback: if no evaluation results are found we populate the
+    # leaderboard with a single example model. This guarantees that a
+    # freshly deployed Space shows a non-empty leaderboard and it serves
+    # as a template for the expected columns/values.
+    # ------------------------------------------------------------------
+    if df.empty:
+        example_row = {}
+        # Populate benchmark metrics with the default value 0.5
+        for metric in benchmark_cols:
+            example_row[metric] = 0.5
+        # Minimal metadata so that the row displays nicely
+        example_row[AutoEvalColumn.model.name] = make_clickable_model("example/model")
+        example_row[AutoEvalColumn.average.name] = 0.5
+        example_row[AutoEvalColumn.model_type_symbol.name] = "🟢"
+        example_row[AutoEvalColumn.model_type.name] = "pretrained"
+        example_row[AutoEvalColumn.precision.name] = "float16"
+        example_row[AutoEvalColumn.weight_type.name] = "Original"
+        example_row[AutoEvalColumn.still_on_hub.name] = True
+        example_row[AutoEvalColumn.architecture.name] = "Transformer"
+        example_row[AutoEvalColumn.revision.name] = "main"
+        example_row[AutoEvalColumn.license.name] = "apache-2.0"
+        # Any missing columns will be created later in the function
+        df = pd.DataFrame([example_row])
     # Sort primarily by LLM exact-match Pass@1 metric; if not present, fall back to average
     preferred_cols = []
     if hasattr(AutoEvalColumn, "pass_at_1"):
         if col in df.columns:
             df = df.sort_values(by=[col], ascending=False)
             break
+    # Ensure all expected columns exist, add missing ones with NaN so selection does not fail
+    for expected in cols:
+        if expected not in df.columns:
+            df[expected] = pd.NA
     df = df[cols].round(decimals=2)
     # filter out if any of the benchmarks have not been produced

src/submission/submit.py CHANGED Viewed

@@ -3,7 +3,7 @@ import os
 from datetime import datetime, timezone
 from src.display.formatting import styled_error, styled_message, styled_warning
-from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
 from src.submission.check_validity import (
     already_submitted_models,
     check_model_card,
@@ -117,3 +117,132 @@ def add_new_eval(
     return styled_message(
         "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
     )

 from datetime import datetime, timezone
 from src.display.formatting import styled_error, styled_message, styled_warning
+from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO, EVAL_RESULTS_PATH, RESULTS_REPO
 from src.submission.check_validity import (
     already_submitted_models,
     check_model_card,
     return styled_message(
         "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
     )
+# --------------------------------------------------------
+# Manual metrics submission (bypass evaluation queue)
+# --------------------------------------------------------
+ALL_SUBJECTIVE_FIELDS = [
+    "readability",
+    "relevance",
+    "explanation_clarity",
+    "problem_identification",
+    "actionability",
+    "completeness",
+    "specificity",
+    "contextual_adequacy",
+    "consistency",
+    "brevity",
+]
+def _compute_multimetric(payload: dict) -> float:
+    """Average of the 10 subjective metrics."""
+    total = sum(float(payload[f]) for f in ALL_SUBJECTIVE_FIELDS)
+    return total / len(ALL_SUBJECTIVE_FIELDS)
+def add_manual_results(
+    model: str,
+    revision: str,
+    bleu: float,
+    readability: int,
+    relevance: int,
+    explanation_clarity: int,
+    problem_identification: int,
+    actionability: int,
+    completeness: int,
+    specificity: int,
+    contextual_adequacy: int,
+    consistency: int,
+    brevity: int,
+    pass_at_1: float,
+    pass_at_5: float,
+    pass_at_10: float,
+):
+    """Directly submit evaluation metrics for a model and push them to the results dataset."""
+    # Basic validation
+    if model == "":
+        return styled_error("Please specify a model name.")
+    if revision == "":
+        revision = "main"
+    if pass_at_5 < pass_at_1:
+        return styled_error("pass@5 must be greater or equal to pass@1")
+    if pass_at_10 < pass_at_5:
+        return styled_error("pass@10 must be greater or equal to pass@5")
+    # Prepare dictionary in the same format used by read_evals.py
+    payload_dict = {
+        "model": model,
+        "revision": revision,
+        "bleu": bleu,
+        "readability": readability,
+        "relevance": relevance,
+        "explanation_clarity": explanation_clarity,
+        "problem_identification": problem_identification,
+        "actionability": actionability,
+        "completeness": completeness,
+        "specificity": specificity,
+        "contextual_adequacy": contextual_adequacy,
+        "consistency": consistency,
+        "brevity": brevity,
+        "pass_at_1": pass_at_1,
+        "pass_at_5": pass_at_5,
+        "pass_at_10": pass_at_10,
+    }
+    multimetric = _compute_multimetric(payload_dict)
+    # Compose final results file (same structure as api_submit_results)
+    result_json = {
+        "config": {
+            "model_dtype": "unknown",
+            "model_name": model,
+            "model_sha": revision,
+        },
+        "results": {
+            "bleu": {"score": bleu},
+            "multimetric": {"score": multimetric},
+            "pass_at_1": {"score": pass_at_1},
+            "pass_at_5": {"score": pass_at_5},
+            "pass_at_10": {"score": pass_at_10},
+        },
+    }
+    # Add subjective metrics
+    for field in ALL_SUBJECTIVE_FIELDS:
+        result_json["results"][field] = {"score": payload_dict[field]}
+    # Write file locally then upload
+    try:
+        os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
+    except Exception:
+        pass
+    from datetime import datetime, timezone
+    import uuid
+    ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
+    unique_id = uuid.uuid4().hex[:8]
+    filename = f"results_{model.replace('/', '_')}_{ts}_{unique_id}.json"
+    local_path = os.path.join(EVAL_RESULTS_PATH, filename)
+    try:
+        with open(local_path, "w") as fp:
+            json.dump(result_json, fp)
+        API.upload_file(
+            path_or_fileobj=local_path,
+            path_in_repo=filename,
+            repo_id=RESULTS_REPO,
+            repo_type="dataset",
+            commit_message=f"Add manual results for {model}",
+        )
+    except Exception as e:
+        return styled_error(f"Failed to upload results: {e}")
+    finally:
+        if os.path.exists(local_path):
+            os.remove(local_path)
+    return styled_message("Metrics successfully submitted! The leaderboard will refresh shortly.")