π¨ Clean up leaderboard
Browse files- app.py +8 -20
- src/display/text_blocks.py +0 -4
- src/leaderboard.py +4 -4
app.py
CHANGED
|
@@ -51,8 +51,6 @@ from src.leaderboard import get_leaderboard_df, get_benchmark_run_df
|
|
| 51 |
from src.display.text_blocks import (
|
| 52 |
INTRODUCTION_TEXT,
|
| 53 |
LLM_BENCHMARKS_TEXT,
|
| 54 |
-
CITATION_BUTTON_LABEL,
|
| 55 |
-
CITATION_BUTTON_TEXT,
|
| 56 |
)
|
| 57 |
|
| 58 |
REPO_ID = "taagarwa/coding-agent-leaderboard"
|
|
@@ -103,12 +101,12 @@ def init_leaderboard(dataframe):
|
|
| 103 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
| 104 |
|
| 105 |
label_choices = [("π Fully FOSS", "π "), ("πΆ Proprietary", "πΆ")]
|
| 106 |
-
meta_columns = [" ", "Harness", "Model
|
| 107 |
benchmark_columns = [col for col in dataframe.columns if col not in meta_columns]
|
| 108 |
-
model_choices = sorted({(extract_body(v), v) for v in dataframe["Model
|
| 109 |
harness_choices = sorted({(extract_body(v), v) for v in dataframe["Harness"]})
|
| 110 |
|
| 111 |
-
default_columns = [" ", "Harness", "Model
|
| 112 |
return Leaderboard(
|
| 113 |
value=dataframe,
|
| 114 |
select_columns=SelectColumns(
|
|
@@ -116,10 +114,10 @@ def init_leaderboard(dataframe):
|
|
| 116 |
label="Select Columns to Display:",
|
| 117 |
),
|
| 118 |
datatype="markdown",
|
| 119 |
-
search_columns=["Harness", "Model
|
| 120 |
filter_columns=[
|
| 121 |
ColumnFilter(label="Category", column=" ", type="checkboxgroup", choices=label_choices),
|
| 122 |
-
ColumnFilter(label="Model", column="Model
|
| 123 |
ColumnFilter(label="Harness", column="Harness", type="checkboxgroup", choices=harness_choices),
|
| 124 |
ColumnFilter(label="Number of Parameters (B)", column="Model Num Params (B)", type="slider"),
|
| 125 |
ColumnFilter(label="Precision", column="Precision", type="checkboxgroup"),
|
|
@@ -140,10 +138,10 @@ def init_benchmark_runs(dataframe):
|
|
| 140 |
select_columns=SelectColumns(
|
| 141 |
default_selection=[
|
| 142 |
" ",
|
| 143 |
-
"Benchmark",
|
| 144 |
-
"Harness",
|
| 145 |
"Model",
|
| 146 |
-
"
|
|
|
|
|
|
|
| 147 |
"Precision",
|
| 148 |
"Environment",
|
| 149 |
"Score",
|
|
@@ -181,16 +179,6 @@ with demo:
|
|
| 181 |
with gr.Tab("π About"):
|
| 182 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 183 |
|
| 184 |
-
with gr.Row():
|
| 185 |
-
with gr.Accordion("π Citation", open=False):
|
| 186 |
-
citation_button = gr.Textbox(
|
| 187 |
-
value=CITATION_BUTTON_TEXT,
|
| 188 |
-
label=CITATION_BUTTON_LABEL,
|
| 189 |
-
lines=20,
|
| 190 |
-
elem_id="citation-button",
|
| 191 |
-
show_copy_button=True,
|
| 192 |
-
)
|
| 193 |
-
|
| 194 |
scheduler = BackgroundScheduler()
|
| 195 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
| 196 |
scheduler.start()
|
|
|
|
| 51 |
from src.display.text_blocks import (
|
| 52 |
INTRODUCTION_TEXT,
|
| 53 |
LLM_BENCHMARKS_TEXT,
|
|
|
|
|
|
|
| 54 |
)
|
| 55 |
|
| 56 |
REPO_ID = "taagarwa/coding-agent-leaderboard"
|
|
|
|
| 101 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
| 102 |
|
| 103 |
label_choices = [("π Fully FOSS", "π "), ("πΆ Proprietary", "πΆ")]
|
| 104 |
+
meta_columns = [" ", "Harness", "Model", "Harness License", "Model License", "Model Num Params (B)", "Precision"]
|
| 105 |
benchmark_columns = [col for col in dataframe.columns if col not in meta_columns]
|
| 106 |
+
model_choices = sorted({(extract_body(v), v) for v in dataframe["Model"]})
|
| 107 |
harness_choices = sorted({(extract_body(v), v) for v in dataframe["Harness"]})
|
| 108 |
|
| 109 |
+
default_columns = [" ", "Harness", "Model"] + benchmark_columns
|
| 110 |
return Leaderboard(
|
| 111 |
value=dataframe,
|
| 112 |
select_columns=SelectColumns(
|
|
|
|
| 114 |
label="Select Columns to Display:",
|
| 115 |
),
|
| 116 |
datatype="markdown",
|
| 117 |
+
search_columns=["Harness", "Model"],
|
| 118 |
filter_columns=[
|
| 119 |
ColumnFilter(label="Category", column=" ", type="checkboxgroup", choices=label_choices),
|
| 120 |
+
ColumnFilter(label="Model", column="Model", type="checkboxgroup", choices=model_choices),
|
| 121 |
ColumnFilter(label="Harness", column="Harness", type="checkboxgroup", choices=harness_choices),
|
| 122 |
ColumnFilter(label="Number of Parameters (B)", column="Model Num Params (B)", type="slider"),
|
| 123 |
ColumnFilter(label="Precision", column="Precision", type="checkboxgroup"),
|
|
|
|
| 138 |
select_columns=SelectColumns(
|
| 139 |
default_selection=[
|
| 140 |
" ",
|
|
|
|
|
|
|
| 141 |
"Model",
|
| 142 |
+
"Harness",
|
| 143 |
+
"Benchmark",
|
| 144 |
+
"Base Model",
|
| 145 |
"Precision",
|
| 146 |
"Environment",
|
| 147 |
"Score",
|
|
|
|
| 179 |
with gr.Tab("π About"):
|
| 180 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 181 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
scheduler = BackgroundScheduler()
|
| 183 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
| 184 |
scheduler.start()
|
src/display/text_blocks.py
CHANGED
|
@@ -33,7 +33,3 @@ A coding agent is a system that autonomously solves software engineering tasks -
|
|
| 33 |
|
| 34 |
Visit the [GitHub repo](https://github.com/redhat-et/coding_agent_bench) for details about the project, methodology, and how to submit your own results.
|
| 35 |
"""
|
| 36 |
-
|
| 37 |
-
CITATION_BUTTON_TEXT = "TBD"
|
| 38 |
-
|
| 39 |
-
CITATION_BUTTON_LABEL = "Citation"
|
|
|
|
| 33 |
|
| 34 |
Visit the [GitHub repo](https://github.com/redhat-et/coding_agent_bench) for details about the project, methodology, and how to submit your own results.
|
| 35 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
src/leaderboard.py
CHANGED
|
@@ -46,8 +46,8 @@ def get_leaderboard_df():
|
|
| 46 |
avg_score = sum(benchmarks.values()) / len(benchmarks)
|
| 47 |
row = {
|
| 48 |
" ": "π " if model.is_oss and harness.is_oss else "πΆ",
|
|
|
|
| 49 |
"Harness": f'[{harness.name}]({harness.url})<sup>*</sup>' if result.harness.name == "internal" else f'[{harness.name}]({harness.url})',
|
| 50 |
-
"Model ID": f'[{model.repo}]({model.url})',
|
| 51 |
"Precision": model.precision,
|
| 52 |
"Model License": "FOSS" if model.is_oss else "Proprietary",
|
| 53 |
"Harness License": "FOSS" if harness.is_oss else "Proprietary",
|
|
@@ -75,10 +75,10 @@ def get_benchmark_run_df():
|
|
| 75 |
rows.append(
|
| 76 |
{
|
| 77 |
" ": "π " if result.model.is_oss and result.harness.is_oss else "πΆ",
|
| 78 |
-
"
|
| 79 |
"Harness": f'[{result.harness.name}]({result.harness.url})<sup>*</sup>' if result.harness.name == "internal" else f'[{result.harness.name}]({result.harness.url})',
|
| 80 |
-
"
|
| 81 |
-
"
|
| 82 |
"Precision": result.model.precision,
|
| 83 |
"Skills": str(result.harness.skills) if result.harness.skills else "None",
|
| 84 |
"Environment": f'[{result.environment.name}]({result.environment.url})<sup>*</sup>' if result.environment.name == "internal" else f'[{result.environment.name}]({result.environment.url})',
|
|
|
|
| 46 |
avg_score = sum(benchmarks.values()) / len(benchmarks)
|
| 47 |
row = {
|
| 48 |
" ": "π " if model.is_oss and harness.is_oss else "πΆ",
|
| 49 |
+
"Model": f'[{model.repo}]({model.url})',
|
| 50 |
"Harness": f'[{harness.name}]({harness.url})<sup>*</sup>' if result.harness.name == "internal" else f'[{harness.name}]({harness.url})',
|
|
|
|
| 51 |
"Precision": model.precision,
|
| 52 |
"Model License": "FOSS" if model.is_oss else "Proprietary",
|
| 53 |
"Harness License": "FOSS" if harness.is_oss else "Proprietary",
|
|
|
|
| 75 |
rows.append(
|
| 76 |
{
|
| 77 |
" ": "π " if result.model.is_oss and result.harness.is_oss else "πΆ",
|
| 78 |
+
"Model": f'[{result.model.repo}]({result.model.url})',
|
| 79 |
"Harness": f'[{result.harness.name}]({result.harness.url})<sup>*</sup>' if result.harness.name == "internal" else f'[{result.harness.name}]({result.harness.url})',
|
| 80 |
+
"Benchmark": f'[{result.benchmark.name}]({result.benchmark.url})',
|
| 81 |
+
"Base Model": result.model.name,
|
| 82 |
"Precision": result.model.precision,
|
| 83 |
"Skills": str(result.harness.skills) if result.harness.skills else "None",
|
| 84 |
"Environment": f'[{result.environment.name}]({result.environment.url})<sup>*</sup>' if result.environment.name == "internal" else f'[{result.environment.name}]({result.environment.url})',
|