taagarwa commited on
Commit
77a435c
Β·
1 Parent(s): 4b9a7ba

🎨 Clean up leaderboard

Browse files
Files changed (3) hide show
  1. app.py +8 -20
  2. src/display/text_blocks.py +0 -4
  3. src/leaderboard.py +4 -4
app.py CHANGED
@@ -51,8 +51,6 @@ from src.leaderboard import get_leaderboard_df, get_benchmark_run_df
51
  from src.display.text_blocks import (
52
  INTRODUCTION_TEXT,
53
  LLM_BENCHMARKS_TEXT,
54
- CITATION_BUTTON_LABEL,
55
- CITATION_BUTTON_TEXT,
56
  )
57
 
58
  REPO_ID = "taagarwa/coding-agent-leaderboard"
@@ -103,12 +101,12 @@ def init_leaderboard(dataframe):
103
  raise ValueError("Leaderboard DataFrame is empty or None.")
104
 
105
  label_choices = [("🟠 Fully FOSS", "🟠"), ("πŸ”Ά Proprietary", "πŸ”Ά")]
106
- meta_columns = [" ", "Harness", "Model ID", "Harness License", "Model License", "Model Num Params (B)", "Precision"]
107
  benchmark_columns = [col for col in dataframe.columns if col not in meta_columns]
108
- model_choices = sorted({(extract_body(v), v) for v in dataframe["Model ID"]})
109
  harness_choices = sorted({(extract_body(v), v) for v in dataframe["Harness"]})
110
 
111
- default_columns = [" ", "Harness", "Model ID"] + benchmark_columns
112
  return Leaderboard(
113
  value=dataframe,
114
  select_columns=SelectColumns(
@@ -116,10 +114,10 @@ def init_leaderboard(dataframe):
116
  label="Select Columns to Display:",
117
  ),
118
  datatype="markdown",
119
- search_columns=["Harness", "Model ID"],
120
  filter_columns=[
121
  ColumnFilter(label="Category", column=" ", type="checkboxgroup", choices=label_choices),
122
- ColumnFilter(label="Model", column="Model ID", type="checkboxgroup", choices=model_choices),
123
  ColumnFilter(label="Harness", column="Harness", type="checkboxgroup", choices=harness_choices),
124
  ColumnFilter(label="Number of Parameters (B)", column="Model Num Params (B)", type="slider"),
125
  ColumnFilter(label="Precision", column="Precision", type="checkboxgroup"),
@@ -140,10 +138,10 @@ def init_benchmark_runs(dataframe):
140
  select_columns=SelectColumns(
141
  default_selection=[
142
  " ",
143
- "Benchmark",
144
- "Harness",
145
  "Model",
146
- "Model ID",
 
 
147
  "Precision",
148
  "Environment",
149
  "Score",
@@ -181,16 +179,6 @@ with demo:
181
  with gr.Tab("πŸ“ About"):
182
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
183
 
184
- with gr.Row():
185
- with gr.Accordion("πŸ“™ Citation", open=False):
186
- citation_button = gr.Textbox(
187
- value=CITATION_BUTTON_TEXT,
188
- label=CITATION_BUTTON_LABEL,
189
- lines=20,
190
- elem_id="citation-button",
191
- show_copy_button=True,
192
- )
193
-
194
  scheduler = BackgroundScheduler()
195
  scheduler.add_job(restart_space, "interval", seconds=1800)
196
  scheduler.start()
 
51
  from src.display.text_blocks import (
52
  INTRODUCTION_TEXT,
53
  LLM_BENCHMARKS_TEXT,
 
 
54
  )
55
 
56
  REPO_ID = "taagarwa/coding-agent-leaderboard"
 
101
  raise ValueError("Leaderboard DataFrame is empty or None.")
102
 
103
  label_choices = [("🟠 Fully FOSS", "🟠"), ("πŸ”Ά Proprietary", "πŸ”Ά")]
104
+ meta_columns = [" ", "Harness", "Model", "Harness License", "Model License", "Model Num Params (B)", "Precision"]
105
  benchmark_columns = [col for col in dataframe.columns if col not in meta_columns]
106
+ model_choices = sorted({(extract_body(v), v) for v in dataframe["Model"]})
107
  harness_choices = sorted({(extract_body(v), v) for v in dataframe["Harness"]})
108
 
109
+ default_columns = [" ", "Harness", "Model"] + benchmark_columns
110
  return Leaderboard(
111
  value=dataframe,
112
  select_columns=SelectColumns(
 
114
  label="Select Columns to Display:",
115
  ),
116
  datatype="markdown",
117
+ search_columns=["Harness", "Model"],
118
  filter_columns=[
119
  ColumnFilter(label="Category", column=" ", type="checkboxgroup", choices=label_choices),
120
+ ColumnFilter(label="Model", column="Model", type="checkboxgroup", choices=model_choices),
121
  ColumnFilter(label="Harness", column="Harness", type="checkboxgroup", choices=harness_choices),
122
  ColumnFilter(label="Number of Parameters (B)", column="Model Num Params (B)", type="slider"),
123
  ColumnFilter(label="Precision", column="Precision", type="checkboxgroup"),
 
138
  select_columns=SelectColumns(
139
  default_selection=[
140
  " ",
 
 
141
  "Model",
142
+ "Harness",
143
+ "Benchmark",
144
+ "Base Model",
145
  "Precision",
146
  "Environment",
147
  "Score",
 
179
  with gr.Tab("πŸ“ About"):
180
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
181
 
 
 
 
 
 
 
 
 
 
 
182
  scheduler = BackgroundScheduler()
183
  scheduler.add_job(restart_space, "interval", seconds=1800)
184
  scheduler.start()
src/display/text_blocks.py CHANGED
@@ -33,7 +33,3 @@ A coding agent is a system that autonomously solves software engineering tasks -
33
 
34
  Visit the [GitHub repo](https://github.com/redhat-et/coding_agent_bench) for details about the project, methodology, and how to submit your own results.
35
  """
36
-
37
- CITATION_BUTTON_TEXT = "TBD"
38
-
39
- CITATION_BUTTON_LABEL = "Citation"
 
33
 
34
  Visit the [GitHub repo](https://github.com/redhat-et/coding_agent_bench) for details about the project, methodology, and how to submit your own results.
35
  """
 
 
 
 
src/leaderboard.py CHANGED
@@ -46,8 +46,8 @@ def get_leaderboard_df():
46
  avg_score = sum(benchmarks.values()) / len(benchmarks)
47
  row = {
48
  " ": "🟠" if model.is_oss and harness.is_oss else "πŸ”Ά",
 
49
  "Harness": f'[{harness.name}]({harness.url})<sup>*</sup>' if result.harness.name == "internal" else f'[{harness.name}]({harness.url})',
50
- "Model ID": f'[{model.repo}]({model.url})',
51
  "Precision": model.precision,
52
  "Model License": "FOSS" if model.is_oss else "Proprietary",
53
  "Harness License": "FOSS" if harness.is_oss else "Proprietary",
@@ -75,10 +75,10 @@ def get_benchmark_run_df():
75
  rows.append(
76
  {
77
  " ": "🟠" if result.model.is_oss and result.harness.is_oss else "πŸ”Ά",
78
- "Benchmark": f'[{result.benchmark.name}]({result.benchmark.url})',
79
  "Harness": f'[{result.harness.name}]({result.harness.url})<sup>*</sup>' if result.harness.name == "internal" else f'[{result.harness.name}]({result.harness.url})',
80
- "Model": result.model.name,
81
- "Model ID": f'[{result.model.repo}]({result.model.url})',
82
  "Precision": result.model.precision,
83
  "Skills": str(result.harness.skills) if result.harness.skills else "None",
84
  "Environment": f'[{result.environment.name}]({result.environment.url})<sup>*</sup>' if result.environment.name == "internal" else f'[{result.environment.name}]({result.environment.url})',
 
46
  avg_score = sum(benchmarks.values()) / len(benchmarks)
47
  row = {
48
  " ": "🟠" if model.is_oss and harness.is_oss else "πŸ”Ά",
49
+ "Model": f'[{model.repo}]({model.url})',
50
  "Harness": f'[{harness.name}]({harness.url})<sup>*</sup>' if result.harness.name == "internal" else f'[{harness.name}]({harness.url})',
 
51
  "Precision": model.precision,
52
  "Model License": "FOSS" if model.is_oss else "Proprietary",
53
  "Harness License": "FOSS" if harness.is_oss else "Proprietary",
 
75
  rows.append(
76
  {
77
  " ": "🟠" if result.model.is_oss and result.harness.is_oss else "πŸ”Ά",
78
+ "Model": f'[{result.model.repo}]({result.model.url})',
79
  "Harness": f'[{result.harness.name}]({result.harness.url})<sup>*</sup>' if result.harness.name == "internal" else f'[{result.harness.name}]({result.harness.url})',
80
+ "Benchmark": f'[{result.benchmark.name}]({result.benchmark.url})',
81
+ "Base Model": result.model.name,
82
  "Precision": result.model.precision,
83
  "Skills": str(result.harness.skills) if result.harness.skills else "None",
84
  "Environment": f'[{result.environment.name}]({result.environment.url})<sup>*</sup>' if result.environment.name == "internal" else f'[{result.environment.name}]({result.environment.url})',