taagarwa commited on
Commit
7a6725b
·
1 Parent(s): 456b71c

✨ Add visual formatting and themeing

Browse files
Files changed (2) hide show
  1. app.py +31 -10
  2. src/display/text_blocks.py +27 -5
app.py CHANGED
@@ -18,14 +18,6 @@ REPO_ID = "taagarwa/coding-agent-leaderboard"
18
  TOKEN = os.environ.get("HF_TOKEN")
19
  API = HfApi(token=TOKEN)
20
 
21
- HEAD = """
22
- <head>
23
- <base target="_blank">
24
- </head>
25
- <h1>Coding Agent Leaderboard</h1>
26
- """
27
-
28
-
29
  def restart_space():
30
  API.restart_space(repo_id=REPO_ID)
31
 
@@ -36,6 +28,35 @@ def extract_body(s: str):
36
  return re.match(r'\[(.*?)\]', s).group(1)
37
 
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  def init_leaderboard(dataframe):
40
  if dataframe is None or dataframe.empty:
41
  raise ValueError("Leaderboard DataFrame is empty or None.")
@@ -61,9 +82,9 @@ def init_leaderboard(dataframe):
61
  interactive=False,
62
  )
63
 
64
- demo = gr.Blocks()
65
  with demo:
66
- gr.HTML(HEAD)
67
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
68
 
69
  with gr.Tabs() as tabs:
 
18
  TOKEN = os.environ.get("HF_TOKEN")
19
  API = HfApi(token=TOKEN)
20
 
 
 
 
 
 
 
 
 
21
  def restart_space():
22
  API.restart_space(repo_id=REPO_ID)
23
 
 
28
  return re.match(r'\[(.*?)\]', s).group(1)
29
 
30
 
31
+ def build_header_html(df):
32
+ n_results = len(df)
33
+ n_models = df["model"].nunique()
34
+ n_harnesses = df["harness"].apply(lambda s: extract_body(s)).nunique()
35
+ n_datasets = df["dataset"].apply(lambda s: extract_body(s)).nunique()
36
+
37
+ return f"""
38
+ <base target="_blank">
39
+ <div style="padding: 1.5rem 0.5rem 1rem 0.5rem; text-align: left;">
40
+ <h1 style="margin: 0 0 0.5rem 0; font-size: 2rem;">
41
+ Coding Agent Leaderboard
42
+ </h1>
43
+ <div style="height: 4px; border-radius: 2px; background: linear-gradient(90deg, #84cc16, #f59e0b); margin-bottom: 0.75rem;"></div>
44
+ <p style="margin: 0 0 0.75rem 0; font-size: 1.1rem; opacity: 0.8;">
45
+ Compare coding agents across models, harnesses, and environments
46
+ </p>
47
+ <div style="display: flex; gap: 0.5rem; flex-wrap: wrap; font-size: 0.95rem; opacity: 0.7;">
48
+ <span style="font-weight: 600;">{n_results} Results</span>
49
+ <span>·</span>
50
+ <span style="font-weight: 600;">{n_models} Models</span>
51
+ <span>·</span>
52
+ <span style="font-weight: 600;">{n_harnesses} Harnesses</span>
53
+ <span>·</span>
54
+ <span style="font-weight: 600;">{n_datasets} Datasets</span>
55
+ </div>
56
+ </div>
57
+ """
58
+
59
+
60
  def init_leaderboard(dataframe):
61
  if dataframe is None or dataframe.empty:
62
  raise ValueError("Leaderboard DataFrame is empty or None.")
 
82
  interactive=False,
83
  )
84
 
85
+ demo = gr.Blocks(theme="citrus")
86
  with demo:
87
+ gr.HTML(build_header_html(LEADERBOARD_DF))
88
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
89
 
90
  with gr.Tabs() as tabs:
src/display/text_blocks.py CHANGED
@@ -1,15 +1,37 @@
1
  INTRODUCTION_TEXT = """
2
- Welcome to the Coding Agent Leaderboard!
 
3
  """
4
 
5
  LLM_BENCHMARKS_TEXT = """
6
- ## About
7
 
8
- Evaluate and compare Coding Agents.
9
 
10
- Coding Agent = Model + Harness + Skills.
 
 
11
 
12
- Visit our [GitHub repo](https://github.com/redhat-et/coding_agent_bench) for more details about the project.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  """
14
 
15
  CITATION_BUTTON_TEXT = "TBD"
 
1
  INTRODUCTION_TEXT = """
2
+ A **Coding Agent** is more than just a model - it's the combination of a **Model**, a **Harness** (the tool/framework driving the model), and **Skills** (the instructions that guide the agent's behavior).
3
+ This leaderboard tracks how these components work together, because the same model can perform very differently depending on the harness and skills it's paired with.
4
  """
5
 
6
  LLM_BENCHMARKS_TEXT = """
7
+ ## What is a Coding Agent?
8
 
9
+ A coding agent is a system that autonomously solves software engineering tasks - reading code, reasoning about bugs, and writing patches. Its performance depends on three components:
10
 
11
+ - **Model** - The underlying language model (e.g. Claude Opus 4.7, Qwen3.6-35B)
12
+ - **Harness** - The framework or tool that orchestrates the model's actions (e.g. Claude Code, OpenCode, Pi)
13
+ - **Skills** - The instructions guiding the agent's behavior
14
 
15
+ ## How to Read the Table
16
+
17
+ | Column | Description |
18
+ |--------|-------------|
19
+ | **Dataset** | The benchmark used for evaluation (e.g. SWE-bench Verified - 500 real GitHub issues) |
20
+ | **Harness** | The agent framework driving the model. Entries marked with `*` are **internal** - the provider ran the benchmark but did not publish the harness or environment |
21
+ | **Model** | The language model being evaluated |
22
+ | **Environment** | The benchmark runtime. Also marked `*` when internal |
23
+ | **Score** | Outcome of the benchmark, often the fraction of tasks solved correctly (higher is better) |
24
+ | **Precision** | Model weight format (e.g. bf16, fp4) - affects speed, memory footprint, and quality |
25
+
26
+ ## Key Concepts
27
+
28
+ - **FOSS vs Proprietary** - Filters let you compare fully open-source agents against proprietary ones. A FOSS model with a FOSS harness means anyone can reproduce the result
29
+ - **Skills** - Some harnesses augment the model with extra capabilities (tools, retrieval, etc.). Listed in the "skills" column when present
30
+ - **Internal results (`*`)** - Benchmarks run by the model provider where the harness and environment were not made public. These are useful reference points but are not independently reproducible
31
+
32
+ ## Learn More
33
+
34
+ Visit the [GitHub repo](https://github.com/redhat-et/coding_agent_bench) for details about the project, methodology, and how to submit your own results.
35
  """
36
 
37
  CITATION_BUTTON_TEXT = "TBD"