openhands openhands commited on
Commit
5778893
·
1 Parent(s): 8cdce51

Update DeepSeek logo, tooltip format, and category names

Browse files

1. Replace DeepSeek logo with official SVG from LobeHub
2. Update tooltip format to: {lm_name} (SDK {version})
- Average Score, Average Cost, Openness
3. Rename categories:
- Bug Fixing -> Issue Resolution
- App Creation -> Greenfield
- Frontend Development -> Frontend
- Test Generation -> Testing
- Information Gathering (unchanged)
4. Update intro paragraph to mention all category names

Co-authored-by: openhands <openhands@all-hands.dev>

app.py CHANGED
@@ -219,16 +219,16 @@ demo = gr.Blocks(
219
  with demo.route("Home", "/home"):
220
  build_main_page()
221
 
222
- with demo.route("Bug Fixing", "/bug-fixing"):
223
  build_bug_fixing_page()
224
 
225
- with demo.route("App Creation", "/app-creation"):
226
  build_app_creation_page()
227
 
228
- with demo.route("Frontend Development", "/frontend-development"):
229
  build_frontend_page()
230
 
231
- with demo.route("Test Generation", "/test-generation"):
232
  build_test_generation_page()
233
 
234
  with demo.route("Information Gathering", "/information-gathering"):
 
219
  with demo.route("Home", "/home"):
220
  build_main_page()
221
 
222
+ with demo.route("Issue Resolution", "/bug-fixing"):
223
  build_bug_fixing_page()
224
 
225
+ with demo.route("Greenfield", "/app-creation"):
226
  build_app_creation_page()
227
 
228
+ with demo.route("Frontend", "/frontend-development"):
229
  build_frontend_page()
230
 
231
+ with demo.route("Testing", "/test-generation"):
232
  build_test_generation_page()
233
 
234
  with demo.route("Information Gathering", "/information-gathering"):
app_creation.py CHANGED
@@ -2,7 +2,7 @@ from content import APP_CREATION_DESCRIPTION
2
  from category_page_builder import build_category_page
3
 
4
  # Define the category for this page
5
- CATEGORY_NAME = "App Creation"
6
 
7
  def build_page():
8
  build_category_page(CATEGORY_NAME, APP_CREATION_DESCRIPTION)
 
2
  from category_page_builder import build_category_page
3
 
4
  # Define the category for this page
5
+ CATEGORY_NAME = "Greenfield"
6
 
7
  def build_page():
8
  build_category_page(CATEGORY_NAME, APP_CREATION_DESCRIPTION)
assets/logo-deepseek.svg CHANGED
bug_fixing.py CHANGED
@@ -2,7 +2,7 @@ from content import BUG_FIXING_DESCRIPTION
2
  from category_page_builder import build_category_page
3
 
4
  # Define the category for this page
5
- CATEGORY_NAME = "Bug Fixing"
6
 
7
  def build_page():
8
  build_category_page(CATEGORY_NAME, BUG_FIXING_DESCRIPTION)
 
2
  from category_page_builder import build_category_page
3
 
4
  # Define the category for this page
5
+ CATEGORY_NAME = "Issue Resolution"
6
 
7
  def build_page():
8
  build_category_page(CATEGORY_NAME, BUG_FIXING_DESCRIPTION)
category_page_builder.py CHANGED
@@ -4,10 +4,10 @@ import pandas as pd
4
  # Import our UI factories and the data loader
5
  from ui_components import create_leaderboard_display, create_benchmark_details_display, get_full_leaderboard_data, create_sub_navigation_bar
6
  CATEGORY_DIAGRAM_MAP = {
7
- "Bug Fixing": "assets/bug-fixing.svg",
8
- "App Creation": "assets/app-creation.svg",
9
- "Frontend Development": "assets/frontend-development.svg",
10
- "Test Generation": "assets/test-generation.svg",
11
  "Information Gathering": "assets/information-gathering.svg",
12
  }
13
 
 
4
  # Import our UI factories and the data loader
5
  from ui_components import create_leaderboard_display, create_benchmark_details_display, get_full_leaderboard_data, create_sub_navigation_bar
6
  CATEGORY_DIAGRAM_MAP = {
7
+ "Issue Resolution": "assets/bug-fixing.svg",
8
+ "Greenfield": "assets/app-creation.svg",
9
+ "Frontend": "assets/frontend-development.svg",
10
+ "Testing": "assets/test-generation.svg",
11
  "Information Gathering": "assets/information-gathering.svg",
12
  }
13
 
content.py CHANGED
@@ -25,7 +25,7 @@ INTRO_PARAGRAPH = """
25
  </p>
26
 
27
  <p>
28
- Our index aggregates results from multiple benchmarks spanning five categories, providing a single view of both <strong>performance</strong> and <strong>cost efficiency</strong>. This enables fair comparisons between agents, helping developers and researchers choose the right tool for their needs.
29
  </p>
30
 
31
  <p>
@@ -39,7 +39,7 @@ PARETO_DISCLAIMER = """
39
  Agents names that are green are Pareto optimal, meaning they achieve the best performance for their cost.
40
  """
41
  BUG_FIXING_DESCRIPTION = """
42
- The **Bug Fixing** category evaluates how well agents can diagnose and fix bugs in real-world codebases. This tests their ability to understand GitHub issues, navigate repositories, identify root causes, and implement correct fixes.
43
  <br><br>
44
  The scores shown below reflect performance aggregated across two distinct benchmarks: SWE-bench (text-based bug reports) and SWE-bench-multimodal (issues with visual context like screenshots or diagrams).
45
  <br><br>
@@ -47,7 +47,7 @@ For detailed results, use the links above to explore individual benchmarks.
47
  <br>
48
  """
49
  APP_CREATION_DESCRIPTION = """
50
- The **App Creation** category in OpenHands Index evaluates an agent's ability to build complete applications from scratch based on natural language specifications. This tests whether agents can understand requirements, design architecture, write modular code, and create working applications.
51
  <br><br>
52
  This category currently includes Commit0, which challenges agents to implement complete features and applications by generating the initial commit for a project.
53
  <br><br>
@@ -55,13 +55,13 @@ For detailed results, use the links above to explore individual benchmark pages.
55
  <br>
56
  """
57
  FRONTEND_DEVELOPMENT_DESCRIPTION = """
58
- The **Frontend Development** category evaluates agents on their ability to build user interfaces and web applications. This tests skills in HTML, CSS, JavaScript frameworks, responsive design, and creating interactive user experiences.
59
  <br><br>
60
  This category includes Multi-SWE-bench, which challenges agents to work across multiple repositories and coordinate changes in complex web application architectures.
61
  <br>
62
  """
63
  TEST_GENERATION_DESCRIPTION = """
64
- The **Test Generation** category evaluates agents on their ability to create comprehensive test suites for existing code. This tests their understanding of code behavior, edge cases, and the ability to write effective unit tests, integration tests, and end-to-end tests.
65
  <br><br>
66
  This category includes SWT-bench (Software Testing Benchmark), which challenges agents to generate high-quality test cases that achieve good coverage and catch real bugs.
67
  <br>
 
25
  </p>
26
 
27
  <p>
28
+ Our index aggregates results from multiple benchmarks spanning five categories: <strong>Issue Resolution</strong> (fixing bugs), <strong>Greenfield</strong> (building new apps), <strong>Frontend</strong> (UI development), <strong>Testing</strong> (test generation), and <strong>Information Gathering</strong>. This provides a single view of both <strong>performance</strong> and <strong>cost efficiency</strong>, enabling fair comparisons between agents.
29
  </p>
30
 
31
  <p>
 
39
  Agents names that are green are Pareto optimal, meaning they achieve the best performance for their cost.
40
  """
41
  BUG_FIXING_DESCRIPTION = """
42
+ The **Issue Resolution** category evaluates how well agents can diagnose and fix bugs in real-world codebases. This tests their ability to understand GitHub issues, navigate repositories, identify root causes, and implement correct fixes.
43
  <br><br>
44
  The scores shown below reflect performance aggregated across two distinct benchmarks: SWE-bench (text-based bug reports) and SWE-bench-multimodal (issues with visual context like screenshots or diagrams).
45
  <br><br>
 
47
  <br>
48
  """
49
  APP_CREATION_DESCRIPTION = """
50
+ The **Greenfield** category in OpenHands Index evaluates an agent's ability to build complete applications from scratch based on natural language specifications. This tests whether agents can understand requirements, design architecture, write modular code, and create working applications.
51
  <br><br>
52
  This category currently includes Commit0, which challenges agents to implement complete features and applications by generating the initial commit for a project.
53
  <br><br>
 
55
  <br>
56
  """
57
  FRONTEND_DEVELOPMENT_DESCRIPTION = """
58
+ The **Frontend** category evaluates agents on their ability to build user interfaces and web applications. This tests skills in HTML, CSS, JavaScript frameworks, responsive design, and creating interactive user experiences.
59
  <br><br>
60
  This category includes Multi-SWE-bench, which challenges agents to work across multiple repositories and coordinate changes in complex web application architectures.
61
  <br>
62
  """
63
  TEST_GENERATION_DESCRIPTION = """
64
+ The **Testing** category evaluates agents on their ability to create comprehensive test suites for existing code. This tests their understanding of code behavior, edge cases, and the ability to write effective unit tests, integration tests, and end-to-end tests.
65
  <br><br>
66
  This category includes SWT-bench (Software Testing Benchmark), which challenges agents to generate high-quality test cases that achieve good coverage and catch real bugs.
67
  <br>
frontend_development.py CHANGED
@@ -2,7 +2,7 @@ from content import FRONTEND_DEVELOPMENT_DESCRIPTION
2
  from category_page_builder import build_category_page
3
 
4
  # Define the category for this page
5
- CATEGORY_NAME = "Frontend Development"
6
 
7
  def build_page():
8
  build_category_page(CATEGORY_NAME, FRONTEND_DEVELOPMENT_DESCRIPTION)
 
2
  from category_page_builder import build_category_page
3
 
4
  # Define the category for this page
5
+ CATEGORY_NAME = "Frontend"
6
 
7
  def build_page():
8
  build_category_page(CATEGORY_NAME, FRONTEND_DEVELOPMENT_DESCRIPTION)
leaderboard_transformer.py CHANGED
@@ -341,7 +341,7 @@ class DataTransformer:
341
  # Calculated and add "Categories Attempted" column
342
  if tag is None or tag == "Overall":
343
  def calculate_attempted(row):
344
- main_categories = ['Bug Fixing', 'Frontend Development', 'App Creation', 'Test Generation', 'Information Gathering']
345
  count = 0
346
  for category in main_categories:
347
  value = row.get(f"{category} Score")
@@ -503,36 +503,42 @@ def _plot_scatter_plotly(
503
  def format_hover_text(row, agent_col, x_axis_label, x_col, y_col, divider_line_x):
504
  """
505
  Builds the complete HTML string for the plot's hover tooltip.
506
- Formats the 'Language Model' column as a bulleted list if multiple.
 
 
 
507
  """
508
  h_pad = " "
509
  parts = ["<br>"]
510
- parts.append(f"{h_pad}<b>{row[agent_col]}</b>{h_pad}<br>")
511
- parts.append(f"{h_pad}Score: <b>{row[y_col]:.3f}</b>{h_pad}<br>")
512
- if divider_line_x > 0 and row[x_col] >= divider_line_x:
513
- # If no cost, display "Missing" for the cost.
514
- parts.append(f"{h_pad}{x_axis_label}: <b>Missing</b>{h_pad}<br>")
515
- else:
516
- parts.append(f"{h_pad}{x_axis_label}: <b>${row[x_col]:.2f}</b>{h_pad}<br>")
517
- parts.append(f"{h_pad}Openness: <b>{row['Openness']}</b>{h_pad}")
518
-
519
- # Add extra vertical space (line spacing) before the next section
520
- parts.append("<br>")
521
- # Clean and format Language Model column
522
- llm_base_value = row['Language Model']
523
  llm_base_value = clean_llm_base_list(llm_base_value)
524
  if isinstance(llm_base_value, list) and llm_base_value:
525
- parts.append(f"{h_pad}Language Model:{h_pad}<br>")
526
- # Create a list of padded bullet points
527
- list_items = [f"{h_pad} • <b>{item}</b>{h_pad}" for item in llm_base_value]
528
- # Join them with line breaks
529
- parts.append('<br>'.join(list_items))
530
  else:
531
- # Handle the non-list case with padding
532
- parts.append(f"{h_pad}Language Model: <b>{llm_base_value}</b>{h_pad}")
533
- # Add a final line break for bottom "padding"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
534
  parts.append("<br>")
535
- # Join all the parts together into the final HTML string
536
  return ''.join(parts)
537
  # Pre-generate hover text and shapes for each point
538
  data_plot['hover_text'] = data_plot.apply(
 
341
  # Calculated and add "Categories Attempted" column
342
  if tag is None or tag == "Overall":
343
  def calculate_attempted(row):
344
+ main_categories = ['Issue Resolution', 'Frontend', 'Greenfield', 'Testing', 'Information Gathering']
345
  count = 0
346
  for category in main_categories:
347
  value = row.get(f"{category} Score")
 
503
  def format_hover_text(row, agent_col, x_axis_label, x_col, y_col, divider_line_x):
504
  """
505
  Builds the complete HTML string for the plot's hover tooltip.
506
+ Format: {lm_name} (SDK {version})
507
+ Average Score: {score}
508
+ Average Cost: {cost}
509
+ Openness: {openness}
510
  """
511
  h_pad = " "
512
  parts = ["<br>"]
513
+
514
+ # Get and clean the language model name
515
+ llm_base_value = row.get('Language Model', '')
 
 
 
 
 
 
 
 
 
 
516
  llm_base_value = clean_llm_base_list(llm_base_value)
517
  if isinstance(llm_base_value, list) and llm_base_value:
518
+ lm_name = llm_base_value[0]
 
 
 
 
519
  else:
520
+ lm_name = str(llm_base_value) if llm_base_value else 'Unknown'
521
+
522
+ # Get SDK version
523
+ sdk_version = row.get('SDK Version', row.get(agent_col, 'Unknown'))
524
+
525
+ # Title line: {lm_name} (SDK {version})
526
+ parts.append(f"{h_pad}<b>{lm_name}</b> (SDK {sdk_version}){h_pad}<br>")
527
+
528
+ # Average Score
529
+ parts.append(f"{h_pad}Average Score: <b>{row[y_col]:.3f}</b>{h_pad}<br>")
530
+
531
+ # Average Cost
532
+ if divider_line_x > 0 and row[x_col] >= divider_line_x:
533
+ parts.append(f"{h_pad}Average Cost: <b>Missing</b>{h_pad}<br>")
534
+ else:
535
+ parts.append(f"{h_pad}Average Cost: <b>${row[x_col]:.2f}</b>{h_pad}<br>")
536
+
537
+ # Openness
538
+ parts.append(f"{h_pad}Openness: <b>{row['Openness']}</b>{h_pad}")
539
+
540
+ # Add final line break for padding
541
  parts.append("<br>")
 
542
  return ''.join(parts)
543
  # Pre-generate hover text and shapes for each point
544
  data_plot['hover_text'] = data_plot.apply(
simple_data_loader.py CHANGED
@@ -144,11 +144,11 @@ class SimpleLeaderboardViewer:
144
  if not config_has_mappings:
145
  print("[DATA_LOADER] No agenteval.json found, using fallback category mappings")
146
  fallback_mappings = {
147
- 'swe-bench': ['Bug Fixing'],
148
- 'swe-bench-multimodal': ['Frontend Development'],
149
- 'commit0': ['App Creation'],
150
- 'multi-swe-bench': ['Bug Fixing'],
151
- 'swt-bench': ['Test Generation'],
152
  'gaia': ['Information Gathering'],
153
  }
154
  for benchmark, categories in fallback_mappings.items():
@@ -244,7 +244,7 @@ class SimpleLeaderboardViewer:
244
  normalized_openness = OPENNESS_MAPPING.get(raw_openness, raw_openness)
245
 
246
  # All 5 categories for the leaderboard
247
- ALL_CATEGORIES = ['Bug Fixing', 'Frontend Development', 'App Creation', 'Test Generation', 'Information Gathering']
248
 
249
  record = {
250
  # Core agent info - use final display names
 
144
  if not config_has_mappings:
145
  print("[DATA_LOADER] No agenteval.json found, using fallback category mappings")
146
  fallback_mappings = {
147
+ 'swe-bench': ['Issue Resolution'],
148
+ 'swe-bench-multimodal': ['Frontend'],
149
+ 'commit0': ['Greenfield'],
150
+ 'multi-swe-bench': ['Issue Resolution'],
151
+ 'swt-bench': ['Testing'],
152
  'gaia': ['Information Gathering'],
153
  }
154
  for benchmark, categories in fallback_mappings.items():
 
244
  normalized_openness = OPENNESS_MAPPING.get(raw_openness, raw_openness)
245
 
246
  # All 5 categories for the leaderboard
247
+ ALL_CATEGORIES = ['Issue Resolution', 'Frontend', 'Greenfield', 'Testing', 'Information Gathering']
248
 
249
  record = {
250
  # Core agent info - use final display names
test_generation.py CHANGED
@@ -2,7 +2,7 @@ from content import TEST_GENERATION_DESCRIPTION
2
  from category_page_builder import build_category_page
3
 
4
  # Define the category for this page
5
- CATEGORY_NAME = "Test Generation"
6
 
7
  def build_page():
8
  build_category_page(CATEGORY_NAME, TEST_GENERATION_DESCRIPTION)
 
2
  from category_page_builder import build_category_page
3
 
4
  # Define the category for this page
5
+ CATEGORY_NAME = "Testing"
6
 
7
  def build_page():
8
  build_category_page(CATEGORY_NAME, TEST_GENERATION_DESCRIPTION)
ui_components.py CHANGED
@@ -223,20 +223,20 @@ def build_descriptions_tooltip_content(table) -> str:
223
  <div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>
224
  <div class="tooltip-description-item"><b>Average Score:</b> Sum of category scores divided by 5. Missing categories count as 0.</div>
225
  <div class="tooltip-description-item"><b>Average Cost:</b> Average cost per instance across all submitted benchmarks, in USD.</div>
226
- <div class="tooltip-description-item"><b>Bug Fixing Score:</b> Macro-average score across Bug Fixing benchmarks.</div>
227
- <div class="tooltip-description-item"><b>Bug Fixing Cost:</b> Macro-average cost per instance (USD) across Bug Fixing benchmarks.</div>
228
- <div class="tooltip-description-item"><b>Frontend Development Score:</b> Macro-average score across Frontend Development benchmarks.</div>
229
- <div class="tooltip-description-item"><b>Frontend Development Cost:</b> Macro-average cost per instance (USD) across Frontend Development benchmarks.</div>
230
- <div class="tooltip-description-item"><b>App Creation Score:</b> Macro-average score across App Creation benchmarks.</div>
231
- <div class="tooltip-description-item"><b>App Creation Cost:</b> Macro-average cost per instance (USD) across App Creation benchmarks.</div>
232
- <div class="tooltip-description-item"><b>Test Generation Score:</b> Macro-average score across Test Generation benchmarks.</div>
233
- <div class="tooltip-description-item"><b>Test Generation Cost:</b> Macro-average cost per instance (USD) across Test Generation benchmarks.</div>
234
  <div class="tooltip-description-item"><b>Information Gathering Score:</b> Macro-average score across Information Gathering benchmarks.</div>
235
  <div class="tooltip-description-item"><b>Information Gathering Cost:</b> Macro-average cost per instance (USD) across Information Gathering benchmarks.</div>
236
  <div class="tooltip-description-item"><b>Categories Attempted:</b> Number of core categories with at least one benchmark attempted (out of 5).</div>
237
  <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
238
  """
239
- elif table in ["Bug Fixing", "Frontend Development", "App Creation", "Test Generation", "Information Gathering"]:
240
  return f"""
241
  <div class="tooltip-description-item"><b>SDK Version:</b> Version of the OpenHands agent evaluated.</div>
242
  <div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>
 
223
  <div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>
224
  <div class="tooltip-description-item"><b>Average Score:</b> Sum of category scores divided by 5. Missing categories count as 0.</div>
225
  <div class="tooltip-description-item"><b>Average Cost:</b> Average cost per instance across all submitted benchmarks, in USD.</div>
226
+ <div class="tooltip-description-item"><b>Issue Resolution Score:</b> Macro-average score across Issue Resolution benchmarks.</div>
227
+ <div class="tooltip-description-item"><b>Issue Resolution Cost:</b> Macro-average cost per instance (USD) across Issue Resolution benchmarks.</div>
228
+ <div class="tooltip-description-item"><b>Frontend Score:</b> Macro-average score across Frontend benchmarks.</div>
229
+ <div class="tooltip-description-item"><b>Frontend Cost:</b> Macro-average cost per instance (USD) across Frontend benchmarks.</div>
230
+ <div class="tooltip-description-item"><b>Greenfield Score:</b> Macro-average score across Greenfield benchmarks.</div>
231
+ <div class="tooltip-description-item"><b>Greenfield Cost:</b> Macro-average cost per instance (USD) across Greenfield benchmarks.</div>
232
+ <div class="tooltip-description-item"><b>Testing Score:</b> Macro-average score across Testing benchmarks.</div>
233
+ <div class="tooltip-description-item"><b>Testing Cost:</b> Macro-average cost per instance (USD) across Testing benchmarks.</div>
234
  <div class="tooltip-description-item"><b>Information Gathering Score:</b> Macro-average score across Information Gathering benchmarks.</div>
235
  <div class="tooltip-description-item"><b>Information Gathering Cost:</b> Macro-average cost per instance (USD) across Information Gathering benchmarks.</div>
236
  <div class="tooltip-description-item"><b>Categories Attempted:</b> Number of core categories with at least one benchmark attempted (out of 5).</div>
237
  <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
238
  """
239
+ elif table in ["Issue Resolution", "Frontend", "Greenfield", "Testing", "Information Gathering"]:
240
  return f"""
241
  <div class="tooltip-description-item"><b>SDK Version:</b> Version of the OpenHands agent evaluated.</div>
242
  <div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>