Spaces:

OpenHands
/

openhands-index

Running

openhands openhands commited on Jan 18

Commit

5778893

1 Parent(s): 8cdce51

Update DeepSeek logo, tooltip format, and category names

1. Replace DeepSeek logo with official SVG from LobeHub
2. Update tooltip format to: {lm_name} (SDK {version})
- Average Score, Average Cost, Openness
3. Rename categories:
- Bug Fixing -> Issue Resolution
- App Creation -> Greenfield
- Frontend Development -> Frontend
- Test Generation -> Testing
- Information Gathering (unchanged)
4. Update intro paragraph to mention all category names

Co-authored-by: openhands <openhands@all-hands.dev>

Files changed (11) hide show

app.py +4 -4
app_creation.py +1 -1
assets/logo-deepseek.svg +1 -4
bug_fixing.py +1 -1
category_page_builder.py +4 -4
content.py +5 -5
frontend_development.py +1 -1
leaderboard_transformer.py +30 -24
simple_data_loader.py +6 -6
test_generation.py +1 -1
ui_components.py +9 -9

app.py CHANGED Viewed

@@ -219,16 +219,16 @@ demo = gr.Blocks(
 with demo.route("Home", "/home"):
     build_main_page()
-with demo.route("Bug Fixing", "/bug-fixing"):
     build_bug_fixing_page()
-with demo.route("App Creation", "/app-creation"):
     build_app_creation_page()
-with demo.route("Frontend Development", "/frontend-development"):
     build_frontend_page()
-with demo.route("Test Generation", "/test-generation"):
     build_test_generation_page()
 with demo.route("Information Gathering", "/information-gathering"):

 with demo.route("Home", "/home"):
     build_main_page()
+with demo.route("Issue Resolution", "/bug-fixing"):
     build_bug_fixing_page()
+with demo.route("Greenfield", "/app-creation"):
     build_app_creation_page()
+with demo.route("Frontend", "/frontend-development"):
     build_frontend_page()
+with demo.route("Testing", "/test-generation"):
     build_test_generation_page()
 with demo.route("Information Gathering", "/information-gathering"):

app_creation.py CHANGED Viewed

@@ -2,7 +2,7 @@ from content import APP_CREATION_DESCRIPTION
 from category_page_builder import build_category_page
 # Define the category for this page
-CATEGORY_NAME = "App Creation"
 def build_page():
     build_category_page(CATEGORY_NAME, APP_CREATION_DESCRIPTION)

 from category_page_builder import build_category_page
 # Define the category for this page
+CATEGORY_NAME = "Greenfield"
 def build_page():
     build_category_page(CATEGORY_NAME, APP_CREATION_DESCRIPTION)

assets/logo-deepseek.svg CHANGED Viewed

bug_fixing.py CHANGED Viewed

@@ -2,7 +2,7 @@ from content import BUG_FIXING_DESCRIPTION
 from category_page_builder import build_category_page
 # Define the category for this page
-CATEGORY_NAME = "Bug Fixing"
 def build_page():
     build_category_page(CATEGORY_NAME, BUG_FIXING_DESCRIPTION)

 from category_page_builder import build_category_page
 # Define the category for this page
+CATEGORY_NAME = "Issue Resolution"
 def build_page():
     build_category_page(CATEGORY_NAME, BUG_FIXING_DESCRIPTION)

category_page_builder.py CHANGED Viewed

@@ -4,10 +4,10 @@ import pandas as pd
 # Import our UI factories and the data loader
 from ui_components import create_leaderboard_display, create_benchmark_details_display, get_full_leaderboard_data, create_sub_navigation_bar
 CATEGORY_DIAGRAM_MAP = {
-    "Bug Fixing": "assets/bug-fixing.svg",
-    "App Creation": "assets/app-creation.svg",
-    "Frontend Development": "assets/frontend-development.svg",
-    "Test Generation": "assets/test-generation.svg",
     "Information Gathering": "assets/information-gathering.svg",
 }

 # Import our UI factories and the data loader
 from ui_components import create_leaderboard_display, create_benchmark_details_display, get_full_leaderboard_data, create_sub_navigation_bar
 CATEGORY_DIAGRAM_MAP = {
+    "Issue Resolution": "assets/bug-fixing.svg",
+    "Greenfield": "assets/app-creation.svg",
+    "Frontend": "assets/frontend-development.svg",
+    "Testing": "assets/test-generation.svg",
     "Information Gathering": "assets/information-gathering.svg",
 }

content.py CHANGED Viewed

@@ -25,7 +25,7 @@ INTRO_PARAGRAPH = """
 </p>
 <p>
-    Our index aggregates results from multiple benchmarks spanning five categories, providing a single view of both <strong>performance</strong> and <strong>cost efficiency</strong>. This enables fair comparisons between agents, helping developers and researchers choose the right tool for their needs.
 </p>
 <p>
@@ -39,7 +39,7 @@ PARETO_DISCLAIMER = """
 Agents names that are green are Pareto optimal, meaning they achieve the best performance for their cost.
 """
 BUG_FIXING_DESCRIPTION = """
-The **Bug Fixing** category evaluates how well agents can diagnose and fix bugs in real-world codebases. This tests their ability to understand GitHub issues, navigate repositories, identify root causes, and implement correct fixes.
 <br><br>
 The scores shown below reflect performance aggregated across two distinct benchmarks: SWE-bench (text-based bug reports) and SWE-bench-multimodal (issues with visual context like screenshots or diagrams).
 <br><br>
@@ -47,7 +47,7 @@ For detailed results, use the links above to explore individual benchmarks.
 <br>
 """
 APP_CREATION_DESCRIPTION = """
-The **App Creation** category in OpenHands Index evaluates an agent's ability to build complete applications from scratch based on natural language specifications. This tests whether agents can understand requirements, design architecture, write modular code, and create working applications.
 <br><br>
 This category currently includes Commit0, which challenges agents to implement complete features and applications by generating the initial commit for a project.
 <br><br>
@@ -55,13 +55,13 @@ For detailed results, use the links above to explore individual benchmark pages.
 <br>
 """
 FRONTEND_DEVELOPMENT_DESCRIPTION = """
-The **Frontend Development** category evaluates agents on their ability to build user interfaces and web applications. This tests skills in HTML, CSS, JavaScript frameworks, responsive design, and creating interactive user experiences.
 <br><br>
 This category includes Multi-SWE-bench, which challenges agents to work across multiple repositories and coordinate changes in complex web application architectures.
 <br>
 """
 TEST_GENERATION_DESCRIPTION = """
-The **Test Generation** category evaluates agents on their ability to create comprehensive test suites for existing code. This tests their understanding of code behavior, edge cases, and the ability to write effective unit tests, integration tests, and end-to-end tests.
 <br><br>
 This category includes SWT-bench (Software Testing Benchmark), which challenges agents to generate high-quality test cases that achieve good coverage and catch real bugs.
 <br>

 </p>
 <p>
+    Our index aggregates results from multiple benchmarks spanning five categories: <strong>Issue Resolution</strong> (fixing bugs), <strong>Greenfield</strong> (building new apps), <strong>Frontend</strong> (UI development), <strong>Testing</strong> (test generation), and <strong>Information Gathering</strong>. This provides a single view of both <strong>performance</strong> and <strong>cost efficiency</strong>, enabling fair comparisons between agents.
 </p>
 <p>
 Agents names that are green are Pareto optimal, meaning they achieve the best performance for their cost.
 """
 BUG_FIXING_DESCRIPTION = """
+The **Issue Resolution** category evaluates how well agents can diagnose and fix bugs in real-world codebases. This tests their ability to understand GitHub issues, navigate repositories, identify root causes, and implement correct fixes.
 <br><br>
 The scores shown below reflect performance aggregated across two distinct benchmarks: SWE-bench (text-based bug reports) and SWE-bench-multimodal (issues with visual context like screenshots or diagrams).
 <br><br>
 <br>
 """
 APP_CREATION_DESCRIPTION = """
+The **Greenfield** category in OpenHands Index evaluates an agent's ability to build complete applications from scratch based on natural language specifications. This tests whether agents can understand requirements, design architecture, write modular code, and create working applications.
 <br><br>
 This category currently includes Commit0, which challenges agents to implement complete features and applications by generating the initial commit for a project.
 <br><br>
 <br>
 """
 FRONTEND_DEVELOPMENT_DESCRIPTION = """
+The **Frontend** category evaluates agents on their ability to build user interfaces and web applications. This tests skills in HTML, CSS, JavaScript frameworks, responsive design, and creating interactive user experiences.
 <br><br>
 This category includes Multi-SWE-bench, which challenges agents to work across multiple repositories and coordinate changes in complex web application architectures.
 <br>
 """
 TEST_GENERATION_DESCRIPTION = """
+The **Testing** category evaluates agents on their ability to create comprehensive test suites for existing code. This tests their understanding of code behavior, edge cases, and the ability to write effective unit tests, integration tests, and end-to-end tests.
 <br><br>
 This category includes SWT-bench (Software Testing Benchmark), which challenges agents to generate high-quality test cases that achieve good coverage and catch real bugs.
 <br>

frontend_development.py CHANGED Viewed

@@ -2,7 +2,7 @@ from content import FRONTEND_DEVELOPMENT_DESCRIPTION
 from category_page_builder import build_category_page
 # Define the category for this page
-CATEGORY_NAME = "Frontend Development"
 def build_page():
     build_category_page(CATEGORY_NAME, FRONTEND_DEVELOPMENT_DESCRIPTION)

 from category_page_builder import build_category_page
 # Define the category for this page
+CATEGORY_NAME = "Frontend"
 def build_page():
     build_category_page(CATEGORY_NAME, FRONTEND_DEVELOPMENT_DESCRIPTION)

leaderboard_transformer.py CHANGED Viewed

@@ -341,7 +341,7 @@ class DataTransformer:
         # Calculated and add "Categories Attempted" column
         if tag is None or tag == "Overall":
             def calculate_attempted(row):
-                main_categories = ['Bug Fixing', 'Frontend Development', 'App Creation', 'Test Generation', 'Information Gathering']
                 count = 0
                 for category in main_categories:
                     value = row.get(f"{category} Score")
@@ -503,36 +503,42 @@ def _plot_scatter_plotly(
     def format_hover_text(row, agent_col, x_axis_label, x_col, y_col, divider_line_x):
         """
         Builds the complete HTML string for the plot's hover tooltip.
-        Formats the 'Language Model' column as a bulleted list if multiple.
         """
         h_pad = "   "
         parts = ["<br>"]
-        parts.append(f"{h_pad}<b>{row[agent_col]}</b>{h_pad}<br>")
-        parts.append(f"{h_pad}Score: <b>{row[y_col]:.3f}</b>{h_pad}<br>")
-        if divider_line_x > 0 and row[x_col] >= divider_line_x:
-            # If no cost, display "Missing" for the cost.
-            parts.append(f"{h_pad}{x_axis_label}: <b>Missing</b>{h_pad}<br>")
-        else:
-            parts.append(f"{h_pad}{x_axis_label}: <b>${row[x_col]:.2f}</b>{h_pad}<br>")
-        parts.append(f"{h_pad}Openness: <b>{row['Openness']}</b>{h_pad}")
-        # Add extra vertical space (line spacing) before the next section
-        parts.append("<br>")
-        # Clean and format Language Model column
-        llm_base_value = row['Language Model']
         llm_base_value = clean_llm_base_list(llm_base_value)
         if isinstance(llm_base_value, list) and llm_base_value:
-            parts.append(f"{h_pad}Language Model:{h_pad}<br>")
-            # Create a list of padded bullet points
-            list_items = [f"{h_pad}  • <b>{item}</b>{h_pad}" for item in llm_base_value]
-            # Join them with line breaks
-            parts.append('<br>'.join(list_items))
         else:
-            # Handle the non-list case with padding
-            parts.append(f"{h_pad}Language Model: <b>{llm_base_value}</b>{h_pad}")
-        # Add a final line break for bottom "padding"
         parts.append("<br>")
-        # Join all the parts together into the final HTML string
         return ''.join(parts)
     # Pre-generate hover text and shapes for each point
     data_plot['hover_text'] = data_plot.apply(

         # Calculated and add "Categories Attempted" column
         if tag is None or tag == "Overall":
             def calculate_attempted(row):
+                main_categories = ['Issue Resolution', 'Frontend', 'Greenfield', 'Testing', 'Information Gathering']
                 count = 0
                 for category in main_categories:
                     value = row.get(f"{category} Score")
     def format_hover_text(row, agent_col, x_axis_label, x_col, y_col, divider_line_x):
         """
         Builds the complete HTML string for the plot's hover tooltip.
+        Format: {lm_name} (SDK {version})
+                Average Score: {score}
+                Average Cost: {cost}
+                Openness: {openness}
         """
         h_pad = "   "
         parts = ["<br>"]
+        # Get and clean the language model name
+        llm_base_value = row.get('Language Model', '')
         llm_base_value = clean_llm_base_list(llm_base_value)
         if isinstance(llm_base_value, list) and llm_base_value:
+            lm_name = llm_base_value[0]
         else:
+            lm_name = str(llm_base_value) if llm_base_value else 'Unknown'
+        # Get SDK version
+        sdk_version = row.get('SDK Version', row.get(agent_col, 'Unknown'))
+        # Title line: {lm_name} (SDK {version})
+        parts.append(f"{h_pad}<b>{lm_name}</b> (SDK {sdk_version}){h_pad}<br>")
+        # Average Score
+        parts.append(f"{h_pad}Average Score: <b>{row[y_col]:.3f}</b>{h_pad}<br>")
+        # Average Cost
+        if divider_line_x > 0 and row[x_col] >= divider_line_x:
+            parts.append(f"{h_pad}Average Cost: <b>Missing</b>{h_pad}<br>")
+        else:
+            parts.append(f"{h_pad}Average Cost: <b>${row[x_col]:.2f}</b>{h_pad}<br>")
+        # Openness
+        parts.append(f"{h_pad}Openness: <b>{row['Openness']}</b>{h_pad}")
+        # Add final line break for padding
         parts.append("<br>")
         return ''.join(parts)
     # Pre-generate hover text and shapes for each point
     data_plot['hover_text'] = data_plot.apply(

simple_data_loader.py CHANGED Viewed

@@ -144,11 +144,11 @@ class SimpleLeaderboardViewer:
         if not config_has_mappings:
             print("[DATA_LOADER] No agenteval.json found, using fallback category mappings")
             fallback_mappings = {
-                'swe-bench': ['Bug Fixing'],
-                'swe-bench-multimodal': ['Frontend Development'],
-                'commit0': ['App Creation'],
-                'multi-swe-bench': ['Bug Fixing'],
-                'swt-bench': ['Test Generation'],
                 'gaia': ['Information Gathering'],
             }
             for benchmark, categories in fallback_mappings.items():
@@ -244,7 +244,7 @@ class SimpleLeaderboardViewer:
                 normalized_openness = OPENNESS_MAPPING.get(raw_openness, raw_openness)
                 # All 5 categories for the leaderboard
-                ALL_CATEGORIES = ['Bug Fixing', 'Frontend Development', 'App Creation', 'Test Generation', 'Information Gathering']
                 record = {
                     # Core agent info - use final display names

         if not config_has_mappings:
             print("[DATA_LOADER] No agenteval.json found, using fallback category mappings")
             fallback_mappings = {
+                'swe-bench': ['Issue Resolution'],
+                'swe-bench-multimodal': ['Frontend'],
+                'commit0': ['Greenfield'],
+                'multi-swe-bench': ['Issue Resolution'],
+                'swt-bench': ['Testing'],
                 'gaia': ['Information Gathering'],
             }
             for benchmark, categories in fallback_mappings.items():
                 normalized_openness = OPENNESS_MAPPING.get(raw_openness, raw_openness)
                 # All 5 categories for the leaderboard
+                ALL_CATEGORIES = ['Issue Resolution', 'Frontend', 'Greenfield', 'Testing', 'Information Gathering']
                 record = {
                     # Core agent info - use final display names

test_generation.py CHANGED Viewed

@@ -2,7 +2,7 @@ from content import TEST_GENERATION_DESCRIPTION
 from category_page_builder import build_category_page
 # Define the category for this page
-CATEGORY_NAME = "Test Generation"
 def build_page():
     build_category_page(CATEGORY_NAME, TEST_GENERATION_DESCRIPTION)

 from category_page_builder import build_category_page
 # Define the category for this page
+CATEGORY_NAME = "Testing"
 def build_page():
     build_category_page(CATEGORY_NAME, TEST_GENERATION_DESCRIPTION)

ui_components.py CHANGED Viewed

@@ -223,20 +223,20 @@ def build_descriptions_tooltip_content(table) -> str:
             <div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>
             <div class="tooltip-description-item"><b>Average Score:</b> Sum of category scores divided by 5. Missing categories count as 0.</div>
             <div class="tooltip-description-item"><b>Average Cost:</b> Average cost per instance across all submitted benchmarks, in USD.</div>
-            <div class="tooltip-description-item"><b>Bug Fixing Score:</b> Macro-average score across Bug Fixing benchmarks.</div>
-            <div class="tooltip-description-item"><b>Bug Fixing Cost:</b> Macro-average cost per instance (USD) across Bug Fixing benchmarks.</div>
-            <div class="tooltip-description-item"><b>Frontend Development Score:</b> Macro-average score across Frontend Development benchmarks.</div>
-            <div class="tooltip-description-item"><b>Frontend Development Cost:</b> Macro-average cost per instance (USD) across Frontend Development benchmarks.</div>
-            <div class="tooltip-description-item"><b>App Creation Score:</b> Macro-average score across App Creation benchmarks.</div>
-            <div class="tooltip-description-item"><b>App Creation Cost:</b> Macro-average cost per instance (USD) across App Creation benchmarks.</div>
-            <div class="tooltip-description-item"><b>Test Generation Score:</b> Macro-average score across Test Generation benchmarks.</div>
-            <div class="tooltip-description-item"><b>Test Generation Cost:</b> Macro-average cost per instance (USD) across Test Generation benchmarks.</div>
             <div class="tooltip-description-item"><b>Information Gathering Score:</b> Macro-average score across Information Gathering benchmarks.</div>
             <div class="tooltip-description-item"><b>Information Gathering Cost:</b> Macro-average cost per instance (USD) across Information Gathering benchmarks.</div>
             <div class="tooltip-description-item"><b>Categories Attempted:</b> Number of core categories with at least one benchmark attempted (out of 5).</div>
             <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
         """
-    elif table in ["Bug Fixing", "Frontend Development", "App Creation", "Test Generation", "Information Gathering"]:
         return f"""
             <div class="tooltip-description-item"><b>SDK Version:</b> Version of the OpenHands agent evaluated.</div>
             <div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>

             <div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>
             <div class="tooltip-description-item"><b>Average Score:</b> Sum of category scores divided by 5. Missing categories count as 0.</div>
             <div class="tooltip-description-item"><b>Average Cost:</b> Average cost per instance across all submitted benchmarks, in USD.</div>
+            <div class="tooltip-description-item"><b>Issue Resolution Score:</b> Macro-average score across Issue Resolution benchmarks.</div>
+            <div class="tooltip-description-item"><b>Issue Resolution Cost:</b> Macro-average cost per instance (USD) across Issue Resolution benchmarks.</div>
+            <div class="tooltip-description-item"><b>Frontend Score:</b> Macro-average score across Frontend benchmarks.</div>
+            <div class="tooltip-description-item"><b>Frontend Cost:</b> Macro-average cost per instance (USD) across Frontend benchmarks.</div>
+            <div class="tooltip-description-item"><b>Greenfield Score:</b> Macro-average score across Greenfield benchmarks.</div>
+            <div class="tooltip-description-item"><b>Greenfield Cost:</b> Macro-average cost per instance (USD) across Greenfield benchmarks.</div>
+            <div class="tooltip-description-item"><b>Testing Score:</b> Macro-average score across Testing benchmarks.</div>
+            <div class="tooltip-description-item"><b>Testing Cost:</b> Macro-average cost per instance (USD) across Testing benchmarks.</div>
             <div class="tooltip-description-item"><b>Information Gathering Score:</b> Macro-average score across Information Gathering benchmarks.</div>
             <div class="tooltip-description-item"><b>Information Gathering Cost:</b> Macro-average cost per instance (USD) across Information Gathering benchmarks.</div>
             <div class="tooltip-description-item"><b>Categories Attempted:</b> Number of core categories with at least one benchmark attempted (out of 5).</div>
             <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
         """
+    elif table in ["Issue Resolution", "Frontend", "Greenfield", "Testing", "Information Gathering"]:
         return f"""
             <div class="tooltip-description-item"><b>SDK Version:</b> Version of the OpenHands agent evaluated.</div>
             <div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>