Spaces:

OpenHands
/

openhands-index

Running

openhands commited on Nov 25, 2025

Commit

6a0d1cb

1 Parent(s): 737a3f2

Remove unused AstaBench category files and update UI to OpenHands categories

Cleanup actions:
- Deleted 4 unused AstaBench category files: c_and_e.py, data_analysis.py, e2e.py, literature_understanding.py
- Updated ui_components.py tooltips to show OpenHands categories (Bug Fixing, Frontend Development, etc.) instead of AstaBench categories
- Changed tooltip from 'four categories' to 'five categories'
- Updated leaderboard_transformer.py comment to reflect actual OpenHands field mappings
- Category list now: Bug Fixing, Frontend Development, App Creation, Test Generation, Information Gathering

This removes ~1KB of dead code and fixes user-facing tooltips to match actual categories.

Files changed (6) hide show

c_and_e.py +0 -9
data_analysis.py +0 -8
e2e.py +0 -8
leaderboard_transformer.py +2 -2
literature_understanding.py +0 -8
ui_components.py +12 -10

c_and_e.py DELETED Viewed

@@ -1,9 +0,0 @@
-import gradio as gr
-from content import CODE_EXECUTION_DESCRIPTION
-from category_page_builder import build_category_page
-# Define the category for this page
-CATEGORY_NAME = "Code & Execution"
-def build_page():
-    build_category_page(CATEGORY_NAME, CODE_EXECUTION_DESCRIPTION)

data_analysis.py DELETED Viewed

@@ -1,8 +0,0 @@
-import gradio as gr
-from content import DATA_ANALYSIS_DESCRIPTION
-from category_page_builder import build_category_page
-# Define the category for this page
-CATEGORY_NAME = "Data Analysis"
-def build_page():
-    build_category_page(CATEGORY_NAME, DATA_ANALYSIS_DESCRIPTION)

e2e.py DELETED Viewed

@@ -1,8 +0,0 @@
-import gradio as gr
-from content import DISCOVERY_DESCRIPTION
-from category_page_builder import build_category_page
-# Define the category for this page
-CATEGORY_NAME = "End-to-End Discovery"
-def build_page():
-    build_category_page(CATEGORY_NAME, DISCOVERY_DESCRIPTION)

leaderboard_transformer.py CHANGED Viewed

@@ -96,8 +96,8 @@ def _pretty_column_name(raw_col: str) -> str:
     """
     Takes a raw column name from the DataFrame and returns a "pretty" version.
     Handles three cases:
-    1. Fixed names (e.g., 'User/organization' -> 'Submitter').
-    2. Dynamic names (e.g., 'ds1000_validation score' -> 'DS1000 Validation Score').
     3. Fallback for any other names.
     """
     # Case 1: Handle fixed, special-case mappings first.

     """
     Takes a raw column name from the DataFrame and returns a "pretty" version.
     Handles three cases:
+    1. Fixed names (e.g., 'Openhands version' -> 'OpenHands Version', 'Language model' -> 'Language Model').
+    2. Dynamic names (e.g., 'swe_bench_lite score' -> 'SWE-bench Lite Score').
     3. Fallback for any other names.
     """
     # Case 1: Handle fixed, special-case mappings first.

literature_understanding.py DELETED Viewed

@@ -1,8 +0,0 @@
-from content import LIT_DESCRIPTION
-from category_page_builder import build_category_page
-# Define the category for this page
-CATEGORY_NAME = "Literature Understanding"
-def build_page():
-    build_category_page(CATEGORY_NAME, LIT_DESCRIPTION)

ui_components.py CHANGED Viewed

@@ -149,20 +149,22 @@ def build_descriptions_tooltip_content(table) -> str:
         return """
             <div class="tooltip-description-item"><b>OpenHands Version:</b> Version of the OpenHands agent evaluated.</div>
             <div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>
-            <div class="tooltip-description-item"><b>Overall Score:</b> Macro-average of the four category-level average scores. Each category contributes equally.</div>
             <div class="tooltip-description-item"><b>Overall Cost:</b> Macro-average cost per problem across all categories, in USD. Each category contributes equally.</div>
-            <div class="tooltip-description-item"><b>Literature Understanding Score:</b> Macro-average score across Literature Understanding benchmarks.</div>
-            <div class="tooltip-description-item"><b>Literature Understanding Cost:</b> Macro-average cost per problem (USD) across Literature Understanding benchmarks.</div>
-            <div class="tooltip-description-item"><b>Code Execution Score:</b> Macro-average score across Code & Execution benchmarks.</div>
-            <div class="tooltip-description-item"><b>Code Execution Cost:</b> Macro-average cost per problem (USD) across Code & Execution benchmarks.</div>
-            <div class="tooltip-description-item"><b>Data Analysis Score:</b> Macro-average score across Data Analysis benchmarks.</div>
-            <div class="tooltip-description-item"><b>Data Analysis Cost:</b> Macro-average cost per problem (USD) across Data Analysis benchmarks.</div>
-            <div class="tooltip-description-item"><b>End-to-End Discovery Score:</b> Macro-average score across End-to-End Discovery benchmarks.</div>
-            <div class="tooltip-description-item"><b>End-to-End Discovery Cost:</b> Macro-average cost per problem (USD)across End-to-End Discovery benchmarks.</div>
             <div class="tooltip-description-item"><b>Categories Attempted:</b> Number of core categories with at least one benchmark attempted (out of 5).</div>
             <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
         """
-    elif table in ["Literature Understanding", "Code & Execution", "Data Analysis", "End-to-End Discovery"]:
         return f"""
             <div class="tooltip-description-item"><b>OpenHands Version:</b> Version of the OpenHands agent evaluated.</div>
             <div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>

         return """
             <div class="tooltip-description-item"><b>OpenHands Version:</b> Version of the OpenHands agent evaluated.</div>
             <div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>
+            <div class="tooltip-description-item"><b>Overall Score:</b> Macro-average of the five category-level average scores. Each category contributes equally.</div>
             <div class="tooltip-description-item"><b>Overall Cost:</b> Macro-average cost per problem across all categories, in USD. Each category contributes equally.</div>
+            <div class="tooltip-description-item"><b>Bug Fixing Score:</b> Macro-average score across Bug Fixing benchmarks.</div>
+            <div class="tooltip-description-item"><b>Bug Fixing Cost:</b> Macro-average cost per problem (USD) across Bug Fixing benchmarks.</div>
+            <div class="tooltip-description-item"><b>Frontend Development Score:</b> Macro-average score across Frontend Development benchmarks.</div>
+            <div class="tooltip-description-item"><b>Frontend Development Cost:</b> Macro-average cost per problem (USD) across Frontend Development benchmarks.</div>
+            <div class="tooltip-description-item"><b>App Creation Score:</b> Macro-average score across App Creation benchmarks.</div>
+            <div class="tooltip-description-item"><b>App Creation Cost:</b> Macro-average cost per problem (USD) across App Creation benchmarks.</div>
+            <div class="tooltip-description-item"><b>Test Generation Score:</b> Macro-average score across Test Generation benchmarks.</div>
+            <div class="tooltip-description-item"><b>Test Generation Cost:</b> Macro-average cost per problem (USD) across Test Generation benchmarks.</div>
+            <div class="tooltip-description-item"><b>Information Gathering Score:</b> Macro-average score across Information Gathering benchmarks.</div>
+            <div class="tooltip-description-item"><b>Information Gathering Cost:</b> Macro-average cost per problem (USD) across Information Gathering benchmarks.</div>
             <div class="tooltip-description-item"><b>Categories Attempted:</b> Number of core categories with at least one benchmark attempted (out of 5).</div>
             <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
         """
+    elif table in ["Bug Fixing", "Frontend Development", "App Creation", "Test Generation", "Information Gathering"]:
         return f"""
             <div class="tooltip-description-item"><b>OpenHands Version:</b> Version of the OpenHands agent evaluated.</div>
             <div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>