Spaces:
Running
Running
openhands
commited on
Commit
·
6a0d1cb
1
Parent(s):
737a3f2
Remove unused AstaBench category files and update UI to OpenHands categories
Browse filesCleanup actions:
- Deleted 4 unused AstaBench category files: c_and_e.py, data_analysis.py, e2e.py, literature_understanding.py
- Updated ui_components.py tooltips to show OpenHands categories (Bug Fixing, Frontend Development, etc.) instead of AstaBench categories
- Changed tooltip from 'four categories' to 'five categories'
- Updated leaderboard_transformer.py comment to reflect actual OpenHands field mappings
- Category list now: Bug Fixing, Frontend Development, App Creation, Test Generation, Information Gathering
This removes ~1KB of dead code and fixes user-facing tooltips to match actual categories.
- c_and_e.py +0 -9
- data_analysis.py +0 -8
- e2e.py +0 -8
- leaderboard_transformer.py +2 -2
- literature_understanding.py +0 -8
- ui_components.py +12 -10
c_and_e.py
DELETED
|
@@ -1,9 +0,0 @@
|
|
| 1 |
-
import gradio as gr
|
| 2 |
-
from content import CODE_EXECUTION_DESCRIPTION
|
| 3 |
-
from category_page_builder import build_category_page
|
| 4 |
-
|
| 5 |
-
# Define the category for this page
|
| 6 |
-
CATEGORY_NAME = "Code & Execution"
|
| 7 |
-
|
| 8 |
-
def build_page():
|
| 9 |
-
build_category_page(CATEGORY_NAME, CODE_EXECUTION_DESCRIPTION)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data_analysis.py
DELETED
|
@@ -1,8 +0,0 @@
|
|
| 1 |
-
import gradio as gr
|
| 2 |
-
from content import DATA_ANALYSIS_DESCRIPTION
|
| 3 |
-
from category_page_builder import build_category_page
|
| 4 |
-
# Define the category for this page
|
| 5 |
-
CATEGORY_NAME = "Data Analysis"
|
| 6 |
-
|
| 7 |
-
def build_page():
|
| 8 |
-
build_category_page(CATEGORY_NAME, DATA_ANALYSIS_DESCRIPTION)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
e2e.py
DELETED
|
@@ -1,8 +0,0 @@
|
|
| 1 |
-
import gradio as gr
|
| 2 |
-
from content import DISCOVERY_DESCRIPTION
|
| 3 |
-
from category_page_builder import build_category_page
|
| 4 |
-
# Define the category for this page
|
| 5 |
-
CATEGORY_NAME = "End-to-End Discovery"
|
| 6 |
-
|
| 7 |
-
def build_page():
|
| 8 |
-
build_category_page(CATEGORY_NAME, DISCOVERY_DESCRIPTION)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_transformer.py
CHANGED
|
@@ -96,8 +96,8 @@ def _pretty_column_name(raw_col: str) -> str:
|
|
| 96 |
"""
|
| 97 |
Takes a raw column name from the DataFrame and returns a "pretty" version.
|
| 98 |
Handles three cases:
|
| 99 |
-
1. Fixed names (e.g., '
|
| 100 |
-
2. Dynamic names (e.g., '
|
| 101 |
3. Fallback for any other names.
|
| 102 |
"""
|
| 103 |
# Case 1: Handle fixed, special-case mappings first.
|
|
|
|
| 96 |
"""
|
| 97 |
Takes a raw column name from the DataFrame and returns a "pretty" version.
|
| 98 |
Handles three cases:
|
| 99 |
+
1. Fixed names (e.g., 'Openhands version' -> 'OpenHands Version', 'Language model' -> 'Language Model').
|
| 100 |
+
2. Dynamic names (e.g., 'swe_bench_lite score' -> 'SWE-bench Lite Score').
|
| 101 |
3. Fallback for any other names.
|
| 102 |
"""
|
| 103 |
# Case 1: Handle fixed, special-case mappings first.
|
literature_understanding.py
DELETED
|
@@ -1,8 +0,0 @@
|
|
| 1 |
-
from content import LIT_DESCRIPTION
|
| 2 |
-
from category_page_builder import build_category_page
|
| 3 |
-
|
| 4 |
-
# Define the category for this page
|
| 5 |
-
CATEGORY_NAME = "Literature Understanding"
|
| 6 |
-
|
| 7 |
-
def build_page():
|
| 8 |
-
build_category_page(CATEGORY_NAME, LIT_DESCRIPTION)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ui_components.py
CHANGED
|
@@ -149,20 +149,22 @@ def build_descriptions_tooltip_content(table) -> str:
|
|
| 149 |
return """
|
| 150 |
<div class="tooltip-description-item"><b>OpenHands Version:</b> Version of the OpenHands agent evaluated.</div>
|
| 151 |
<div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>
|
| 152 |
-
<div class="tooltip-description-item"><b>Overall Score:</b> Macro-average of the
|
| 153 |
<div class="tooltip-description-item"><b>Overall Cost:</b> Macro-average cost per problem across all categories, in USD. Each category contributes equally.</div>
|
| 154 |
-
<div class="tooltip-description-item"><b>
|
| 155 |
-
<div class="tooltip-description-item"><b>
|
| 156 |
-
<div class="tooltip-description-item"><b>
|
| 157 |
-
<div class="tooltip-description-item"><b>
|
| 158 |
-
<div class="tooltip-description-item"><b>
|
| 159 |
-
<div class="tooltip-description-item"><b>
|
| 160 |
-
<div class="tooltip-description-item"><b>
|
| 161 |
-
<div class="tooltip-description-item"><b>
|
|
|
|
|
|
|
| 162 |
<div class="tooltip-description-item"><b>Categories Attempted:</b> Number of core categories with at least one benchmark attempted (out of 5).</div>
|
| 163 |
<div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
|
| 164 |
"""
|
| 165 |
-
elif table in ["
|
| 166 |
return f"""
|
| 167 |
<div class="tooltip-description-item"><b>OpenHands Version:</b> Version of the OpenHands agent evaluated.</div>
|
| 168 |
<div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>
|
|
|
|
| 149 |
return """
|
| 150 |
<div class="tooltip-description-item"><b>OpenHands Version:</b> Version of the OpenHands agent evaluated.</div>
|
| 151 |
<div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>
|
| 152 |
+
<div class="tooltip-description-item"><b>Overall Score:</b> Macro-average of the five category-level average scores. Each category contributes equally.</div>
|
| 153 |
<div class="tooltip-description-item"><b>Overall Cost:</b> Macro-average cost per problem across all categories, in USD. Each category contributes equally.</div>
|
| 154 |
+
<div class="tooltip-description-item"><b>Bug Fixing Score:</b> Macro-average score across Bug Fixing benchmarks.</div>
|
| 155 |
+
<div class="tooltip-description-item"><b>Bug Fixing Cost:</b> Macro-average cost per problem (USD) across Bug Fixing benchmarks.</div>
|
| 156 |
+
<div class="tooltip-description-item"><b>Frontend Development Score:</b> Macro-average score across Frontend Development benchmarks.</div>
|
| 157 |
+
<div class="tooltip-description-item"><b>Frontend Development Cost:</b> Macro-average cost per problem (USD) across Frontend Development benchmarks.</div>
|
| 158 |
+
<div class="tooltip-description-item"><b>App Creation Score:</b> Macro-average score across App Creation benchmarks.</div>
|
| 159 |
+
<div class="tooltip-description-item"><b>App Creation Cost:</b> Macro-average cost per problem (USD) across App Creation benchmarks.</div>
|
| 160 |
+
<div class="tooltip-description-item"><b>Test Generation Score:</b> Macro-average score across Test Generation benchmarks.</div>
|
| 161 |
+
<div class="tooltip-description-item"><b>Test Generation Cost:</b> Macro-average cost per problem (USD) across Test Generation benchmarks.</div>
|
| 162 |
+
<div class="tooltip-description-item"><b>Information Gathering Score:</b> Macro-average score across Information Gathering benchmarks.</div>
|
| 163 |
+
<div class="tooltip-description-item"><b>Information Gathering Cost:</b> Macro-average cost per problem (USD) across Information Gathering benchmarks.</div>
|
| 164 |
<div class="tooltip-description-item"><b>Categories Attempted:</b> Number of core categories with at least one benchmark attempted (out of 5).</div>
|
| 165 |
<div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
|
| 166 |
"""
|
| 167 |
+
elif table in ["Bug Fixing", "Frontend Development", "App Creation", "Test Generation", "Information Gathering"]:
|
| 168 |
return f"""
|
| 169 |
<div class="tooltip-description-item"><b>OpenHands Version:</b> Version of the OpenHands agent evaluated.</div>
|
| 170 |
<div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>
|