openhands commited on
Commit
6a0d1cb
·
1 Parent(s): 737a3f2

Remove unused AstaBench category files and update UI to OpenHands categories

Browse files

Cleanup actions:
- Deleted 4 unused AstaBench category files: c_and_e.py, data_analysis.py, e2e.py, literature_understanding.py
- Updated ui_components.py tooltips to show OpenHands categories (Bug Fixing, Frontend Development, etc.) instead of AstaBench categories
- Changed tooltip from 'four categories' to 'five categories'
- Updated leaderboard_transformer.py comment to reflect actual OpenHands field mappings
- Category list now: Bug Fixing, Frontend Development, App Creation, Test Generation, Information Gathering

This removes ~1KB of dead code and fixes user-facing tooltips to match actual categories.

c_and_e.py DELETED
@@ -1,9 +0,0 @@
1
- import gradio as gr
2
- from content import CODE_EXECUTION_DESCRIPTION
3
- from category_page_builder import build_category_page
4
-
5
- # Define the category for this page
6
- CATEGORY_NAME = "Code & Execution"
7
-
8
- def build_page():
9
- build_category_page(CATEGORY_NAME, CODE_EXECUTION_DESCRIPTION)
 
 
 
 
 
 
 
 
 
 
data_analysis.py DELETED
@@ -1,8 +0,0 @@
1
- import gradio as gr
2
- from content import DATA_ANALYSIS_DESCRIPTION
3
- from category_page_builder import build_category_page
4
- # Define the category for this page
5
- CATEGORY_NAME = "Data Analysis"
6
-
7
- def build_page():
8
- build_category_page(CATEGORY_NAME, DATA_ANALYSIS_DESCRIPTION)
 
 
 
 
 
 
 
 
 
e2e.py DELETED
@@ -1,8 +0,0 @@
1
- import gradio as gr
2
- from content import DISCOVERY_DESCRIPTION
3
- from category_page_builder import build_category_page
4
- # Define the category for this page
5
- CATEGORY_NAME = "End-to-End Discovery"
6
-
7
- def build_page():
8
- build_category_page(CATEGORY_NAME, DISCOVERY_DESCRIPTION)
 
 
 
 
 
 
 
 
 
leaderboard_transformer.py CHANGED
@@ -96,8 +96,8 @@ def _pretty_column_name(raw_col: str) -> str:
96
  """
97
  Takes a raw column name from the DataFrame and returns a "pretty" version.
98
  Handles three cases:
99
- 1. Fixed names (e.g., 'User/organization' -> 'Submitter').
100
- 2. Dynamic names (e.g., 'ds1000_validation score' -> 'DS1000 Validation Score').
101
  3. Fallback for any other names.
102
  """
103
  # Case 1: Handle fixed, special-case mappings first.
 
96
  """
97
  Takes a raw column name from the DataFrame and returns a "pretty" version.
98
  Handles three cases:
99
+ 1. Fixed names (e.g., 'Openhands version' -> 'OpenHands Version', 'Language model' -> 'Language Model').
100
+ 2. Dynamic names (e.g., 'swe_bench_lite score' -> 'SWE-bench Lite Score').
101
  3. Fallback for any other names.
102
  """
103
  # Case 1: Handle fixed, special-case mappings first.
literature_understanding.py DELETED
@@ -1,8 +0,0 @@
1
- from content import LIT_DESCRIPTION
2
- from category_page_builder import build_category_page
3
-
4
- # Define the category for this page
5
- CATEGORY_NAME = "Literature Understanding"
6
-
7
- def build_page():
8
- build_category_page(CATEGORY_NAME, LIT_DESCRIPTION)
 
 
 
 
 
 
 
 
 
ui_components.py CHANGED
@@ -149,20 +149,22 @@ def build_descriptions_tooltip_content(table) -> str:
149
  return """
150
  <div class="tooltip-description-item"><b>OpenHands Version:</b> Version of the OpenHands agent evaluated.</div>
151
  <div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>
152
- <div class="tooltip-description-item"><b>Overall Score:</b> Macro-average of the four category-level average scores. Each category contributes equally.</div>
153
  <div class="tooltip-description-item"><b>Overall Cost:</b> Macro-average cost per problem across all categories, in USD. Each category contributes equally.</div>
154
- <div class="tooltip-description-item"><b>Literature Understanding Score:</b> Macro-average score across Literature Understanding benchmarks.</div>
155
- <div class="tooltip-description-item"><b>Literature Understanding Cost:</b> Macro-average cost per problem (USD) across Literature Understanding benchmarks.</div>
156
- <div class="tooltip-description-item"><b>Code Execution Score:</b> Macro-average score across Code & Execution benchmarks.</div>
157
- <div class="tooltip-description-item"><b>Code Execution Cost:</b> Macro-average cost per problem (USD) across Code & Execution benchmarks.</div>
158
- <div class="tooltip-description-item"><b>Data Analysis Score:</b> Macro-average score across Data Analysis benchmarks.</div>
159
- <div class="tooltip-description-item"><b>Data Analysis Cost:</b> Macro-average cost per problem (USD) across Data Analysis benchmarks.</div>
160
- <div class="tooltip-description-item"><b>End-to-End Discovery Score:</b> Macro-average score across End-to-End Discovery benchmarks.</div>
161
- <div class="tooltip-description-item"><b>End-to-End Discovery Cost:</b> Macro-average cost per problem (USD)across End-to-End Discovery benchmarks.</div>
 
 
162
  <div class="tooltip-description-item"><b>Categories Attempted:</b> Number of core categories with at least one benchmark attempted (out of 5).</div>
163
  <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
164
  """
165
- elif table in ["Literature Understanding", "Code & Execution", "Data Analysis", "End-to-End Discovery"]:
166
  return f"""
167
  <div class="tooltip-description-item"><b>OpenHands Version:</b> Version of the OpenHands agent evaluated.</div>
168
  <div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>
 
149
  return """
150
  <div class="tooltip-description-item"><b>OpenHands Version:</b> Version of the OpenHands agent evaluated.</div>
151
  <div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>
152
+ <div class="tooltip-description-item"><b>Overall Score:</b> Macro-average of the five category-level average scores. Each category contributes equally.</div>
153
  <div class="tooltip-description-item"><b>Overall Cost:</b> Macro-average cost per problem across all categories, in USD. Each category contributes equally.</div>
154
+ <div class="tooltip-description-item"><b>Bug Fixing Score:</b> Macro-average score across Bug Fixing benchmarks.</div>
155
+ <div class="tooltip-description-item"><b>Bug Fixing Cost:</b> Macro-average cost per problem (USD) across Bug Fixing benchmarks.</div>
156
+ <div class="tooltip-description-item"><b>Frontend Development Score:</b> Macro-average score across Frontend Development benchmarks.</div>
157
+ <div class="tooltip-description-item"><b>Frontend Development Cost:</b> Macro-average cost per problem (USD) across Frontend Development benchmarks.</div>
158
+ <div class="tooltip-description-item"><b>App Creation Score:</b> Macro-average score across App Creation benchmarks.</div>
159
+ <div class="tooltip-description-item"><b>App Creation Cost:</b> Macro-average cost per problem (USD) across App Creation benchmarks.</div>
160
+ <div class="tooltip-description-item"><b>Test Generation Score:</b> Macro-average score across Test Generation benchmarks.</div>
161
+ <div class="tooltip-description-item"><b>Test Generation Cost:</b> Macro-average cost per problem (USD) across Test Generation benchmarks.</div>
162
+ <div class="tooltip-description-item"><b>Information Gathering Score:</b> Macro-average score across Information Gathering benchmarks.</div>
163
+ <div class="tooltip-description-item"><b>Information Gathering Cost:</b> Macro-average cost per problem (USD) across Information Gathering benchmarks.</div>
164
  <div class="tooltip-description-item"><b>Categories Attempted:</b> Number of core categories with at least one benchmark attempted (out of 5).</div>
165
  <div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
166
  """
167
+ elif table in ["Bug Fixing", "Frontend Development", "App Creation", "Test Generation", "Information Gathering"]:
168
  return f"""
169
  <div class="tooltip-description-item"><b>OpenHands Version:</b> Version of the OpenHands agent evaluated.</div>
170
  <div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>