Spaces:
Running
Running
openhands
openhands
commited on
Commit
·
5778893
1
Parent(s):
8cdce51
Update DeepSeek logo, tooltip format, and category names
Browse files1. Replace DeepSeek logo with official SVG from LobeHub
2. Update tooltip format to: {lm_name} (SDK {version})
- Average Score, Average Cost, Openness
3. Rename categories:
- Bug Fixing -> Issue Resolution
- App Creation -> Greenfield
- Frontend Development -> Frontend
- Test Generation -> Testing
- Information Gathering (unchanged)
4. Update intro paragraph to mention all category names
Co-authored-by: openhands <openhands@all-hands.dev>
- app.py +4 -4
- app_creation.py +1 -1
- assets/logo-deepseek.svg +1 -4
- bug_fixing.py +1 -1
- category_page_builder.py +4 -4
- content.py +5 -5
- frontend_development.py +1 -1
- leaderboard_transformer.py +30 -24
- simple_data_loader.py +6 -6
- test_generation.py +1 -1
- ui_components.py +9 -9
app.py
CHANGED
|
@@ -219,16 +219,16 @@ demo = gr.Blocks(
|
|
| 219 |
with demo.route("Home", "/home"):
|
| 220 |
build_main_page()
|
| 221 |
|
| 222 |
-
with demo.route("
|
| 223 |
build_bug_fixing_page()
|
| 224 |
|
| 225 |
-
with demo.route("
|
| 226 |
build_app_creation_page()
|
| 227 |
|
| 228 |
-
with demo.route("Frontend
|
| 229 |
build_frontend_page()
|
| 230 |
|
| 231 |
-
with demo.route("
|
| 232 |
build_test_generation_page()
|
| 233 |
|
| 234 |
with demo.route("Information Gathering", "/information-gathering"):
|
|
|
|
| 219 |
with demo.route("Home", "/home"):
|
| 220 |
build_main_page()
|
| 221 |
|
| 222 |
+
with demo.route("Issue Resolution", "/bug-fixing"):
|
| 223 |
build_bug_fixing_page()
|
| 224 |
|
| 225 |
+
with demo.route("Greenfield", "/app-creation"):
|
| 226 |
build_app_creation_page()
|
| 227 |
|
| 228 |
+
with demo.route("Frontend", "/frontend-development"):
|
| 229 |
build_frontend_page()
|
| 230 |
|
| 231 |
+
with demo.route("Testing", "/test-generation"):
|
| 232 |
build_test_generation_page()
|
| 233 |
|
| 234 |
with demo.route("Information Gathering", "/information-gathering"):
|
app_creation.py
CHANGED
|
@@ -2,7 +2,7 @@ from content import APP_CREATION_DESCRIPTION
|
|
| 2 |
from category_page_builder import build_category_page
|
| 3 |
|
| 4 |
# Define the category for this page
|
| 5 |
-
CATEGORY_NAME = "
|
| 6 |
|
| 7 |
def build_page():
|
| 8 |
build_category_page(CATEGORY_NAME, APP_CREATION_DESCRIPTION)
|
|
|
|
| 2 |
from category_page_builder import build_category_page
|
| 3 |
|
| 4 |
# Define the category for this page
|
| 5 |
+
CATEGORY_NAME = "Greenfield"
|
| 6 |
|
| 7 |
def build_page():
|
| 8 |
build_category_page(CATEGORY_NAME, APP_CREATION_DESCRIPTION)
|
assets/logo-deepseek.svg
CHANGED
|
|
|
|
bug_fixing.py
CHANGED
|
@@ -2,7 +2,7 @@ from content import BUG_FIXING_DESCRIPTION
|
|
| 2 |
from category_page_builder import build_category_page
|
| 3 |
|
| 4 |
# Define the category for this page
|
| 5 |
-
CATEGORY_NAME = "
|
| 6 |
|
| 7 |
def build_page():
|
| 8 |
build_category_page(CATEGORY_NAME, BUG_FIXING_DESCRIPTION)
|
|
|
|
| 2 |
from category_page_builder import build_category_page
|
| 3 |
|
| 4 |
# Define the category for this page
|
| 5 |
+
CATEGORY_NAME = "Issue Resolution"
|
| 6 |
|
| 7 |
def build_page():
|
| 8 |
build_category_page(CATEGORY_NAME, BUG_FIXING_DESCRIPTION)
|
category_page_builder.py
CHANGED
|
@@ -4,10 +4,10 @@ import pandas as pd
|
|
| 4 |
# Import our UI factories and the data loader
|
| 5 |
from ui_components import create_leaderboard_display, create_benchmark_details_display, get_full_leaderboard_data, create_sub_navigation_bar
|
| 6 |
CATEGORY_DIAGRAM_MAP = {
|
| 7 |
-
"
|
| 8 |
-
"
|
| 9 |
-
"Frontend
|
| 10 |
-
"
|
| 11 |
"Information Gathering": "assets/information-gathering.svg",
|
| 12 |
}
|
| 13 |
|
|
|
|
| 4 |
# Import our UI factories and the data loader
|
| 5 |
from ui_components import create_leaderboard_display, create_benchmark_details_display, get_full_leaderboard_data, create_sub_navigation_bar
|
| 6 |
CATEGORY_DIAGRAM_MAP = {
|
| 7 |
+
"Issue Resolution": "assets/bug-fixing.svg",
|
| 8 |
+
"Greenfield": "assets/app-creation.svg",
|
| 9 |
+
"Frontend": "assets/frontend-development.svg",
|
| 10 |
+
"Testing": "assets/test-generation.svg",
|
| 11 |
"Information Gathering": "assets/information-gathering.svg",
|
| 12 |
}
|
| 13 |
|
content.py
CHANGED
|
@@ -25,7 +25,7 @@ INTRO_PARAGRAPH = """
|
|
| 25 |
</p>
|
| 26 |
|
| 27 |
<p>
|
| 28 |
-
Our index aggregates results from multiple benchmarks spanning five categories,
|
| 29 |
</p>
|
| 30 |
|
| 31 |
<p>
|
|
@@ -39,7 +39,7 @@ PARETO_DISCLAIMER = """
|
|
| 39 |
Agents names that are green are Pareto optimal, meaning they achieve the best performance for their cost.
|
| 40 |
"""
|
| 41 |
BUG_FIXING_DESCRIPTION = """
|
| 42 |
-
The **
|
| 43 |
<br><br>
|
| 44 |
The scores shown below reflect performance aggregated across two distinct benchmarks: SWE-bench (text-based bug reports) and SWE-bench-multimodal (issues with visual context like screenshots or diagrams).
|
| 45 |
<br><br>
|
|
@@ -47,7 +47,7 @@ For detailed results, use the links above to explore individual benchmarks.
|
|
| 47 |
<br>
|
| 48 |
"""
|
| 49 |
APP_CREATION_DESCRIPTION = """
|
| 50 |
-
The **
|
| 51 |
<br><br>
|
| 52 |
This category currently includes Commit0, which challenges agents to implement complete features and applications by generating the initial commit for a project.
|
| 53 |
<br><br>
|
|
@@ -55,13 +55,13 @@ For detailed results, use the links above to explore individual benchmark pages.
|
|
| 55 |
<br>
|
| 56 |
"""
|
| 57 |
FRONTEND_DEVELOPMENT_DESCRIPTION = """
|
| 58 |
-
The **Frontend
|
| 59 |
<br><br>
|
| 60 |
This category includes Multi-SWE-bench, which challenges agents to work across multiple repositories and coordinate changes in complex web application architectures.
|
| 61 |
<br>
|
| 62 |
"""
|
| 63 |
TEST_GENERATION_DESCRIPTION = """
|
| 64 |
-
The **
|
| 65 |
<br><br>
|
| 66 |
This category includes SWT-bench (Software Testing Benchmark), which challenges agents to generate high-quality test cases that achieve good coverage and catch real bugs.
|
| 67 |
<br>
|
|
|
|
| 25 |
</p>
|
| 26 |
|
| 27 |
<p>
|
| 28 |
+
Our index aggregates results from multiple benchmarks spanning five categories: <strong>Issue Resolution</strong> (fixing bugs), <strong>Greenfield</strong> (building new apps), <strong>Frontend</strong> (UI development), <strong>Testing</strong> (test generation), and <strong>Information Gathering</strong>. This provides a single view of both <strong>performance</strong> and <strong>cost efficiency</strong>, enabling fair comparisons between agents.
|
| 29 |
</p>
|
| 30 |
|
| 31 |
<p>
|
|
|
|
| 39 |
Agents names that are green are Pareto optimal, meaning they achieve the best performance for their cost.
|
| 40 |
"""
|
| 41 |
BUG_FIXING_DESCRIPTION = """
|
| 42 |
+
The **Issue Resolution** category evaluates how well agents can diagnose and fix bugs in real-world codebases. This tests their ability to understand GitHub issues, navigate repositories, identify root causes, and implement correct fixes.
|
| 43 |
<br><br>
|
| 44 |
The scores shown below reflect performance aggregated across two distinct benchmarks: SWE-bench (text-based bug reports) and SWE-bench-multimodal (issues with visual context like screenshots or diagrams).
|
| 45 |
<br><br>
|
|
|
|
| 47 |
<br>
|
| 48 |
"""
|
| 49 |
APP_CREATION_DESCRIPTION = """
|
| 50 |
+
The **Greenfield** category in OpenHands Index evaluates an agent's ability to build complete applications from scratch based on natural language specifications. This tests whether agents can understand requirements, design architecture, write modular code, and create working applications.
|
| 51 |
<br><br>
|
| 52 |
This category currently includes Commit0, which challenges agents to implement complete features and applications by generating the initial commit for a project.
|
| 53 |
<br><br>
|
|
|
|
| 55 |
<br>
|
| 56 |
"""
|
| 57 |
FRONTEND_DEVELOPMENT_DESCRIPTION = """
|
| 58 |
+
The **Frontend** category evaluates agents on their ability to build user interfaces and web applications. This tests skills in HTML, CSS, JavaScript frameworks, responsive design, and creating interactive user experiences.
|
| 59 |
<br><br>
|
| 60 |
This category includes Multi-SWE-bench, which challenges agents to work across multiple repositories and coordinate changes in complex web application architectures.
|
| 61 |
<br>
|
| 62 |
"""
|
| 63 |
TEST_GENERATION_DESCRIPTION = """
|
| 64 |
+
The **Testing** category evaluates agents on their ability to create comprehensive test suites for existing code. This tests their understanding of code behavior, edge cases, and the ability to write effective unit tests, integration tests, and end-to-end tests.
|
| 65 |
<br><br>
|
| 66 |
This category includes SWT-bench (Software Testing Benchmark), which challenges agents to generate high-quality test cases that achieve good coverage and catch real bugs.
|
| 67 |
<br>
|
frontend_development.py
CHANGED
|
@@ -2,7 +2,7 @@ from content import FRONTEND_DEVELOPMENT_DESCRIPTION
|
|
| 2 |
from category_page_builder import build_category_page
|
| 3 |
|
| 4 |
# Define the category for this page
|
| 5 |
-
CATEGORY_NAME = "Frontend
|
| 6 |
|
| 7 |
def build_page():
|
| 8 |
build_category_page(CATEGORY_NAME, FRONTEND_DEVELOPMENT_DESCRIPTION)
|
|
|
|
| 2 |
from category_page_builder import build_category_page
|
| 3 |
|
| 4 |
# Define the category for this page
|
| 5 |
+
CATEGORY_NAME = "Frontend"
|
| 6 |
|
| 7 |
def build_page():
|
| 8 |
build_category_page(CATEGORY_NAME, FRONTEND_DEVELOPMENT_DESCRIPTION)
|
leaderboard_transformer.py
CHANGED
|
@@ -341,7 +341,7 @@ class DataTransformer:
|
|
| 341 |
# Calculated and add "Categories Attempted" column
|
| 342 |
if tag is None or tag == "Overall":
|
| 343 |
def calculate_attempted(row):
|
| 344 |
-
main_categories = ['
|
| 345 |
count = 0
|
| 346 |
for category in main_categories:
|
| 347 |
value = row.get(f"{category} Score")
|
|
@@ -503,36 +503,42 @@ def _plot_scatter_plotly(
|
|
| 503 |
def format_hover_text(row, agent_col, x_axis_label, x_col, y_col, divider_line_x):
|
| 504 |
"""
|
| 505 |
Builds the complete HTML string for the plot's hover tooltip.
|
| 506 |
-
|
|
|
|
|
|
|
|
|
|
| 507 |
"""
|
| 508 |
h_pad = " "
|
| 509 |
parts = ["<br>"]
|
| 510 |
-
|
| 511 |
-
|
| 512 |
-
|
| 513 |
-
# If no cost, display "Missing" for the cost.
|
| 514 |
-
parts.append(f"{h_pad}{x_axis_label}: <b>Missing</b>{h_pad}<br>")
|
| 515 |
-
else:
|
| 516 |
-
parts.append(f"{h_pad}{x_axis_label}: <b>${row[x_col]:.2f}</b>{h_pad}<br>")
|
| 517 |
-
parts.append(f"{h_pad}Openness: <b>{row['Openness']}</b>{h_pad}")
|
| 518 |
-
|
| 519 |
-
# Add extra vertical space (line spacing) before the next section
|
| 520 |
-
parts.append("<br>")
|
| 521 |
-
# Clean and format Language Model column
|
| 522 |
-
llm_base_value = row['Language Model']
|
| 523 |
llm_base_value = clean_llm_base_list(llm_base_value)
|
| 524 |
if isinstance(llm_base_value, list) and llm_base_value:
|
| 525 |
-
|
| 526 |
-
# Create a list of padded bullet points
|
| 527 |
-
list_items = [f"{h_pad} • <b>{item}</b>{h_pad}" for item in llm_base_value]
|
| 528 |
-
# Join them with line breaks
|
| 529 |
-
parts.append('<br>'.join(list_items))
|
| 530 |
else:
|
| 531 |
-
|
| 532 |
-
|
| 533 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 534 |
parts.append("<br>")
|
| 535 |
-
# Join all the parts together into the final HTML string
|
| 536 |
return ''.join(parts)
|
| 537 |
# Pre-generate hover text and shapes for each point
|
| 538 |
data_plot['hover_text'] = data_plot.apply(
|
|
|
|
| 341 |
# Calculated and add "Categories Attempted" column
|
| 342 |
if tag is None or tag == "Overall":
|
| 343 |
def calculate_attempted(row):
|
| 344 |
+
main_categories = ['Issue Resolution', 'Frontend', 'Greenfield', 'Testing', 'Information Gathering']
|
| 345 |
count = 0
|
| 346 |
for category in main_categories:
|
| 347 |
value = row.get(f"{category} Score")
|
|
|
|
| 503 |
def format_hover_text(row, agent_col, x_axis_label, x_col, y_col, divider_line_x):
|
| 504 |
"""
|
| 505 |
Builds the complete HTML string for the plot's hover tooltip.
|
| 506 |
+
Format: {lm_name} (SDK {version})
|
| 507 |
+
Average Score: {score}
|
| 508 |
+
Average Cost: {cost}
|
| 509 |
+
Openness: {openness}
|
| 510 |
"""
|
| 511 |
h_pad = " "
|
| 512 |
parts = ["<br>"]
|
| 513 |
+
|
| 514 |
+
# Get and clean the language model name
|
| 515 |
+
llm_base_value = row.get('Language Model', '')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 516 |
llm_base_value = clean_llm_base_list(llm_base_value)
|
| 517 |
if isinstance(llm_base_value, list) and llm_base_value:
|
| 518 |
+
lm_name = llm_base_value[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 519 |
else:
|
| 520 |
+
lm_name = str(llm_base_value) if llm_base_value else 'Unknown'
|
| 521 |
+
|
| 522 |
+
# Get SDK version
|
| 523 |
+
sdk_version = row.get('SDK Version', row.get(agent_col, 'Unknown'))
|
| 524 |
+
|
| 525 |
+
# Title line: {lm_name} (SDK {version})
|
| 526 |
+
parts.append(f"{h_pad}<b>{lm_name}</b> (SDK {sdk_version}){h_pad}<br>")
|
| 527 |
+
|
| 528 |
+
# Average Score
|
| 529 |
+
parts.append(f"{h_pad}Average Score: <b>{row[y_col]:.3f}</b>{h_pad}<br>")
|
| 530 |
+
|
| 531 |
+
# Average Cost
|
| 532 |
+
if divider_line_x > 0 and row[x_col] >= divider_line_x:
|
| 533 |
+
parts.append(f"{h_pad}Average Cost: <b>Missing</b>{h_pad}<br>")
|
| 534 |
+
else:
|
| 535 |
+
parts.append(f"{h_pad}Average Cost: <b>${row[x_col]:.2f}</b>{h_pad}<br>")
|
| 536 |
+
|
| 537 |
+
# Openness
|
| 538 |
+
parts.append(f"{h_pad}Openness: <b>{row['Openness']}</b>{h_pad}")
|
| 539 |
+
|
| 540 |
+
# Add final line break for padding
|
| 541 |
parts.append("<br>")
|
|
|
|
| 542 |
return ''.join(parts)
|
| 543 |
# Pre-generate hover text and shapes for each point
|
| 544 |
data_plot['hover_text'] = data_plot.apply(
|
simple_data_loader.py
CHANGED
|
@@ -144,11 +144,11 @@ class SimpleLeaderboardViewer:
|
|
| 144 |
if not config_has_mappings:
|
| 145 |
print("[DATA_LOADER] No agenteval.json found, using fallback category mappings")
|
| 146 |
fallback_mappings = {
|
| 147 |
-
'swe-bench': ['
|
| 148 |
-
'swe-bench-multimodal': ['Frontend
|
| 149 |
-
'commit0': ['
|
| 150 |
-
'multi-swe-bench': ['
|
| 151 |
-
'swt-bench': ['
|
| 152 |
'gaia': ['Information Gathering'],
|
| 153 |
}
|
| 154 |
for benchmark, categories in fallback_mappings.items():
|
|
@@ -244,7 +244,7 @@ class SimpleLeaderboardViewer:
|
|
| 244 |
normalized_openness = OPENNESS_MAPPING.get(raw_openness, raw_openness)
|
| 245 |
|
| 246 |
# All 5 categories for the leaderboard
|
| 247 |
-
ALL_CATEGORIES = ['
|
| 248 |
|
| 249 |
record = {
|
| 250 |
# Core agent info - use final display names
|
|
|
|
| 144 |
if not config_has_mappings:
|
| 145 |
print("[DATA_LOADER] No agenteval.json found, using fallback category mappings")
|
| 146 |
fallback_mappings = {
|
| 147 |
+
'swe-bench': ['Issue Resolution'],
|
| 148 |
+
'swe-bench-multimodal': ['Frontend'],
|
| 149 |
+
'commit0': ['Greenfield'],
|
| 150 |
+
'multi-swe-bench': ['Issue Resolution'],
|
| 151 |
+
'swt-bench': ['Testing'],
|
| 152 |
'gaia': ['Information Gathering'],
|
| 153 |
}
|
| 154 |
for benchmark, categories in fallback_mappings.items():
|
|
|
|
| 244 |
normalized_openness = OPENNESS_MAPPING.get(raw_openness, raw_openness)
|
| 245 |
|
| 246 |
# All 5 categories for the leaderboard
|
| 247 |
+
ALL_CATEGORIES = ['Issue Resolution', 'Frontend', 'Greenfield', 'Testing', 'Information Gathering']
|
| 248 |
|
| 249 |
record = {
|
| 250 |
# Core agent info - use final display names
|
test_generation.py
CHANGED
|
@@ -2,7 +2,7 @@ from content import TEST_GENERATION_DESCRIPTION
|
|
| 2 |
from category_page_builder import build_category_page
|
| 3 |
|
| 4 |
# Define the category for this page
|
| 5 |
-
CATEGORY_NAME = "
|
| 6 |
|
| 7 |
def build_page():
|
| 8 |
build_category_page(CATEGORY_NAME, TEST_GENERATION_DESCRIPTION)
|
|
|
|
| 2 |
from category_page_builder import build_category_page
|
| 3 |
|
| 4 |
# Define the category for this page
|
| 5 |
+
CATEGORY_NAME = "Testing"
|
| 6 |
|
| 7 |
def build_page():
|
| 8 |
build_category_page(CATEGORY_NAME, TEST_GENERATION_DESCRIPTION)
|
ui_components.py
CHANGED
|
@@ -223,20 +223,20 @@ def build_descriptions_tooltip_content(table) -> str:
|
|
| 223 |
<div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>
|
| 224 |
<div class="tooltip-description-item"><b>Average Score:</b> Sum of category scores divided by 5. Missing categories count as 0.</div>
|
| 225 |
<div class="tooltip-description-item"><b>Average Cost:</b> Average cost per instance across all submitted benchmarks, in USD.</div>
|
| 226 |
-
<div class="tooltip-description-item"><b>
|
| 227 |
-
<div class="tooltip-description-item"><b>
|
| 228 |
-
<div class="tooltip-description-item"><b>Frontend
|
| 229 |
-
<div class="tooltip-description-item"><b>Frontend
|
| 230 |
-
<div class="tooltip-description-item"><b>
|
| 231 |
-
<div class="tooltip-description-item"><b>
|
| 232 |
-
<div class="tooltip-description-item"><b>
|
| 233 |
-
<div class="tooltip-description-item"><b>
|
| 234 |
<div class="tooltip-description-item"><b>Information Gathering Score:</b> Macro-average score across Information Gathering benchmarks.</div>
|
| 235 |
<div class="tooltip-description-item"><b>Information Gathering Cost:</b> Macro-average cost per instance (USD) across Information Gathering benchmarks.</div>
|
| 236 |
<div class="tooltip-description-item"><b>Categories Attempted:</b> Number of core categories with at least one benchmark attempted (out of 5).</div>
|
| 237 |
<div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
|
| 238 |
"""
|
| 239 |
-
elif table in ["
|
| 240 |
return f"""
|
| 241 |
<div class="tooltip-description-item"><b>SDK Version:</b> Version of the OpenHands agent evaluated.</div>
|
| 242 |
<div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>
|
|
|
|
| 223 |
<div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>
|
| 224 |
<div class="tooltip-description-item"><b>Average Score:</b> Sum of category scores divided by 5. Missing categories count as 0.</div>
|
| 225 |
<div class="tooltip-description-item"><b>Average Cost:</b> Average cost per instance across all submitted benchmarks, in USD.</div>
|
| 226 |
+
<div class="tooltip-description-item"><b>Issue Resolution Score:</b> Macro-average score across Issue Resolution benchmarks.</div>
|
| 227 |
+
<div class="tooltip-description-item"><b>Issue Resolution Cost:</b> Macro-average cost per instance (USD) across Issue Resolution benchmarks.</div>
|
| 228 |
+
<div class="tooltip-description-item"><b>Frontend Score:</b> Macro-average score across Frontend benchmarks.</div>
|
| 229 |
+
<div class="tooltip-description-item"><b>Frontend Cost:</b> Macro-average cost per instance (USD) across Frontend benchmarks.</div>
|
| 230 |
+
<div class="tooltip-description-item"><b>Greenfield Score:</b> Macro-average score across Greenfield benchmarks.</div>
|
| 231 |
+
<div class="tooltip-description-item"><b>Greenfield Cost:</b> Macro-average cost per instance (USD) across Greenfield benchmarks.</div>
|
| 232 |
+
<div class="tooltip-description-item"><b>Testing Score:</b> Macro-average score across Testing benchmarks.</div>
|
| 233 |
+
<div class="tooltip-description-item"><b>Testing Cost:</b> Macro-average cost per instance (USD) across Testing benchmarks.</div>
|
| 234 |
<div class="tooltip-description-item"><b>Information Gathering Score:</b> Macro-average score across Information Gathering benchmarks.</div>
|
| 235 |
<div class="tooltip-description-item"><b>Information Gathering Cost:</b> Macro-average cost per instance (USD) across Information Gathering benchmarks.</div>
|
| 236 |
<div class="tooltip-description-item"><b>Categories Attempted:</b> Number of core categories with at least one benchmark attempted (out of 5).</div>
|
| 237 |
<div class="tooltip-description-item"><b>Logs:</b> View evaluation run logs (e.g., outputs, traces).</div>
|
| 238 |
"""
|
| 239 |
+
elif table in ["Issue Resolution", "Frontend", "Greenfield", "Testing", "Information Gathering"]:
|
| 240 |
return f"""
|
| 241 |
<div class="tooltip-description-item"><b>SDK Version:</b> Version of the OpenHands agent evaluated.</div>
|
| 242 |
<div class="tooltip-description-item"><b>Language Model:</b> Language model(s) used by the agent. Hover over ⓘ to view all.</div>
|