diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ + diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml new file mode 100644 index 0000000000000000000000000000000000000000..46d9a15b36f98427deecace6b3c26563ab3c2a7c --- /dev/null +++ b/.github/workflows/integration-tests.yml @@ -0,0 +1,44 @@ +name: Integration Tests + +on: + pull_request: + branches: [ main ] + +jobs: + integration-test: + runs-on: ubuntu-latest + + environment: + name: testing + + steps: + - uses: actions/checkout@v4 + with: + lfs: true + + - name: Set up Python 3.11 + uses: actions/setup-python@v4 + with: + python-version: '3.11' + + - name: Cache pip dependencies + uses: actions/cache@v3 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements*.txt') }} + restore-keys: | + ${{ runner.os }}-pip- + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install -r requirements-dev.txt + + - name: Run integration tests + run: | + pytest tests/integration/ -v --tb=short + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + HF_CONFIG: continuous-integration + IS_INTERNAL: true diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..ceb3dca0e8fd117379ca9ce86969862c0599dae2 --- /dev/null +++ b/.gitignore @@ -0,0 +1,184 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# UV +# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +#uv.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +# PyPI configuration file +.pypirc + +# Vim files +*.swp +*.swo +*.un~ + +# Misc +.DS_Store +.mise.toml +.vscode/ +.gradio/ + +.claude diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..1271152271d9b12ba65e5bffe18284f89afda7f9 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,44 @@ +FROM python:3.10-slim + + +# (0) Install SSH client tools (and git, if you're pulling via SSH) +RUN apt-get update && \ + apt-get install -y --no-install-recommends openssh-client git && \ + rm -rf /var/lib/apt/lists/* + +# The two following lines are requirements for the Dev Mode to be functional +# Learn more about the Dev Mode at https://huggingface.co/dev-mode-explorers +RUN useradd -m -u 1000 user +WORKDIR /app + + +# (2) Copy dependencies manifest +COPY --chown=user requirements.txt requirements.txt + +# (3) Install dependencies, mounting SSH keys and optional HTTPS creds +RUN --mount=type=secret,id=AGENTEVAL_DEPLOY_KEY,mode=0400,required=true \ + --mount=type=secret,id=ASTABENCH_DEPLOY_KEY,mode=0400,required=true \ + mkdir -p /root/.ssh && chmod 700 /root/.ssh && \ + cat /run/secrets/AGENTEVAL_DEPLOY_KEY > /root/.ssh/id_ed25519 && chmod 600 /root/.ssh/id_ed25519 && \ + cat /run/secrets/ASTABENCH_DEPLOY_KEY > /root/.ssh/id_astabench && chmod 600 /root/.ssh/id_astabench && \ + ssh-keyscan github.com >> /root/.ssh/known_hosts && \ + printf 'Host github.com\n User git\n IdentityFile /root/.ssh/id_ed25519\n IdentityFile /root/.ssh/id_astabench\n StrictHostKeyChecking no\n' >> /root/.ssh/config && \ + # rewrite all GitHub HTTPS URLs to SSH so nested deps install via SSH + git config --global url."ssh://git@github.com/".insteadOf "https://github.com/" && \ + pip install --no-cache-dir --upgrade -r requirements.txt + +# (4) Copy in your Gradio app code +COPY . . +RUN mkdir -p /home/user/data && chown -R user:user /home/user/data + +# Make the app treat this as non‑debug (so DATA_DIR=/home/user/data) +ENV system=spaces + +# (5) Switch to a non-root user +USER user + +# (6) Expose Gradio’s default port +EXPOSE 7860 + +# (7) Launch your app +CMD ["python", "app.py"] diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ad315e6277e62b5f88765b88e82ede41143f9e06 --- /dev/null +++ b/README.md @@ -0,0 +1,60 @@ +--- +title: OpenHands Index +emoji: 🤖 +colorFrom: blue +colorTo: purple +sdk: docker +app_file: app.py +pinned: true +license: apache-2.0 +hf_oauth: true +app_port: 7860 +failure_strategy: none +tags: + - leaderboard +--- + +## OpenHands Index Leaderboard + +This leaderboard tracks agent performance across multiple software engineering and AI benchmarks. + +## Development +The leaderboard is built using the [HuggingFace Datasets](https://huggingface.co/docs/datasets/index) library, which provides a convenient way to manage and query datasets. +Results are sourced from the [OpenHands Index Results](https://github.com/OpenHands/openhands-index-results) repository. + +To run the leaderboard locally first make sure to set this env variable: +```bash +export IS_INTERNAL=true +``` +You can then start it up with the following command: +```bash +python app.py +``` +This will start a local server that you can access in your web browser at `http://localhost:7860`. + +## Hugging Face Integration +The repo backs two Hugging Face leaderboard spaces: +- https://huggingface.co/spaces/allenai/asta-bench-internal-leaderboard +- https://huggingface.co/spaces/allenai/asta-bench-leaderboard + +Please follow the steps below to push changes to the leaderboards on Hugging Face. + +Before pushing, make sure to merge your changes to the `main` branch of this repository. (following the standard GitHub workflow of creating a branch, making changes, and then merging it back to `main`). + +Before pushing for the first time, you'll need to add the Hugging Face remote repositories if you haven't done so already. You can do this by running the following commands: + +```bash +git remote add huggingface https://huggingface.co/spaces/allenai/asta-bench-internal-leaderboard +git remote add huggingface-public https://huggingface.co/spaces/allenai/asta-bench-leaderboard +``` +You can verify that the remotes have been added by running: + +```bash +git remote -v +``` +Then, to push the changes to the Hugging Face leaderboards, you can use the following commands: + +```bash +git push huggingface main:main +git push huggingface-public main:main +``` diff --git a/about.py b/about.py new file mode 100644 index 0000000000000000000000000000000000000000..d34b415cc625a2dc4e151f27d91bc80ffdf11fbb --- /dev/null +++ b/about.py @@ -0,0 +1,144 @@ +import gradio as gr + + +def build_page(): + with gr.Column(elem_id="about-page-content-wrapper"): + # --- Section 1: About AstaBench --- + gr.HTML( + """ +

About AstaBench

+

+ AstaBench is a novel AI agents evaluation framework, providing a challenging new test for AI agents: the first benchmark challenge that evaluates agents’ scientific abilities on a broad spectrum of research skills, including literature understanding, data analysis, planning, tool use, coding, and search. Asta’s set of standard tools makes it easy to build general-purpose science agents and to compare their performance in an apples-to-apples manner. +

+ """ + ) + gr.Markdown("---", elem_classes="divider-line") + + # --- Section 2: Why AstaBench? --- + gr.HTML( + """ +

Why AstaBench?

+

+ Most current benchmarks test agentic AI and isolated aspects of scientific reasoning, but rarely evaluate AI agentic behavior rigorously or capture the full skill set scientific research requires. Agents can appear effective despite inconsistent results and high compute use, often outperforming others by consuming more resources. Advancing scientific AI requires evaluations that emphasize reproducibility, efficiency, and the real complexity of research. +

+
+

+ AstaBench fills this gap: an agents evaluation framework and suite of open benchmarks for evaluating scientific AI assistants on core scientific tasks that require novel reasoning. AstaBench helps scientists identify which agents best support their needs through task-relevant leaderboards, while giving AI developers a standard execution environment and tools to test the scientific reasoning capabilities of their agents compared to well-known baselines from the literature, including both open and closed LLM foundation models. +

+ """ + ) + gr.Markdown("---", elem_classes="divider-line") + + # --- Section 3: What Does AstaBench Include? --- + gr.HTML( + """ +

What Does AstaBench Include?

+

+ AstaBench includes a rigorous agents evaluation framework and a suite of benchmarks consisting of over 2,400 problems across 11 benchmarks, organized into four core categories: +

+ +

+ Plus: a large suite of integrated agents and leaderboards with results from extensive evaluation of agents and models. +

+

+ 🔍 Learn more in the AstaBench technical blog post +

+ """ + ) + gr.Markdown("---", elem_classes="divider-line") + + # --- Section 4: Understanding the Leaderboards --- + gr.HTML( + """ +

Understanding the Leaderboards

+

+ The AstaBench Overall Leaderboard provides a high-level view of overall agent performance and efficiency: +

+ +

+ Each category leaderboard provides: +

+ + """ + ) + gr.Markdown("---", elem_classes="divider-line") + + # --- Section 5: Scoring & Aggregation --- + gr.HTML( + """ +

Scoring & Aggregation

+

+ AstaBench encourages careful, transparent evaluation. Here's how we handle scoring, cost, and partial results: +

+ +

Scores

+ + +

Cost

+ + +

+ Note: Cost values reflect pricing and infrastructure conditions at a fixed point in time. We recognize that compute costs may change over time and vary by provider, and are actively working on methods to keep costs up-to-date and normalized for fair comparisons. +

+ +

Coverage

+ + +

+ These design choices ensure fair comparison while penalizing cherry-picking and omissions. +

+ """ + ) + gr.Markdown("---", elem_classes="divider-line") + + # --- Section 6: Learn More --- + gr.HTML( + """ +
+

Learn More

+ +
+ """ + ) + # Floating feedback button + floating_feedback_button_html = """ +
+ Have feedback? +
+ """ + gr.HTML(floating_feedback_button_html) \ No newline at end of file diff --git a/aliases.py b/aliases.py new file mode 100644 index 0000000000000000000000000000000000000000..554dc9de02927a898ce0b0b5f8d466dbbffab0a8 --- /dev/null +++ b/aliases.py @@ -0,0 +1,23 @@ +from agenteval.config import ( + OPENNESS_OPEN_SOURCE_OPEN_WEIGHTS as CANONICAL_OPENNESS_OPEN_SOURCE_OPEN_WEIGHTS, + OPENNESS_OPEN_SOURCE_CLOSED_WEIGHTS as CANONICAL_OPENNESS_OPEN_SOURCE_CLOSED_WEIGHTS, + OPENNESS_CLOSED_API_AVAILABLE as CANONICAL_OPENNESS_CLOSED_API_AVAILABLE, + OPENNESS_CLOSED_UI_ONLY as CANONICAL_OPENNESS_CLOSED_UI_ONLY, + TOOL_USAGE_STANDARD as CANONICAL_TOOL_USAGE_STANDARD, + TOOL_USAGE_CUSTOM_INTERFACE as CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE, + TOOL_USAGE_FULLY_CUSTOM as CANONICAL_TOOL_USAGE_FULLY_CUSTOM, +) + + +OPENNESS_ALIASES = { + CANONICAL_OPENNESS_OPEN_SOURCE_OPEN_WEIGHTS: {"Open Source + Open Weights"}, + CANONICAL_OPENNESS_OPEN_SOURCE_CLOSED_WEIGHTS: {"Open Source"}, + CANONICAL_OPENNESS_CLOSED_API_AVAILABLE: {"API Available"}, + CANONICAL_OPENNESS_CLOSED_UI_ONLY: {"Closed"} +} + +TOOL_USAGE_ALIASES = { + CANONICAL_TOOL_USAGE_STANDARD: {}, + CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE: {"Custom with Standard Search"}, + CANONICAL_TOOL_USAGE_FULLY_CUSTOM: {"Fully Custom"} +} diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..25f13c1687d2e1fb59e2a08b7560523da16eb36b --- /dev/null +++ b/app.py @@ -0,0 +1,282 @@ +# app.py +import logging + +logging.basicConfig(level=logging.WARNING) + +import gradio as gr +import urllib.parse + +from apscheduler.schedulers.background import BackgroundScheduler +from huggingface_hub import HfApi + +from config import LEADERBOARD_PATH, LOCAL_DEBUG +from content import css +from main_page import build_page as build_main_page +from literature_understanding import build_page as build_lit_page +from c_and_e import build_page as build_c_and_e_page +from data_analysis import build_page as build_data_analysis_page +from e2e import build_page as build_e2e_page +from submission import build_page as build_submission_page +from about import build_page as build_about_page + +api = HfApi() +LOGO_PATH = "assets/logo.svg" +# JavaScripts +scroll_script = """ + +""" +redirect_script = """ + +""" +tooltip_script = """ + +""" +redirect_submission_on_close_script = """ + +""" +# --- Theme Definition --- +theme = gr.themes.Base( + primary_hue=gr.themes.Color(c100="#CFF5E8", c200="#B7EFDD", c300="#9FEAD1", c400="#87E5C5", c50="#E7FAF3", c500="#6FE0BA", c600="#57DBAF", c700="#3FD5A3", c800="#27D09C", c900="#0FCB8C", c950="#0fcb8c"), + secondary_hue=gr.themes.Color(c100="#FCDCEB", c200="#FBCBE1", c300="#F9BAD7", c400="#F7A8CD", c50="#FDEEF5", c500="#F697C4", c600="#F586BA", c700="#F375B0", c800="#F263A6", c900="#F0529C", c950="#F0529C"), + neutral_hue=gr.themes.Color(c100="#FDF9F4", c200="#C9C9C3", c300="#B0B5AF", c400="#97A09C", c50="#FAF2E9", c500="#7F8C89", c600="#667876", c700="#344F4F", c800="#1C3A3C", c900="#032629", c950="032629"), + font=[gr.themes.GoogleFont('Manrope'), 'ui-sans-serif', 'sans-serif', 'sans-serif'], + font_mono=[gr.themes.GoogleFont('Roboto Mono'), 'ui-monospace', 'monospace', 'monospace'], +).set( + body_text_color='*neutral_950', + body_text_color_subdued='*neutral_950', + body_text_color_subdued_dark='*neutral_50', + body_text_color_dark='*neutral_50', + background_fill_primary='*neutral_50', + background_fill_primary_dark='*neutral_900', + background_fill_secondary='*neutral_100', + background_fill_secondary_dark='*neutral_800', + border_color_accent='*secondary_900', + border_color_accent_subdued='*neutral_400', + border_color_accent_subdued_dark='*neutral_400', + color_accent='*primary_900', + color_accent_soft='*neutral_200', + color_accent_soft_dark='*neutral_800', + link_text_color='*secondary_900', + link_text_color_dark='*primary_900', + link_text_color_active_dark='*primary_600', + link_text_color_hover_dark='*primary_700', + link_text_color_visited_dark='*primary_600', + table_even_background_fill='*neutral_100', + table_even_background_fill_dark='*neutral_800', + button_primary_background_fill='*secondary_900', + button_primary_background_fill_dark='*primary_900', + button_primary_background_fill_hover='*secondary_600', + button_primary_background_fill_hover_dark='*primary_600', + button_secondary_background_fill="#9FEAD1", + button_secondary_background_fill_dark="#9FEAD1", + button_secondary_text_color="*neutral_900", + button_secondary_text_color_dark="*neutral_900", + block_title_text_color="*neutral_900", + button_primary_text_color='*neutral_900', + block_title_text_color_dark="#ffffff", + button_primary_text_color_dark='*neutral_900', + block_border_color="#032629", + block_border_color_dark="#9fead1", + block_background_fill_dark="#032629", + block_background_fill="#FAF2E9", + checkbox_label_text_color="#032629", + checkbox_label_background_fill="#D8D6CF", + checkbox_label_background_fill_dark="#254243", + checkbox_background_color_selected="#F0529C", + checkbox_background_color_selected_dark="#0FCB8C", +) +try: + with open(LOGO_PATH, "r") as f: + svg_content = f.read() + encoded_svg = urllib.parse.quote(svg_content) + home_icon_data_uri = f"data:image/svg+xml,{encoded_svg}" +except FileNotFoundError: + print(f"Warning: Home icon file not found at {LOGO_PATH}.") + home_icon_data_uri = "none" + +# --- This is the final CSS --- +final_css = css + f""" +/* --- Find the "Home" button and replace its text with an icon --- */ +.nav-holder nav a[href$="/"] {{ + display: none !important; +}} +.nav-holder nav a[href*="/home"] {{ + grid-row: 1 !important; + grid-column: 1 !important; + justify-self: start !important; + display: flex !important; + align-items: center !important; + justify-content: center !important; + + /* 2. Hide the original "Home" text */ + font-size: 0 !important; + text-indent: -9999px; + + /* 3. Apply the icon as the background */ + background-image: url("{home_icon_data_uri}") !important; + background-size: contain !important; + background-repeat: no-repeat !important; + background-position: center !important; + + width: 240px !important; + height: 50px !important; + padding: 0 !important; + border: none !important; + outline: none !important; +}} +""" +# --- Gradio App Definition --- +demo = gr.Blocks( + theme=theme, + css=final_css, + head=scroll_script + redirect_script + tooltip_script + redirect_submission_on_close_script, + title="OpenHands Index", +) +with demo.route("Home", "/home"): + build_main_page() + +with demo.route("Literature Understanding", "/literature-understanding"): + build_lit_page() + +with demo.route("Code & Execution", "/code-execution"): + build_c_and_e_page() + +with demo.route("Data Analysis", "/data-analysis"): + build_data_analysis_page() + +with demo.route("End-to-End Discovery", "/discovery"): + build_e2e_page() + +with demo.route("About", "/about"): + build_about_page() + +with demo.route("🚀 Submit an Agent", "/submit"): + build_submission_page() +# --- Scheduler and Launch +def restart_space_job(): + print("Scheduler: Attempting to restart space.") + try: + api.restart_space(repo_id=LEADERBOARD_PATH) + print("Scheduler: Space restart request sent.") + except Exception as e: + print(f"Scheduler: Error restarting space: {e}") + scheduler = BackgroundScheduler(timezone="UTC") + scheduler.add_job(restart_space_job, "interval", hours=1) + scheduler.start() + + +# Launch the Gradio app +if __name__ == "__main__": + if LOCAL_DEBUG: + print("Launching in LOCAL_DEBUG mode.") + demo.launch(debug=True, allowed_paths=["assets"], favicon_path="assets/favicon/favicon.ico") + else: + print("Launching in Space mode.") + # For Spaces, share=False is typical unless specific tunneling is needed. + # debug=True can be set to False for a "production" Space. + demo.launch(server_name="0.0.0.0", server_port=7860, debug=True, share=False, allowed_paths=["assets"], favicon_path="assets/favicon/favicon.ico") + diff --git a/assets/api-custom.svg b/assets/api-custom.svg new file mode 100644 index 0000000000000000000000000000000000000000..38d6c3c8ee040d47eef46c60ffca092cae69506e --- /dev/null +++ b/assets/api-custom.svg @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/assets/api-equivalent.svg b/assets/api-equivalent.svg new file mode 100644 index 0000000000000000000000000000000000000000..9661ffd30b6aec8cfaf6413d437d2c887e4a5231 --- /dev/null +++ b/assets/api-equivalent.svg @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/assets/api-legend.svg b/assets/api-legend.svg new file mode 100644 index 0000000000000000000000000000000000000000..b1ab7e14c02ce058efa3d17f338a1955b4b2c2e0 --- /dev/null +++ b/assets/api-legend.svg @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/assets/api-standard.svg b/assets/api-standard.svg new file mode 100644 index 0000000000000000000000000000000000000000..af6caf448193041b4851f4bd87ca0f5c90e1d3a5 --- /dev/null +++ b/assets/api-standard.svg @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/assets/c-custom.svg b/assets/c-custom.svg new file mode 100644 index 0000000000000000000000000000000000000000..89584c94add354c5afa629703db573a03732abe0 --- /dev/null +++ b/assets/c-custom.svg @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/assets/c-equivalent.svg b/assets/c-equivalent.svg new file mode 100644 index 0000000000000000000000000000000000000000..cd19eb27b4e15dca998abee37558b4509b512e3b --- /dev/null +++ b/assets/c-equivalent.svg @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/assets/c-legend.svg b/assets/c-legend.svg new file mode 100644 index 0000000000000000000000000000000000000000..44b266792733c505dbb8c90beac6ed3b53ee01e5 --- /dev/null +++ b/assets/c-legend.svg @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/assets/c-standard.svg b/assets/c-standard.svg new file mode 100644 index 0000000000000000000000000000000000000000..ae73b44d77998d246ba37f715b5db1eddbd928c6 --- /dev/null +++ b/assets/c-standard.svg @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/assets/code-execution.svg b/assets/code-execution.svg new file mode 100644 index 0000000000000000000000000000000000000000..b0be5a8a325be1d51701c1ac310add27d98cbf68 --- /dev/null +++ b/assets/code-execution.svg @@ -0,0 +1,265 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/assets/custom-legend.svg b/assets/custom-legend.svg new file mode 100644 index 0000000000000000000000000000000000000000..93e5262297b2b3aa85b99a41de5746fe86558e10 --- /dev/null +++ b/assets/custom-legend.svg @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/assets/data-analysis.svg b/assets/data-analysis.svg new file mode 100644 index 0000000000000000000000000000000000000000..226dd6719b453af2db546b8476f7841b9ac121ab --- /dev/null +++ b/assets/data-analysis.svg @@ -0,0 +1,265 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/assets/ellipse-coral.svg b/assets/ellipse-coral.svg new file mode 100644 index 0000000000000000000000000000000000000000..712c4ca565cb6da74cf40c0919caabb25510f431 --- /dev/null +++ b/assets/ellipse-coral.svg @@ -0,0 +1,3 @@ + + + diff --git a/assets/ellipse-pink.svg b/assets/ellipse-pink.svg new file mode 100644 index 0000000000000000000000000000000000000000..bc52aa3f0cc2b3f5a766fd2dcbd080921caab0ed --- /dev/null +++ b/assets/ellipse-pink.svg @@ -0,0 +1,3 @@ + + + diff --git a/assets/ellipse-white.svg b/assets/ellipse-white.svg new file mode 100644 index 0000000000000000000000000000000000000000..268dd9ab710c15bacfd98aba83963e881f561ef4 --- /dev/null +++ b/assets/ellipse-white.svg @@ -0,0 +1,3 @@ + + + diff --git a/assets/ellipse-yellow.svg b/assets/ellipse-yellow.svg new file mode 100644 index 0000000000000000000000000000000000000000..46243656b3f197672662d7fdeed6ff98886224b0 --- /dev/null +++ b/assets/ellipse-yellow.svg @@ -0,0 +1,3 @@ + + + diff --git a/assets/end-to-end-discovery.svg b/assets/end-to-end-discovery.svg new file mode 100644 index 0000000000000000000000000000000000000000..20f740857e53066f237923557f8fb3df07bc23af --- /dev/null +++ b/assets/end-to-end-discovery.svg @@ -0,0 +1,265 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/assets/equivalent-legend.svg b/assets/equivalent-legend.svg new file mode 100644 index 0000000000000000000000000000000000000000..5b702bb34e9a3914a134d868c1df0b3cae6830bb --- /dev/null +++ b/assets/equivalent-legend.svg @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/assets/favicon/favicon.ico b/assets/favicon/favicon.ico new file mode 100644 index 0000000000000000000000000000000000000000..2250685982f2cf9f7c3807b92ec1a925306e185c Binary files /dev/null and b/assets/favicon/favicon.ico differ diff --git a/assets/five-point-star.svg b/assets/five-point-star.svg new file mode 100644 index 0000000000000000000000000000000000000000..b30c535c95aa23f4395d0b4b27e21e8ce04eab9d --- /dev/null +++ b/assets/five-point-star.svg @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/assets/four-point-star.svg b/assets/four-point-star.svg new file mode 100644 index 0000000000000000000000000000000000000000..79d860e5c93f30e21e3939c7d0c4636f84014071 --- /dev/null +++ b/assets/four-point-star.svg @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/assets/just-icon.svg b/assets/just-icon.svg new file mode 100644 index 0000000000000000000000000000000000000000..86edb1a81eed854f30f6f4c4881f2b96fa698da9 --- /dev/null +++ b/assets/just-icon.svg @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/assets/literature-understanding.svg b/assets/literature-understanding.svg new file mode 100644 index 0000000000000000000000000000000000000000..b9ef6f80eec122f14466d65a68a2c5cc747f63a9 --- /dev/null +++ b/assets/literature-understanding.svg @@ -0,0 +1,265 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/assets/logo.svg b/assets/logo.svg new file mode 100644 index 0000000000000000000000000000000000000000..d47d4dab17cf7a933029feb30338dd5758de8a81 --- /dev/null +++ b/assets/logo.svg @@ -0,0 +1,12 @@ + + + + + + + + + + + + \ No newline at end of file diff --git a/assets/openhands-logo.svg b/assets/openhands-logo.svg new file mode 100644 index 0000000000000000000000000000000000000000..1becba2bb0a80e7b9b0a7715f15e56512a7e5f7e --- /dev/null +++ b/assets/openhands-logo.svg @@ -0,0 +1 @@ +404: Not Found \ No newline at end of file diff --git a/assets/os-custom.svg b/assets/os-custom.svg new file mode 100644 index 0000000000000000000000000000000000000000..b82f0830d1c0287f506afc6be3d5199ad47fc8f3 --- /dev/null +++ b/assets/os-custom.svg @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/assets/os-equivalent.svg b/assets/os-equivalent.svg new file mode 100644 index 0000000000000000000000000000000000000000..3b832d2706e3da5a3af9c9464c276c8418332405 --- /dev/null +++ b/assets/os-equivalent.svg @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/assets/os-legend.svg b/assets/os-legend.svg new file mode 100644 index 0000000000000000000000000000000000000000..64901cb23db76f5378b06064e4e4d7a70a629a1b --- /dev/null +++ b/assets/os-legend.svg @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/assets/os-ow-custom.svg b/assets/os-ow-custom.svg new file mode 100644 index 0000000000000000000000000000000000000000..b88476e6cb065c5f4b62efc8d455dceec34e7780 --- /dev/null +++ b/assets/os-ow-custom.svg @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/assets/os-ow-equivalent.svg b/assets/os-ow-equivalent.svg new file mode 100644 index 0000000000000000000000000000000000000000..3b74754ad376d5b22df54139f72dfa025f16836a --- /dev/null +++ b/assets/os-ow-equivalent.svg @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/assets/os-ow-legend.svg b/assets/os-ow-legend.svg new file mode 100644 index 0000000000000000000000000000000000000000..f0c61a15d7e76b56fd67dd6f674fae43a9c501b6 --- /dev/null +++ b/assets/os-ow-legend.svg @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/assets/os-ow-standard.svg b/assets/os-ow-standard.svg new file mode 100644 index 0000000000000000000000000000000000000000..680b774b6df7e598bb1439e879ef016856f92238 --- /dev/null +++ b/assets/os-ow-standard.svg @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/assets/os-standard.svg b/assets/os-standard.svg new file mode 100644 index 0000000000000000000000000000000000000000..0af793fcd8c93ef6190386e1a6fcc04ef14ff331 --- /dev/null +++ b/assets/os-standard.svg @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/assets/overall.svg b/assets/overall.svg new file mode 100644 index 0000000000000000000000000000000000000000..d5953aac408ca232dc0163b7b1e7390c33c0edae --- /dev/null +++ b/assets/overall.svg @@ -0,0 +1,261 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/assets/pareto.svg b/assets/pareto.svg new file mode 100644 index 0000000000000000000000000000000000000000..324bbc6375a3ca40fc0b22469c52437cb1e3d09b --- /dev/null +++ b/assets/pareto.svg @@ -0,0 +1,3 @@ + + + diff --git a/assets/standard-legend.svg b/assets/standard-legend.svg new file mode 100644 index 0000000000000000000000000000000000000000..5b136aa0796bcc985bddf997aa1f8d5d26636c61 --- /dev/null +++ b/assets/standard-legend.svg @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/assets/three-point-star.svg b/assets/three-point-star.svg new file mode 100644 index 0000000000000000000000000000000000000000..ecceed5a1951a4eb50a1daf3be68405f522816ef --- /dev/null +++ b/assets/three-point-star.svg @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/assets/trophy.svg b/assets/trophy.svg new file mode 100644 index 0000000000000000000000000000000000000000..e6f66024f092c468a213eceaf996a302c654e3f9 --- /dev/null +++ b/assets/trophy.svg @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/assets/up-arrow.svg b/assets/up-arrow.svg new file mode 100644 index 0000000000000000000000000000000000000000..ceb903e2aad29a0d8ab513d5740a1cebc4560f27 --- /dev/null +++ b/assets/up-arrow.svg @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/c_and_e.py b/c_and_e.py new file mode 100644 index 0000000000000000000000000000000000000000..3bd75baef88eebdfb0f83d17ae7a8aff089410d0 --- /dev/null +++ b/c_and_e.py @@ -0,0 +1,9 @@ +import gradio as gr +from content import CODE_EXECUTION_DESCRIPTION +from category_page_builder import build_category_page + +# Define the category for this page +CATEGORY_NAME = "Code & Execution" + +def build_page(): + build_category_page(CATEGORY_NAME, CODE_EXECUTION_DESCRIPTION) \ No newline at end of file diff --git a/category_page_builder.py b/category_page_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..070a62b9b78084c8d14c29a5d992c7dd2d3e1432 --- /dev/null +++ b/category_page_builder.py @@ -0,0 +1,105 @@ +import gradio as gr +import pandas as pd + +# Import our UI factories and the data loader +from ui_components import create_leaderboard_display, create_benchmark_details_display, get_full_leaderboard_data, create_sub_navigation_bar +CATEGORY_DIAGRAM_MAP = { + "Literature Understanding": "assets/literature-understanding.svg", + "Code & Execution": "assets/code-execution.svg", + "Data Analysis": "assets/data-analysis.svg", + "End-to-End Discovery": "assets/end-to-end-discovery.svg", +} + +def build_category_page(CATEGORY_NAME, PAGE_DESCRIPTION): + with gr.Column(elem_id="page-content-wrapper"): + validation_df, validation_tag_map = get_full_leaderboard_data("validation") + test_df, test_tag_map = get_full_leaderboard_data("test") + with gr.Row(elem_id="intro-row"): + + with gr.Column(scale=1): + gr.HTML(f'

AstaBench {CATEGORY_NAME} Leaderboard (Aggregate)

', elem_id="main-header") + with gr.Column(elem_id="validation_nav_container", visible=False) as validation_nav_container: + create_sub_navigation_bar(validation_tag_map, CATEGORY_NAME, validation=True) + + with gr.Column(elem_id="test_nav_container", visible=True) as test_nav_container: + create_sub_navigation_bar(test_tag_map, CATEGORY_NAME) + + gr.Markdown(PAGE_DESCRIPTION, elem_id="intro-category-paragraph") + + # --- The Right Column --- + with gr.Column(scale=1): + image_path = CATEGORY_DIAGRAM_MAP.get(CATEGORY_NAME) + if image_path: + gr.Image( + value=image_path, + show_label=False, + show_download_button=False, + show_fullscreen_button=False, + show_share_button=False, + interactive=False, + elem_id="diagram-image" + ) + # --- This page now has two main sections: Validation and Test --- + with gr.Tabs(): + with gr.Tab("Results: Test Set") as test_tab: + # Repeat the process for the "test" split + if not test_df.empty: + gr.Markdown("**Test Set** results are reserved for final assessment. This helps ensure that the agent generalizes well to unseen problems.") + create_leaderboard_display( + full_df=test_df, + tag_map=test_tag_map, + category_name=CATEGORY_NAME, + split_name="test" + ) + create_benchmark_details_display( + full_df=test_df, + tag_map=test_tag_map, + category_name=CATEGORY_NAME, + validation=False, + ) + else: + gr.Markdown("No data available for test split.") + with gr.Tab("Results: Validation Set") as validation_tab: + # 1. Load all necessary data for the "validation" split ONCE. + if not validation_df.empty: + gr.Markdown("**Validation Set** results are used during development to tune and compare agents before final testing.") + # 2. Render the main category display using the loaded data. + create_leaderboard_display( + full_df=validation_df, + tag_map=validation_tag_map, + category_name=CATEGORY_NAME, + split_name="validation" + ) + + # 3. Render the detailed breakdown for each benchmark in the category. + create_benchmark_details_display( + full_df=validation_df, + tag_map=validation_tag_map, + category_name=CATEGORY_NAME, + validation=True, + ) + else: + gr.Markdown("No data available for validation split.") + + + show_validation_js = """ + () => { + document.getElementById('validation_nav_container').style.display = 'block'; + document.getElementById('test_nav_container').style.display = 'none'; + setTimeout(() => { window.dispatchEvent(new Event('resize')) }, 0); + } + """ + + # JavaScript to show the TEST nav, hide the VALIDATION nav, AND fix the plots. + show_test_js = """ + () => { + document.getElementById('validation_nav_container').style.display = 'none'; + document.getElementById('test_nav_container').style.display = 'block'; + } + """ + + # Assign the pure JS functions to the select events. No Python `fn` is needed. + validation_tab.select(fn=None, inputs=None, outputs=None, js=show_validation_js) + test_tab.select(fn=None, inputs=None, outputs=None, js=show_test_js) + + return validation_nav_container, test_nav_container \ No newline at end of file diff --git a/config.py b/config.py new file mode 100644 index 0000000000000000000000000000000000000000..969b151697b80c6bc8e1de02eadfccb9d9a64f42 --- /dev/null +++ b/config.py @@ -0,0 +1,22 @@ +import os + +LOCAL_DEBUG = not (os.environ.get("system") == "spaces") +CONFIG_NAME = os.getenv("HF_CONFIG", "1.0.0-dev1") # This corresponds to 'config' in LeaderboardViewer +IS_INTERNAL = os.environ.get("IS_INTERNAL", "false").lower() == "true" + +# OpenHands Index datasets +CONTACT_DATASET = f"OpenHands/openhands-index-contact-info" + +if IS_INTERNAL: + # datasets backing the internal leaderboard + SUBMISSION_DATASET = f"OpenHands/openhands-index-internal-submissions" + RESULTS_DATASET = f"OpenHands/openhands-index-internal-results" + LEADERBOARD_PATH = f"OpenHands/openhands-index-internal-leaderboard" +else: + # datasets backing the public leaderboard + SUBMISSION_DATASET = f"OpenHands/openhands-index-submissions" + RESULTS_DATASET = f"OpenHands/openhands-index-results" + LEADERBOARD_PATH = f"OpenHands/openhands-index" + +DATA_DIR = "/tmp/oh_index/data/" + CONFIG_NAME +EXTRACTED_DATA_DIR = os.path.join(DATA_DIR, "extracted") diff --git a/content.py b/content.py new file mode 100644 index 0000000000000000000000000000000000000000..d3aa3dff7cfbaa2ee09b4903c749c3c551c04dc5 --- /dev/null +++ b/content.py @@ -0,0 +1,934 @@ +import re + +def create_gradio_anchor_id(text: str, validation) -> str: + """ + Replicates the ID format created by gr.Markdown(header_links=True). + Example: "Paper Finder Validation" -> "h-paper-finder-validation" + """ + text = text.lower() + text = re.sub(r'\s+', '-', text) # Replace spaces with hyphens + text = re.sub(r'[^\w-]', '', text) # Remove non-word characters + if validation: + return f"h-{text}-leaderboard-1" + return f"h-{text}-leaderboard" + + +TITLE = """

AstaBench Leaderboard

""" + +INTRO_PARAGRAPH = """ +

+ AstaBench provides an aggregated view of agent performance and efficiency across all benchmarks in all four categories. We report: +

+ + + +

+ This view is designed for quick comparison of general-purpose scientific agents. For more details on how we calculate scores and cost, please see the About Page. +

+""" +SCATTER_DISCLAIMER = """ +**Note:** Agents without cost data are displayed to the right of the vertical divider line. +""" +PARETO_DISCLAIMER = """ +Agents names that are green are Pareto optimal, meaning they achieve the best performance for their cost. +""" +LIT_DESCRIPTION = """ +The **Literature Understanding** category evaluates how well agents comprehend and interact with scientific literature—testing their ability to find research papers, assess citation quality, extract information from text, and more. +

+The scores shown below reflect performance aggregated across five distinct benchmarks, each targeting a different aspect of literature-based reasoning. +

+For detailed results, use the links above to explore individual benchmarks. +
+""" +CODE_EXECUTION_DESCRIPTION = """ +The **Code & Execution** category in AstaBench includes tasks that evaluate an agent’s ability to write, modify, and run code in realistic research scenarios. Unlike literature tasks—which only require read-only tools and can sometimes even be solved by a language model alone—these problems often require the agent to manipulate a machine environment with tools: reading input files, executing code, and writing outputs to specific files in the required format. +

+The scores in this category are aggregated from three distinct benchmarks, each targeting different facets of scientific coding and execution. Together, these benchmarks evaluate whether an agent can function as a hands-on scientific assistant—not just by reasoning about code, but by running it in real-world contexts. +

+For detailed results, use the links above to explore individual benchmark pages. +
+""" +DATA_ANALYSIS_DESCRIPTION = """ +The **Data Analysis** category evaluates agents on their ability to analyze structured datasets and generate meaningful scientific hypotheses. It currently includes a single benchmark, DiscoveryBench, so the category-level scores are the same as the benchmark-level results. +

+As additional benchmarks are added in the future, this category will expand to cover a broader range of data-driven reasoning tasks across scientific domains. +
+""" +DISCOVERY_DESCRIPTION = """ +The **End-to-End Discovery** category tests whether agents can carry out a complete scientific workflow, from task description to experiment design, code execution, results analysis, and report writing. These tasks require agents to integrate multiple capabilities, producing not just answers but full research artifacts. +

+Scores in this category are aggregated from two benchmarks, providing the first standardized way to evaluate automated scientific discovery (ASD) agents across all stages of the research process. Use the links above to explore individual benchmark pages. +
+""" +SUBMISSION_CONFIRMATION = """ +**Your agent has been submitted to AstaBench for evaluation.** +

+🙏 Thanks for contributing! +

+You'll receive a confirmation email from our team within 2 business days with next steps. We will reach out to you directly if further information is needed. +

+We appreciate your support in advancing scientific AI. +""" + +# External URLs for benchmark descriptions +SCHOLAR_QA_CS_URL = "https://www.semanticscholar.org/paper/OpenScholar%3A-Synthesizing-Scientific-Literature-LMs-Asai-He/b40df4b273f255b3cb5639e220c8ab7b1bdb313e" +LITQA2_URL = "https://www.semanticscholar.org/paper/Language-agents-achieve-superhuman-synthesis-of-Skarlinski-Cox/fa5f9aa1cb6f97654ca8e6d279ceee1427a87e68" +ARXIV_DIGESTABLES_URL = "https://www.semanticscholar.org/paper/ArxivDIGESTables%3A-Synthesizing-Scientific-into-Newman-Lee/c7face35e84f2cb04fb1600d54298799aa0ed189" +SUPER_URL = "https://www.semanticscholar.org/paper/SUPER%3A-Evaluating-Agents-on-Setting-Up-and-Tasks-Bogin-Yang/053ef8299988680d47df36224bfccffc817472f1" +CORE_BENCH_URL = "https://www.semanticscholar.org/paper/CORE-Bench%3A-Fostering-the-Credibility-of-Published-Siegel-Kapoor/4c913d59d150fe7581386b87dfd9f90448a9adee" +DS1000_URL = "https://arxiv.org/abs/2211.11501" +DISCOVERY_BENCH_URL = "https://www.semanticscholar.org/paper/DiscoveryBench%3A-Towards-Data-Driven-Discovery-with-Majumder-Surana/48c83799530dc523ee01e6c1c40ad577d5c10a16" + +# Helper function to create external links +def external_link(url, text, is_s2_url=False): + url = f"{url}?utm_source=asta_leaderboard" if is_s2_url else url + return f"{text}" + +def internal_leaderboard_link(text, validation): + anchor_id = create_gradio_anchor_id(text, validation) + return f"{text}" + +# Function to get benchmark descriptions with validation flag +def get_benchmark_description(benchmark_name, validation): + descriptions = { + 'PaperFindingBench': ( + "PaperFindingBench assesses an agent's ability to locate sets of papers based on a natural language " + "description that may involve both the papers' content and metadata, such as the author or publication year." + ), + 'LitQA2-FullText-Search': ( + f"A version of {internal_leaderboard_link('LitQA2-FullText', validation)} that isolates the retrieval aspect of the task. " + f"This benchmark features the same multi-choice questions as {internal_leaderboard_link('LitQA2-FullText', validation)}, but the agent is not evaluated on answering the actual question " + "but rather on providing a ranked list of papers in which the answer is likely to be found." + ), + 'ScholarQA-CS2': ( + "ScholarQA-CS2 assesses long-form model responses to literature review questions in the domain of computer science. " + "Answers are expected to be comprehensive reports, such as those produced by deep research systems. " + f"This benchmark advances on the previously released {external_link(SCHOLAR_QA_CS_URL, 'ScholarQA-CS', is_s2_url=True)} " + "by using queries from real-world usage, and introducing new evaluation methods for coverage and precision " + "of both the report text and its citations." + ), + 'LitQA2-FullText': ( + f"{external_link(LITQA2_URL, 'LitQA2', is_s2_url=True)}, a benchmark introduced by FutureHouse, gauges a model's ability to answer questions that require document retrieval from the scientific literature. " + "It consists of multiple-choice questions that necessitate finding a unique paper and analyzing its detailed full text to spot precise information; these questions cannot be answered from a paper’s abstract. " + "While the original version of the benchmark provided for each question the title of the paper in which the answer can be found, it did not specify the overall collection to search over. In our version, " + "we search over the index we provide as part of the Asta standard toolset. The “-FullText” suffix indicates we consider only the subset of LitQA2 questions for which " + "the full-text version of the answering paper is open source and available in our index." + ), + 'ArxivDIGESTables-Clean': ( + f"{external_link(ARXIV_DIGESTABLES_URL, 'ArxivDIGESTables', is_s2_url=True)} assesses the ability of models to construct literature review tables, i.e., tables whose rows are papers and whose columns constitute a set of " + "aspects used to compare and contrast the papers. The goal is to construct such tables given a set of related papers and a table caption describing the user's goal. Generated tables are evaluated by " + "comparing them to actual tables published in ArXiv papers. The “-Clean” suffix indicates a curated subset of ArxivDIGESTables which drops tables that are either trivial or impossible to reconstruct from full-texts." + ), + 'SUPER-Expert': ( + "SUPER-Expert evaluates the capability of models in setting up and executing tasks from low-resource " + "research repositories—centralized databases containing research data and related materials. " + f"The \"-Expert\" split indicates the name of the most challenging split in the {external_link(SUPER_URL, 'original SUPER benchmark', is_s2_url=True)} " + "that involves solving reproduction tasks from scratch and without any intermediate hints or details " + "about the important landmarks involved in each task." + ), + 'CORE-Bench-Hard': ( + "Core-Bench-Hard tests computational reproducibility, a task involving reproducing the results of a study " + "using provided code and data. It consists of both language-only and vision-language challenges across " + "multiple difficulty levels. " + f"The \"-Hard\" split refers to the name of the most challenging split in the original {external_link(CORE_BENCH_URL, 'Core-bench benchmark', is_s2_url=True)} " + "where only a README file is provided with no instructions or an auxiliary Dockerfile." + ), + 'DS-1000': ( + "DS-1000 is an established code generation benchmark containing Python data science coding questions " + "originally sourced from StackOverflow. It's designed to reflect an array of diverse, realistic, and " + "practical use cases and directly involves many of the Python libraries commonly used in data science " + f"and machine learning research. We split the original {external_link(DS1000_URL, 'dataset')} " + "into 100 validation and 900 test problems." + ), + 'DiscoveryBench': ( + "DiscoveryBench is the first comprehensive benchmark to formalize the multi-step process of data-driven " + "analysis and discovery (i.e., data loading, transformation, statistical analysis, and modeling). " + f"Originally introduced {external_link(DISCOVERY_BENCH_URL, 'here', is_s2_url=True)}, it is designed to systematically " + "evaluate how well current LLMs can replicate or reproduce published scientific findings across diverse " + "domains, including social science, biology, history, and more." + ), + 'E2E-Bench': ( + "E2E-Bench is the \"decathlon\" of AI-assisted research. It measures whether a system can run the entire " + "research pipeline, starting with an initial task description, to designing and performing (software) " + "experiments, to analyzing and writing up the results." + ), + 'E2E-Bench-Hard': ( + f"E2E-Bench-Hard is a more challenging variant of {internal_leaderboard_link('E2E-Bench', validation)}. Tasks are generated using the HypER system, " + "which identifies research trends and proposes new, underexplored problems. Unlike the regular version, " + "these tasks are not simplified or curated for accessibility; they are reviewed only for feasibility. " + "This version is intended to test whether systems can handle more complex and less-structured research " + f"scenarios, following the same end-to-end process as {internal_leaderboard_link('E2E-Bench', validation)}." + ) + } + + return descriptions.get(benchmark_name, "") + +CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results" +CITATION_BUTTON_TEXT = r"""@article{asta-bench, + title={AstaBench}, + author={AstaBench folks}, + year={2025}, + eprint={TBD.TBD}, + archivePrefix={arXiv}, + primaryClass={cs.AI}, + secondaryClass={cs.CL} +}""" + +LEGAL_DISCLAIMER_TEXT = """ +

Terms and Conditions

+

+ The Allen Institute for Artificial Intelligence (Ai2) maintains this repository for agent evaluation submissions to AstaBench. To keep AstaBench fair and auditable, all evaluation logs and associated submission files will be made publicly available. This includes your benchmark inputs, model output responses, and other data and information related to your submission as needed to verify the results. +

+
+

+ Your submissions to AstaBench will be posted, scored, and ranked on the leaderboard at https://huggingface.co/spaces/allenai/asta-bench-leaderboard. You agree you have the rights to the materials you submit and that you will not share any personal, sensitive, proprietary, or confidential information. +

+""" + +def format_error(msg): + return f"

{msg}

" + + +def format_warning(msg): + return f"

{msg}

" + + +def format_log(msg): + return f"

{msg}

" + + +def hyperlink(link_url: str, text: str = "🔗") -> str: + if not link_url or not isinstance(link_url, str): + return str(text) # Or simply "" if link_url is bad + return f'{text}' + + +def hf_uri_to_web_url(uri: str) -> str: + """ + Convert a Hugging Face-style URI like: + hf://datasets/{namespace}/{repo}/{path...} + into a public web URL: + https://huggingface.co/datasets/{namespace}/{repo}/tree/main/{path...} + """ + prefix = "hf://datasets/" + if not uri.startswith(prefix): + raise ValueError("URI must start with 'hf://datasets/'") + + parts = uri[len(prefix) :].split("/", 2) + if len(parts) < 3: + raise ValueError("Expected format: hf://datasets/{namespace}/{repo}/{path...}") + + namespace, repo, path = parts + return f"https://huggingface.co/datasets/{namespace}/{repo}/tree/main/{path}" + + +css = """ +/* CSS Color Variables using Gradio theme */ +:root { + --color-primary-green: var(--primary-900); /* #0FCB8C */ + --color-primary-pink: var(--secondary-900); /* #f0529c */ + --color-neutral-light: var(--neutral-200); /* #C9C9C3 */ + --color-background-light: var(--neutral-50); /* #FAF2E9 */ + --color-background-dark: var(--neutral-900); /* #032629 */ + --color-text-light: var(--neutral-50); /* #FAF2E9 */ +} + +/* This makes space for the huggingface header bar which must shown on HF spaces. */ +/* FIXME Media queries don't seem to survive rendering. */ +/* @media (min-width: 768px) { ... } */ +gradio-app { + padding-top: 65px; +} + +/* Global Styles */ +h2 { + overflow: hidden; +} + +#intro-paragraph { + font-size: 18px; + max-width: 90%; + padding-left: 35px; + margin-top: 20px; +} + +#intro-paragraph p, +#intro-paragraph li { + font-size: 16px; + line-height: 1.8; +} + +#intro-paragraph ul { + margin-top: 20px; + margin-bottom: 20px; +} + +#diagram-image { + height: 100%; +} + +#diagram-image img { + width: 100%; + height: 100%; + object-fit: cover; +} +#intro-category-paragraph { + font-size: 18px; + max-width: 90%; + margin-top: 20px; +} + +#intro-category-paragraph p, +#intro-category-paragraph li { + font-size: 16px; + line-height: 1.8; +} + +#intro-category-paragraph ul { + margin-top: 20px; + margin-bottom: 20px; +} + +#about-content { + font-size: 18px; + max-width: 60%; + padding-left: 25px; +} +#category-intro { + font-size: 18px; + max-width: 60%; +} +#logo-image { + margin: 0; + margin-bottom: 30px; + justify-content: flex-start; + max-width: 250px; + height: auto; +} +#page-content-wrapper{ + padding-left: 25px; +} +.table-component{ + height: auto !important; + max-height: none !important; +} +.table-wrap { + max-height: none !important; + height: auto !important; + overflow-y: visible !important; +} +/* --- New Rules for Table Density --- */ +table.gr-table th, table.gr-table td { + padding: 4px 4px !important; + width: 1%; + white-space: nowrap; +} +table.svelte-1e98i6s td { + vertical-align: top !important; +} +table.gr-table { + font-size: 14px !important; +} +.html-container { + padding-top: 0 !important; +} +#scatter-disclaimer { + overflow: visible !important; +} +#pareto-disclaimer { + color: #f0529c !important; +} +thead.svelte-1e98i6s th { + background: white !important; +} +.dark thead.svelte-1e98i6s th { + background: #091a1a !important; +} +.cell-wrap.svelte-v1pjjd { + font-family: 'Manrope'; + } +nav.svelte-ti537g.svelte-ti537g { + justify-content: flex-start; +} +.nav-holder { + padding-left: 20px !important; +} +#legend-markdown span { + margin-right: 15px !important; +} +#leaderboard-accordion .label-wrap { + font-size: 1.4rem !important; + z-index: 10 !important; + position: relative !important; +} +.dark #leaderboard-accordion .label-wrap { + color: #0FCB8C !important; +} +.dark block.svelte-1svsvh2 { + background: #032629 !important; +} +.padding.svelte-phx28p { + padding: 0 !important; +} +.sub-nav-bar-container { + display: flex !important; + flex-wrap: wrap !important; + align-items: center !important; + gap: 10px !important; +} +.dark .primary-link-button { + color: var(--color-primary-green); +} +.primary-link-button { + background: none; + border: none; + padding: 0; + margin: 0; + font-family: inherit; + font-size: 16px; + color: var(--color-primary-pink); + text-decoration: none; + cursor: pointer; + white-space: nowrap; +} +.primary-link-button:hover { + text-decoration: underline; +} +.sub-nav-label { + font-weight: bold; + font-size: 16px; + display: flex; + align-items: center; +} +.wrap-header-df th span{ + white-space: normal !important; + word-break: normal !important; + overflow-wrap: break-word !important; + line-height: 1.2 !important; + vertical-align: top !important; + font-size: 12px !important; + font-family: 'Manrope'; +} +.wrap-header-df th { + height: auto !important; +} +.wrap-header-df .cell-wrap img { + width: 16px; + height: 16px; + vertical-align: middle; +} +#legend-markdown img { + width: 16px; + height: 16px; + vertical-align: middle; +} +/*------ Global tooltip styles ------*/ +.tooltip-icon { + display: inline-block; + cursor: help; + position: relative; +} +.tooltip-icon::after { + content: attr(data-tooltip); + position: absolute; + bottom: 125%; + background-color: #105257; + color: #fff; + padding: 10px; + border-radius: 4px; + font-size: 12px; + opacity: 0; + transition: opacity 0.2s; + white-space: pre-line; + width: max-content; + text-align: left; + pointer-events: none; + max-width: 300px; + left: 50%; + transform: translateX(-50%); + z-index: 1000; +} +@media (max-width: 768px) { + .tooltip-icon::after { + max-width: 250px; + } +} +.tooltip-icon:hover::after { + opacity: 1; +} +/*------ Openness label tooltip styles ------*/ +.styler, +#openness-label-html, +#agent-tooling-label-html { + overflow: visible !important; +} +/*------ Table cell tooltip styles ------*/ +.wrap.default.full, +span.wrap[tabindex="0"][role="button"][data-editable="false"] { + overflow: visible !important; +} + +.cell-tooltip-icon::after { + height: fit-content; + top: 125%; +} +/*------ Table column description tooltip styles ------*/ +#legend-markdown, +#leaderboard-accordion { + overflow: visible !important; +} + +/* --- inside table tooltips --- */ +.native-tooltip-icon { + cursor: help; + text-decoration: underline dotted 1px; +} +/* Main Nav bar styling */ +.nav-holder nav { + display: grid !important; + grid-template-columns: auto auto auto auto auto 1fr auto auto !important; + gap: 10px 20px !important; /* Vertical and horizontal spacing */ + width: 100% !important; + align-items: center; +} +.nav-holder nav a[href*="about"] { + grid-row: 1 !important; + grid-column: 7 !important; +} +.nav-holder nav a[href*="submit"] { + grid-row: 1 !important; + grid-column: 8 !important; + white-space: nowrap !important; +} +/* Divider line between header and category nav */ +.nav-holder nav::after { + content: ''; /* Required for pseudo-elements to appear */ + background-color: #C9C9C3; + height: 1px; + grid-row: 2 !important; + grid-column: 1 / -1 !important; +} + +/* Horizontal scrolling for navigation */ +.nav-holder nav { + overflow-x: auto; + scrollbar-width: none; + -ms-overflow-style: none; +} +.nav-holder nav::-webkit-scrollbar { + display: none; +} + +/* Category navigation buttons in row 3 */ +.nav-holder nav a[href*="literature-understanding"], +.nav-holder nav a[href*="code-execution"], +.nav-holder nav a[href*="data-analysis"], +.nav-holder nav a[href*="discovery"] { + grid-row: 3 !important; + justify-self: center !important; + width: fit-content !important; + white-space: nowrap; + flex-shrink: 0; +} + +.nav-holder nav a[href*="literature-understanding"] { grid-column: 1 !important; } +.nav-holder nav a[href*="code-execution"] { grid-column: 2 !important; } +.nav-holder nav a[href*="data-analysis"] { grid-column: 3 !important; } +.nav-holder nav a[href*="discovery"] { grid-column: 4 !important; } + +/* Navigation hover styles */ +.nav-holder nav a[href*="about"]:hover, +.nav-holder nav a[href*="submit"]:hover, +.nav-holder nav a[href*="literature-understanding"]:hover, +.nav-holder nav a[href*="code-execution"]:hover, +.nav-holder nav a[href*="data-analysis"]:hover, +.nav-holder nav a[href*="discovery"]:hover { + background-color: #FDF9F4; +} + +.dark .nav-holder nav a[href*="about"]:hover, +.dark .nav-holder nav a[href*="submit"]:hover, +.dark .nav-holder nav a[href*="literature-understanding"]:hover, +.dark .nav-holder nav a[href*="code-execution"]:hover, +.dark .nav-holder nav a[href*="data-analysis"]:hover, +.dark .nav-holder nav a[href*="discovery"]:hover { + background-color: #1C3A3C; +} +.benchmark-main-subtitle{ + color: var(--color-primary-green); + overflow: hidden; + padding-top: 120px; +} +.benchmark-title{ + color: var(--color-primary-pink); + margin-top: 50px; + font-size: 20px; +} +.dark .benchmark-title{ + color: var(--color-primary-green); +} +.benchmark-description { + margin: 20px 0; + max-width: 800px; +} +/*------ Submission Page CSS ------*/ +#submission-modal .modal-container, +#success-modal .modal-container { + height: auto; + max-width: 600px; +} + +#submission-modal-content, +#success-modal .submission-modal-content { + padding: 20px; + background-color: inherit; + border-radius: 8px; + text-align: center; +} + +#submission-modal-content p, +#success-modal .submission-modal-content p { + font-size: 16px; +} + +#legal-modal-content { + padding: 30px; + background-color: inherit; + border-radius: 8px; + text-align: left; + font-size: 14px; +} + +#legal-modal-content h2 { + text-align: center; +} +#legal-modal-content button { + width: fit-content; +} +.spinner-container { + display: flex; + flex-direction: column; + align-items: center; + justify-content: center; + padding: 30px; +} + +.spinner { + width: 50px; + height: 50px; + border: 5px solid #dee2e6; + border-top: 5px solid #007bff; + border-radius: 50%; + animation: spin 1s linear infinite; + margin-bottom: 20px; +} + +@keyframes spin { + 0% { transform: rotate(0deg); } + 100% { transform: rotate(360deg); } +} + +#submission-page-container { + max-width: 800px; + margin: 0 auto; +} + +#submission-file-label { + padding: 10px; +} + +#submission-button { + max-width: fit-content; + font-size: 14px; +} + +.custom-form-group { + border: 1px solid #000 !important; + border-radius: 4px !important; + padding: 24px !important; + overflow: visible !important; +} + +#openness-label-html, +#agent-tooling-label-html, +#agent-info-label-html, +#submitter-info-label-html, +#username-label-html, +#email-label-html, +#role-label-html { + padding-left: 12px; +} + +.form-label { + margin: 4px 0px 0px 6px; +} + +.form-label-fieldset { + padding-top: 10px !important; +} + +#agent-tooling-label-html { + padding-top: 6px; +} + +.custom-form-group, +.styler { + background: none; +} + +#feedback-button { + display: inline-block; + background-color: #345d60; + color: white; + border: none; + border-radius: 4px; + padding: 15px 20px; + font-size: 16px; + cursor: pointer; + transition: all 0.3s ease; + text-decoration: none; +} + +#feedback-button:hover { + background-color: #5d888b; + transform: translateY(-2px); + box-shadow: 0 6px 12px rgba(0,0,0,0.3); +} +.dark #main-header h2 { + color: #0fcb8c; +} +#main-header h2 { + color: #f0529c; +} + +/* --- New HTML-Based Tooltip Styles --- */ +.tooltip-icon-legend { + position: relative; + cursor: help; + display: inline-block; +} + +/* The HTML pop-up card tooltips.*/ +.tooltip-card { + /* Hiding mechanism */ + opacity: 0; + visibility: hidden; + transition: opacity 0.2s; + pointer-events: none; + /* Card appearance */ + position: fixed; + z-index: 1000; + background-color: #083c40; + color: #e5e7eb; + border-radius: 12px; + padding: 15px; + width: max-content; + max-width: 400px; + text-align: left; +} +.tooltip-card.visible { + opacity: 1; + visibility: visible; +} +.tooltip-card h3 { + font-size: 18px; + color: #fff; + margin-top: 0; + margin-bottom: 12px; +} +.tooltip-card .tooltip-description { + margin-bottom: 20px; + line-height: 1.3; +} +.tooltip-card .tooltip-items-container { + display: flex; + flex-direction: column; + gap: 10px; +} +.tooltip-card .tooltip-legend-item { + display: flex; + align-items: + flex-start; + gap: 10px; +} +.tooltip-card .tooltip-legend-item img { + width: 20px; + height: 20px; + margin-top: 2px; +} +.tooltip-card .tooltip-legend-item div { + display: flex; + flex-direction: column; +} +.tooltip-card .tooltip-legend-item strong { + font-weight: 600; + color: #fff; +} +.tooltip-card .tooltip-legend-item span { + font-size: 13px; + line-height: 1.3; +} +.tooltip-sub-list { + list-style-type: '• '; + padding-left: 18px; + font-size: 13px; + line-height: 1.3; + display: flex; + flex-direction: column; +} +.table-legend-item { + display: flex; + align-items: center; + white-space: nowrap; + margin-top: 8px; + flex-wrap: wrap; +} + +/* About Page CSS */ +#about-page-content-wrapper { + margin-left: auto; + margin-right: auto; + max-width: 800px; + padding: 0 24px; + display: flex; + flex-direction: column; + gap: 40px; + margin-top: 40px; + opacity: 85%; + margin-bottom: 60px; +} +.link-buttons-container { + display: flex; + flex-wrap: wrap; /* Allows buttons to stack on very narrow screens */ + gap: 16px; + margin-top: 16px; +} +.link-button { + display: flex; + justify-content: space-between; + align-items: center; + flex-grow: 1; + background-color: #083c40; + padding: 16px 20px; + font-weight: 600; + border-radius: 12px; + text-decoration: none; + transition: background-color 0.2s ease-in-out; +} +.link-button:hover { + background-color: #0a4c52; +} +.external-link-icon { + font-size: 20px; + line-height: 1; + margin-left: 12px; +} + +#leaderboard-accordion table { + width: auto !important; + margin-right: auto !important; +} +.info-list { + padding-left: 20px; +} + +/* Smooth scrolling for the entire page */ +html { + scroll-behavior: smooth; +} +/* Home Page Styling */ +.diagram-placeholder { + width: 100%; + height: 100%; + min-height: 250px; + display: flex; + align-items: center; + justify-content: center; + background-color: #FAF2E9; + color: #F0529C; + border-radius: 8px; + font-size: 14px; + text-align: center; +} +/* 2. Responsive behavior for smaller screens */ +@media (max-width: 900px) { + #intro-row { + flex-direction: column; + } +} +/* Plot legend styles */ +.plot-legend-container { + min-height: 572px; + background-color: #fff; + padding: 24px 32px; + border: 1px solid black; + border-radius: 4px; +} + +.dark .plot-legend-container { + background: rgba(250, 242, 233, 0.1); + border-color: rgb(159, 234, 209); +} + +#plot-legend-logo { + margin-bottom: 24px; +} + +#plot-legend-logo img { + height: 19px; +} + +.plot-legend-category-heading { + font-size: 16px; + font-weight: 700; +} + +.plot-legend-item { + display: flex; + margin-top: 8px; +} + + +.plot-legend-item-text .description { + color: #888; + font-size: 12px; +} + +.plot-legend-item-svg { + margin-top: 3px; + width: 14px; + height: 14px; + margin-right: 8px; +} + +.plot-legend-tooling-svg { + height: 16px; + width: 16px; + margin-top: 2px; +} + +#plot-legend-item-pareto-svg { + width: 18px; + height: 18px; + margin-right: 2px; +} +h3 .header-link-icon { + font-size: 12px; + vertical-align: text-top; + margin-left: 6px; + text-decoration: none; +} + +/* Targets all "overall stats" columns in the main leaderboard for each category */ +#main-leaderboard td:nth-child(6) .prose, +#main-leaderboard td:nth-child(7) .prose { + font-weight: 700 !important; +} +""" diff --git a/data/1.0.0-dev1/agenteval.json b/data/1.0.0-dev1/agenteval.json new file mode 100644 index 0000000000000000000000000000000000000000..0ff142457afc401b065933c50dc0c20b2d65a7c8 --- /dev/null +++ b/data/1.0.0-dev1/agenteval.json @@ -0,0 +1,90 @@ +{ + "suite_config": { + "name": "openhands-index", + "version": "1.0.0-dev1", + "splits": [ + { + "name": "validation", + "tasks": [ + { + "name": "swe-bench", + "path": "openhands/swe-bench", + "primary_metric": "resolved/mean", + "tags": ["swe-bench"] + }, + { + "name": "multi-swe-bench", + "path": "openhands/multi-swe-bench", + "primary_metric": "resolved/mean", + "tags": ["multi-swe-bench"] + }, + { + "name": "swe-bench-multimodal", + "path": "openhands/swe-bench-multimodal", + "primary_metric": "resolved/mean", + "tags": ["swe-bench-multimodal"] + }, + { + "name": "swt-bench", + "path": "openhands/swt-bench", + "primary_metric": "generated/mean", + "tags": ["swt-bench"] + }, + { + "name": "commit0", + "path": "openhands/commit0", + "primary_metric": "tests_passed/mean", + "tags": ["commit0"] + }, + { + "name": "gaia", + "path": "openhands/gaia", + "primary_metric": "correct/mean", + "tags": ["gaia"] + } + ] + }, + { + "name": "test", + "tasks": [ + { + "name": "swe-bench", + "path": "openhands/swe-bench", + "primary_metric": "resolved/mean", + "tags": ["swe-bench"] + }, + { + "name": "multi-swe-bench", + "path": "openhands/multi-swe-bench", + "primary_metric": "resolved/mean", + "tags": ["multi-swe-bench"] + }, + { + "name": "swe-bench-multimodal", + "path": "openhands/swe-bench-multimodal", + "primary_metric": "resolved/mean", + "tags": ["swe-bench-multimodal"] + }, + { + "name": "swt-bench", + "path": "openhands/swt-bench", + "primary_metric": "generated/mean", + "tags": ["swt-bench"] + }, + { + "name": "commit0", + "path": "openhands/commit0", + "primary_metric": "tests_passed/mean", + "tags": ["commit0"] + }, + { + "name": "gaia", + "path": "openhands/gaia", + "primary_metric": "correct/mean", + "tags": ["gaia"] + } + ] + } + ] + } +} diff --git a/data/1.0.0-dev1/agenteval_backup.json b/data/1.0.0-dev1/agenteval_backup.json new file mode 100644 index 0000000000000000000000000000000000000000..505854a46788c30d7414ad67203f62dc83b45140 --- /dev/null +++ b/data/1.0.0-dev1/agenteval_backup.json @@ -0,0 +1,308 @@ +{ + "suite_config": { + "name": "openhands-index", + "version": "1.0.0-dev1", + "splits": [ + { + "name": "validation", + "tasks": [ + { + "name": "swe-bench", + "path": "openhands/swe-bench", + "primary_metric": "resolved/mean", + "tags": [ + "swe-bench" + ] + }, + { + "name": "multi-swe-bench", + "path": "openhands/multi-swe-bench", + "primary_metric": "resolved/mean", + "tags": [ + "multi-swe-bench" + ] + }, + { + "name": "swe-bench-multimodal", + "path": "openhands/swe-bench-multimodal", + "primary_metric": "resolved/mean", + "tags": [ + "swe-bench-multimodal" + ] + }, + { + "name": "swt-bench", + "path": "openhands/swt-bench", + "primary_metric": "generated/mean", + "tags": [ + "swt-bench" + ] + }, + { + "name": "commit0", + "path": "openhands/commit0", + "primary_metric": "tests_passed/mean", + "tags": [ + "commit0" + ] + }, + { + "name": "gaia", + "path": "openhands/gaia", + "primary_metric": "correct/mean", + "tags": [ + "gaia" + ] + } + ] + }, + { + "name": "test", + "tasks": [ + { + "name": "swe-bench", + "path": "openhands/swe-bench", + "primary_metric": "resolved/mean", + "tags": [ + "swe-bench" + ] + }, + { + "name": "multi-swe-bench", + "path": "openhands/multi-swe-bench", + "primary_metric": "resolved/mean", + "tags": [ + "multi-swe-bench" + ] + }, + { + "name": "arxivdigestables_test", + "path": "astabench/arxivdigestables_test", + "primary_metric": "score_tables/mean", + "tags": [ + "lit" + ] + }, + { + "name": "litqa2_test", + "path": "astabench/litqa2_test", + "primary_metric": "is_correct/accuracy", + "tags": [ + "lit" + ] + }, + { + "name": "discoverybench_test", + "path": "astabench/discoverybench_test", + "primary_metric": "score_discoverybench/mean", + "tags": [ + "data" + ] + }, + { + "name": "core_bench_test", + "path": "astabench/core_bench_test", + "primary_metric": "evaluate_task_questions/accuracy", + "tags": [ + "code" + ] + }, + { + "name": "ds1000_test", + "path": "astabench/ds1000_test", + "primary_metric": "ds1000_scorer/accuracy", + "tags": [ + "code" + ] + }, + { + "name": "e2e_discovery_test", + "path": "astabench/e2e_discovery_test", + "primary_metric": "score_rubric/accuracy", + "tags": [ + "discovery" + ] + }, + { + "name": "super_test", + "path": "astabench/super_test", + "primary_metric": "check_super_execution/entrypoints", + "tags": [ + "code" + ] + } + ] + } + ] + }, + "split": "validation", + "results": [ + { + "task_name": "sqa_dev", + "metrics": [ + { + "name": "global_avg/mean", + "value": 0.6215245045241414 + }, + { + "name": "global_avg/stderr", + "value": 0.02088486499225903 + }, + { + "name": "ingredient_recall/mean", + "value": 0.6029178145087237 + }, + { + "name": "ingredient_recall/stderr", + "value": 0.026215888361291618 + }, + { + "name": "answer_precision/mean", + "value": 0.7960436785436785 + }, + { + "name": "answer_precision/stderr", + "value": 0.027692773517249983 + }, + { + "name": "citation_precision/mean", + "value": 0.697849041353826 + }, + { + "name": "citation_precision/stderr", + "value": 0.026784164936602798 + }, + { + "name": "citation_recall/mean", + "value": 0.3892874836903378 + }, + { + "name": "citation_recall/stderr", + "value": 0.015094770200171756 + } + ], + "model_costs": [ + 1.3829150000000001, + 0.9759700000000001, + 2.2324650000000004, + 0.76631, + 0.9277900000000001, + 2.6388600000000006, + 0.8114100000000002, + 2.3263174999999996, + 2.5423725, + 1.2398675000000001, + 1.7387300000000003, + 1.2176599999999997, + 0.564655, + 0.9726750000000001, + 0.7675700000000001, + 1.5198850000000002, + 1.4726625000000002, + 2.1937650000000004, + 0.6907700000000001, + 1.39835, + 1.2598175, + 2.5373550000000002, + 2.19239, + 1.2508875000000006, + 2.2650550000000007, + 1.6047725, + 0.6525125000000003, + 1.4262200000000003, + 1.0533299999999999, + 1.7252375, + 1.407145, + 1.5408700000000004, + 2.8073224999999993, + 1.0448125000000006, + 1.7037300000000004, + 0.8650500000000001, + 1.0171225000000002, + 0.5697925000000001, + 2.7851025, + 1.0551425, + 2.9213775, + 1.7772975000000004, + 1.2753225000000001, + 0.8108325000000001, + 0.6958375000000001, + 0.8840950000000003, + 1.2028724999999998, + 1.2490475000000003, + 2.4272, + 1.95026, + 1.5352475, + 2.11181, + 2.3612249999999997, + 1.8619225000000004, + 0.7431075000000001, + 1.5189675000000002, + 1.089575, + 1.6103700000000003, + 1.4201450000000002, + 2.397835, + 1.469175, + 1.0723550000000004, + 0.7964050000000003, + 3.3733175, + 4.197085, + 4.2637675, + 1.2982124999999998, + 0.66146, + 1.1130475000000002, + 2.4393974999999997, + 2.582, + 1.7381725000000001, + 0.415025, + 1.6777325, + 1.0507825000000002, + 2.4627125000000003, + 1.017005, + 1.9210250000000002, + 1.5009025000000003, + 0.8283125000000001, + 2.9854425, + 0.4633375000000001, + 0.397685, + 1.2803425, + 3.0388200000000003, + 1.2610875000000004, + 1.798365, + 3.427287500000001, + 0.29307750000000005, + 0.37101249999999997, + 2.8046925000000003, + 0.35557000000000005, + 3.5481700000000007, + 1.1073975, + 1.5280825, + 1.1714900000000001, + 3.1791275000000003, + 3.8214725000000005, + 1.8440275, + 1.730515, + 1.9350675000000002, + 1.6592125000000002, + 1.9227124999999998, + 1.202885, + 1.2688150000000002, + 0.8819875000000001, + 0.6989325, + 1.965635, + 1.7467800000000002, + 1.6940625000000002 + ] + } + ], + "submission": { + "submit_time": "2025-06-09T20:55:35.869831Z", + "username": "miked-ai", + "agent_name": "Basic ReAct", + "agent_description": null, + "agent_url": null, + "logs_url": "hf://datasets/allenai/asta-bench-internal-submissions/1.0.0-dev1/validation/miked-ai_Basic_ReAct__task_tools__report_editor__2025-06-09T20-55-35", + "logs_url_public": null, + "summary_url": null + } +} diff --git a/data_analysis.py b/data_analysis.py new file mode 100644 index 0000000000000000000000000000000000000000..a484744b38755477627372f351ab9fbe2b3b5aef --- /dev/null +++ b/data_analysis.py @@ -0,0 +1,8 @@ +import gradio as gr +from content import DATA_ANALYSIS_DESCRIPTION +from category_page_builder import build_category_page +# Define the category for this page +CATEGORY_NAME = "Data Analysis" + +def build_page(): + build_category_page(CATEGORY_NAME, DATA_ANALYSIS_DESCRIPTION) diff --git a/e2e.py b/e2e.py new file mode 100644 index 0000000000000000000000000000000000000000..dbf5572b4fb0ff4e6314e944355a8a27ce366a3e --- /dev/null +++ b/e2e.py @@ -0,0 +1,8 @@ +import gradio as gr +from content import DISCOVERY_DESCRIPTION +from category_page_builder import build_category_page +# Define the category for this page +CATEGORY_NAME = "End-to-End Discovery" + +def build_page(): + build_category_page(CATEGORY_NAME, DISCOVERY_DESCRIPTION) \ No newline at end of file diff --git a/generate_mock_data.py b/generate_mock_data.py new file mode 100644 index 0000000000000000000000000000000000000000..e712a7a9e1bee6975ce785c084be79a22ae33e4b --- /dev/null +++ b/generate_mock_data.py @@ -0,0 +1,102 @@ +"""Generate mock results data in agenteval format for OpenHands Index.""" +import json +import pandas as pd +import pyarrow as pa +import pyarrow.parquet as pq +from pathlib import Path + +# Load the suite config +with open("data/1.0.0-dev1/agenteval.json") as f: + suite_config_data = json.load(f) + +suite_config = suite_config_data["suite_config"] + +# Mock agents +agents = [ + { + "name": "OpenHands CodeAct v2.1", + "source_url": "https://github.com/OpenHands/OpenHands" + }, + { + "name": "Aider", + "source_url": "https://github.com/paul-gauthier/aider" + }, + { + "name": "SWE-agent", + "source_url": "https://github.com/princeton-nlp/SWE-agent" + } +] + +def create_mock_results(split_name): + """Create mock results for a split.""" + split_config = next(s for s in suite_config["splits"] if s["name"] == split_name) + + rows = [] + for agent in agents: + # Create results for each task + results = [] + for task in split_config["tasks"]: + task_name = task["name"] + primary_metric = task["primary_metric"] + + # Generate mock score (different for each agent) + base_score = 0.3 + (hash(agent["name"]) % 50) / 100 + score = base_score + (hash(task_name) % 30) / 100 + score = min(score, 1.0) + + task_result = { + "task_name": task_name, + "eval_spec": { + "model": "gpt-4", + "solver": f"openhands/{task_name}", + }, + "metrics": [ + { + "name": primary_metric, + "value": score + } + ], + "model_usages": [] + } + results.append(task_result) + + # Create row + row = { + "suite_config": suite_config, + "split": split_name, + "results": results, + "submission": { + "agent_name": agent["name"], + "source_url": agent["source_url"], + "openness": "open-source/open-weights", + "tool_usage": "standard" + } + } + rows.append(row) + + return rows + +# Create mock data for both splits +all_rows = [] +for split in ["validation", "test"]: + all_rows.extend(create_mock_results(split)) + +# Convert to DataFrame +df = pd.DataFrame(all_rows) + +# Save as parquet +output_dir = Path("mock_results/1.0.0-dev1") +output_dir.mkdir(parents=True, exist_ok=True) + +# Save validation split +validation_df = df[df["split"] == "validation"] +validation_df.to_parquet(output_dir / "validation.parquet", index=False) + +# Save test split +test_df = df[df["split"] == "test"] +test_df.to_parquet(output_dir / "test.parquet", index=False) + +print(f"Created mock data:") +print(f" - Validation: {len(validation_df)} rows") +print(f" - Test: {len(test_df)} rows") +print(f" - Output: {output_dir}") diff --git a/github_data_loader.py b/github_data_loader.py new file mode 100644 index 0000000000000000000000000000000000000000..ce968d56850c9a720fa43318ed79300b16eb608b --- /dev/null +++ b/github_data_loader.py @@ -0,0 +1,71 @@ +""" +Custom data loader for OpenHands Index that fetches from GitHub instead of HF datasets. +Mimics the interface of LeaderboardViewer from agent-eval. +""" +import pandas as pd +import requests +from typing import Dict, List, Tuple + + +class GitHubDataLoader: + """Loads leaderboard data from GitHub repository.""" + + def __init__(self, base_url: str, split: str): + self.base_url = base_url + self.split = split + self.tag_map = self._build_tag_map() + + def _build_tag_map(self) -> Dict[str, List[str]]: + """Build tag map for the OpenHands datasets.""" + # Map datasets to their respective tags + return { + "swe-bench": ["swe-bench"], + "multi-swe-bench": ["multi-swe-bench"], + "swe-bench-multimodal": ["swe-bench-multimodal"], + "swt-bench": ["swt-bench"], + "commit0": ["commit0"], + "gaia": ["gaia"], + } + + def _load(self) -> Tuple[pd.DataFrame, Dict]: + """Load and combine data from all GitHub JSON files.""" + all_results = [] + + datasets = ["swe-bench", "multi-swe-bench", "swe-bench-multimodal", + "swt-bench", "commit0", "gaia"] + + for dataset in datasets: + url = f"{self.base_url}/{dataset}.json" + try: + response = requests.get(url, timeout=10) + if response.status_code == 200: + data = response.json() + # Transform GitHub data to match agenteval format + for entry in data: + all_results.append({ + "agent_name": entry.get("agent_name", "Unknown"), + "score": entry.get("score", 0.0), + "dataset": dataset, + "split": self.split, + # Add other fields as needed + }) + except Exception as e: + print(f"Warning: Could not load data from {url}: {e}") + continue + + if all_results: + df = pd.DataFrame(all_results) + return df, self.tag_map + else: + return pd.DataFrame(), self.tag_map + + +class DummyViewer: + """Fallback viewer when data loading fails.""" + + def __init__(self, df: pd.DataFrame): + self._df = df + self.tag_map = {"Overall": []} + + def _load(self) -> Tuple[pd.DataFrame, Dict]: + return self._df, self.tag_map diff --git a/leaderboard_transformer.py b/leaderboard_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..2685614ce55d13f3f5288daff02fdeeb3819605c --- /dev/null +++ b/leaderboard_transformer.py @@ -0,0 +1,666 @@ +import plotly.graph_objects as go +import numpy as np +import pandas as pd +import logging +from typing import Optional +import base64 +import html + +import aliases + +logger = logging.getLogger(__name__) + +INFORMAL_TO_FORMAL_NAME_MAP = { + # Short Names + "lit": "Literature Understanding", + "code": "Code & Execution", + "data": "Data Analysis", + "discovery": "End-to-End Discovery", + + # Validation Names + "arxivdigestables_validation": "ArxivDIGESTables-Clean", + "ArxivDIGESTables_Clean_validation": "ArxivDIGESTables-Clean", + "sqa_dev": "ScholarQA-CS2", + "ScholarQA_CS2_validation": "ScholarQA-CS2", + "litqa2_validation": "LitQA2-FullText", + "LitQA2_FullText_validation": "LitQA2-FullText", + "paper_finder_validation": "PaperFindingBench", + "PaperFindingBench_validation": "PaperFindingBench", + "paper_finder_litqa2_validation": "LitQA2-FullText-Search", + "LitQA2_FullText_Search_validation": "LitQA2-FullText-Search", + "discoverybench_validation": "DiscoveryBench", + "DiscoveryBench_validation": "DiscoveryBench", + "core_bench_validation": "CORE-Bench-Hard", + "CORE_Bench_Hard_validation": "CORE-Bench-Hard", + "ds1000_validation": "DS-1000", + "DS_1000_validation": "DS-1000", + "e2e_discovery_validation": "E2E-Bench", + "E2E_Bench_validation": "E2E-Bench", + "e2e_discovery_hard_validation": "E2E-Bench-Hard", + "E2E_Bench_Hard_validation": "E2E-Bench-Hard", + "super_validation": "SUPER-Expert", + "SUPER_Expert_validation": "SUPER-Expert", + # Test Names + "paper_finder_test": "PaperFindingBench", + "PaperFindingBench_test": "PaperFindingBench", + "paper_finder_litqa2_test": "LitQA2-FullText-Search", + "LitQA2_FullText_Search_test": "LitQA2-FullText-Search", + "sqa_test": "ScholarQA-CS2", + "ScholarQA_CS2_test": "ScholarQA-CS2", + "arxivdigestables_test": "ArxivDIGESTables-Clean", + "ArxivDIGESTables_Clean_test": "ArxivDIGESTables-Clean", + "litqa2_test": "LitQA2-FullText", + "LitQA2_FullText_test": "LitQA2-FullText", + "discoverybench_test": "DiscoveryBench", + "DiscoveryBench_test": "DiscoveryBench", + "core_bench_test": "CORE-Bench-Hard", + "CORE_Bench_Hard_test": "CORE-Bench-Hard", + "ds1000_test": "DS-1000", + "DS_1000_test": "DS-1000", + "e2e_discovery_test": "E2E-Bench", + "E2E_Bench_test": "E2E-Bench", + "e2e_discovery_hard_test": "E2E-Bench-Hard", + "E2E_Bench_Hard_test": "E2E-Bench-Hard", + "super_test": "SUPER-Expert", + "SUPER_Expert_test": "SUPER-Expert", +} +ORDER_MAP = { + 'Overall_keys': [ + 'lit', + 'code', + 'data', + 'discovery', + ], + 'Literature Understanding': [ + 'PaperFindingBench', + 'LitQA2-FullText-Search', + 'ScholarQA-CS2', + 'LitQA2-FullText', + 'ArxivDIGESTables-Clean' + ], + 'Code & Execution': [ + 'SUPER-Expert', + 'CORE-Bench-Hard', + 'DS-1000' + ], + # Add other keys for 'Data Analysis' and 'Discovery' when/if we add more benchmarks in those categories +} + + +def _safe_round(value, digits=3): + """Rounds a number if it's a valid float/int, otherwise returns it as is.""" + return round(value, digits) if isinstance(value, (float, int)) and pd.notna(value) else value + + +def _pretty_column_name(raw_col: str) -> str: + """ + Takes a raw column name from the DataFrame and returns a "pretty" version. + Handles three cases: + 1. Fixed names (e.g., 'User/organization' -> 'Submitter'). + 2. Dynamic names (e.g., 'ds1000_validation score' -> 'DS1000 Validation Score'). + 3. Fallback for any other names. + """ + # Case 1: Handle fixed, special-case mappings first. + fixed_mappings = { + 'id': 'id', + 'Agent': 'Agent', + 'Agent description': 'Agent Description', + 'User/organization': 'Submitter', + 'Submission date': 'Date', + 'Overall': 'Overall Score', + 'Overall cost': 'Overall Cost', + 'Logs': 'Logs', + 'Openness': 'Openness', + 'Agent tooling': 'Agent Tooling', + 'LLM base': 'Models Used', + 'Source': 'Source', + } + + if raw_col in fixed_mappings: + return fixed_mappings[raw_col] + + # Case 2: Handle dynamic names by finding the longest matching base name. + # We sort by length (desc) to match 'core_bench_validation' before 'core_bench'. + sorted_base_names = sorted(INFORMAL_TO_FORMAL_NAME_MAP.keys(), key=len, reverse=True) + + for base_name in sorted_base_names: + if raw_col.startswith(base_name): + formal_name = INFORMAL_TO_FORMAL_NAME_MAP[base_name] + + # Get the metric part (e.g., ' score' or ' cost 95% CI') + metric_part = raw_col[len(base_name):].strip() + + # Capitalize the metric part correctly (e.g., 'score' -> 'Score') + pretty_metric = metric_part.capitalize() + return f"{formal_name} {pretty_metric}" + + # Case 3: If no specific rule applies, just make it title case. + return raw_col.title() + + +def create_pretty_tag_map(raw_tag_map: dict, name_map: dict) -> dict: + """ + Converts a tag map with raw names into a tag map with pretty, formal names, + applying a specific, non-alphabetic sort order to the values. + """ + pretty_map = {} + # Helper to get pretty name with a fallback + def get_pretty(raw_name): + return name_map.get(raw_name, raw_name.replace("_", " ")) + + key_order = ORDER_MAP.get('Overall_keys', []) + sorted_keys = sorted(raw_tag_map.keys(), key=lambda x: key_order.index(x) if x in key_order else len(key_order)) + for raw_key in sorted_keys: + raw_value_list = raw_tag_map[raw_key] + pretty_key = get_pretty(raw_key) + pretty_value_list = [get_pretty(raw_val) for raw_val in raw_value_list] + + # Get the unique values first + unique_values = list(set(pretty_value_list)) + # Get the custom order for the current key. Fall back to an empty list. + custom_order = ORDER_MAP.get(pretty_key, []) + def sort_key(value): + if value in custom_order: + return 0, custom_order.index(value) + else: + return 1, value + pretty_map[pretty_key] = sorted(unique_values, key=sort_key) + + print(f"Created pretty tag map: {pretty_map}") + return pretty_map + + +def transform_raw_dataframe(raw_df: pd.DataFrame) -> pd.DataFrame: + """ + Transforms a raw leaderboard DataFrame into a presentation-ready format. + + This function performs two main actions: + 1. Rounds all numeric metric values (columns containing 'score' or 'cost'). + 2. Renames all columns to a "pretty", human-readable format. + Args: + raw_df (pd.DataFrame): The DataFrame with raw data and column names + like 'agent_name', 'overall/score', 'tag/code/cost'. + Returns: + pd.DataFrame: A new DataFrame ready for display. + """ + if not isinstance(raw_df, pd.DataFrame): + raise TypeError("Input 'raw_df' must be a pandas DataFrame.") + + df = raw_df.copy() + # Create the mapping for pretty column names + pretty_cols_map = {col: _pretty_column_name(col) for col in df.columns} + + # Rename the columns and return the new DataFrame + transformed_df = df.rename(columns=pretty_cols_map) + # Apply safe rounding to all metric columns + for col in transformed_df.columns: + if 'Score' in col or 'Cost' in col: + transformed_df[col] = transformed_df[col].apply(_safe_round) + + logger.info("Raw DataFrame transformed: numbers rounded and columns renamed.") + return transformed_df + + +class DataTransformer: + """ + Visualizes a pre-processed leaderboard DataFrame. + + This class takes a "pretty" DataFrame and a tag map, and provides + methods to view filtered versions of the data and generate plots. + """ + def __init__(self, dataframe: pd.DataFrame, tag_map: dict[str, list[str]]): + """ + Initializes the viewer. + Args: + dataframe (pd.DataFrame): The presentation-ready leaderboard data. + tag_map (dict): A map of formal tag names to formal task names. + """ + if not isinstance(dataframe, pd.DataFrame): + raise TypeError("Input 'dataframe' must be a pandas DataFrame.") + if not isinstance(tag_map, dict): + raise TypeError("Input 'tag_map' must be a dictionary.") + + self.data = dataframe + self.tag_map = tag_map + logger.info(f"DataTransformer initialized with a DataFrame of shape {self.data.shape}.") + + + def view( + self, + tag: Optional[str] = "Overall", # Default to "Overall" for clarity + use_plotly: bool = False, + ) -> tuple[pd.DataFrame, dict[str, go.Figure]]: + """ + Generates a filtered view of the DataFrame and a corresponding scatter plot. + """ + if self.data.empty: + logger.warning("No data available to view.") + return self.data, {} + + # --- 1. Determine Primary and Group Metrics Based on the Tag --- + if tag is None or tag == "Overall": + primary_metric = "Overall" + group_metrics = list(self.tag_map.keys()) + else: + primary_metric = tag + # For a specific tag, the group is its list of sub-tasks. + group_metrics = self.tag_map.get(tag, []) + + # --- 2. Sort the DataFrame by the Primary Score --- + primary_score_col = f"{primary_metric} Score" + df_sorted = self.data + if primary_score_col in self.data.columns: + df_sorted = self.data.sort_values(primary_score_col, ascending=False, na_position='last') + + df_view = df_sorted.copy() + + # --- 3. Add Columns for Agent Openness and Tooling --- + base_cols = ["id","Agent","Submitter","Models Used","Source"] + new_cols = ["Openness", "Agent Tooling"] + ending_cols = ["Date", "Logs"] + + metrics_to_display = [primary_score_col, f"{primary_metric} Cost"] + for item in group_metrics: + metrics_to_display.append(f"{item} Score") + metrics_to_display.append(f"{item} Cost") + + final_cols_ordered = new_cols + base_cols + list(dict.fromkeys(metrics_to_display)) + ending_cols + + for col in final_cols_ordered: + if col not in df_view.columns: + df_view[col] = pd.NA + + # The final selection will now use the new column structure + df_view = df_view[final_cols_ordered].reset_index(drop=True) + cols = len(final_cols_ordered) + + # Calculated and add "Categories Attempted" column + if primary_metric == "Overall": + def calculate_attempted(row): + main_categories = ['Literature Understanding', 'Code & Execution', 'Data Analysis', 'End-to-End Discovery'] + count = sum(1 for category in main_categories if row.get(f"{category} Score") != 0.0) + return f"{count}/4" + + # Apply the function row-wise to create the new column + attempted_column = df_view.apply(calculate_attempted, axis=1) + # Insert the new column at a nice position (e.g., after "Date") + df_view.insert((cols - 2), "Categories Attempted", attempted_column) + else: + total_benchmarks = len(group_metrics) + def calculate_benchmarks_attempted(row): + # Count how many benchmarks in this category have COST data reported + count = sum(1 for benchmark in group_metrics if pd.notna(row.get(f"{benchmark} Score"))) + return f"{count}/{total_benchmarks}" + # Insert the new column, for example, after "Date" + df_view.insert((cols - 2), "Benchmarks Attempted", df_view.apply(calculate_benchmarks_attempted, axis=1)) + + # --- 4. Generate the Scatter Plot for the Primary Metric --- + plots: dict[str, go.Figure] = {} + if use_plotly: + primary_cost_col = f"{primary_metric} Cost" + # Check if the primary score and cost columns exist in the FINAL view + if primary_score_col in df_view.columns and primary_cost_col in df_view.columns: + fig = _plot_scatter_plotly( + data=df_view, + x=primary_cost_col, + y=primary_score_col, + agent_col="Agent", + name=primary_metric + ) + # Use a consistent key for easy retrieval later + plots['scatter_plot'] = fig + else: + logger.warning( + f"Skipping plot for '{primary_metric}': score column '{primary_score_col}' " + f"or cost column '{primary_cost_col}' not found." + ) + # Add an empty figure to avoid downstream errors + plots['scatter_plot'] = go.Figure() + return df_view, plots + +DEFAULT_Y_COLUMN = "Overall Score" +DUMMY_X_VALUE_FOR_MISSING_COSTS = 0 + +def _plot_scatter_plotly( + data: pd.DataFrame, + x: Optional[str], + y: str, + agent_col: str = 'Agent', + name: Optional[str] = None +) -> go.Figure: + + # --- Section 1: Define Mappings --- + # These include aliases for openness categories, + # so multiple names might correspond to the same color. + color_map = { + aliases.CANONICAL_OPENNESS_OPEN_SOURCE_OPEN_WEIGHTS: "deeppink", + aliases.CANONICAL_OPENNESS_OPEN_SOURCE_CLOSED_WEIGHTS: "coral", + aliases.CANONICAL_OPENNESS_CLOSED_API_AVAILABLE: "yellow", + aliases.CANONICAL_OPENNESS_CLOSED_UI_ONLY: "white", + } + for canonical_openness, openness_aliases in aliases.OPENNESS_ALIASES.items(): + for openness_alias in openness_aliases: + color_map[openness_alias] = color_map[canonical_openness] + # Only keep one name per color for the legend. + colors_for_legend = set(aliases.OPENNESS_ALIASES.keys()) + category_order = list(color_map.keys()) + + # These include aliases for tool usage categories, + # so multiple names might correspond to the same shape. + shape_map = { + aliases.CANONICAL_TOOL_USAGE_STANDARD: "star", + aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE: "star-diamond", + aliases.CANONICAL_TOOL_USAGE_FULLY_CUSTOM: "star-triangle-up", + } + for canonical_tool_usage, tool_usages_aliases in aliases.TOOL_USAGE_ALIASES.items(): + for tool_usage_alias in tool_usages_aliases: + shape_map[tool_usage_alias] = shape_map[canonical_tool_usage] + default_shape = 'square' + # Only keep one name per shape for the legend. + shapes_for_legend = set(aliases.TOOL_USAGE_ALIASES.keys()) + + x_col_to_use = x + y_col_to_use = y + llm_base = data["Models Used"] if "Models Used" in data.columns else "Models Used" + + # --- Section 2: Data Preparation--- + required_cols = [y_col_to_use, agent_col, "Openness", "Agent Tooling"] + if not all(col in data.columns for col in required_cols): + logger.error(f"Missing one or more required columns for plotting: {required_cols}") + return go.Figure() + + data_plot = data.copy() + data_plot[y_col_to_use] = pd.to_numeric(data_plot[y_col_to_use], errors='coerce') + + x_axis_label = f"Average (mean) cost per problem (USD)" if x else "Cost (Data N/A)" + max_reported_cost = 0 + divider_line_x = 0 + + if x and x in data_plot.columns: + data_plot[x_col_to_use] = pd.to_numeric(data_plot[x_col_to_use], errors='coerce') + + # --- Separate data into two groups --- + valid_cost_data = data_plot[data_plot[x_col_to_use].notna()].copy() + missing_cost_data = data_plot[data_plot[x_col_to_use].isna()].copy() + + # Hardcode for all missing costs for now, but ideally try to fallback + # to the max cost in the same figure in another split, if that one has data... + max_reported_cost = valid_cost_data[x_col_to_use].max() if not valid_cost_data.empty else 10 + + # ---Calculate where to place the missing data and the divider line --- + divider_line_x = max_reported_cost + (max_reported_cost/10) + new_x_for_missing = max_reported_cost + (max_reported_cost/5) + if not missing_cost_data.empty: + missing_cost_data[x_col_to_use] = new_x_for_missing + + if not valid_cost_data.empty: + if not missing_cost_data.empty: + # --- Combine the two groups back together --- + data_plot = pd.concat([valid_cost_data, missing_cost_data]) + else: + data_plot = valid_cost_data # No missing data, just use the valid set + else: + # ---Handle the case where ALL costs are missing --- + if not missing_cost_data.empty: + data_plot = missing_cost_data + else: + data_plot = pd.DataFrame() + else: + # Handle case where x column is not provided at all + data_plot[x_col_to_use] = 0 + + # Clean data based on all necessary columns + data_plot.dropna(subset=[y_col_to_use, x_col_to_use, "Openness", "Agent Tooling"], inplace=True) + + # --- Section 3: Initialize Figure --- + fig = go.Figure() + if data_plot.empty: + logger.warning(f"No valid data to plot after cleaning.") + return fig + + # --- Section 4: Calculate and Draw Pareto Frontier --- + if x_col_to_use and y_col_to_use: + sorted_data = data_plot.sort_values(by=[x_col_to_use, y_col_to_use], ascending=[True, False]) + frontier_points = [] + max_score_so_far = float('-inf') + + for _, row in sorted_data.iterrows(): + score = row[y_col_to_use] + if score >= max_score_so_far: + frontier_points.append({'x': row[x_col_to_use], 'y': score}) + max_score_so_far = score + + if frontier_points: + frontier_df = pd.DataFrame(frontier_points) + fig.add_trace(go.Scatter( + x=frontier_df['x'], + y=frontier_df['y'], + mode='lines', + name='Efficiency Frontier', + showlegend=False, + line=dict(color='#0FCB8C', width=2, dash='dash'), + hoverinfo='skip' + )) + + # --- Section 5: Prepare for Marker Plotting --- + def format_hover_text(row, agent_col, x_axis_label, x_col, y_col, divider_line_x): + """ + Builds the complete HTML string for the plot's hover tooltip. + Formats the 'Models Used' column as a bulleted list if multiple. + """ + h_pad = " " + parts = ["
"] + parts.append(f"{h_pad}{row[agent_col]}{h_pad}
") + parts.append(f"{h_pad}Score: {row[y_col]:.3f}{h_pad}
") + if divider_line_x > 0 and row[x_col] >= divider_line_x: + # If no cost, display "Missing" for the cost. + parts.append(f"{h_pad}{x_axis_label}: Missing{h_pad}
") + else: + parts.append(f"{h_pad}{x_axis_label}: ${row[x_col]:.2f}{h_pad}
") + parts.append(f"{h_pad}Openness: {row['Openness']}{h_pad}
") + parts.append(f"{h_pad}Tooling: {row['Agent Tooling']}{h_pad}") + + # Add extra vertical space (line spacing) before the next section + parts.append("
") + # Clean and format Models Used column + llm_base_value = row['Models Used'] + llm_base_value = clean_llm_base_list(llm_base_value) + if isinstance(llm_base_value, list) and llm_base_value: + parts.append(f"{h_pad}Models Used:{h_pad}
") + # Create a list of padded bullet points + list_items = [f"{h_pad} • {item}{h_pad}" for item in llm_base_value] + # Join them with line breaks + parts.append('
'.join(list_items)) + else: + # Handle the non-list case with padding + parts.append(f"{h_pad}Models Used: {llm_base_value}{h_pad}") + # Add a final line break for bottom "padding" + parts.append("
") + # Join all the parts together into the final HTML string + return ''.join(parts) + # Pre-generate hover text and shapes for each point + data_plot['hover_text'] = data_plot.apply( + lambda row: format_hover_text( + row, + agent_col=agent_col, + x_axis_label=x_axis_label, + x_col=x_col_to_use, + y_col=y_col_to_use, + divider_line_x=divider_line_x + ), + axis=1 + ) + data_plot['shape_symbol'] = data_plot['Agent Tooling'].map(shape_map).fillna(default_shape) + + # --- Section 6: Plot Markers by "Openness" Category --- + for category in category_order: + group = data_plot[data_plot['Openness'] == category] + if group.empty: + continue + + fig.add_trace(go.Scatter( + x=group[x_col_to_use], + y=group[y_col_to_use], + mode='markers', + name=category, + showlegend=False, + text=group['hover_text'], + hoverinfo='text', + marker=dict( + color=color_map.get(category, 'black'), + symbol=group['shape_symbol'], + size=15, + opacity=0.8, + line=dict(width=1, color='deeppink') + ) + )) + + # --- Section 8: Configure Layout --- + xaxis_config = dict(title=x_axis_label, rangemode="tozero") + if divider_line_x > 0: + fig.add_vline( + x=divider_line_x, + line_width=2, + line_dash="dash", + line_color="grey", + annotation_text="Missing Cost Data", + annotation_position="top right" + ) + + # ---Adjust x-axis range to make room for the new points --- + xaxis_config['range'] = [-0.2, (max_reported_cost + (max_reported_cost / 4))] + + + fig.update_layout( + template="plotly_white", + title=f"AstaBench {name} Leaderboard", + xaxis=xaxis_config, # Use the updated config + yaxis=dict(title="Average (mean) score", range=[-0.2, None]), + legend=dict( + bgcolor='#FAF2E9', + ), + height=572, + hoverlabel=dict( + bgcolor="#105257", + font_size=12, + font_family="Manrope", + font_color="#d3dedc", + ), + ) + + return fig + + +def format_cost_column(df: pd.DataFrame, cost_col_name: str) -> pd.DataFrame: + """ + Applies custom formatting to a cost column based on its corresponding score column. + - If cost is not null, it remains unchanged. + - If cost is null but score is not, it becomes "Missing Cost". + - If both cost and score are null, it becomes "Not Attempted". + Args: + df: The DataFrame to modify. + cost_col_name: The name of the cost column to format (e.g., "Overall Cost"). + Returns: + The DataFrame with the formatted cost column. + """ + # Find the corresponding score column by replacing "Cost" with "Score" + score_col_name = cost_col_name.replace("Cost", "Score") + + # Ensure the score column actually exists to avoid errors + if score_col_name not in df.columns: + return df # Return the DataFrame unmodified if there's no matching score + + def apply_formatting_logic(row): + cost_value = row[cost_col_name] + score_value = row[score_col_name] + status_color = "#ec4899" + + if pd.notna(cost_value) and isinstance(cost_value, (int, float)): + return f"${cost_value:.2f}" + elif pd.notna(score_value): + return f'Missing' # Score exists, but cost is missing + else: + return f'Not Submitted' # Neither score nor cost exists + + # Apply the logic to the specified cost column and update the DataFrame + df[cost_col_name] = df.apply(apply_formatting_logic, axis=1) + + return df + +def format_score_column(df: pd.DataFrame, score_col_name: str) -> pd.DataFrame: + """ + Applies custom formatting to a score column for display. + - If a score is 0 or NaN, it's displayed as a colored "0". + - Other scores are formatted to two decimal places. + """ + status_color = "#ec4899" # The same color as your other status text + + # First, fill any NaN values with 0 so we only have one case to handle. + # We must use reassignment to avoid the SettingWithCopyWarning. + df[score_col_name] = df[score_col_name].fillna(0) + + def apply_formatting(score_value): + # Now, we just check if the value is 0. + if score_value == 0: + return f'0.0' + + # For all other numbers, format them for consistency. + if isinstance(score_value, (int, float)): + return f"{score_value:.3f}" + + # Fallback for any unexpected non-numeric data + return score_value + + # Apply the formatting and return the updated DataFrame + return df.assign(**{score_col_name: df[score_col_name].apply(apply_formatting)}) + + +def get_pareto_df(data): + cost_cols = [c for c in data.columns if 'Cost' in c] + score_cols = [c for c in data.columns if 'Score' in c] + if not cost_cols or not score_cols: + return pd.DataFrame() + + x_col, y_col = cost_cols[0], score_cols[0] + + frontier_data = data.dropna(subset=[x_col, y_col]).copy() + frontier_data[y_col] = pd.to_numeric(frontier_data[y_col], errors='coerce') + frontier_data[x_col] = pd.to_numeric(frontier_data[x_col], errors='coerce') + frontier_data.dropna(subset=[x_col, y_col], inplace=True) + if frontier_data.empty: + return pd.DataFrame() + + frontier_data = frontier_data.sort_values(by=[x_col, y_col], ascending=[True, False]) + + pareto_points = [] + max_score_at_cost = -np.inf + + for _, row in frontier_data.iterrows(): + if row[y_col] >= max_score_at_cost: + pareto_points.append(row) + max_score_at_cost = row[y_col] + + return pd.DataFrame(pareto_points) + + +def svg_to_data_uri(path: str) -> str: + """Reads an SVG file and encodes it as a Data URI for Plotly.""" + try: + with open(path, "rb") as f: + encoded_string = base64.b64encode(f.read()).decode() + return f"data:image/svg+xml;base64,{encoded_string}" + except FileNotFoundError: + logger.warning(f"SVG file not found at: {path}") + return None + +def clean_llm_base_list(model_list): + """ + Cleans a list of model strings by keeping only the text after the last '/'. + For example: "models/gemini-2.5-flash-preview-05-20" becomes "gemini-2.5-flash-preview-05-20". + """ + # Return the original value if it's not a list, to avoid errors. + if not isinstance(model_list, list): + return model_list + + # Use a list comprehension for a clean and efficient transformation. + return [str(item).split('/')[-1] for item in model_list] diff --git a/literature_understanding.py b/literature_understanding.py new file mode 100644 index 0000000000000000000000000000000000000000..79ce6d8b1c9083d708bcc0e834e9d42b28bce424 --- /dev/null +++ b/literature_understanding.py @@ -0,0 +1,8 @@ +from content import LIT_DESCRIPTION +from category_page_builder import build_category_page + +# Define the category for this page +CATEGORY_NAME = "Literature Understanding" + +def build_page(): + build_category_page(CATEGORY_NAME, LIT_DESCRIPTION) diff --git a/main_page.py b/main_page.py new file mode 100644 index 0000000000000000000000000000000000000000..2236a2c21f42bc99f162f22dcf4cbba9f551f6af --- /dev/null +++ b/main_page.py @@ -0,0 +1,81 @@ +import matplotlib +matplotlib.use('Agg') +import gradio as gr + + +from ui_components import create_leaderboard_display, get_full_leaderboard_data + +from content import ( + CITATION_BUTTON_LABEL, + CITATION_BUTTON_TEXT, + INTRO_PARAGRAPH +) + +# --- Global State for Viewers (simple caching) --- +CACHED_VIEWERS = {} +CACHED_TAG_MAPS = {} + +def build_page(): + with gr.Row(elem_id="intro-row"): + with gr.Column(scale=1): + gr.HTML(INTRO_PARAGRAPH, elem_id="intro-paragraph") + + with gr.Column(scale=1): + gr.Image( + value="assets/overall.svg", + show_label=False, + interactive=False, + show_download_button=False, + show_fullscreen_button=False, + show_share_button=False, + elem_id="diagram-image" + ) + + # --- Leaderboard Display Section --- + gr.Markdown("---") + CATEGORY_NAME = "Overall" + gr.HTML(f'

AstaBench {CATEGORY_NAME} Leaderboard (Aggregate)

', elem_id="main-header") + + with gr.Tabs() as tabs: + with gr.Tab("Results: Test Set") as test_tab: + test_df, test_tag_map = get_full_leaderboard_data("test") + if not test_df.empty: + gr.Markdown("**Test Set** results are reserved for final assessment. This helps ensure that the agent generalizes well to unseen problems.") + create_leaderboard_display( + full_df=test_df, + tag_map=test_tag_map, + category_name=CATEGORY_NAME, # Use our constant + split_name="test" + ) + else: + gr.Markdown("No data available for test split.") + with gr.Tab("Results: Validation Set") as validation_tab: + # 1. Load all necessary data for the "validation" split ONCE. + validation_df, validation_tag_map = get_full_leaderboard_data("validation") + # Check if data was loaded successfully before trying to display it + if not validation_df.empty: + gr.Markdown("**Validation Set** results are used during development to tune and compare agents before final testing.") + # 2. Render the display by calling the factory with the loaded data. + create_leaderboard_display( + full_df=validation_df, + tag_map=validation_tag_map, + category_name=CATEGORY_NAME, # Use our constant + split_name="validation" + ) + else: + gr.Markdown("No data available for validation split.") + + # hiding this for now till we have the real paper data + # with gr.Accordion("📙 Citation", open=False): + # gr.Textbox(value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, elem_id="citation-button-main", interactive=False) + + + # JavaScript to show the TEST nav, hide the VALIDATION nav, AND fix the plots. + show_validation_js = """ + () => {setTimeout(() => { window.dispatchEvent(new Event('resize')) }, 0);} + """ + # Assign the pure JS functions to the select events. No Python `fn` is needed. + validation_tab.select(fn=None, inputs=None, outputs=None, js=show_validation_js) + +if __name__ == "__main__": + demo.launch() \ No newline at end of file diff --git a/mock_results/1.0.0-dev1/test.parquet b/mock_results/1.0.0-dev1/test.parquet new file mode 100644 index 0000000000000000000000000000000000000000..3b219207b2b30be6795eed43e15cdbbcbb8afdb5 Binary files /dev/null and b/mock_results/1.0.0-dev1/test.parquet differ diff --git a/mock_results/1.0.0-dev1/validation.parquet b/mock_results/1.0.0-dev1/validation.parquet new file mode 100644 index 0000000000000000000000000000000000000000..d2abfdebd7c1b8cb78b1a547945b13ceaa7ea7d7 Binary files /dev/null and b/mock_results/1.0.0-dev1/validation.parquet differ diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000000000000000000000000000000000000..509fcc5ff6b908a2a7213a324aeb38770214dd6c --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,4 @@ +black +isort +pytest~=8.4.1 +pytest-mock~=3.14.1 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..1ed16bea9991d5ea8b12b729c1a1cb62e2882deb --- /dev/null +++ b/requirements.txt @@ -0,0 +1,132 @@ +agent-eval==0.1.43 +aiobotocore==2.22.0 +aiofiles==24.1.0 +aiohappyeyeballs==2.6.1 +aiohttp==3.11.18 +aioitertools==0.12.0 +aiosignal==1.3.2 +annotated-types==0.7.0 +anyio==4.9.0 +APScheduler==3.11.0 +async-timeout==5.0.1 +attrs==25.3.0 +Authlib==1.5.2 +beautifulsoup4==4.13.4 +black==25.1.0 +botocore==1.37.3 +certifi==2025.4.26 +cffi==1.17.1 +charset-normalizer==3.4.2 +click==8.1.8 +contourpy==1.3.2 +cryptography==44.0.3 +cycler==0.12.1 +datasets==4.0.0 +debugpy==1.8.14 +dill==0.3.8 +distro==1.9.0 +docstring_parser==0.16 +exceptiongroup==1.2.2 +fastapi==0.115.12 +ffmpy==0.5.0 +filelock==3.18.0 +fonttools==4.58.1 +frozenlist==1.6.0 +fsspec==2025.3.0 +gradio==5.30.0 +gradio_client==1.10.1 +gradio_modal==0.0.4 +groovy==0.1.2 +h11==0.16.0 +httpcore==1.0.9 +httpx==0.28.1 +huggingface-hub==0.30.2 +idna==3.10 +ijson==3.3.0 +importlib_metadata==8.7.0 +inspect_ai==0.3.104 +isort==6.0.1 +itsdangerous==2.2.0 +Jinja2==3.1.6 +jiter==0.9.0 +jmespath==1.0.1 +jsonlines==4.0.0 +jsonpatch==1.33 +jsonpointer==3.0.0 +jsonschema==4.23.0 +jsonschema-specifications==2025.4.1 +kiwisolver==1.4.8 +linkify-it-py==2.0.3 +litellm==1.68.1 +markdown-it-py==3.0.0 +MarkupSafe==3.0.2 +matplotlib==3.10.3 +mdit-py-plugins==0.4.2 +mdurl==0.1.2 +mmh3==5.1.0 +mplcursors==0.6 +multidict==6.4.3 +multiprocess==0.70.16 +mypy_extensions==1.1.0 +narwhals==1.38.2 +nest-asyncio==1.6.0 +numpy==2.2.5 +openai==1.75.0 +orjson==3.10.18 +packaging==25.0 +pandas==2.2.3 +pathspec==0.12.1 +pillow==11.2.1 +platformdirs==4.3.7 +plotly==6.0.1 +propcache==0.3.1 +psutil==7.0.0 +pyarrow==20.0.0 +pycparser==2.22 +pydantic==2.11.4 +pydantic_core==2.33.2 +pydub==0.25.1 +Pygments==2.19.1 +pyparsing==3.2.3 +python-dateutil==2.9.0.post0 +python-dotenv==1.1.0 +python-multipart==0.0.20 +pytz==2025.2 +PyYAML==6.0.2 +referencing==0.36.2 +regex==2024.11.6 +requests==2.32.3 +rich==13.9.4 +rpds-py==0.24.0 +ruff==0.11.8 +s3fs==2025.3.0 +safehttpx==0.1.6 +seaborn==0.13.2 +semantic-version==2.10.0 +semver==3.0.4 +shellingham==1.5.4 +shortuuid==1.0.13 +six==1.17.0 +sniffio==1.3.1 +soupsieve==2.7 +starlette==0.46.2 +tenacity==9.1.2 +textual<3.0.0 +tiktoken==0.9.0 +tokenizers==0.21.1 +tomli==2.2.1 +tomlkit==0.13.2 +tqdm==4.67.1 +typer==0.15.3 +typing-inspection==0.4.0 +typing_extensions==4.13.2 +tzdata==2025.2 +tzlocal==5.3.1 +uc-micro-py==1.0.3 +urllib3==2.4.0 +uvicorn==0.34.2 +websockets==15.0.1 +wrapt==1.17.2 +xxhash==3.5.0 +yarl==1.20.0 +zipp==3.21.0 diff --git a/submission.py b/submission.py new file mode 100644 index 0000000000000000000000000000000000000000..9692fdc25893f010feec9cd119997cac7390b1d0 --- /dev/null +++ b/submission.py @@ -0,0 +1,453 @@ +import logging +import typing + +import matplotlib +from agenteval.cli import SUBMISSION_METADATA_FILENAME +from agenteval.models import SubmissionMetadata +from datasets.exceptions import DataFilesNotFoundError +from gradio_modal import Modal + +matplotlib.use('Agg') + +import os +import shutil +import tarfile +from datetime import datetime, timedelta, timezone +from email.utils import parseaddr + +import gradio as gr +import requests +from agenteval.leaderboard.upload import sanitize_path_component, _validate_path_component +from datasets import Dataset, DatasetDict, VerificationMode, load_dataset +from datasets.data_files import EmptyDatasetError +from huggingface_hub import HfApi + +import aliases +from config import ( + CONFIG_NAME, + CONTACT_DATASET, + EXTRACTED_DATA_DIR, + RESULTS_DATASET, + SUBMISSION_DATASET, +) +from content import ( + CITATION_BUTTON_LABEL, + CITATION_BUTTON_TEXT, + LEGAL_DISCLAIMER_TEXT, + SUBMISSION_CONFIRMATION, + format_error, + format_log, + format_warning, +) +from ui_components import build_openness_tooltip_content, build_tooling_tooltip_content + + +logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) + +api = HfApi() +MAX_UPLOAD_BYTES = 5e9 +os.makedirs(EXTRACTED_DATA_DIR, exist_ok=True) + +# --- Submission Logic (largely unchanged from original, ensure LeaderboardSubmission and other deps are fine) --- +def try_load_dataset_submission(*args, **kwargs) -> DatasetDict: # Renamed to avoid conflict if LV has one + try: + return load_dataset(*args, **kwargs) + except EmptyDatasetError: + return DatasetDict() + except ValueError: # Handles cases where dataset is empty or ill-formed + return DatasetDict() + except DataFilesNotFoundError: + return DatasetDict() + +def upload_submission( + folder_path: str, + split: str, + submission_name: str, + hf_username: str, +) -> str: + total = 0 + for root, _, files in os.walk(folder_path): + for f_ul in files: + total += os.path.getsize(os.path.join(root, f_ul)) + if total > MAX_UPLOAD_BYTES: + raise ValueError( + f"Upload too large: exceeds {MAX_UPLOAD_BYTES // 1000000} MB limit." + ) + + # This is a copy of agenteval.upload.upload_folder_to_hf so we can use other api params. + # TODO in agenteval: When you mildly wrap another library call, always pass *args, **kwargs. + _validate_path_component(CONFIG_NAME, "config_name") + _validate_path_component(split, "split") + _validate_path_component(submission_name, "submission_name") + dataset_url = f"hf://datasets/{SUBMISSION_DATASET}/{CONFIG_NAME}/{split}/{submission_name}" + logger.info(f"Uploading dataset {dataset_url}") + api.upload_folder( + folder_path=folder_path, + path_in_repo=f"{CONFIG_NAME}/{split}/{submission_name}", + repo_id=SUBMISSION_DATASET, + repo_type="dataset", + # Reminder: This may be going into a public dataset. + # Don't put private information in commit message such as email. + commit_message=f'Submission from hf user "{hf_username}" to "{dataset_url}"', + ) + return dataset_url + +def show_loading_spinner(): + return gr.update(visible=True) + +def add_new_eval( + val_or_test: str, + agent_name: str | None, + agent_description: str, + agent_url: str, + openness: str | None, + degree_of_control: str | None, + path_to_file: typing.IO | None, + username: str, + role: str, + email: str, + email_opt_in: bool, + profile: gr.OAuthProfile, +): + if not agent_name: + return ( + format_warning("Please provide an agent name."), # error_message + gr.update(visible=True), # error_modal + gr.update(visible=False), # success_modal + gr.update(visible=False) # loading_modal + ) + + if path_to_file is None: + return ( + format_warning("Please attach a .tar.gz file."), # error_message + gr.update(visible=True), # error_modal + gr.update(visible=False), # success_modal + gr.update(visible=False) # loading_modal + ) + + logger.info(f"agent {agent_name}: Checking submission") + + # Load current eval_results for submission checks + # This is a bit redundant if display part reloads it, but submission needs its own consistent view + current_eval_results_for_submission = try_load_dataset_submission( + RESULTS_DATASET, + CONFIG_NAME, + download_mode="force_redownload", # Or a less aggressive mode + verification_mode=VerificationMode.NO_CHECKS, + ) + + submission_time = datetime.now(timezone.utc) + if not username or username.strip() == "": + username = profile.username # Default to HF username + + logger.debug(f"agent {agent_name}: User account age check {profile.username}") + try: + # Account age check disabled for launch. + # https://github.com/allenai/astabench-issues/issues/419 + # if _is_hf_acct_too_new(submission_time, profile.username): + # return ( + # format_error("This account is not authorized to submit here (account too new)."), # error_message + # gr.update(visible=True), # error_modal + # gr.update(visible=False), # success_modal + # gr.update(visible=False) # loading_modal + # ) + pass + except Exception as e: + logger.warning(f"Error checking user account age: {e}") + return ( + format_error("Could not verify account age. Please try again later."), # error_message + gr.update(visible=True), # error_modal + gr.update(visible=False), # success_modal + gr.update(visible=False) # loading_modal + ) + + logger.debug(f"agent {agent_name}: Submission frequency check {profile.username}") + contact_infos = try_load_dataset_submission( + CONTACT_DATASET, CONFIG_NAME, download_mode="force_redownload", + verification_mode=VerificationMode.NO_CHECKS + ) + if _is_last_submission_too_recent( + contact_rows=contact_infos.get(val_or_test, []), + username=profile.username, + submission_time=submission_time, + ): + logger.info(f"agent {agent_name}: Denied submission because user {username} submitted recently") + return ( + format_error("You already submitted once in the last 24h for this split; please try again later."), # error_message + gr.update(visible=True), # error_modal + gr.update(visible=False), # success_modal + gr.update(visible=False) # loading_modal + ) + + logger.debug(f"agent {agent_name}: Email validation {email}") + _, parsed_mail = parseaddr(email) + if "@" not in parsed_mail: + return ( + format_warning("Please provide a valid email address."), # error_message + gr.update(visible=True), # error_modal + gr.update(visible=False), # success_modal + gr.update(visible=False) # loading_modal + ) + + logger.debug(f"agent {agent_name}: Duplicate submission check") + if val_or_test in current_eval_results_for_submission and len(current_eval_results_for_submission[val_or_test]) > 0: + existing_submissions = current_eval_results_for_submission[val_or_test].to_dict().get("submission", []) + for sub_item in existing_submissions: + if (sub_item.get("agent_name", "").lower() == agent_name.lower() and + sub_item.get("username", "").lower() == username.lower()): + return ( + format_warning("This agent name by this user has already been submitted to this split."), # error_message + gr.update(visible=True), # error_modal + gr.update(visible=False), # success_modal + gr.update(visible=False) # loading_modal + ) + + safe_username = sanitize_path_component(username) + safe_agent_name = sanitize_path_component(agent_name) + extracted_dir = os.path.join(EXTRACTED_DATA_DIR, f"{safe_username}_{safe_agent_name}") + + logger.debug(f"agent {agent_name}: File extraction to {extracted_dir}") + try: + if os.path.exists(extracted_dir): shutil.rmtree(extracted_dir) + os.makedirs(extracted_dir, exist_ok=True) + with tarfile.open(path_to_file.name, "r:gz") as tar: + members_extracted = 0 + for member in tar.getmembers(): + if not member.isreg(): continue + fname = os.path.basename(member.name) + if not fname or fname.startswith("."): continue + fobj = tar.extractfile(member) + if not fobj: continue + with open(os.path.join(extracted_dir, fname), "wb") as out: + out.write(fobj.read()) + members_extracted +=1 + if members_extracted == 0: + return ( + format_error("Submission tarball is empty or contains no valid files."), # error_message + gr.update(visible=True), # error_modal + gr.update(visible=False), # success_modal + gr.update(visible=False) # loading_modal + ) + except Exception as e: + return ( + format_error(f"Error extracting file: {e}. Ensure it's a valid .tar.gz."), # error_message + gr.update(visible=True), # error_modal + gr.update(visible=False), # success_modal + gr.update(visible=False) # loading_modal + ) + + submission_name = f"{safe_username}_{safe_agent_name}_{submission_time.strftime('%Y-%m-%d_%H-%M-%S')}" + + logger.debug(f"agent {agent_name}: Generate submission.json") + subm_meta = SubmissionMetadata( + agent_name=agent_name, + agent_description=agent_description, + agent_url=agent_url, + openness=openness, + tool_usage=degree_of_control, + username=username, + submit_time=submission_time, + ) + with open(os.path.join(extracted_dir, SUBMISSION_METADATA_FILENAME), "w", encoding="utf-8") as fp: + fp.write(subm_meta.model_dump_json(indent=2)) + + logger.info(f"agent {agent_name}: Upload raw (unscored) submission files") + try: + dataset_url = upload_submission(extracted_dir, val_or_test, submission_name, profile.username) + except ValueError as e: + return ( + format_error(str(e)), # error_message + gr.update(visible=True), # error_modal + gr.update(visible=False), # success_modal + gr.update(visible=False) # loading_modal + ) + except Exception as e: + return ( + format_error(f"Failed to upload raw submission: {e}"), # error_message + gr.update(visible=True), # error_modal + gr.update(visible=False), # success_modal + gr.update(visible=False) # loading_modal + ) + + logger.info(f"agent {agent_name}: Save contact information") + contact_info = subm_meta.model_dump() + contact_info["username_auth"] = profile.username + contact_info["email"] = email + contact_info["email_opt_in"] = email_opt_in + contact_info["role"] = role + contact_info["dataset_url"] = dataset_url + + logger.debug(f"agent {agent_name}: Contact info: {contact_info}") + if val_or_test in contact_infos: + contact_infos[val_or_test] = contact_infos[val_or_test].add_item(contact_info) + else: + contact_infos[val_or_test] = Dataset.from_list([contact_info]) + + try: + contact_infos.push_to_hub( + repo_id=CONTACT_DATASET, + config_name=CONFIG_NAME, + commit_message=f'Submission from hf user "{profile.username}" to "{dataset_url}"', + ) + except Exception as e: + return ( + format_error(f"Submission recorded, but contact info failed to save: {e}"), # error_message + gr.update(visible=True), # error_modal + gr.update(visible=False), # success_modal + gr.update(visible=False) # loading_modal + ) + + logger.info(f"Agent '{agent_name}' submitted successfully by '{username}' to '{val_or_test}' split.") + return ( + "", # message + gr.update(visible=False), # error_modal + gr.update(visible=True), # success_modal + gr.update(visible=False) # loading_modal + ) + + +def _is_hf_acct_too_new(submission_time: datetime, username: str): + user_data_resp = requests.get(f"https://huggingface.co/api/users/{username}/overview") + user_data_resp.raise_for_status() + creation_date_str = user_data_resp.json()["createdAt"] + created_at = datetime.strptime(creation_date_str, "%Y-%m-%dT%H:%M:%S.%fZ").replace(tzinfo=timezone.utc) + return submission_time - created_at < timedelta(days=60) + + +def _is_last_submission_too_recent(contact_rows, username, submission_time): + user_submission_dates = sorted( + row["submit_time"] for row in contact_rows if row["username_auth"] == username + ) + return user_submission_dates and (submission_time - user_submission_dates[-1] < timedelta(days=1)) + + +openness_label_html = f"""
+ Agent Openness + {build_openness_tooltip_content()} +
""" + + +agent_tooling_label_html = f"""
+ Agent Tooling + {build_tooling_tooltip_content()} +
""" + + +heading_html = """ +

🚀 Submit an agent for evaluation

+

Submit your agent to AstaBench for evaluation on real-world scientific tasks. Once submitted, your run will be reviewed by our team. If there are any issues, we’ll reach out within 5–7 business days. We’re working toward full automation, but in the meantime, human review helps ensure quality and trust.

+

How to run an evaluation

+

Please follow the steps in our README. You’ll upload your run file at the end of this form.

+""" + +# --- Submission Accordion --- +def build_page(): + with gr.Column(elem_id="submission-page-container"): + gr.HTML(heading_html) + gr.LoginButton() + with gr.Group(elem_classes="custom-form-group"): + gr.HTML(value="""

Submitter Information

""", elem_id="submitter-info-label-html") + gr.HTML(value="""

Username

""", elem_classes="form-label") + username_tb = gr.Textbox(label="This will show on the leaderboard. By default, we’ll use your Hugging Face username; but you can enter your organization name instead (e.g., university, company, or lab).") + gr.HTML(value="""

Role

""", elem_classes="form-label") + role = gr.Dropdown(label="Please select the role that most closely matches your current position. Helps us improve AstaBench for different user types. Not displayed on the leaderboard.", + interactive=True, + choices=[ + "Undergraduate Student", + "Masters Student", + "PhD Student", + "Postdoctoral Researcher", + "Academic Faculty (e.g., Professor, Lecturer)", + "Industry Researcher (e.g., Research Scientist, Applied Scientist)", + "Engineer or Developer (e.g., Software or ML Engineer)", + "Data Scientist or Analyst", + "Product or Program Manager", + "Startup Founder or Independent Researcher", + "Other" + ]) + gr.HTML(value="""

Contact email

""", elem_classes="form-label") + mail_tb = gr.Textbox(label="We'll only use your email to communicate about your submission.") + mail_opt_in = gr.Checkbox(label="I’m open to being contacted by email for user research studies or feedback opportunities.") + with gr.Group(elem_classes="custom-form-group"): + gr.HTML(value="""

Agent Information

""", elem_id="agent-info-label-html") + gr.HTML(value="""

Split

""", elem_classes="form-label") + level_of_test_radio = gr.Radio(choices=[ + ("Test set", "test"), + ("Validation set", "validation"), + ], elem_classes="form-label-fieldset", value="validation", label="The Test Set is used for final leaderboard rankings. The Validation Set is for development and iteration. Choose based on your evaluation goal.") + gr.HTML(value="""

Agent name

""", elem_classes="form-label") + agent_name_tb = gr.Textbox(label="This is how your agent will appear on the leaderboard. Use a clear, descriptive name (e.g., Asta Scholar QA, Perplexity Deep Research). Omit model names (e.g. GPT-4, Mistral) as they’ll be shown automatically based on your logs.") + gr.HTML(value="""

Agent description

""", elem_classes="form-label") + agent_desc_tb = gr.Textbox(label="Briefly describe your agent’s approach, core strategies, or what makes it distinct. This description may appear on the leaderboard.") + gr.HTML(value="""

URL

""", elem_classes="form-label") + agent_url_tb = gr.Textbox(label="Link to more information about your agent (e.g. GitHub repo, blog post, or website). This optional link may be shown on the leaderboard to let others explore your agent in more depth.") + gr.HTML(value=openness_label_html, elem_classes="form-label") + openness_radio = gr.Radio([aliases.CANONICAL_OPENNESS_OPEN_SOURCE_CLOSED_WEIGHTS, aliases.CANONICAL_OPENNESS_OPEN_SOURCE_OPEN_WEIGHTS, aliases.CANONICAL_OPENNESS_CLOSED_API_AVAILABLE, aliases.CANONICAL_OPENNESS_CLOSED_UI_ONLY], elem_classes="form-label-fieldset", value=None, label="This affects how your submission is categorized on the leaderboard. Choose based on the availability of your code, model weights, or APIs.") + gr.HTML(value=agent_tooling_label_html, elem_classes="form-label") + degree_of_control_radio = gr.Radio([aliases.CANONICAL_TOOL_USAGE_STANDARD, aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE, aliases.CANONICAL_TOOL_USAGE_FULLY_CUSTOM], elem_classes="form-label-fieldset",value=None, label="Choose based on the tools and the execution environment your agent used during evaluation.") + gr.HTML(value="""

Submission file

""", elem_classes="form-label") + gr.HTML("
Upload your run as a .tar.gz archive prepared using the steps in the README (“Submitting to the Leaderboard”).
") + file_upload_comp = gr.File( + show_label=False, + file_types=[".gz", ".tar.gz"], + ) + submit_eval_button = gr.Button("Submit Evaluation", elem_id="submission-button") + # Modals for loading spinner, success and error messages + with Modal(visible=False, elem_id="submission-modal") as loading_modal: + with gr.Column(elem_id="submission-modal-content"): + gr.HTML('

Processing your submission...

') + + with Modal(visible=False, elem_id="submission-modal") as error_modal: + with gr.Column(elem_id="submission-modal-content"): + gr.Markdown("## ⚠️ Error") + error_message = gr.Markdown() + + with Modal(visible=False, elem_id="success-modal") as success_modal: + with gr.Column(elem_id="submission-modal-content"): + gr.Markdown(SUBMISSION_CONFIRMATION) + with Modal(visible=False, elem_id="submission-modal") as disclaimer_modal: + with gr.Column(elem_id="legal-modal-content"): + gr.HTML(LEGAL_DISCLAIMER_TEXT) + with gr.Row(): + agree_button = gr.Button("I agree to the terms and conditions above", variant="primary") + + def accept_and_load(): + return [ + gr.update(visible=False), # Hide disclaimer_modal + gr.update(visible=True) # Show loading_modal + ] + + def show_disclaimer(): + return gr.update(visible=True) + + submit_eval_button.click( + fn=show_disclaimer, + inputs=None, + outputs=[disclaimer_modal] + ) + + agree_button.click( + fn=accept_and_load, + inputs=None, + outputs=[disclaimer_modal, loading_modal], + ).then( + fn=add_new_eval, + inputs=[ + level_of_test_radio, + agent_name_tb, + agent_desc_tb, + agent_url_tb, + openness_radio, + degree_of_control_radio, + file_upload_comp, + username_tb, + role, + mail_tb, + mail_opt_in + ], + outputs=[error_message, error_modal, success_modal, loading_modal], + ) + # hiding this for now till we have the real paper data + # with gr.Accordion("📙 Citation", open=False): + # gr.Textbox(value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, elem_id="citation-button-main", interactive=False) diff --git a/tests/integration/test_submission.py b/tests/integration/test_submission.py new file mode 100644 index 0000000000000000000000000000000000000000..61a46dce4c3cb44c4f31cfb25915db51758f9fcc --- /dev/null +++ b/tests/integration/test_submission.py @@ -0,0 +1,110 @@ +import json +import os +from datetime import datetime + +import gradio +import pytest +import pyarrow as pa +from agenteval.models import SubmissionMetadata +from datasets import load_dataset, VerificationMode +from huggingface_hub import HfApi, hf_hub_download + +from aliases import CANONICAL_TOOL_USAGE_STANDARD, CANONICAL_OPENNESS_CLOSED_UI_ONLY +from config import IS_INTERNAL, CONFIG_NAME, CONTACT_DATASET, SUBMISSION_DATASET +from submission import add_new_eval + +_hf = HfApi() + + +class TestSubmission: + @pytest.fixture(autouse=True) + def setup(self): + # These need to be set before imports are evaluated so all we can do here + # is check that they have been set correctly. + assert IS_INTERNAL == True + assert CONFIG_NAME == "continuous-integration" + + def test_add_new_eval(self, mocker): + # Bypass some checks so that the test can cover later parts of the code. + mocker.patch("submission._is_hf_acct_too_new", return_value=False) + mocker.patch("submission._is_last_submission_too_recent", return_value=False) + + # We use this to find records corresponding to this test. + agent_description = f"CI run at {datetime.now().isoformat()}" + print(f"Using unique agent description: {agent_description}") + + print("Submitting test submission...") + with open(os.path.join(os.path.dirname(__file__), "test-submission.tar.gz"), "rb") as f: + result = add_new_eval( + val_or_test="test", + agent_name="TestSubmissionIntegration", + agent_description=agent_description, + agent_url="https://github.com/allenai/asta-bench-leaderboard/blob/main/tests/integration/test_submission.py", + openness=CANONICAL_OPENNESS_CLOSED_UI_ONLY, + degree_of_control=CANONICAL_TOOL_USAGE_STANDARD, + path_to_file=f, + username="test_user", + role="Other", + email="jasond+asta_testing@allenai.org", + email_opt_in=True, + profile=gradio.OAuthProfile({ + "name": "Test User", + "preferred_username": "test_user", + "profile": "test_user_profile", + "picture": "https://placecats.com/150/150", + }), + ) + + message, error_modal, success_modal, loading_modal = result + assert message == "" # Success + assert error_modal == {'__type__': 'update', 'visible': False} + assert success_modal == {'__type__': 'update', 'visible': True} + assert loading_modal == {'__type__': 'update', 'visible': False} + + print("Looking up contact record...") + contacts = load_dataset(path=CONTACT_DATASET, + name=CONFIG_NAME, + download_mode="force_redownload", + verification_mode=VerificationMode.NO_CHECKS) + # There should have been a new entry due to this test with our unique description. + found_contact = next(row for row in contacts['test'] if row['agent_description'] == agent_description) + assert found_contact + + # This contains an attribute that should lead us to files in the submissions dataset. + dataset_url = found_contact['dataset_url'] + print(f"Found dataset URL: {dataset_url}") + assert dataset_url.startswith( + "hf://datasets/allenai/asta-bench-internal-submissions/continuous-integration/test/") + + print("Checking submission dataset...") + # Commit message itself should link this and the contact record together unambiguously. + recent_commits = _hf.list_repo_commits(repo_type="dataset", repo_id=SUBMISSION_DATASET) + assert any(dataset_url in c.title for c in recent_commits) + + print("Checking that files are present...") + rel_path = dataset_url[len("hf://datasets/allenai/asta-bench-internal-submissions/"):] + ds_info = _hf.dataset_info(SUBMISSION_DATASET) + # These are the files in our test-submission.tar.gz + assert any(f"{rel_path}/eval_config.json" == f.rfilename for f in ds_info.siblings) + assert any(f"{rel_path}/task_sqa_solver_openscilm.eval" == f.rfilename for f in ds_info.siblings) + # This is the generated metadata put into the dataset itself. + assert any(f"{rel_path}/submission.json" == f.rfilename for f in ds_info.siblings) + + print("Checking contact record against submission.json...") + # Checks on contact record which is stored in a private dataset. + local_path = hf_hub_download(repo_type="dataset", + repo_id=SUBMISSION_DATASET, + filename=f"{rel_path}/submission.json") + with open(local_path) as f: + contact_from_json = json.load(f) + # Assert that all keys and values in submission.json are present in the contact record + for key, value_from_json in contact_from_json.items(): + value_from_dataset = found_contact[key] + if isinstance(value_from_dataset, datetime): + value_from_dataset = found_contact[key].isoformat().replace('+00:00', 'Z') + assert value_from_dataset == value_from_json + # submission.json should not contain sensitive PII, specifically, email. + assert 'email' in found_contact + assert 'email' not in contact_from_json + # submission.json is defined by a specific data model. + SubmissionMetadata.model_validate(contact_from_json) diff --git a/ui_components.py b/ui_components.py new file mode 100644 index 0000000000000000000000000000000000000000..ab8b6afbf5530bc8c04c3fa1280115d6ba1ac9e9 --- /dev/null +++ b/ui_components.py @@ -0,0 +1,885 @@ +import gradio as gr +import pandas as pd +import plotly.graph_objects as go +import os +import base64 + +from agenteval.leaderboard.view import LeaderboardViewer +from huggingface_hub import HfApi + +import aliases +from leaderboard_transformer import ( + DataTransformer, + transform_raw_dataframe, + create_pretty_tag_map, + INFORMAL_TO_FORMAL_NAME_MAP, + _plot_scatter_plotly, + format_cost_column, + format_score_column, + get_pareto_df, + clean_llm_base_list, +) +from config import ( + CONFIG_NAME, + EXTRACTED_DATA_DIR, + IS_INTERNAL, + RESULTS_DATASET, +) +from content import ( + create_gradio_anchor_id, + format_error, + get_benchmark_description, + hf_uri_to_web_url, + hyperlink, + SCATTER_DISCLAIMER, +) + +api = HfApi() +os.makedirs(EXTRACTED_DATA_DIR, exist_ok=True) +# Global variables +COMBINED_ICON_MAP = { + aliases.CANONICAL_OPENNESS_OPEN_SOURCE_OPEN_WEIGHTS: { + aliases.CANONICAL_TOOL_USAGE_STANDARD: "assets/os-ow-standard.svg", + aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE: "assets/os-ow-equivalent.svg", + aliases.CANONICAL_TOOL_USAGE_FULLY_CUSTOM: "assets/os-ow-custom.svg", + }, + aliases.CANONICAL_OPENNESS_OPEN_SOURCE_CLOSED_WEIGHTS: { + aliases.CANONICAL_TOOL_USAGE_STANDARD: "assets/os-standard.svg", + aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE: "assets/os-equivalent.svg", + aliases.CANONICAL_TOOL_USAGE_FULLY_CUSTOM: "assets/os-custom.svg", + }, + aliases.CANONICAL_OPENNESS_CLOSED_API_AVAILABLE: { + aliases.CANONICAL_TOOL_USAGE_STANDARD: "assets/api-standard.svg", + aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE: "assets/api-equivalent.svg", + aliases.CANONICAL_TOOL_USAGE_FULLY_CUSTOM: "assets/api-custom.svg", + }, + aliases.CANONICAL_OPENNESS_CLOSED_UI_ONLY: { + aliases.CANONICAL_TOOL_USAGE_STANDARD: "assets/c-standard.svg", + aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE: "assets/c-equivalent.svg", + aliases.CANONICAL_TOOL_USAGE_FULLY_CUSTOM: "assets/c-custom.svg", + } +} + + +# it's important to do the tool usage first here, so that when +# we do openness, the tool usage changes get picked up +for openness in COMBINED_ICON_MAP: + for canonical_tool_usage, tool_usage_aliases in aliases.TOOL_USAGE_ALIASES.items(): + for tool_usage_alias in tool_usage_aliases: + COMBINED_ICON_MAP[openness][tool_usage_alias] = COMBINED_ICON_MAP[openness][canonical_tool_usage] + +for canonical_openness, openness_aliases in aliases.OPENNESS_ALIASES.items(): + for openness_alias in openness_aliases: + COMBINED_ICON_MAP[openness_alias] = COMBINED_ICON_MAP[canonical_openness] + + +OPENNESS_SVG_MAP = { + aliases.CANONICAL_OPENNESS_OPEN_SOURCE_OPEN_WEIGHTS: { + "path": "assets/ellipse-pink.svg", + "description": "Code and models are open" + }, + aliases.CANONICAL_OPENNESS_OPEN_SOURCE_CLOSED_WEIGHTS: { + "path": "assets/ellipse-coral.svg", + "description": "Code is open but uses closed-weight models" + }, + aliases.CANONICAL_OPENNESS_CLOSED_API_AVAILABLE: { + "path": "assets/ellipse-yellow.svg", + "description": "No access to code; API access only" + }, + aliases.CANONICAL_OPENNESS_CLOSED_UI_ONLY: { + "path": "assets/ellipse-white.svg", + "description": "No access to code or API; UI access only" + }, +} +TOOLING_SVG_MAP = { + aliases.CANONICAL_TOOL_USAGE_STANDARD: { + "path": "assets/five-point-star.svg", + "description": "Uses only tools explicitly provided in state.tools" + }, + aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE: { + "path": "assets/four-point-star.svg", + "description": "Custom tools for accessing an equivalent underlying environment" + }, + aliases.CANONICAL_TOOL_USAGE_FULLY_CUSTOM: { + "path": "assets/three-point-star.svg", + "description": f"Uses tools beyond constraints of {aliases.CANONICAL_TOOL_USAGE_STANDARD} or {aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE}" + }, +} + +def get_svg_as_data_uri(path: str) -> str: + """Reads an SVG file and returns it as a base64-encoded data URI.""" + try: + with open(path, "rb") as svg_file: + encoded_svg = base64.b64encode(svg_file.read()).decode("utf-8") + return f"data:image/svg+xml;base64,{encoded_svg}" + except FileNotFoundError: + print(f"Warning: SVG file not found at {path}") + return "" + +# Create a pre-loaded version of our map. This should be run ONCE when the app starts. +PRELOADED_URI_MAP = { + openness: { + tooling: get_svg_as_data_uri(path) + for tooling, path in tooling_map.items() + } + for openness, tooling_map in COMBINED_ICON_MAP.items() +} + +def get_combined_icon_html(row, uri_map): + """ + Looks up the correct icon URI from the pre-loaded map based on the row's + 'Openness' and 'Agent Tooling' values and returns an HTML tag. + """ + openness_val = row['Openness'] + tooling_val = row['Agent Tooling'] + uri = uri_map.get(openness_val, {}).get(tooling_val, "") + # The tooltip will show the exact combination for clarity. + tooltip = f"Openness: {openness_val}, Tooling: {tooling_val}" + + # Return the HTML string that Gradio will render in the DataFrame. + return f'{tooltip}' + +def create_svg_html(value, svg_map): + """ + Generates the absolute simplest HTML for an icon, without any extra text. + This version is compatible with gr.DataFrame. + """ + if pd.isna(value) or value not in svg_map: + return "" + + path_info = svg_map[value] + # Handle both old string format and new object format + if isinstance(path_info, dict): + path = path_info["path"] + else: + path = path_info + + src = get_svg_as_data_uri(path) + # Generate the HTML for the single icon, with NO text. + if src: + return f'{value}' + return "" + +def build_openness_tooltip_content() -> str: + """ + Generates the inner HTML for the Agent Openness tooltip card, + """ + descriptions = { + aliases.CANONICAL_OPENNESS_OPEN_SOURCE_OPEN_WEIGHTS: "Both code and ML models are open", + aliases.CANONICAL_OPENNESS_OPEN_SOURCE_CLOSED_WEIGHTS: "Code is open but uses an ML model with closed-weights", + aliases.CANONICAL_OPENNESS_CLOSED_API_AVAILABLE: "No access to code; API access only", + aliases.CANONICAL_OPENNESS_CLOSED_UI_ONLY: "No access to code or API; UI access only", + } + html_items = [] + for name, info in OPENNESS_SVG_MAP.items(): + uri = get_svg_as_data_uri(info["path"]) + desc = descriptions.get(name, "") + + html_items.append(f""" +
+ {name} +
+ {name} + {desc} +
+
+ """) + + joined_items = "".join(html_items) + + return f""" + ⓘ + +

Agent Openness

+

Indicates how transparent and reproducible an agent is.

+
{joined_items}
+
+
""" + + +def build_pareto_tooltip_content() -> str: + """Generates the inner HTML for the Pareto tooltip card with final copy.""" + trophy_uri = get_svg_as_data_uri("assets/trophy.svg") + trophy_icon_html = f'' + return f""" +

On Pareto Frontier

+

The Pareto frontier represents the best balance between score and cost.

+

Agents on the frontier either:

+ +
+ These agents are marked with this icon: + {trophy_icon_html} +
+ """ + +def build_tooling_tooltip_content() -> str: + """Generates the inner HTML for the Agent Tooling tooltip card.""" + descriptions = { + aliases.CANONICAL_TOOL_USAGE_STANDARD: "Uses only predefined tools from the evaluation environment (as defined in Inspect's state.tools).", + aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE: "Custom tools for accessing an equivalent underlying environment:", + aliases.CANONICAL_TOOL_USAGE_FULLY_CUSTOM: f"Uses tools beyond constraints of {aliases.CANONICAL_TOOL_USAGE_STANDARD} or {aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE}", + } + custom_interface_sub_list = """ + + """ + html_items = [] + for name, info in TOOLING_SVG_MAP.items(): + uri = get_svg_as_data_uri(info["path"]) + desc = descriptions.get(name, "") + + # Check if this is the special case that needs a sub-list + sub_list_html = custom_interface_sub_list if name == aliases.CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE else "" + + html_items.append(f""" +
+ {name} +
+ {name} + {desc} + {sub_list_html} +
+
+ """) + + joined_items = "".join(html_items) + + return f""" + ⓘ + +

Agent Tooling

+

Describes the tool usage and execution environment of the agent during evaluation.

+
{joined_items}
+
+
""" + + +def build_descriptions_tooltip_content(table) -> str: + """Generates the inner HTML for the Column Descriptions tooltip card depending on which kind of table.""" + if table == "Overall": + return """ +
Agent: Name of the evaluated agent.
+
Submitter: Organization or individual who submitted the agent for evaluation.
+
Models Used: Model(s) used by the agent. Hover over ⓘ to view all.
+
Overall Score: Macro-average of the four category-level average scores. Each category contributes equally.
+
Overall Cost: Macro-average cost per problem across all categories, in USD. Each category contributes equally.
+
Literature Understanding Score: Macro-average score across Literature Understanding benchmarks.
+
Literature Understanding Cost: Macro-average cost per problem (USD) across Literature Understanding benchmarks.
+
Code Execution Score: Macro-average score across Code & Execution benchmarks.
+
Code Execution Cost: Macro-average cost per problem (USD) across Code & Execution benchmarks.
+
Data Analysis Score: Macro-average score across Data Analysis benchmarks.
+
Data Analysis Cost: Macro-average cost per problem (USD) across Data Analysis benchmarks.
+
End-to-End Discovery Score: Macro-average score across End-to-End Discovery benchmarks.
+
End-to-End Discovery Cost: Macro-average cost per problem (USD)across End-to-End Discovery benchmarks.
+
Categories Attempted: Number of core categories with at least one benchmark attempted (out of 4).
+
Logs: View evaluation run logs (e.g., outputs, traces).
+ """ + elif table in ["Literature Understanding", "Code & Execution", "Data Analysis", "End-to-End Discovery"]: + return f""" +
Agent: Name of the evaluated agent.
+
Submitter: Organization or individual who submitted the agent for evaluation.
+
Models Used: Model(s) used by the agent. Hover over ⓘ to view all.
+
{table} Score: Macro-average score across {table} benchmarks.
+
{table} Cost: Macro-average cost per problem (USD) across {table} benchmarks.
+
Benchmark Score: Average (mean) score on the benchmark.
+
Benchmark Cost: Average (mean) cost per problem (USD) on the benchmark.
+
Benchmarks Attempted: Number of benchmarks attempted in this category (e.g., 3/5).
+
Logs: View evaluation run logs (e.g., outputs, traces).
+ """ + else: + # Fallback for any other table type, e.g., individual benchmarks + return f""" +
Agent: Name of the evaluated agent.
+
Submitter: Organization or individual who submitted the agent for evaluation.
+
Models Used: Model(s) used by the agent. Hover over ⓘ to view all.
+
Benchmark Attempted: Indicates whether the agent attempted this benchmark.
+
{table} Score: Score achieved by the agent on this benchmark.
+
{table} Cost: Cost incurred by the agent to solve this benchmark (in USD).
+
Logs: View evaluation run logs (e.g., outputs, traces).
+ """ + +# Dynamically generate the correct HTML for the legend parts +openness_html = " ".join([create_svg_html(name, OPENNESS_SVG_MAP) for name in OPENNESS_SVG_MAP]) +tooling_html = " ".join([create_svg_html(name, TOOLING_SVG_MAP) for name in TOOLING_SVG_MAP]) +# Create HTML for the "Openness" legend items for table +openness_html_items = [] +for name, info in OPENNESS_SVG_MAP.items(): + uri = get_svg_as_data_uri(info["path"]) + # Each item is now its own flexbox container to guarantee alignment + openness_html_items.append( + f'
' + f'{name}' + f'{name}' + f'
' + ) +openness_html = " ".join(openness_html_items) + +# Create HTML for the "Tooling" legend items for table +tooling_html_items = [] +for name, info in TOOLING_SVG_MAP.items(): + uri = get_svg_as_data_uri(info["path"]) + tooling_html_items.append( + f'
' + f'{name}' + f'{name}' + f'
' + ) +tooling_html = " ".join(tooling_html_items) + +pareto_tooltip_content = build_pareto_tooltip_content() +openness_tooltip_content = build_openness_tooltip_content() +tooling_tooltip_content = build_tooling_tooltip_content() + +def create_legend_markdown(which_table: str) -> str: + """ + Generates the complete HTML for the legend section, including tooltips. + This is used in the main leaderboard display. + """ + descriptions_tooltip_content = build_descriptions_tooltip_content(which_table) + trophy_uri = get_svg_as_data_uri("assets/trophy.svg") + legend_markdown = f""" +
+ +
+ Pareto + + ⓘ + {pareto_tooltip_content} + +
+ On Frontier + On frontier +
+
+ +
+ Agent Openness + {openness_tooltip_content} +
{openness_html}
+
+ +
+ Agent Tooling + {tooling_tooltip_content} +
{tooling_html}
+
+ +
+ Column Descriptions + + ⓘ + +

Column Descriptions

+
{descriptions_tooltip_content}
+
+
+
+
+ """ + return legend_markdown + +# Create HTML for plot legend with SVG icons and keys +openness_legend_items = [] +for name, info in OPENNESS_SVG_MAP.items(): + uri = get_svg_as_data_uri(info["path"]) + if uri: + openness_legend_items.append( + f'
' + f'{name}' + f'
' + f'
' + f'{name}' + f'
' + f'{info["description"]}' + f'
' + f'
' + ) + +tooling_legend_items = [] +for name, info in TOOLING_SVG_MAP.items(): + uri = get_svg_as_data_uri(info["path"]) + if uri: + tooling_legend_items.append( + f'
' + f'{name}' + f'
' + f'
' + f'{name}' + f'
' + f'{info["description"]}' + f'
' + f'
' + ) + +plot_legend_html = f""" +
+ +
+ Pareto +
+
+ + On frontier +
+
+
+
+ Agent Openness +
+ {''.join(openness_legend_items)} +
+
+
+ Agent Tooling +
+ {''.join(tooling_legend_items)} +
+
+
+"""; + +# --- Global State for Viewers (simple caching) --- +CACHED_VIEWERS = {} +CACHED_TAG_MAPS = {} + + +class DummyViewer: + """A mock viewer to be cached on error. It has a ._load() method + to ensure it behaves like the real LeaderboardViewer.""" + def __init__(self, error_df): + self._error_df = error_df + + def _load(self): + # The _load method returns the error DataFrame and an empty tag map + return self._error_df, {} + +def get_leaderboard_viewer_instance(split: str): + """ + Fetches the LeaderboardViewer for a split, using a cache to avoid + re-downloading data. On error, returns a stable DummyViewer object. + """ + global CACHED_VIEWERS, CACHED_TAG_MAPS + + if split in CACHED_VIEWERS: + # Cache hit: return the cached viewer and tag map + return CACHED_VIEWERS[split], CACHED_TAG_MAPS.get(split, {"Overall": []}) + + # --- Cache miss: try to load data from the source --- + try: + print(f"Using Hugging Face dataset for split '{split}': {RESULTS_DATASET}/{CONFIG_NAME}") + viewer = LeaderboardViewer( + repo_id=RESULTS_DATASET, + config=CONFIG_NAME, + split=split, + is_internal=IS_INTERNAL + ) + + # Simplify tag map creation + pretty_tag_map = create_pretty_tag_map(viewer.tag_map, INFORMAL_TO_FORMAL_NAME_MAP) + + # Cache the results for next time + CACHED_VIEWERS[split] = viewer + CACHED_TAG_MAPS[split] = pretty_tag_map # Cache the pretty map directly + + return viewer, pretty_tag_map + + except Exception as e: + # On ANY error, create a consistent error message and cache a DummyViewer + error_message = f"Error loading data for split '{split}': {e}" + print(format_error(error_message)) + + dummy_df = pd.DataFrame({"Message": [error_message]}) + dummy_viewer = DummyViewer(dummy_df) + dummy_tag_map = {"Overall": []} + + # Cache the dummy objects so we don't try to fetch again on this run + CACHED_VIEWERS[split] = dummy_viewer + CACHED_TAG_MAPS[split] = dummy_tag_map + + return dummy_viewer, dummy_tag_map + + +def create_leaderboard_display( + full_df: pd.DataFrame, + tag_map: dict, + category_name: str, + split_name: str +): + """ + This UI factory takes pre-loaded data and renders the main DataFrame and Plot + for a given category (e.g., "Overall" or "Literature Understanding"). + """ + # 1. Instantiate the transformer and get the specific view for this category. + # The function no longer loads data itself; it filters the data it receives. + transformer = DataTransformer(full_df, tag_map) + df_view, plots_dict = transformer.view(tag=category_name, use_plotly=True) + pareto_df = get_pareto_df(df_view) + # Get the list of agents on the frontier. We'll use this list later. + trophy_uri = get_svg_as_data_uri("assets/trophy.svg") + trophy_icon_html = f'On Pareto Frontier' + if not pareto_df.empty and 'id' in pareto_df.columns: + pareto_agent_names = pareto_df['id'].tolist() + else: + pareto_agent_names = [] + df_view['Pareto'] = df_view.apply( + lambda row: trophy_icon_html if row['id'] in pareto_agent_names else '', + axis=1 + ) + # Create mapping for Openness / tooling + df_view['Icon'] = df_view.apply( + lambda row: get_combined_icon_html(row, PRELOADED_URI_MAP), + axis=1 # IMPORTANT: axis=1 tells pandas to process row-by-row + ) + + # Format cost columns + for col in df_view.columns: + if "Cost" in col: + df_view = format_cost_column(df_view, col) + + # Fill NaN scores with 0 + for col in df_view.columns: + if "Score" in col: + df_view = format_score_column(df_view, col) + scatter_plot = plots_dict.get('scatter_plot', go.Figure()) + #Make pretty and format the Models Used column + df_view['Models Used'] = df_view['Models Used'].apply(clean_llm_base_list) + df_view['Models Used'] = df_view['Models Used'].apply(format_llm_base_with_html) + # append the repro url to the end of the agent name + if 'Source' in df_view.columns: + df_view['Agent'] = df_view.apply( + lambda row: f"{row['Agent']} {row['Source']}" if pd.notna(row['Source']) and row['Source'] else row['Agent'], + axis=1 + ) + + all_cols = df_view.columns.tolist() + # Remove pareto and Icon columns and insert it at the beginning + all_cols.insert(0, all_cols.pop(all_cols.index('Icon'))) + all_cols.insert(0, all_cols.pop(all_cols.index('Pareto'))) + df_view = df_view[all_cols] + # Drop internally used columns that are not needed in the display + columns_to_drop = ['id', 'Openness', 'Agent Tooling', 'Source'] + df_view = df_view.drop(columns=columns_to_drop, errors='ignore') + + df_headers = df_view.columns.tolist() + df_datatypes = [] + for col in df_headers: + if col == "Logs" or "Cost" in col or "Score" in col: + df_datatypes.append("markdown") + elif col in ["Agent","Icon","Models Used", "Pareto"]: + df_datatypes.append("html") + else: + df_datatypes.append("str") + + header_rename_map = { + "Pareto": "", + "Icon": "", + } + # 2. Create the final list of headers for display. + df_view = df_view.rename(columns=header_rename_map) + # Dynamically set widths for the DataFrame columns + fixed_start_widths = [40, 40, 200, 100, 200] + num_score_cost_cols = 0 + remaining_headers = df_headers[len(fixed_start_widths):] + for col in remaining_headers: + if "Score" in col or "Cost" in col: + num_score_cost_cols += 1 + dynamic_widths = [90] * num_score_cost_cols + fixed_end_widths = [90, 100, 50] + # 5. Combine all the lists to create the final, fully dynamic list. + final_column_widths = fixed_start_widths + dynamic_widths + fixed_end_widths + + with gr.Row(): + with gr.Column(scale=3): + plot_component = gr.Plot( + value=scatter_plot, + show_label=False, + ) + gr.Markdown(value=SCATTER_DISCLAIMER, elem_id="scatter-disclaimer") + with gr.Column(scale=1): + gr.HTML(value=plot_legend_html) + + # Put table and key into an accordion + with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"): + dataframe_component = gr.DataFrame( + headers=df_headers, + value=df_view, + datatype=df_datatypes, + interactive=False, + wrap=True, + column_widths=final_column_widths, + elem_classes=["wrap-header-df"], + show_search="search", + elem_id="main-leaderboard" + ) + legend_markdown = create_legend_markdown(category_name) + gr.HTML(value=legend_markdown, elem_id="legend-markdown") + + # Return the components so they can be referenced elsewhere. + return plot_component, dataframe_component + +# # --- Detailed Benchmark Display --- +def create_benchmark_details_display( + full_df: pd.DataFrame, + tag_map: dict, + category_name: str, + validation: bool = False, +): + """ + Generates a detailed breakdown for each benchmark within a given category. + For each benchmark, it creates a title, a filtered table, and a scatter plot. + Args: + full_df (pd.DataFrame): The complete, "pretty" dataframe for the entire split. + tag_map (dict): The "pretty" tag map to find the list of benchmarks. + category_name (str): The main category to display details for (e.g., "Literature Understanding"). + """ + # 1. Get the list of benchmarks for the selected category + benchmark_names = tag_map.get(category_name, []) + + if not benchmark_names: + gr.Markdown(f"No detailed benchmarks found for the category: {category_name}") + return + + gr.HTML(f'

{category_name} Detailed Benchmark Results

') + gr.Markdown("---") + # 2. Loop through each benchmark and create its UI components + for benchmark_name in benchmark_names: + anchor_id = create_gradio_anchor_id(benchmark_name, validation) + gr.HTML( + f""" +

{benchmark_name} Leaderboard 🔗

+
{get_benchmark_description(benchmark_name, validation)}
+ + """ + ) + + # 3. Prepare the data for this specific benchmark's table and plot + benchmark_score_col = f"{benchmark_name} Score" + benchmark_cost_col = f"{benchmark_name} Cost" + + # Define the columns needed for the detailed table + table_cols = ['Agent','Source','Openness','Agent Tooling', 'Submitter', 'Date', benchmark_score_col, benchmark_cost_col,'Logs','id', 'Models Used'] + + # Filter to only columns that actually exist in the full dataframe + existing_table_cols = [col for col in table_cols if col in full_df.columns] + + if benchmark_score_col not in existing_table_cols: + gr.Markdown(f"Score data for {benchmark_name} not available.") + continue # Skip to the next benchmark if score is missing + + # Create a specific DataFrame for the table view + benchmark_table_df = full_df[existing_table_cols].copy() + pareto_df = get_pareto_df(benchmark_table_df) + # Get the list of agents on the frontier. We'll use this list later. + trophy_uri = get_svg_as_data_uri("assets/trophy.svg") + trophy_icon_html = f'On Pareto Frontier' + if not pareto_df.empty and 'id' in pareto_df.columns: + pareto_agent_names = pareto_df['id'].tolist() + else: + pareto_agent_names = [] + benchmark_table_df['Pareto'] = benchmark_table_df.apply( + lambda row: trophy_icon_html if row['id'] in pareto_agent_names else '', + axis=1 + ) + + benchmark_table_df['Icon'] = benchmark_table_df.apply( + lambda row: get_combined_icon_html(row, PRELOADED_URI_MAP), + axis=1 # IMPORTANT: axis=1 tells pandas to process row-by-row + ) + + #Make pretty and format the Models Used column + benchmark_table_df['Models Used'] = benchmark_table_df['Models Used'].apply(clean_llm_base_list) + benchmark_table_df['Models Used'] = benchmark_table_df['Models Used'].apply(format_llm_base_with_html) + # append the repro url to the end of the agent name + if 'Source' in benchmark_table_df.columns: + benchmark_table_df['Agent'] = benchmark_table_df.apply( + lambda row: f"{row['Agent']} {row['Source']}" if row['Source'] else row['Agent'], + axis=1 + ) + + # Calculated and add "Benchmark Attempted" column + def check_benchmark_status(row): + has_score = pd.notna(row.get(benchmark_score_col)) + has_cost = pd.notna(row.get(benchmark_cost_col)) + if has_score and has_cost: + return "✅" + if has_score or has_cost: + return "⚠️" + return "🚫 " + + # Apply the function to create the new column + benchmark_table_df['Attempted Benchmark'] = benchmark_table_df.apply(check_benchmark_status, axis=1) + # Sort the DataFrame + if benchmark_score_col in benchmark_table_df.columns: + benchmark_table_df = benchmark_table_df.sort_values( + by=benchmark_score_col, ascending=False, na_position='last' + ) + # 1. Format the cost and score columns + benchmark_table_df = format_cost_column(benchmark_table_df, benchmark_cost_col) + benchmark_table_df = format_score_column(benchmark_table_df, benchmark_score_col) + desired_cols_in_order = [ + 'Pareto', + 'Icon', + 'Agent', + 'Submitter', + 'Models Used', + 'Attempted Benchmark', + benchmark_score_col, + benchmark_cost_col, + 'Date', + 'Logs' + ] + for col in desired_cols_in_order: + if col not in benchmark_table_df.columns: + benchmark_table_df[col] = pd.NA # Add as an empty column + benchmark_table_df = benchmark_table_df[desired_cols_in_order] + # Rename columns for a cleaner table display, as requested + benchmark_table_df.rename({ + benchmark_score_col: 'Score', + benchmark_cost_col: 'Cost', + }, inplace=True) + # Ensure the 'Logs' column is formatted correctly + df_headers = benchmark_table_df.columns.tolist() + df_datatypes = [] + for col in df_headers: + if "Logs" in col or "Cost" in col or "Score" in col: + df_datatypes.append("markdown") + elif col in ["Agent", "Icon", "Models Used", "Pareto"]: + df_datatypes.append("html") + else: + df_datatypes.append("str") + # Remove Pareto, Openness, and Agent Tooling from the headers + header_rename_map = { + "Pareto": "", + "Icon": "", + } + # 2. Create the final list of headers for display. + benchmark_table_df = benchmark_table_df.rename(columns=header_rename_map) + benchmark_plot = _plot_scatter_plotly( + data=full_df, + x=benchmark_cost_col, + y=benchmark_score_col, + agent_col="Agent", + name=benchmark_name + ) + with gr.Row(): + with gr.Column(scale=3): + gr.Plot(value=benchmark_plot, show_label=False) + gr.Markdown(value=SCATTER_DISCLAIMER, elem_id="scatter-disclaimer") + with gr.Column(scale=1): + gr.HTML(value=plot_legend_html) + + # Put table and key into an accordion + with gr.Accordion("Show / Hide Table View", open=True, elem_id="leaderboard-accordion"): + gr.DataFrame( + headers=df_headers, + value=benchmark_table_df, + datatype=df_datatypes, + interactive=False, + wrap=True, + column_widths=[40, 40, 200, 150, 175, 85, 100, 100, 80, 40], + show_search="search", + elem_classes=["wrap-header-df"] + ) + legend_markdown = create_legend_markdown(benchmark_name) + gr.HTML(value=legend_markdown, elem_id="legend-markdown") + +def get_full_leaderboard_data(split: str) -> tuple[pd.DataFrame, dict]: + """ + Loads and transforms the complete dataset for a given split. + This function handles caching and returns the final "pretty" DataFrame and tag map. + """ + viewer_or_data, raw_tag_map = get_leaderboard_viewer_instance(split) + + if isinstance(viewer_or_data, (LeaderboardViewer, DummyViewer)): + raw_df, _ = viewer_or_data._load() + if raw_df.empty: + return pd.DataFrame(), {} + + pretty_df = transform_raw_dataframe(raw_df) + pretty_tag_map = create_pretty_tag_map(raw_tag_map, INFORMAL_TO_FORMAL_NAME_MAP) + if "Logs" in pretty_df.columns: + def format_log_entry_to_html(raw_uri): + if pd.isna(raw_uri) or raw_uri == "": return "" + web_url = hf_uri_to_web_url(str(raw_uri)) + return hyperlink(web_url, "🔗") if web_url else "" + # Apply the function to the "Logs" column + pretty_df["Logs"] = pretty_df["Logs"].apply(format_log_entry_to_html) + + if "Source" in pretty_df.columns: + def format_source_url_to_html(raw_url): + # Handle empty or NaN values, returning a blank string. + if pd.isna(raw_url) or raw_url == "": return "" + # Assume 'source_url' is already a valid web URL and doesn't need conversion. + return hyperlink(str(raw_url), "🔗") + # Apply the function to the "source_url" column. + pretty_df["Source"] = pretty_df["Source"].apply(format_source_url_to_html) + return pretty_df, pretty_tag_map + + # Fallback for unexpected types + return pd.DataFrame(), {} +def create_sub_navigation_bar(tag_map: dict, category_name: str, validation: bool = False) -> gr.HTML: + """ + Builds the entire sub-navigation bar as a single, self-contained HTML component. + This bypasses Gradio's layout components, giving us full control. + """ + benchmark_names = tag_map.get(category_name, []) + if not benchmark_names: + # Return an empty HTML component to prevent errors + return gr.HTML() + + # Start building the list of HTML button elements as strings + html_buttons = [] + for name in benchmark_names: + target_id = create_gradio_anchor_id(name, validation) + + # Create a standard HTML button. + # The onclick attribute calls our global JS function directly. + # Note the mix of double and single quotes. + button_str = f""" + + """ + html_buttons.append(button_str) + + # Join the button strings and wrap them in a single div container + # This container will be our flexbox row. + full_html = f""" + + """ + + # Return the entire navigation bar as one single Gradio HTML component + return gr.HTML(full_html) + +def format_llm_base_with_html(value): + """ + Formats the 'Models Used' cell value. + If the value is a list with more than 1 element, it returns an + HTML with the full list in a hover-over tooltip. + If it's a single-element list, it returns just that element. + Otherwise, it returns the original value. + """ + if isinstance(value, list): + if len(value) > 1: + # Join the list items with a newline character for a clean tooltip + tooltip_text = "\n".join(map(str, value)) + # Return an HTML span with the title attribute for the tooltip + return f'{value[0]} (+ {len(value) - 1}) ⓘ' + if len(value) == 1: + # If only one item, just return that item + return value[0] + # Return the value as-is if it's not a list or is an empty list + return value