Spaces:
Running
Running
openhands
openhands
commited on
Commit
·
1027cfb
1
Parent(s):
085a012
Convert to JSONL data format and remove agent-eval dependency
Browse files- Replace parquet files with JSONL for better readability and simplicity
- Remove agent-eval library dependency to avoid secret requirements
- Create simple_data_loader.py as lightweight replacement for LeaderboardViewer
- Create submission_utils.py for submission handling functions
- Simplify Dockerfile (no SSH keys or secrets needed)
- Reduce requirements.txt to essential packages only
- Generate mock data for all 6 benchmarks (swe-bench, multi-swe-bench, swe-bench-multimodal, swt-bench, commit0, gaia)
- Update aliases.py to define constants locally
- Fix imports in submission.py and test files
Co-authored-by: openhands <openhands@all-hands.dev>
- Dockerfile +7 -18
- aliases.py +8 -9
- app.py +10 -4
- generate_mock_data.py +0 -102
- generate_mock_jsonl.py +177 -0
- mock_results/1.0.0-dev1/agenteval.json +74 -0
- mock_results/1.0.0-dev1/commit0.jsonl +5 -0
- mock_results/1.0.0-dev1/gaia.jsonl +5 -0
- mock_results/1.0.0-dev1/multi-swe-bench.jsonl +5 -0
- mock_results/1.0.0-dev1/swe-bench-multimodal.jsonl +5 -0
- mock_results/1.0.0-dev1/swe-bench.jsonl +5 -0
- mock_results/1.0.0-dev1/swt-bench.jsonl +5 -0
- requirements.txt +8 -127
- simple_data_loader.py +117 -0
- submission.py +6 -8
- submission_utils.py +88 -0
- tests/integration/test_submission.py +1 -3
- ui_components.py +8 -6
Dockerfile
CHANGED
|
@@ -1,9 +1,8 @@
|
|
| 1 |
FROM python:3.10-slim
|
| 2 |
|
| 3 |
-
|
| 4 |
-
# (0) Install SSH client tools (and git, if you're pulling via SSH)
|
| 5 |
RUN apt-get update && \
|
| 6 |
-
apt-get install -y --no-install-recommends
|
| 7 |
rm -rf /var/lib/apt/lists/*
|
| 8 |
|
| 9 |
# The two following lines are requirements for the Dev Mode to be functional
|
|
@@ -11,23 +10,13 @@ RUN apt-get update && \
|
|
| 11 |
RUN useradd -m -u 1000 user
|
| 12 |
WORKDIR /app
|
| 13 |
|
| 14 |
-
|
| 15 |
-
# (2) Copy dependencies manifest
|
| 16 |
COPY --chown=user requirements.txt requirements.txt
|
| 17 |
|
| 18 |
-
#
|
| 19 |
-
RUN --
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
cat /run/secrets/AGENTEVAL_DEPLOY_KEY > /root/.ssh/id_ed25519 && chmod 600 /root/.ssh/id_ed25519 && \
|
| 23 |
-
cat /run/secrets/ASTABENCH_DEPLOY_KEY > /root/.ssh/id_astabench && chmod 600 /root/.ssh/id_astabench && \
|
| 24 |
-
ssh-keyscan github.com >> /root/.ssh/known_hosts && \
|
| 25 |
-
printf 'Host github.com\n User git\n IdentityFile /root/.ssh/id_ed25519\n IdentityFile /root/.ssh/id_astabench\n StrictHostKeyChecking no\n' >> /root/.ssh/config && \
|
| 26 |
-
# rewrite all GitHub HTTPS URLs to SSH so nested deps install via SSH
|
| 27 |
-
git config --global url."ssh://git@github.com/".insteadOf "https://github.com/" && \
|
| 28 |
-
pip install --no-cache-dir --upgrade -r requirements.txt
|
| 29 |
-
|
| 30 |
-
# (4) Copy in your Gradio app code
|
| 31 |
COPY . .
|
| 32 |
RUN mkdir -p /home/user/data && chown -R user:user /home/user/data
|
| 33 |
|
|
|
|
| 1 |
FROM python:3.10-slim
|
| 2 |
|
| 3 |
+
# Install git for cloning results repository
|
|
|
|
| 4 |
RUN apt-get update && \
|
| 5 |
+
apt-get install -y --no-install-recommends git && \
|
| 6 |
rm -rf /var/lib/apt/lists/*
|
| 7 |
|
| 8 |
# The two following lines are requirements for the Dev Mode to be functional
|
|
|
|
| 10 |
RUN useradd -m -u 1000 user
|
| 11 |
WORKDIR /app
|
| 12 |
|
| 13 |
+
# Copy dependencies manifest
|
|
|
|
| 14 |
COPY --chown=user requirements.txt requirements.txt
|
| 15 |
|
| 16 |
+
# Install dependencies (no secrets needed)
|
| 17 |
+
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
| 18 |
+
|
| 19 |
+
# Copy in your Gradio app code
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
COPY . .
|
| 21 |
RUN mkdir -p /home/user/data && chown -R user:user /home/user/data
|
| 22 |
|
aliases.py
CHANGED
|
@@ -1,12 +1,11 @@
|
|
| 1 |
-
from agenteval
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
)
|
| 10 |
|
| 11 |
|
| 12 |
OPENNESS_ALIASES = {
|
|
|
|
| 1 |
+
# Define constants that were previously imported from agenteval
|
| 2 |
+
CANONICAL_OPENNESS_OPEN_SOURCE_OPEN_WEIGHTS = "open_source_open_weights"
|
| 3 |
+
CANONICAL_OPENNESS_OPEN_SOURCE_CLOSED_WEIGHTS = "open_source_closed_weights"
|
| 4 |
+
CANONICAL_OPENNESS_CLOSED_API_AVAILABLE = "closed_api_available"
|
| 5 |
+
CANONICAL_OPENNESS_CLOSED_UI_ONLY = "closed_ui_only"
|
| 6 |
+
CANONICAL_TOOL_USAGE_STANDARD = "standard"
|
| 7 |
+
CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE = "custom_interface"
|
| 8 |
+
CANONICAL_TOOL_USAGE_FULLY_CUSTOM = "fully_custom"
|
|
|
|
| 9 |
|
| 10 |
|
| 11 |
OPENNESS_ALIASES = {
|
app.py
CHANGED
|
@@ -239,18 +239,24 @@ demo = gr.Blocks(
|
|
| 239 |
with demo.route("Home", "/home"):
|
| 240 |
build_main_page()
|
| 241 |
|
| 242 |
-
with demo.route("
|
| 243 |
build_lit_page()
|
| 244 |
|
| 245 |
-
with demo.route("
|
| 246 |
build_c_and_e_page()
|
| 247 |
|
| 248 |
-
with demo.route("
|
| 249 |
build_data_analysis_page()
|
| 250 |
|
| 251 |
-
with demo.route("
|
| 252 |
build_e2e_page()
|
| 253 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 254 |
with demo.route("About", "/about"):
|
| 255 |
build_about_page()
|
| 256 |
|
|
|
|
| 239 |
with demo.route("Home", "/home"):
|
| 240 |
build_main_page()
|
| 241 |
|
| 242 |
+
with demo.route("SWE-bench", "/swe-bench"):
|
| 243 |
build_lit_page()
|
| 244 |
|
| 245 |
+
with demo.route("Multi-SWE-bench", "/multi-swe-bench"):
|
| 246 |
build_c_and_e_page()
|
| 247 |
|
| 248 |
+
with demo.route("SWE-bench Multimodal", "/swe-bench-multimodal"):
|
| 249 |
build_data_analysis_page()
|
| 250 |
|
| 251 |
+
with demo.route("SWT-bench", "/swt-bench"):
|
| 252 |
build_e2e_page()
|
| 253 |
|
| 254 |
+
with demo.route("Commit0", "/commit0"):
|
| 255 |
+
build_lit_page()
|
| 256 |
+
|
| 257 |
+
with demo.route("GAIA", "/gaia"):
|
| 258 |
+
build_c_and_e_page()
|
| 259 |
+
|
| 260 |
with demo.route("About", "/about"):
|
| 261 |
build_about_page()
|
| 262 |
|
generate_mock_data.py
DELETED
|
@@ -1,102 +0,0 @@
|
|
| 1 |
-
"""Generate mock results data in agenteval format for OpenHands Index."""
|
| 2 |
-
import json
|
| 3 |
-
import pandas as pd
|
| 4 |
-
import pyarrow as pa
|
| 5 |
-
import pyarrow.parquet as pq
|
| 6 |
-
from pathlib import Path
|
| 7 |
-
|
| 8 |
-
# Load the suite config
|
| 9 |
-
with open("data/1.0.0-dev1/agenteval.json") as f:
|
| 10 |
-
suite_config_data = json.load(f)
|
| 11 |
-
|
| 12 |
-
suite_config = suite_config_data["suite_config"]
|
| 13 |
-
|
| 14 |
-
# Mock agents
|
| 15 |
-
agents = [
|
| 16 |
-
{
|
| 17 |
-
"name": "OpenHands CodeAct v2.1",
|
| 18 |
-
"source_url": "https://github.com/OpenHands/OpenHands"
|
| 19 |
-
},
|
| 20 |
-
{
|
| 21 |
-
"name": "Aider",
|
| 22 |
-
"source_url": "https://github.com/paul-gauthier/aider"
|
| 23 |
-
},
|
| 24 |
-
{
|
| 25 |
-
"name": "SWE-agent",
|
| 26 |
-
"source_url": "https://github.com/princeton-nlp/SWE-agent"
|
| 27 |
-
}
|
| 28 |
-
]
|
| 29 |
-
|
| 30 |
-
def create_mock_results(split_name):
|
| 31 |
-
"""Create mock results for a split."""
|
| 32 |
-
split_config = next(s for s in suite_config["splits"] if s["name"] == split_name)
|
| 33 |
-
|
| 34 |
-
rows = []
|
| 35 |
-
for agent in agents:
|
| 36 |
-
# Create results for each task
|
| 37 |
-
results = []
|
| 38 |
-
for task in split_config["tasks"]:
|
| 39 |
-
task_name = task["name"]
|
| 40 |
-
primary_metric = task["primary_metric"]
|
| 41 |
-
|
| 42 |
-
# Generate mock score (different for each agent)
|
| 43 |
-
base_score = 0.3 + (hash(agent["name"]) % 50) / 100
|
| 44 |
-
score = base_score + (hash(task_name) % 30) / 100
|
| 45 |
-
score = min(score, 1.0)
|
| 46 |
-
|
| 47 |
-
task_result = {
|
| 48 |
-
"task_name": task_name,
|
| 49 |
-
"eval_spec": {
|
| 50 |
-
"model": "gpt-4",
|
| 51 |
-
"solver": f"openhands/{task_name}",
|
| 52 |
-
},
|
| 53 |
-
"metrics": [
|
| 54 |
-
{
|
| 55 |
-
"name": primary_metric,
|
| 56 |
-
"value": score
|
| 57 |
-
}
|
| 58 |
-
],
|
| 59 |
-
"model_usages": []
|
| 60 |
-
}
|
| 61 |
-
results.append(task_result)
|
| 62 |
-
|
| 63 |
-
# Create row
|
| 64 |
-
row = {
|
| 65 |
-
"suite_config": suite_config,
|
| 66 |
-
"split": split_name,
|
| 67 |
-
"results": results,
|
| 68 |
-
"submission": {
|
| 69 |
-
"agent_name": agent["name"],
|
| 70 |
-
"source_url": agent["source_url"],
|
| 71 |
-
"openness": "open-source/open-weights",
|
| 72 |
-
"tool_usage": "standard"
|
| 73 |
-
}
|
| 74 |
-
}
|
| 75 |
-
rows.append(row)
|
| 76 |
-
|
| 77 |
-
return rows
|
| 78 |
-
|
| 79 |
-
# Create mock data for both splits
|
| 80 |
-
all_rows = []
|
| 81 |
-
for split in ["validation", "test"]:
|
| 82 |
-
all_rows.extend(create_mock_results(split))
|
| 83 |
-
|
| 84 |
-
# Convert to DataFrame
|
| 85 |
-
df = pd.DataFrame(all_rows)
|
| 86 |
-
|
| 87 |
-
# Save as parquet
|
| 88 |
-
output_dir = Path("mock_results/1.0.0-dev1")
|
| 89 |
-
output_dir.mkdir(parents=True, exist_ok=True)
|
| 90 |
-
|
| 91 |
-
# Save validation split
|
| 92 |
-
validation_df = df[df["split"] == "validation"]
|
| 93 |
-
validation_df.to_parquet(output_dir / "validation.parquet", index=False)
|
| 94 |
-
|
| 95 |
-
# Save test split
|
| 96 |
-
test_df = df[df["split"] == "test"]
|
| 97 |
-
test_df.to_parquet(output_dir / "test.parquet", index=False)
|
| 98 |
-
|
| 99 |
-
print(f"Created mock data:")
|
| 100 |
-
print(f" - Validation: {len(validation_df)} rows")
|
| 101 |
-
print(f" - Test: {len(test_df)} rows")
|
| 102 |
-
print(f" - Output: {output_dir}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
generate_mock_jsonl.py
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Generate mock results data in JSONL format for OpenHands Index."""
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from datetime import datetime
|
| 6 |
+
|
| 7 |
+
# Define the 6 benchmarks
|
| 8 |
+
BENCHMARKS = {
|
| 9 |
+
"swe-bench": {
|
| 10 |
+
"tags": ["swe-bench"],
|
| 11 |
+
"metric": "resolve_rate",
|
| 12 |
+
"metric_display": "Resolve Rate (%)"
|
| 13 |
+
},
|
| 14 |
+
"multi-swe-bench": {
|
| 15 |
+
"tags": ["multi-swe-bench"],
|
| 16 |
+
"metric": "resolve_rate",
|
| 17 |
+
"metric_display": "Resolve Rate (%)"
|
| 18 |
+
},
|
| 19 |
+
"swe-bench-multimodal": {
|
| 20 |
+
"tags": ["swe-bench-multimodal"],
|
| 21 |
+
"metric": "resolve_rate",
|
| 22 |
+
"metric_display": "Resolve Rate (%)"
|
| 23 |
+
},
|
| 24 |
+
"swt-bench": {
|
| 25 |
+
"tags": ["swt-bench"],
|
| 26 |
+
"metric": "success_rate",
|
| 27 |
+
"metric_display": "Success Rate (%)"
|
| 28 |
+
},
|
| 29 |
+
"commit0": {
|
| 30 |
+
"tags": ["commit0"],
|
| 31 |
+
"metric": "test_pass_rate",
|
| 32 |
+
"metric_display": "Test Pass Rate (%)"
|
| 33 |
+
},
|
| 34 |
+
"gaia": {
|
| 35 |
+
"tags": ["gaia"],
|
| 36 |
+
"metric": "accuracy",
|
| 37 |
+
"metric_display": "Accuracy (%)"
|
| 38 |
+
}
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
# Mock agents with realistic scores
|
| 42 |
+
MOCK_AGENTS = [
|
| 43 |
+
{
|
| 44 |
+
"agent_name": "OpenHands CodeAct v2.1",
|
| 45 |
+
"llm_base": "claude-3-5-sonnet-20241022",
|
| 46 |
+
"openness": "closed_api_available",
|
| 47 |
+
"tool_usage": "standard",
|
| 48 |
+
"scores": {
|
| 49 |
+
"swe-bench": 48.3,
|
| 50 |
+
"multi-swe-bench": 35.2,
|
| 51 |
+
"swe-bench-multimodal": 42.1,
|
| 52 |
+
"swt-bench": 65.4,
|
| 53 |
+
"commit0": 71.2,
|
| 54 |
+
"gaia": 58.7
|
| 55 |
+
}
|
| 56 |
+
},
|
| 57 |
+
{
|
| 58 |
+
"agent_name": "OpenHands CodeAct v2.0",
|
| 59 |
+
"llm_base": "gpt-4o-2024-11-20",
|
| 60 |
+
"openness": "closed_api_available",
|
| 61 |
+
"tool_usage": "standard",
|
| 62 |
+
"scores": {
|
| 63 |
+
"swe-bench": 45.1,
|
| 64 |
+
"multi-swe-bench": 32.8,
|
| 65 |
+
"swe-bench-multimodal": 39.5,
|
| 66 |
+
"swt-bench": 62.3,
|
| 67 |
+
"commit0": 68.9,
|
| 68 |
+
"gaia": 55.2
|
| 69 |
+
}
|
| 70 |
+
},
|
| 71 |
+
{
|
| 72 |
+
"agent_name": "AutoCodeRover",
|
| 73 |
+
"llm_base": "gpt-4-turbo-2024-04-09",
|
| 74 |
+
"openness": "closed_api_available",
|
| 75 |
+
"tool_usage": "standard",
|
| 76 |
+
"scores": {
|
| 77 |
+
"swe-bench": 38.7,
|
| 78 |
+
"multi-swe-bench": 28.4,
|
| 79 |
+
"swe-bench-multimodal": 34.2,
|
| 80 |
+
"swt-bench": 54.1,
|
| 81 |
+
"commit0": 61.5,
|
| 82 |
+
"gaia": 48.3
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
{
|
| 86 |
+
"agent_name": "Agentless",
|
| 87 |
+
"llm_base": "gpt-4o-mini-2024-07-18",
|
| 88 |
+
"openness": "closed_api_available",
|
| 89 |
+
"tool_usage": "standard",
|
| 90 |
+
"scores": {
|
| 91 |
+
"swe-bench": 32.5,
|
| 92 |
+
"multi-swe-bench": 24.1,
|
| 93 |
+
"swe-bench-multimodal": 28.9,
|
| 94 |
+
"swt-bench": 47.8,
|
| 95 |
+
"commit0": 55.3,
|
| 96 |
+
"gaia": 42.1
|
| 97 |
+
}
|
| 98 |
+
},
|
| 99 |
+
{
|
| 100 |
+
"agent_name": "SWE-Agent",
|
| 101 |
+
"llm_base": "claude-3-opus-20240229",
|
| 102 |
+
"openness": "closed_api_available",
|
| 103 |
+
"tool_usage": "custom_interface",
|
| 104 |
+
"scores": {
|
| 105 |
+
"swe-bench": 29.8,
|
| 106 |
+
"multi-swe-bench": 21.5,
|
| 107 |
+
"swe-bench-multimodal": 25.7,
|
| 108 |
+
"swt-bench": 44.2,
|
| 109 |
+
"commit0": 52.1,
|
| 110 |
+
"gaia": 39.4
|
| 111 |
+
}
|
| 112 |
+
},
|
| 113 |
+
]
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def generate_mock_data():
|
| 117 |
+
"""Generate mock JSONL files for all benchmarks."""
|
| 118 |
+
output_dir = Path("mock_results/1.0.0-dev1")
|
| 119 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 120 |
+
|
| 121 |
+
# Create agenteval.json config
|
| 122 |
+
config = {
|
| 123 |
+
"suite_config": {
|
| 124 |
+
"name": "openhands-index",
|
| 125 |
+
"version": "1.0.0-dev1",
|
| 126 |
+
"splits": []
|
| 127 |
+
}
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
# Generate data for each benchmark
|
| 131 |
+
for benchmark_name, benchmark_info in BENCHMARKS.items():
|
| 132 |
+
print(f"Generating mock data for {benchmark_name}...")
|
| 133 |
+
|
| 134 |
+
# Add to config
|
| 135 |
+
config["suite_config"]["splits"].append({
|
| 136 |
+
"name": benchmark_name,
|
| 137 |
+
"tasks": [{
|
| 138 |
+
"name": benchmark_name,
|
| 139 |
+
"tags": benchmark_info["tags"]
|
| 140 |
+
}]
|
| 141 |
+
})
|
| 142 |
+
|
| 143 |
+
# Generate JSONL file
|
| 144 |
+
jsonl_path = output_dir / f"{benchmark_name}.jsonl"
|
| 145 |
+
with open(jsonl_path, 'w') as f:
|
| 146 |
+
for agent in MOCK_AGENTS:
|
| 147 |
+
record = {
|
| 148 |
+
"agent_name": agent["agent_name"],
|
| 149 |
+
"llm_base": agent["llm_base"],
|
| 150 |
+
"openness": agent["openness"],
|
| 151 |
+
"tool_usage": agent["tool_usage"],
|
| 152 |
+
"score": agent["scores"][benchmark_name],
|
| 153 |
+
"metric": benchmark_info["metric"],
|
| 154 |
+
"submission_time": datetime.now().isoformat(),
|
| 155 |
+
"tags": benchmark_info["tags"],
|
| 156 |
+
# Additional metadata
|
| 157 |
+
"total_cost": round(10.0 + agent["scores"][benchmark_name] * 0.5, 2),
|
| 158 |
+
"total_runtime": round(300 + agent["scores"][benchmark_name] * 5, 1),
|
| 159 |
+
}
|
| 160 |
+
f.write(json.dumps(record) + '\n')
|
| 161 |
+
|
| 162 |
+
print(f" Created {jsonl_path}")
|
| 163 |
+
|
| 164 |
+
# Write config file
|
| 165 |
+
config_path = output_dir / "agenteval.json"
|
| 166 |
+
with open(config_path, 'w') as f:
|
| 167 |
+
json.dump(config, f, indent=2)
|
| 168 |
+
print(f"\nCreated config: {config_path}")
|
| 169 |
+
|
| 170 |
+
print("\n✓ Mock data generation complete!")
|
| 171 |
+
print(f" Location: {output_dir}")
|
| 172 |
+
print(f" Benchmarks: {', '.join(BENCHMARKS.keys())}")
|
| 173 |
+
print(f" Agents: {len(MOCK_AGENTS)}")
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
if __name__ == "__main__":
|
| 177 |
+
generate_mock_data()
|
mock_results/1.0.0-dev1/agenteval.json
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"suite_config": {
|
| 3 |
+
"name": "openhands-index",
|
| 4 |
+
"version": "1.0.0-dev1",
|
| 5 |
+
"splits": [
|
| 6 |
+
{
|
| 7 |
+
"name": "swe-bench",
|
| 8 |
+
"tasks": [
|
| 9 |
+
{
|
| 10 |
+
"name": "swe-bench",
|
| 11 |
+
"tags": [
|
| 12 |
+
"swe-bench"
|
| 13 |
+
]
|
| 14 |
+
}
|
| 15 |
+
]
|
| 16 |
+
},
|
| 17 |
+
{
|
| 18 |
+
"name": "multi-swe-bench",
|
| 19 |
+
"tasks": [
|
| 20 |
+
{
|
| 21 |
+
"name": "multi-swe-bench",
|
| 22 |
+
"tags": [
|
| 23 |
+
"multi-swe-bench"
|
| 24 |
+
]
|
| 25 |
+
}
|
| 26 |
+
]
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
"name": "swe-bench-multimodal",
|
| 30 |
+
"tasks": [
|
| 31 |
+
{
|
| 32 |
+
"name": "swe-bench-multimodal",
|
| 33 |
+
"tags": [
|
| 34 |
+
"swe-bench-multimodal"
|
| 35 |
+
]
|
| 36 |
+
}
|
| 37 |
+
]
|
| 38 |
+
},
|
| 39 |
+
{
|
| 40 |
+
"name": "swt-bench",
|
| 41 |
+
"tasks": [
|
| 42 |
+
{
|
| 43 |
+
"name": "swt-bench",
|
| 44 |
+
"tags": [
|
| 45 |
+
"swt-bench"
|
| 46 |
+
]
|
| 47 |
+
}
|
| 48 |
+
]
|
| 49 |
+
},
|
| 50 |
+
{
|
| 51 |
+
"name": "commit0",
|
| 52 |
+
"tasks": [
|
| 53 |
+
{
|
| 54 |
+
"name": "commit0",
|
| 55 |
+
"tags": [
|
| 56 |
+
"commit0"
|
| 57 |
+
]
|
| 58 |
+
}
|
| 59 |
+
]
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"name": "gaia",
|
| 63 |
+
"tasks": [
|
| 64 |
+
{
|
| 65 |
+
"name": "gaia",
|
| 66 |
+
"tags": [
|
| 67 |
+
"gaia"
|
| 68 |
+
]
|
| 69 |
+
}
|
| 70 |
+
]
|
| 71 |
+
}
|
| 72 |
+
]
|
| 73 |
+
}
|
| 74 |
+
}
|
mock_results/1.0.0-dev1/commit0.jsonl
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 71.2, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093416", "tags": ["commit0"], "total_cost": 45.6, "total_runtime": 656.0}
|
| 2 |
+
{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 68.9, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093428", "tags": ["commit0"], "total_cost": 44.45, "total_runtime": 644.5}
|
| 3 |
+
{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 61.5, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093436", "tags": ["commit0"], "total_cost": 40.75, "total_runtime": 607.5}
|
| 4 |
+
{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 55.3, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093445", "tags": ["commit0"], "total_cost": 37.65, "total_runtime": 576.5}
|
| 5 |
+
{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 52.1, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093450", "tags": ["commit0"], "total_cost": 36.05, "total_runtime": 560.5}
|
mock_results/1.0.0-dev1/gaia.jsonl
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 58.7, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093541", "tags": ["gaia"], "total_cost": 39.35, "total_runtime": 593.5}
|
| 2 |
+
{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 55.2, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093556", "tags": ["gaia"], "total_cost": 37.6, "total_runtime": 576.0}
|
| 3 |
+
{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 48.3, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093563", "tags": ["gaia"], "total_cost": 34.15, "total_runtime": 541.5}
|
| 4 |
+
{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 42.1, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093568", "tags": ["gaia"], "total_cost": 31.05, "total_runtime": 510.5}
|
| 5 |
+
{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 39.4, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093574", "tags": ["gaia"], "total_cost": 29.7, "total_runtime": 497.0}
|
mock_results/1.0.0-dev1/multi-swe-bench.jsonl
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 35.2, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093026", "tags": ["multi-swe-bench"], "total_cost": 27.6, "total_runtime": 476.0}
|
| 2 |
+
{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 32.8, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093040", "tags": ["multi-swe-bench"], "total_cost": 26.4, "total_runtime": 464.0}
|
| 3 |
+
{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 28.4, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093048", "tags": ["multi-swe-bench"], "total_cost": 24.2, "total_runtime": 442.0}
|
| 4 |
+
{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 24.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093058", "tags": ["multi-swe-bench"], "total_cost": 22.05, "total_runtime": 420.5}
|
| 5 |
+
{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 21.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093067", "tags": ["multi-swe-bench"], "total_cost": 20.75, "total_runtime": 407.5}
|
mock_results/1.0.0-dev1/swe-bench-multimodal.jsonl
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 42.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093172", "tags": ["swe-bench-multimodal"], "total_cost": 31.05, "total_runtime": 510.5}
|
| 2 |
+
{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 39.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093187", "tags": ["swe-bench-multimodal"], "total_cost": 29.75, "total_runtime": 497.5}
|
| 3 |
+
{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 34.2, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093200", "tags": ["swe-bench-multimodal"], "total_cost": 27.1, "total_runtime": 471.0}
|
| 4 |
+
{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 28.9, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093208", "tags": ["swe-bench-multimodal"], "total_cost": 24.45, "total_runtime": 444.5}
|
| 5 |
+
{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 25.7, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093218", "tags": ["swe-bench-multimodal"], "total_cost": 22.85, "total_runtime": 428.5}
|
mock_results/1.0.0-dev1/swe-bench.jsonl
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 48.3, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092865", "tags": ["swe-bench"], "total_cost": 34.15, "total_runtime": 541.5}
|
| 2 |
+
{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 45.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092895", "tags": ["swe-bench"], "total_cost": 32.55, "total_runtime": 525.5}
|
| 3 |
+
{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 38.7, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092908", "tags": ["swe-bench"], "total_cost": 29.35, "total_runtime": 493.5}
|
| 4 |
+
{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 32.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092916", "tags": ["swe-bench"], "total_cost": 26.25, "total_runtime": 462.5}
|
| 5 |
+
{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 29.8, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092922", "tags": ["swe-bench"], "total_cost": 24.9, "total_runtime": 449.0}
|
mock_results/1.0.0-dev1/swt-bench.jsonl
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 65.4, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093304", "tags": ["swt-bench"], "total_cost": 42.7, "total_runtime": 627.0}
|
| 2 |
+
{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 62.3, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093317", "tags": ["swt-bench"], "total_cost": 41.15, "total_runtime": 611.5}
|
| 3 |
+
{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 54.1, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093326", "tags": ["swt-bench"], "total_cost": 37.05, "total_runtime": 570.5}
|
| 4 |
+
{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 47.8, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093333", "tags": ["swt-bench"], "total_cost": 33.9, "total_runtime": 539.0}
|
| 5 |
+
{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 44.2, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093338", "tags": ["swt-bench"], "total_cost": 32.1, "total_runtime": 521.0}
|
requirements.txt
CHANGED
|
@@ -1,132 +1,13 @@
|
|
| 1 |
-
|
| 2 |
-
aiobotocore==2.22.0
|
| 3 |
-
aiofiles==24.1.0
|
| 4 |
-
aiohappyeyeballs==2.6.1
|
| 5 |
-
aiohttp==3.11.18
|
| 6 |
-
aioitertools==0.12.0
|
| 7 |
-
aiosignal==1.3.2
|
| 8 |
-
annotated-types==0.7.0
|
| 9 |
-
anyio==4.9.0
|
| 10 |
-
APScheduler==3.11.0
|
| 11 |
-
async-timeout==5.0.1
|
| 12 |
-
attrs==25.3.0
|
| 13 |
-
Authlib==1.5.2
|
| 14 |
-
beautifulsoup4==4.13.4
|
| 15 |
-
black==25.1.0
|
| 16 |
-
botocore==1.37.3
|
| 17 |
-
certifi==2025.4.26
|
| 18 |
-
cffi==1.17.1
|
| 19 |
-
charset-normalizer==3.4.2
|
| 20 |
-
click==8.1.8
|
| 21 |
-
contourpy==1.3.2
|
| 22 |
-
cryptography==44.0.3
|
| 23 |
-
cycler==0.12.1
|
| 24 |
-
datasets==4.0.0
|
| 25 |
-
debugpy==1.8.14
|
| 26 |
-
dill==0.3.8
|
| 27 |
-
distro==1.9.0
|
| 28 |
-
docstring_parser==0.16
|
| 29 |
-
exceptiongroup==1.2.2
|
| 30 |
-
fastapi==0.115.12
|
| 31 |
-
ffmpy==0.5.0
|
| 32 |
-
filelock==3.18.0
|
| 33 |
-
fonttools==4.58.1
|
| 34 |
-
frozenlist==1.6.0
|
| 35 |
-
fsspec==2025.3.0
|
| 36 |
gradio==5.30.0
|
| 37 |
-
gradio_client==1.10.1
|
| 38 |
-
gradio_modal==0.0.4
|
| 39 |
-
groovy==0.1.2
|
| 40 |
-
h11==0.16.0
|
| 41 |
-
httpcore==1.0.9
|
| 42 |
-
httpx==0.28.1
|
| 43 |
-
huggingface-hub==0.30.2
|
| 44 |
-
idna==3.10
|
| 45 |
-
ijson==3.3.0
|
| 46 |
-
importlib_metadata==8.7.0
|
| 47 |
-
inspect_ai==0.3.104
|
| 48 |
-
isort==6.0.1
|
| 49 |
-
itsdangerous==2.2.0
|
| 50 |
-
Jinja2==3.1.6
|
| 51 |
-
jiter==0.9.0
|
| 52 |
-
jmespath==1.0.1
|
| 53 |
-
jsonlines==4.0.0
|
| 54 |
-
jsonpatch==1.33
|
| 55 |
-
jsonpointer==3.0.0
|
| 56 |
-
jsonschema==4.23.0
|
| 57 |
-
jsonschema-specifications==2025.4.1
|
| 58 |
-
kiwisolver==1.4.8
|
| 59 |
-
linkify-it-py==2.0.3
|
| 60 |
-
litellm==1.68.1
|
| 61 |
-
markdown-it-py==3.0.0
|
| 62 |
-
MarkupSafe==3.0.2
|
| 63 |
-
matplotlib==3.10.3
|
| 64 |
-
mdit-py-plugins==0.4.2
|
| 65 |
-
mdurl==0.1.2
|
| 66 |
-
mmh3==5.1.0
|
| 67 |
-
mplcursors==0.6
|
| 68 |
-
multidict==6.4.3
|
| 69 |
-
multiprocess==0.70.16
|
| 70 |
-
mypy_extensions==1.1.0
|
| 71 |
-
narwhals==1.38.2
|
| 72 |
-
nest-asyncio==1.6.0
|
| 73 |
-
numpy==2.2.5
|
| 74 |
-
openai==1.75.0
|
| 75 |
-
orjson==3.10.18
|
| 76 |
-
packaging==25.0
|
| 77 |
pandas==2.2.3
|
| 78 |
-
pathspec==0.12.1
|
| 79 |
-
pillow==11.2.1
|
| 80 |
-
platformdirs==4.3.7
|
| 81 |
plotly==6.0.1
|
| 82 |
-
propcache==0.3.1
|
| 83 |
-
psutil==7.0.0
|
| 84 |
-
pyarrow==20.0.0
|
| 85 |
-
pycparser==2.22
|
| 86 |
-
pydantic==2.11.4
|
| 87 |
-
pydantic_core==2.33.2
|
| 88 |
-
pydub==0.25.1
|
| 89 |
-
Pygments==2.19.1
|
| 90 |
-
pyparsing==3.2.3
|
| 91 |
-
python-dateutil==2.9.0.post0
|
| 92 |
-
python-dotenv==1.1.0
|
| 93 |
-
python-multipart==0.0.20
|
| 94 |
-
pytz==2025.2
|
| 95 |
-
PyYAML==6.0.2
|
| 96 |
-
referencing==0.36.2
|
| 97 |
-
regex==2024.11.6
|
| 98 |
requests==2.32.3
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
seaborn==0.13.2
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
shellingham==1.5.4
|
| 108 |
-
shortuuid==1.0.13
|
| 109 |
-
six==1.17.0
|
| 110 |
-
sniffio==1.3.1
|
| 111 |
-
soupsieve==2.7
|
| 112 |
-
starlette==0.46.2
|
| 113 |
-
tenacity==9.1.2
|
| 114 |
-
textual<3.0.0
|
| 115 |
-
tiktoken==0.9.0
|
| 116 |
-
tokenizers==0.21.1
|
| 117 |
-
tomli==2.2.1
|
| 118 |
-
tomlkit==0.13.2
|
| 119 |
-
tqdm==4.67.1
|
| 120 |
-
typer==0.15.3
|
| 121 |
-
typing-inspection==0.4.0
|
| 122 |
-
typing_extensions==4.13.2
|
| 123 |
-
tzdata==2025.2
|
| 124 |
-
tzlocal==5.3.1
|
| 125 |
-
uc-micro-py==1.0.3
|
| 126 |
-
urllib3==2.4.0
|
| 127 |
-
uvicorn==0.34.2
|
| 128 |
-
websockets==15.0.1
|
| 129 |
-
wrapt==1.17.2
|
| 130 |
-
xxhash==3.5.0
|
| 131 |
-
yarl==1.20.0
|
| 132 |
-
zipp==3.21.0
|
|
|
|
| 1 |
+
# Core dependencies
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
gradio==5.30.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
pandas==2.2.3
|
|
|
|
|
|
|
|
|
|
| 4 |
plotly==6.0.1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
requests==2.32.3
|
| 6 |
+
huggingface-hub==0.30.2
|
| 7 |
+
APScheduler==3.11.0
|
| 8 |
+
|
| 9 |
+
# Additional dependencies for UI and processing
|
| 10 |
+
matplotlib==3.10.3
|
| 11 |
seaborn==0.13.2
|
| 12 |
+
Pillow==11.2.1
|
| 13 |
+
PyYAML==6.0.2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
simple_data_loader.py
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Simple data loader for OpenHands Index leaderboard.
|
| 3 |
+
Loads JSONL files from local directory or GitHub repository.
|
| 4 |
+
"""
|
| 5 |
+
import os
|
| 6 |
+
import pandas as pd
|
| 7 |
+
import json
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class SimpleLeaderboardViewer:
|
| 12 |
+
"""Simple replacement for agent-eval's LeaderboardViewer."""
|
| 13 |
+
|
| 14 |
+
def __init__(self, data_dir: str, config: str, split: str):
|
| 15 |
+
"""
|
| 16 |
+
Args:
|
| 17 |
+
data_dir: Path to data directory
|
| 18 |
+
config: Config name (e.g., "1.0.0-dev1")
|
| 19 |
+
split: Split name (e.g., "validation" or "test")
|
| 20 |
+
"""
|
| 21 |
+
self.data_dir = Path(data_dir)
|
| 22 |
+
self.config = config
|
| 23 |
+
self.split = split
|
| 24 |
+
self.config_path = self.data_dir / config
|
| 25 |
+
|
| 26 |
+
# Load suite configuration
|
| 27 |
+
config_file = self.config_path / "agenteval.json"
|
| 28 |
+
if config_file.exists():
|
| 29 |
+
with open(config_file) as f:
|
| 30 |
+
suite_config = json.load(f)
|
| 31 |
+
self.suite_config = suite_config["suite_config"]
|
| 32 |
+
else:
|
| 33 |
+
self.suite_config = {
|
| 34 |
+
"name": "openhands-index",
|
| 35 |
+
"version": config,
|
| 36 |
+
"splits": []
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
# Build tag map from config
|
| 40 |
+
self.tag_map = {}
|
| 41 |
+
for split_config in self.suite_config.get("splits", []):
|
| 42 |
+
if split_config["name"] == split:
|
| 43 |
+
for task in split_config.get("tasks", []):
|
| 44 |
+
for tag in task.get("tags", []):
|
| 45 |
+
if tag not in self.tag_map:
|
| 46 |
+
self.tag_map[tag] = []
|
| 47 |
+
self.tag_map[tag].append(task["name"])
|
| 48 |
+
|
| 49 |
+
def _load(self):
|
| 50 |
+
"""Load the JSONL file for the split and return DataFrame and tag map."""
|
| 51 |
+
jsonl_file = self.config_path / f"{self.split}.jsonl"
|
| 52 |
+
|
| 53 |
+
if not jsonl_file.exists():
|
| 54 |
+
# Return empty dataframe with error message
|
| 55 |
+
return pd.DataFrame({
|
| 56 |
+
"Message": [f"No data found for split '{self.split}'. Expected file: {jsonl_file}"]
|
| 57 |
+
}), {}
|
| 58 |
+
|
| 59 |
+
try:
|
| 60 |
+
# Read JSONL file
|
| 61 |
+
records = []
|
| 62 |
+
with open(jsonl_file, 'r') as f:
|
| 63 |
+
for line in f:
|
| 64 |
+
if line.strip():
|
| 65 |
+
records.append(json.loads(line))
|
| 66 |
+
|
| 67 |
+
if not records:
|
| 68 |
+
return pd.DataFrame({
|
| 69 |
+
"Message": [f"No data in file: {jsonl_file}"]
|
| 70 |
+
}), {}
|
| 71 |
+
|
| 72 |
+
df = pd.DataFrame(records)
|
| 73 |
+
return df, self.tag_map
|
| 74 |
+
except Exception as e:
|
| 75 |
+
return pd.DataFrame({
|
| 76 |
+
"Message": [f"Error loading data: {e}"]
|
| 77 |
+
}), {}
|
| 78 |
+
|
| 79 |
+
def get_dataframe(self):
|
| 80 |
+
"""Get the raw dataframe."""
|
| 81 |
+
df, _ = self._load()
|
| 82 |
+
return df
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def load_mock_data_locally(data_dir: str = "mock_results"):
|
| 86 |
+
"""
|
| 87 |
+
Load mock data from local directory for testing.
|
| 88 |
+
|
| 89 |
+
Args:
|
| 90 |
+
data_dir: Path to mock results directory
|
| 91 |
+
|
| 92 |
+
Returns:
|
| 93 |
+
Dictionary mapping split names to SimpleLeaderboardViewer instances
|
| 94 |
+
"""
|
| 95 |
+
viewers = {}
|
| 96 |
+
data_path = Path(data_dir)
|
| 97 |
+
|
| 98 |
+
if not data_path.exists():
|
| 99 |
+
print(f"Warning: Mock data directory '{data_dir}' not found")
|
| 100 |
+
return viewers
|
| 101 |
+
|
| 102 |
+
# Find all config directories
|
| 103 |
+
for config_dir in data_path.iterdir():
|
| 104 |
+
if config_dir.is_dir():
|
| 105 |
+
config_name = config_dir.name
|
| 106 |
+
|
| 107 |
+
# Find all JSONL files (each represents a split)
|
| 108 |
+
for jsonl_file in config_dir.glob("*.jsonl"):
|
| 109 |
+
split_name = jsonl_file.stem
|
| 110 |
+
viewer = SimpleLeaderboardViewer(
|
| 111 |
+
data_dir=str(data_path),
|
| 112 |
+
config=config_name,
|
| 113 |
+
split=split_name
|
| 114 |
+
)
|
| 115 |
+
viewers[split_name] = viewer
|
| 116 |
+
|
| 117 |
+
return viewers
|
submission.py
CHANGED
|
@@ -2,11 +2,6 @@ import logging
|
|
| 2 |
import typing
|
| 3 |
|
| 4 |
import matplotlib
|
| 5 |
-
from agenteval.cli import SUBMISSION_METADATA_FILENAME
|
| 6 |
-
from agenteval.models import SubmissionMetadata
|
| 7 |
-
from datasets.exceptions import DataFilesNotFoundError
|
| 8 |
-
from gradio_modal import Modal
|
| 9 |
-
|
| 10 |
matplotlib.use('Agg')
|
| 11 |
|
| 12 |
import os
|
|
@@ -17,12 +12,15 @@ from email.utils import parseaddr
|
|
| 17 |
|
| 18 |
import gradio as gr
|
| 19 |
import requests
|
| 20 |
-
from agenteval.leaderboard.upload import sanitize_path_component, _validate_path_component
|
| 21 |
-
from datasets import Dataset, DatasetDict, VerificationMode, load_dataset
|
| 22 |
-
from datasets.data_files import EmptyDatasetError
|
| 23 |
from huggingface_hub import HfApi
|
| 24 |
|
| 25 |
import aliases
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
from config import (
|
| 27 |
CONFIG_NAME,
|
| 28 |
CONTACT_DATASET,
|
|
|
|
| 2 |
import typing
|
| 3 |
|
| 4 |
import matplotlib
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
matplotlib.use('Agg')
|
| 6 |
|
| 7 |
import os
|
|
|
|
| 12 |
|
| 13 |
import gradio as gr
|
| 14 |
import requests
|
|
|
|
|
|
|
|
|
|
| 15 |
from huggingface_hub import HfApi
|
| 16 |
|
| 17 |
import aliases
|
| 18 |
+
from submission_utils import (
|
| 19 |
+
SUBMISSION_METADATA_FILENAME,
|
| 20 |
+
SubmissionMetadata,
|
| 21 |
+
sanitize_path_component,
|
| 22 |
+
_validate_path_component
|
| 23 |
+
)
|
| 24 |
from config import (
|
| 25 |
CONFIG_NAME,
|
| 26 |
CONTACT_DATASET,
|
submission_utils.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Utilities for submission handling, replacing agent-eval dependencies.
|
| 3 |
+
"""
|
| 4 |
+
import re
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from typing import Optional
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
# Constants
|
| 10 |
+
SUBMISSION_METADATA_FILENAME = "metadata.json"
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
# Simple SubmissionMetadata class
|
| 14 |
+
class SubmissionMetadata:
|
| 15 |
+
"""Simple metadata for submissions."""
|
| 16 |
+
def __init__(self, **kwargs):
|
| 17 |
+
self.agent_name = kwargs.get("agent_name", "")
|
| 18 |
+
self.llm_base = kwargs.get("llm_base", "")
|
| 19 |
+
self.openness = kwargs.get("openness", "")
|
| 20 |
+
self.tool_usage = kwargs.get("tool_usage", "")
|
| 21 |
+
self.submitter_name = kwargs.get("submitter_name", "")
|
| 22 |
+
self.submitter_email = kwargs.get("submitter_email", "")
|
| 23 |
+
|
| 24 |
+
def to_dict(self):
|
| 25 |
+
return {
|
| 26 |
+
"agent_name": self.agent_name,
|
| 27 |
+
"llm_base": self.llm_base,
|
| 28 |
+
"openness": self.openness,
|
| 29 |
+
"tool_usage": self.tool_usage,
|
| 30 |
+
"submitter_name": self.submitter_name,
|
| 31 |
+
"submitter_email": self.submitter_email,
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
# Path validation functions
|
| 36 |
+
def _validate_path_component(component: str, allow_underscores: bool = True) -> None:
|
| 37 |
+
"""
|
| 38 |
+
Validate a single path component.
|
| 39 |
+
|
| 40 |
+
Args:
|
| 41 |
+
component: The path component to validate
|
| 42 |
+
allow_underscores: Whether to allow underscores in the component
|
| 43 |
+
|
| 44 |
+
Raises:
|
| 45 |
+
ValueError: If the component is invalid
|
| 46 |
+
"""
|
| 47 |
+
if not component:
|
| 48 |
+
raise ValueError("Path component cannot be empty")
|
| 49 |
+
|
| 50 |
+
if component in (".", ".."):
|
| 51 |
+
raise ValueError(f"Path component cannot be '{component}'")
|
| 52 |
+
|
| 53 |
+
# Check for invalid characters
|
| 54 |
+
pattern = r'^[a-zA-Z0-9_\-\.]+$' if allow_underscores else r'^[a-zA-Z0-9\-\.]+$'
|
| 55 |
+
if not re.match(pattern, component):
|
| 56 |
+
raise ValueError(
|
| 57 |
+
f"Path component '{component}' contains invalid characters. "
|
| 58 |
+
f"Only alphanumeric, hyphens, dots{', and underscores' if allow_underscores else ''} are allowed."
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def sanitize_path_component(component: str, replacement: str = "_") -> str:
|
| 63 |
+
"""
|
| 64 |
+
Sanitize a path component by replacing invalid characters.
|
| 65 |
+
|
| 66 |
+
Args:
|
| 67 |
+
component: The path component to sanitize
|
| 68 |
+
replacement: The character to use for replacing invalid characters
|
| 69 |
+
|
| 70 |
+
Returns:
|
| 71 |
+
Sanitized path component
|
| 72 |
+
"""
|
| 73 |
+
if not component:
|
| 74 |
+
return "unnamed"
|
| 75 |
+
|
| 76 |
+
# Replace any non-alphanumeric, non-hyphen, non-dot, non-underscore with replacement
|
| 77 |
+
sanitized = re.sub(r'[^a-zA-Z0-9_\-\.]', replacement, component)
|
| 78 |
+
|
| 79 |
+
# Remove leading/trailing dots or hyphens
|
| 80 |
+
sanitized = sanitized.strip('.-')
|
| 81 |
+
|
| 82 |
+
# Collapse multiple replacements into one
|
| 83 |
+
sanitized = re.sub(f'{re.escape(replacement)}+', replacement, sanitized)
|
| 84 |
+
|
| 85 |
+
if not sanitized:
|
| 86 |
+
return "unnamed"
|
| 87 |
+
|
| 88 |
+
return sanitized
|
tests/integration/test_submission.py
CHANGED
|
@@ -4,14 +4,12 @@ from datetime import datetime
|
|
| 4 |
|
| 5 |
import gradio
|
| 6 |
import pytest
|
| 7 |
-
import pyarrow as pa
|
| 8 |
-
from agenteval.models import SubmissionMetadata
|
| 9 |
-
from datasets import load_dataset, VerificationMode
|
| 10 |
from huggingface_hub import HfApi, hf_hub_download
|
| 11 |
|
| 12 |
from aliases import CANONICAL_TOOL_USAGE_STANDARD, CANONICAL_OPENNESS_CLOSED_UI_ONLY
|
| 13 |
from config import IS_INTERNAL, CONFIG_NAME, CONTACT_DATASET, SUBMISSION_DATASET
|
| 14 |
from submission import add_new_eval
|
|
|
|
| 15 |
|
| 16 |
_hf = HfApi()
|
| 17 |
|
|
|
|
| 4 |
|
| 5 |
import gradio
|
| 6 |
import pytest
|
|
|
|
|
|
|
|
|
|
| 7 |
from huggingface_hub import HfApi, hf_hub_download
|
| 8 |
|
| 9 |
from aliases import CANONICAL_TOOL_USAGE_STANDARD, CANONICAL_OPENNESS_CLOSED_UI_ONLY
|
| 10 |
from config import IS_INTERNAL, CONFIG_NAME, CONTACT_DATASET, SUBMISSION_DATASET
|
| 11 |
from submission import add_new_eval
|
| 12 |
+
from submission_utils import SubmissionMetadata
|
| 13 |
|
| 14 |
_hf = HfApi()
|
| 15 |
|
ui_components.py
CHANGED
|
@@ -4,10 +4,10 @@ import plotly.graph_objects as go
|
|
| 4 |
import os
|
| 5 |
import base64
|
| 6 |
|
| 7 |
-
from agenteval.leaderboard.view import LeaderboardViewer
|
| 8 |
from huggingface_hub import HfApi
|
| 9 |
|
| 10 |
import aliases
|
|
|
|
| 11 |
from leaderboard_transformer import (
|
| 12 |
DataTransformer,
|
| 13 |
transform_raw_dataframe,
|
|
@@ -473,12 +473,14 @@ def get_leaderboard_viewer_instance(split: str):
|
|
| 473 |
|
| 474 |
# --- Cache miss: try to load data from the source ---
|
| 475 |
try:
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
|
|
|
|
|
|
|
|
|
|
| 479 |
config=CONFIG_NAME,
|
| 480 |
-
split=split
|
| 481 |
-
is_internal=IS_INTERNAL
|
| 482 |
)
|
| 483 |
|
| 484 |
# Simplify tag map creation
|
|
|
|
| 4 |
import os
|
| 5 |
import base64
|
| 6 |
|
|
|
|
| 7 |
from huggingface_hub import HfApi
|
| 8 |
|
| 9 |
import aliases
|
| 10 |
+
from simple_data_loader import SimpleLeaderboardViewer
|
| 11 |
from leaderboard_transformer import (
|
| 12 |
DataTransformer,
|
| 13 |
transform_raw_dataframe,
|
|
|
|
| 473 |
|
| 474 |
# --- Cache miss: try to load data from the source ---
|
| 475 |
try:
|
| 476 |
+
# First try to load from extracted data directory (local mock data)
|
| 477 |
+
data_dir = EXTRACTED_DATA_DIR if os.path.exists(EXTRACTED_DATA_DIR) else "mock_results"
|
| 478 |
+
|
| 479 |
+
print(f"Loading data for split '{split}' from: {data_dir}/{CONFIG_NAME}")
|
| 480 |
+
viewer = SimpleLeaderboardViewer(
|
| 481 |
+
data_dir=data_dir,
|
| 482 |
config=CONFIG_NAME,
|
| 483 |
+
split=split
|
|
|
|
| 484 |
)
|
| 485 |
|
| 486 |
# Simplify tag map creation
|