Spaces:

OpenHands
/

openhands-index

Running

openhands openhands commited on Nov 24, 2025

Commit

1027cfb

1 Parent(s): 085a012

Convert to JSONL data format and remove agent-eval dependency

- Replace parquet files with JSONL for better readability and simplicity
- Remove agent-eval library dependency to avoid secret requirements
- Create simple_data_loader.py as lightweight replacement for LeaderboardViewer
- Create submission_utils.py for submission handling functions
- Simplify Dockerfile (no SSH keys or secrets needed)
- Reduce requirements.txt to essential packages only
- Generate mock data for all 6 benchmarks (swe-bench, multi-swe-bench, swe-bench-multimodal, swt-bench, commit0, gaia)
- Update aliases.py to define constants locally
- Fix imports in submission.py and test files

Co-authored-by: openhands <openhands@all-hands.dev>

Files changed (18) hide show

Dockerfile +7 -18
aliases.py +8 -9
app.py +10 -4
generate_mock_data.py +0 -102
generate_mock_jsonl.py +177 -0
mock_results/1.0.0-dev1/agenteval.json +74 -0
mock_results/1.0.0-dev1/commit0.jsonl +5 -0
mock_results/1.0.0-dev1/gaia.jsonl +5 -0
mock_results/1.0.0-dev1/multi-swe-bench.jsonl +5 -0
mock_results/1.0.0-dev1/swe-bench-multimodal.jsonl +5 -0
mock_results/1.0.0-dev1/swe-bench.jsonl +5 -0
mock_results/1.0.0-dev1/swt-bench.jsonl +5 -0
requirements.txt +8 -127
simple_data_loader.py +117 -0
submission.py +6 -8
submission_utils.py +88 -0
tests/integration/test_submission.py +1 -3
ui_components.py +8 -6

Dockerfile CHANGED Viewed

@@ -1,9 +1,8 @@
 FROM python:3.10-slim
-# (0) Install SSH client tools (and git, if you're pulling via SSH)
 RUN apt-get update && \
-    apt-get install -y --no-install-recommends openssh-client git && \
     rm -rf /var/lib/apt/lists/*
 # The two following lines are requirements for the Dev Mode to be functional
@@ -11,23 +10,13 @@ RUN apt-get update && \
 RUN useradd -m -u 1000 user
 WORKDIR /app
-# (2) Copy dependencies manifest
 COPY --chown=user requirements.txt requirements.txt
-# (3) Install dependencies, mounting SSH keys and optional HTTPS creds
-RUN --mount=type=secret,id=AGENTEVAL_DEPLOY_KEY,mode=0400,required=true \
-    --mount=type=secret,id=ASTABENCH_DEPLOY_KEY,mode=0400,required=true \
-    mkdir -p /root/.ssh && chmod 700 /root/.ssh && \
-    cat /run/secrets/AGENTEVAL_DEPLOY_KEY > /root/.ssh/id_ed25519 && chmod 600 /root/.ssh/id_ed25519 && \
-    cat /run/secrets/ASTABENCH_DEPLOY_KEY > /root/.ssh/id_astabench && chmod 600 /root/.ssh/id_astabench && \
-    ssh-keyscan github.com >> /root/.ssh/known_hosts && \
-    printf 'Host github.com\n  User git\n  IdentityFile /root/.ssh/id_ed25519\n  IdentityFile /root/.ssh/id_astabench\n  StrictHostKeyChecking no\n' >> /root/.ssh/config && \
-    # rewrite all GitHub HTTPS URLs to SSH so nested deps install via SSH
-    git config --global url."ssh://git@github.com/".insteadOf "https://github.com/" && \
-    pip install --no-cache-dir --upgrade -r requirements.txt
-# (4) Copy in your Gradio app code
 COPY . .
 RUN mkdir -p /home/user/data && chown -R user:user /home/user/data

 FROM python:3.10-slim
+# Install git for cloning results repository
 RUN apt-get update && \
+    apt-get install -y --no-install-recommends git && \
     rm -rf /var/lib/apt/lists/*
 # The two following lines are requirements for the Dev Mode to be functional
 RUN useradd -m -u 1000 user
 WORKDIR /app
+# Copy dependencies manifest
 COPY --chown=user requirements.txt requirements.txt
+# Install dependencies (no secrets needed)
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+# Copy in your Gradio app code
 COPY . .
 RUN mkdir -p /home/user/data && chown -R user:user /home/user/data

aliases.py CHANGED Viewed

@@ -1,12 +1,11 @@
-from agenteval.config import (
-    OPENNESS_OPEN_SOURCE_OPEN_WEIGHTS as CANONICAL_OPENNESS_OPEN_SOURCE_OPEN_WEIGHTS,
-    OPENNESS_OPEN_SOURCE_CLOSED_WEIGHTS as CANONICAL_OPENNESS_OPEN_SOURCE_CLOSED_WEIGHTS,
-    OPENNESS_CLOSED_API_AVAILABLE as CANONICAL_OPENNESS_CLOSED_API_AVAILABLE,
-    OPENNESS_CLOSED_UI_ONLY as CANONICAL_OPENNESS_CLOSED_UI_ONLY,
-    TOOL_USAGE_STANDARD as CANONICAL_TOOL_USAGE_STANDARD,
-    TOOL_USAGE_CUSTOM_INTERFACE as CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE,
-    TOOL_USAGE_FULLY_CUSTOM as CANONICAL_TOOL_USAGE_FULLY_CUSTOM,
-)
 OPENNESS_ALIASES = {

+# Define constants that were previously imported from agenteval
+CANONICAL_OPENNESS_OPEN_SOURCE_OPEN_WEIGHTS = "open_source_open_weights"
+CANONICAL_OPENNESS_OPEN_SOURCE_CLOSED_WEIGHTS = "open_source_closed_weights"
+CANONICAL_OPENNESS_CLOSED_API_AVAILABLE = "closed_api_available"
+CANONICAL_OPENNESS_CLOSED_UI_ONLY = "closed_ui_only"
+CANONICAL_TOOL_USAGE_STANDARD = "standard"
+CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE = "custom_interface"
+CANONICAL_TOOL_USAGE_FULLY_CUSTOM = "fully_custom"
 OPENNESS_ALIASES = {

app.py CHANGED Viewed

@@ -239,18 +239,24 @@ demo = gr.Blocks(
 with demo.route("Home", "/home"):
     build_main_page()
-with demo.route("Literature Understanding", "/literature-understanding"):
     build_lit_page()
-with demo.route("Code & Execution", "/code-execution"):
     build_c_and_e_page()
-with demo.route("Data Analysis", "/data-analysis"):
     build_data_analysis_page()
-with demo.route("End-to-End Discovery", "/discovery"):
     build_e2e_page()
 with demo.route("About", "/about"):
     build_about_page()

 with demo.route("Home", "/home"):
     build_main_page()
+with demo.route("SWE-bench", "/swe-bench"):
     build_lit_page()
+with demo.route("Multi-SWE-bench", "/multi-swe-bench"):
     build_c_and_e_page()
+with demo.route("SWE-bench Multimodal", "/swe-bench-multimodal"):
     build_data_analysis_page()
+with demo.route("SWT-bench", "/swt-bench"):
     build_e2e_page()
+with demo.route("Commit0", "/commit0"):
+    build_lit_page()
+with demo.route("GAIA", "/gaia"):
+    build_c_and_e_page()
 with demo.route("About", "/about"):
     build_about_page()

generate_mock_data.py DELETED Viewed

@@ -1,102 +0,0 @@
-"""Generate mock results data in agenteval format for OpenHands Index."""
-import json
-import pandas as pd
-import pyarrow as pa
-import pyarrow.parquet as pq
-from pathlib import Path
-# Load the suite config
-with open("data/1.0.0-dev1/agenteval.json") as f:
-    suite_config_data = json.load(f)
-suite_config = suite_config_data["suite_config"]
-# Mock agents
-agents = [
-    {
-        "name": "OpenHands CodeAct v2.1",
-        "source_url": "https://github.com/OpenHands/OpenHands"
-    },
-    {
-        "name": "Aider",
-        "source_url": "https://github.com/paul-gauthier/aider"
-    },
-    {
-        "name": "SWE-agent",
-        "source_url": "https://github.com/princeton-nlp/SWE-agent"
-    }
-]
-def create_mock_results(split_name):
-    """Create mock results for a split."""
-    split_config = next(s for s in suite_config["splits"] if s["name"] == split_name)
-    rows = []
-    for agent in agents:
-        # Create results for each task
-        results = []
-        for task in split_config["tasks"]:
-            task_name = task["name"]
-            primary_metric = task["primary_metric"]
-            # Generate mock score (different for each agent)
-            base_score = 0.3 + (hash(agent["name"]) % 50) / 100
-            score = base_score + (hash(task_name) % 30) / 100
-            score = min(score, 1.0)
-            task_result = {
-                "task_name": task_name,
-                "eval_spec": {
-                    "model": "gpt-4",
-                    "solver": f"openhands/{task_name}",
-                },
-                "metrics": [
-                    {
-                        "name": primary_metric,
-                        "value": score
-                    }
-                ],
-                "model_usages": []
-            }
-            results.append(task_result)
-        # Create row
-        row = {
-            "suite_config": suite_config,
-            "split": split_name,
-            "results": results,
-            "submission": {
-                "agent_name": agent["name"],
-                "source_url": agent["source_url"],
-                "openness": "open-source/open-weights",
-                "tool_usage": "standard"
-            }
-        }
-        rows.append(row)
-    return rows
-# Create mock data for both splits
-all_rows = []
-for split in ["validation", "test"]:
-    all_rows.extend(create_mock_results(split))
-# Convert to DataFrame
-df = pd.DataFrame(all_rows)
-# Save as parquet
-output_dir = Path("mock_results/1.0.0-dev1")
-output_dir.mkdir(parents=True, exist_ok=True)
-# Save validation split
-validation_df = df[df["split"] == "validation"]
-validation_df.to_parquet(output_dir / "validation.parquet", index=False)
-# Save test split
-test_df = df[df["split"] == "test"]
-test_df.to_parquet(output_dir / "test.parquet", index=False)
-print(f"Created mock data:")
-print(f"  - Validation: {len(validation_df)} rows")
-print(f"  - Test: {len(test_df)} rows")
-print(f"  - Output: {output_dir}")

generate_mock_jsonl.py ADDED Viewed

	@@ -0,0 +1,177 @@

+"""Generate mock results data in JSONL format for OpenHands Index."""
+import json
+import os
+from pathlib import Path
+from datetime import datetime
+# Define the 6 benchmarks
+BENCHMARKS = {
+    "swe-bench": {
+        "tags": ["swe-bench"],
+        "metric": "resolve_rate",
+        "metric_display": "Resolve Rate (%)"
+    },
+    "multi-swe-bench": {
+        "tags": ["multi-swe-bench"],
+        "metric": "resolve_rate",
+        "metric_display": "Resolve Rate (%)"
+    },
+    "swe-bench-multimodal": {
+        "tags": ["swe-bench-multimodal"],
+        "metric": "resolve_rate",
+        "metric_display": "Resolve Rate (%)"
+    },
+    "swt-bench": {
+        "tags": ["swt-bench"],
+        "metric": "success_rate",
+        "metric_display": "Success Rate (%)"
+    },
+    "commit0": {
+        "tags": ["commit0"],
+        "metric": "test_pass_rate",
+        "metric_display": "Test Pass Rate (%)"
+    },
+    "gaia": {
+        "tags": ["gaia"],
+        "metric": "accuracy",
+        "metric_display": "Accuracy (%)"
+    }
+}
+# Mock agents with realistic scores
+MOCK_AGENTS = [
+    {
+        "agent_name": "OpenHands CodeAct v2.1",
+        "llm_base": "claude-3-5-sonnet-20241022",
+        "openness": "closed_api_available",
+        "tool_usage": "standard",
+        "scores": {
+            "swe-bench": 48.3,
+            "multi-swe-bench": 35.2,
+            "swe-bench-multimodal": 42.1,
+            "swt-bench": 65.4,
+            "commit0": 71.2,
+            "gaia": 58.7
+        }
+    },
+    {
+        "agent_name": "OpenHands CodeAct v2.0",
+        "llm_base": "gpt-4o-2024-11-20",
+        "openness": "closed_api_available",
+        "tool_usage": "standard",
+        "scores": {
+            "swe-bench": 45.1,
+            "multi-swe-bench": 32.8,
+            "swe-bench-multimodal": 39.5,
+            "swt-bench": 62.3,
+            "commit0": 68.9,
+            "gaia": 55.2
+        }
+    },
+    {
+        "agent_name": "AutoCodeRover",
+        "llm_base": "gpt-4-turbo-2024-04-09",
+        "openness": "closed_api_available",
+        "tool_usage": "standard",
+        "scores": {
+            "swe-bench": 38.7,
+            "multi-swe-bench": 28.4,
+            "swe-bench-multimodal": 34.2,
+            "swt-bench": 54.1,
+            "commit0": 61.5,
+            "gaia": 48.3
+        }
+    },
+    {
+        "agent_name": "Agentless",
+        "llm_base": "gpt-4o-mini-2024-07-18",
+        "openness": "closed_api_available",
+        "tool_usage": "standard",
+        "scores": {
+            "swe-bench": 32.5,
+            "multi-swe-bench": 24.1,
+            "swe-bench-multimodal": 28.9,
+            "swt-bench": 47.8,
+            "commit0": 55.3,
+            "gaia": 42.1
+        }
+    },
+    {
+        "agent_name": "SWE-Agent",
+        "llm_base": "claude-3-opus-20240229",
+        "openness": "closed_api_available",
+        "tool_usage": "custom_interface",
+        "scores": {
+            "swe-bench": 29.8,
+            "multi-swe-bench": 21.5,
+            "swe-bench-multimodal": 25.7,
+            "swt-bench": 44.2,
+            "commit0": 52.1,
+            "gaia": 39.4
+        }
+    },
+]
+def generate_mock_data():
+    """Generate mock JSONL files for all benchmarks."""
+    output_dir = Path("mock_results/1.0.0-dev1")
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Create agenteval.json config
+    config = {
+        "suite_config": {
+            "name": "openhands-index",
+            "version": "1.0.0-dev1",
+            "splits": []
+        }
+    }
+    # Generate data for each benchmark
+    for benchmark_name, benchmark_info in BENCHMARKS.items():
+        print(f"Generating mock data for {benchmark_name}...")
+        # Add to config
+        config["suite_config"]["splits"].append({
+            "name": benchmark_name,
+            "tasks": [{
+                "name": benchmark_name,
+                "tags": benchmark_info["tags"]
+            }]
+        })
+        # Generate JSONL file
+        jsonl_path = output_dir / f"{benchmark_name}.jsonl"
+        with open(jsonl_path, 'w') as f:
+            for agent in MOCK_AGENTS:
+                record = {
+                    "agent_name": agent["agent_name"],
+                    "llm_base": agent["llm_base"],
+                    "openness": agent["openness"],
+                    "tool_usage": agent["tool_usage"],
+                    "score": agent["scores"][benchmark_name],
+                    "metric": benchmark_info["metric"],
+                    "submission_time": datetime.now().isoformat(),
+                    "tags": benchmark_info["tags"],
+                    # Additional metadata
+                    "total_cost": round(10.0 + agent["scores"][benchmark_name] * 0.5, 2),
+                    "total_runtime": round(300 + agent["scores"][benchmark_name] * 5, 1),
+                }
+                f.write(json.dumps(record) + '\n')
+        print(f"  Created {jsonl_path}")
+    # Write config file
+    config_path = output_dir / "agenteval.json"
+    with open(config_path, 'w') as f:
+        json.dump(config, f, indent=2)
+    print(f"\nCreated config: {config_path}")
+    print("\n✓ Mock data generation complete!")
+    print(f"  Location: {output_dir}")
+    print(f"  Benchmarks: {', '.join(BENCHMARKS.keys())}")
+    print(f"  Agents: {len(MOCK_AGENTS)}")
+if __name__ == "__main__":
+    generate_mock_data()

mock_results/1.0.0-dev1/agenteval.json ADDED Viewed

	@@ -0,0 +1,74 @@

+{
+  "suite_config": {
+    "name": "openhands-index",
+    "version": "1.0.0-dev1",
+    "splits": [
+      {
+        "name": "swe-bench",
+        "tasks": [
+          {
+            "name": "swe-bench",
+            "tags": [
+              "swe-bench"
+            ]
+          }
+        ]
+      },
+      {
+        "name": "multi-swe-bench",
+        "tasks": [
+          {
+            "name": "multi-swe-bench",
+            "tags": [
+              "multi-swe-bench"
+            ]
+          }
+        ]
+      },
+      {
+        "name": "swe-bench-multimodal",
+        "tasks": [
+          {
+            "name": "swe-bench-multimodal",
+            "tags": [
+              "swe-bench-multimodal"
+            ]
+          }
+        ]
+      },
+      {
+        "name": "swt-bench",
+        "tasks": [
+          {
+            "name": "swt-bench",
+            "tags": [
+              "swt-bench"
+            ]
+          }
+        ]
+      },
+      {
+        "name": "commit0",
+        "tasks": [
+          {
+            "name": "commit0",
+            "tags": [
+              "commit0"
+            ]
+          }
+        ]
+      },
+      {
+        "name": "gaia",
+        "tasks": [
+          {
+            "name": "gaia",
+            "tags": [
+              "gaia"
+            ]
+          }
+        ]
+      }
+    ]
+  }
+}

mock_results/1.0.0-dev1/commit0.jsonl ADDED Viewed

	@@ -0,0 +1,5 @@

+{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 71.2, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093416", "tags": ["commit0"], "total_cost": 45.6, "total_runtime": 656.0}
+{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 68.9, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093428", "tags": ["commit0"], "total_cost": 44.45, "total_runtime": 644.5}
+{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 61.5, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093436", "tags": ["commit0"], "total_cost": 40.75, "total_runtime": 607.5}
+{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 55.3, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093445", "tags": ["commit0"], "total_cost": 37.65, "total_runtime": 576.5}
+{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 52.1, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093450", "tags": ["commit0"], "total_cost": 36.05, "total_runtime": 560.5}

mock_results/1.0.0-dev1/gaia.jsonl ADDED Viewed

	@@ -0,0 +1,5 @@

+{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 58.7, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093541", "tags": ["gaia"], "total_cost": 39.35, "total_runtime": 593.5}
+{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 55.2, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093556", "tags": ["gaia"], "total_cost": 37.6, "total_runtime": 576.0}
+{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 48.3, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093563", "tags": ["gaia"], "total_cost": 34.15, "total_runtime": 541.5}
+{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 42.1, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093568", "tags": ["gaia"], "total_cost": 31.05, "total_runtime": 510.5}
+{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 39.4, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093574", "tags": ["gaia"], "total_cost": 29.7, "total_runtime": 497.0}

mock_results/1.0.0-dev1/multi-swe-bench.jsonl ADDED Viewed

	@@ -0,0 +1,5 @@

+{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 35.2, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093026", "tags": ["multi-swe-bench"], "total_cost": 27.6, "total_runtime": 476.0}
+{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 32.8, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093040", "tags": ["multi-swe-bench"], "total_cost": 26.4, "total_runtime": 464.0}
+{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 28.4, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093048", "tags": ["multi-swe-bench"], "total_cost": 24.2, "total_runtime": 442.0}
+{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 24.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093058", "tags": ["multi-swe-bench"], "total_cost": 22.05, "total_runtime": 420.5}
+{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 21.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093067", "tags": ["multi-swe-bench"], "total_cost": 20.75, "total_runtime": 407.5}

mock_results/1.0.0-dev1/swe-bench-multimodal.jsonl ADDED Viewed

	@@ -0,0 +1,5 @@

+{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 42.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093172", "tags": ["swe-bench-multimodal"], "total_cost": 31.05, "total_runtime": 510.5}
+{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 39.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093187", "tags": ["swe-bench-multimodal"], "total_cost": 29.75, "total_runtime": 497.5}
+{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 34.2, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093200", "tags": ["swe-bench-multimodal"], "total_cost": 27.1, "total_runtime": 471.0}
+{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 28.9, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093208", "tags": ["swe-bench-multimodal"], "total_cost": 24.45, "total_runtime": 444.5}
+{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 25.7, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093218", "tags": ["swe-bench-multimodal"], "total_cost": 22.85, "total_runtime": 428.5}

mock_results/1.0.0-dev1/swe-bench.jsonl ADDED Viewed

	@@ -0,0 +1,5 @@

+{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 48.3, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092865", "tags": ["swe-bench"], "total_cost": 34.15, "total_runtime": 541.5}
+{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 45.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092895", "tags": ["swe-bench"], "total_cost": 32.55, "total_runtime": 525.5}
+{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 38.7, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092908", "tags": ["swe-bench"], "total_cost": 29.35, "total_runtime": 493.5}
+{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 32.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092916", "tags": ["swe-bench"], "total_cost": 26.25, "total_runtime": 462.5}
+{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 29.8, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092922", "tags": ["swe-bench"], "total_cost": 24.9, "total_runtime": 449.0}

mock_results/1.0.0-dev1/swt-bench.jsonl ADDED Viewed

	@@ -0,0 +1,5 @@

+{"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 65.4, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093304", "tags": ["swt-bench"], "total_cost": 42.7, "total_runtime": 627.0}
+{"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 62.3, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093317", "tags": ["swt-bench"], "total_cost": 41.15, "total_runtime": 611.5}
+{"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 54.1, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093326", "tags": ["swt-bench"], "total_cost": 37.05, "total_runtime": 570.5}
+{"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 47.8, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093333", "tags": ["swt-bench"], "total_cost": 33.9, "total_runtime": 539.0}
+{"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 44.2, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093338", "tags": ["swt-bench"], "total_cost": 32.1, "total_runtime": 521.0}

requirements.txt CHANGED Viewed

@@ -1,132 +1,13 @@
-agent-eval==0.1.43
-aiobotocore==2.22.0
-aiofiles==24.1.0
-aiohappyeyeballs==2.6.1
-aiohttp==3.11.18
-aioitertools==0.12.0
-aiosignal==1.3.2
-annotated-types==0.7.0
-anyio==4.9.0
-APScheduler==3.11.0
-async-timeout==5.0.1
-attrs==25.3.0
-Authlib==1.5.2
-beautifulsoup4==4.13.4
-black==25.1.0
-botocore==1.37.3
-certifi==2025.4.26
-cffi==1.17.1
-charset-normalizer==3.4.2
-click==8.1.8
-contourpy==1.3.2
-cryptography==44.0.3
-cycler==0.12.1
-datasets==4.0.0
-debugpy==1.8.14
-dill==0.3.8
-distro==1.9.0
-docstring_parser==0.16
-exceptiongroup==1.2.2
-fastapi==0.115.12
-ffmpy==0.5.0
-filelock==3.18.0
-fonttools==4.58.1
-frozenlist==1.6.0
-fsspec==2025.3.0
 gradio==5.30.0
-gradio_client==1.10.1
-gradio_modal==0.0.4
-groovy==0.1.2
-h11==0.16.0
-httpcore==1.0.9
-httpx==0.28.1
-huggingface-hub==0.30.2
-idna==3.10
-ijson==3.3.0
-importlib_metadata==8.7.0
-inspect_ai==0.3.104
-isort==6.0.1
-itsdangerous==2.2.0
-Jinja2==3.1.6
-jiter==0.9.0
-jmespath==1.0.1
-jsonlines==4.0.0
-jsonpatch==1.33
-jsonpointer==3.0.0
-jsonschema==4.23.0
-jsonschema-specifications==2025.4.1
-kiwisolver==1.4.8
-linkify-it-py==2.0.3
-litellm==1.68.1
-markdown-it-py==3.0.0
-MarkupSafe==3.0.2
-matplotlib==3.10.3
-mdit-py-plugins==0.4.2
-mdurl==0.1.2
-mmh3==5.1.0
-mplcursors==0.6
-multidict==6.4.3
-multiprocess==0.70.16
-mypy_extensions==1.1.0
-narwhals==1.38.2
-nest-asyncio==1.6.0
-numpy==2.2.5
-openai==1.75.0
-orjson==3.10.18
-packaging==25.0
 pandas==2.2.3
-pathspec==0.12.1
-pillow==11.2.1
-platformdirs==4.3.7
 plotly==6.0.1
-propcache==0.3.1
-psutil==7.0.0
-pyarrow==20.0.0
-pycparser==2.22
-pydantic==2.11.4
-pydantic_core==2.33.2
-pydub==0.25.1
-Pygments==2.19.1
-pyparsing==3.2.3
-python-dateutil==2.9.0.post0
-python-dotenv==1.1.0
-python-multipart==0.0.20
-pytz==2025.2
-PyYAML==6.0.2
-referencing==0.36.2
-regex==2024.11.6
 requests==2.32.3
-rich==13.9.4
-rpds-py==0.24.0
-ruff==0.11.8
-s3fs==2025.3.0
-safehttpx==0.1.6
 seaborn==0.13.2
-semantic-version==2.10.0
-semver==3.0.4
-shellingham==1.5.4
-shortuuid==1.0.13
-six==1.17.0
-sniffio==1.3.1
-soupsieve==2.7
-starlette==0.46.2
-tenacity==9.1.2
-textual<3.0.0
-tiktoken==0.9.0
-tokenizers==0.21.1
-tomli==2.2.1
-tomlkit==0.13.2
-tqdm==4.67.1
-typer==0.15.3
-typing-inspection==0.4.0
-typing_extensions==4.13.2
-tzdata==2025.2
-tzlocal==5.3.1
-uc-micro-py==1.0.3
-urllib3==2.4.0
-uvicorn==0.34.2
-websockets==15.0.1
-wrapt==1.17.2
-xxhash==3.5.0
-yarl==1.20.0
-zipp==3.21.0

+# Core dependencies
 gradio==5.30.0
 pandas==2.2.3
 plotly==6.0.1
 requests==2.32.3
+huggingface-hub==0.30.2
+APScheduler==3.11.0
+# Additional dependencies for UI and processing
+matplotlib==3.10.3
 seaborn==0.13.2
+Pillow==11.2.1
+PyYAML==6.0.2

simple_data_loader.py ADDED Viewed

	@@ -0,0 +1,117 @@

+"""
+Simple data loader for OpenHands Index leaderboard.
+Loads JSONL files from local directory or GitHub repository.
+"""
+import os
+import pandas as pd
+import json
+from pathlib import Path
+class SimpleLeaderboardViewer:
+    """Simple replacement for agent-eval's LeaderboardViewer."""
+    def __init__(self, data_dir: str, config: str, split: str):
+        """
+        Args:
+            data_dir: Path to data directory
+            config: Config name (e.g., "1.0.0-dev1")
+            split: Split name (e.g., "validation" or "test")
+        """
+        self.data_dir = Path(data_dir)
+        self.config = config
+        self.split = split
+        self.config_path = self.data_dir / config
+        # Load suite configuration
+        config_file = self.config_path / "agenteval.json"
+        if config_file.exists():
+            with open(config_file) as f:
+                suite_config = json.load(f)
+                self.suite_config = suite_config["suite_config"]
+        else:
+            self.suite_config = {
+                "name": "openhands-index",
+                "version": config,
+                "splits": []
+            }
+        # Build tag map from config
+        self.tag_map = {}
+        for split_config in self.suite_config.get("splits", []):
+            if split_config["name"] == split:
+                for task in split_config.get("tasks", []):
+                    for tag in task.get("tags", []):
+                        if tag not in self.tag_map:
+                            self.tag_map[tag] = []
+                        self.tag_map[tag].append(task["name"])
+    def _load(self):
+        """Load the JSONL file for the split and return DataFrame and tag map."""
+        jsonl_file = self.config_path / f"{self.split}.jsonl"
+        if not jsonl_file.exists():
+            # Return empty dataframe with error message
+            return pd.DataFrame({
+                "Message": [f"No data found for split '{self.split}'. Expected file: {jsonl_file}"]
+            }), {}
+        try:
+            # Read JSONL file
+            records = []
+            with open(jsonl_file, 'r') as f:
+                for line in f:
+                    if line.strip():
+                        records.append(json.loads(line))
+            if not records:
+                return pd.DataFrame({
+                    "Message": [f"No data in file: {jsonl_file}"]
+                }), {}
+            df = pd.DataFrame(records)
+            return df, self.tag_map
+        except Exception as e:
+            return pd.DataFrame({
+                "Message": [f"Error loading data: {e}"]
+            }), {}
+    def get_dataframe(self):
+        """Get the raw dataframe."""
+        df, _ = self._load()
+        return df
+def load_mock_data_locally(data_dir: str = "mock_results"):
+    """
+    Load mock data from local directory for testing.
+    Args:
+        data_dir: Path to mock results directory
+    Returns:
+        Dictionary mapping split names to SimpleLeaderboardViewer instances
+    """
+    viewers = {}
+    data_path = Path(data_dir)
+    if not data_path.exists():
+        print(f"Warning: Mock data directory '{data_dir}' not found")
+        return viewers
+    # Find all config directories
+    for config_dir in data_path.iterdir():
+        if config_dir.is_dir():
+            config_name = config_dir.name
+            # Find all JSONL files (each represents a split)
+            for jsonl_file in config_dir.glob("*.jsonl"):
+                split_name = jsonl_file.stem
+                viewer = SimpleLeaderboardViewer(
+                    data_dir=str(data_path),
+                    config=config_name,
+                    split=split_name
+                )
+                viewers[split_name] = viewer
+    return viewers

submission.py CHANGED Viewed

@@ -2,11 +2,6 @@ import logging
 import typing
 import matplotlib
-from agenteval.cli import SUBMISSION_METADATA_FILENAME
-from agenteval.models import SubmissionMetadata
-from datasets.exceptions import DataFilesNotFoundError
-from gradio_modal import Modal
 matplotlib.use('Agg')
 import os
@@ -17,12 +12,15 @@ from email.utils import parseaddr
 import gradio as gr
 import requests
-from agenteval.leaderboard.upload import sanitize_path_component, _validate_path_component
-from datasets import Dataset, DatasetDict, VerificationMode, load_dataset
-from datasets.data_files import EmptyDatasetError
 from huggingface_hub import HfApi
 import aliases
 from config import (
     CONFIG_NAME,
     CONTACT_DATASET,

 import typing
 import matplotlib
 matplotlib.use('Agg')
 import os
 import gradio as gr
 import requests
 from huggingface_hub import HfApi
 import aliases
+from submission_utils import (
+    SUBMISSION_METADATA_FILENAME,
+    SubmissionMetadata,
+    sanitize_path_component,
+    _validate_path_component
+)
 from config import (
     CONFIG_NAME,
     CONTACT_DATASET,

submission_utils.py ADDED Viewed

	@@ -0,0 +1,88 @@

+"""
+Utilities for submission handling, replacing agent-eval dependencies.
+"""
+import re
+from pathlib import Path
+from typing import Optional
+# Constants
+SUBMISSION_METADATA_FILENAME = "metadata.json"
+# Simple SubmissionMetadata class
+class SubmissionMetadata:
+    """Simple metadata for submissions."""
+    def __init__(self, **kwargs):
+        self.agent_name = kwargs.get("agent_name", "")
+        self.llm_base = kwargs.get("llm_base", "")
+        self.openness = kwargs.get("openness", "")
+        self.tool_usage = kwargs.get("tool_usage", "")
+        self.submitter_name = kwargs.get("submitter_name", "")
+        self.submitter_email = kwargs.get("submitter_email", "")
+    def to_dict(self):
+        return {
+            "agent_name": self.agent_name,
+            "llm_base": self.llm_base,
+            "openness": self.openness,
+            "tool_usage": self.tool_usage,
+            "submitter_name": self.submitter_name,
+            "submitter_email": self.submitter_email,
+        }
+# Path validation functions
+def _validate_path_component(component: str, allow_underscores: bool = True) -> None:
+    """
+    Validate a single path component.
+    Args:
+        component: The path component to validate
+        allow_underscores: Whether to allow underscores in the component
+    Raises:
+        ValueError: If the component is invalid
+    """
+    if not component:
+        raise ValueError("Path component cannot be empty")
+    if component in (".", ".."):
+        raise ValueError(f"Path component cannot be '{component}'")
+    # Check for invalid characters
+    pattern = r'^[a-zA-Z0-9_\-\.]+$' if allow_underscores else r'^[a-zA-Z0-9\-\.]+$'
+    if not re.match(pattern, component):
+        raise ValueError(
+            f"Path component '{component}' contains invalid characters. "
+            f"Only alphanumeric, hyphens, dots{', and underscores' if allow_underscores else ''} are allowed."
+        )
+def sanitize_path_component(component: str, replacement: str = "_") -> str:
+    """
+    Sanitize a path component by replacing invalid characters.
+    Args:
+        component: The path component to sanitize
+        replacement: The character to use for replacing invalid characters
+    Returns:
+        Sanitized path component
+    """
+    if not component:
+        return "unnamed"
+    # Replace any non-alphanumeric, non-hyphen, non-dot, non-underscore with replacement
+    sanitized = re.sub(r'[^a-zA-Z0-9_\-\.]', replacement, component)
+    # Remove leading/trailing dots or hyphens
+    sanitized = sanitized.strip('.-')
+    # Collapse multiple replacements into one
+    sanitized = re.sub(f'{re.escape(replacement)}+', replacement, sanitized)
+    if not sanitized:
+        return "unnamed"
+    return sanitized

tests/integration/test_submission.py CHANGED Viewed

@@ -4,14 +4,12 @@ from datetime import datetime
 import gradio
 import pytest
-import pyarrow as pa
-from agenteval.models import SubmissionMetadata
-from datasets import load_dataset, VerificationMode
 from huggingface_hub import HfApi, hf_hub_download
 from aliases import CANONICAL_TOOL_USAGE_STANDARD, CANONICAL_OPENNESS_CLOSED_UI_ONLY
 from config import IS_INTERNAL, CONFIG_NAME, CONTACT_DATASET, SUBMISSION_DATASET
 from submission import add_new_eval
 _hf = HfApi()

 import gradio
 import pytest
 from huggingface_hub import HfApi, hf_hub_download
 from aliases import CANONICAL_TOOL_USAGE_STANDARD, CANONICAL_OPENNESS_CLOSED_UI_ONLY
 from config import IS_INTERNAL, CONFIG_NAME, CONTACT_DATASET, SUBMISSION_DATASET
 from submission import add_new_eval
+from submission_utils import SubmissionMetadata
 _hf = HfApi()

ui_components.py CHANGED Viewed

@@ -4,10 +4,10 @@ import plotly.graph_objects as go
 import os
 import base64
-from agenteval.leaderboard.view import LeaderboardViewer
 from huggingface_hub import HfApi
 import aliases
 from leaderboard_transformer import (
     DataTransformer,
     transform_raw_dataframe,
@@ -473,12 +473,14 @@ def get_leaderboard_viewer_instance(split: str):
     # --- Cache miss: try to load data from the source ---
     try:
-        print(f"Using Hugging Face dataset for split '{split}': {RESULTS_DATASET}/{CONFIG_NAME}")
-        viewer = LeaderboardViewer(
-            repo_id=RESULTS_DATASET,
             config=CONFIG_NAME,
-            split=split,
-            is_internal=IS_INTERNAL
         )
         # Simplify tag map creation

 import os
 import base64
 from huggingface_hub import HfApi
 import aliases
+from simple_data_loader import SimpleLeaderboardViewer
 from leaderboard_transformer import (
     DataTransformer,
     transform_raw_dataframe,
     # --- Cache miss: try to load data from the source ---
     try:
+        # First try to load from extracted data directory (local mock data)
+        data_dir = EXTRACTED_DATA_DIR if os.path.exists(EXTRACTED_DATA_DIR) else "mock_results"
+        print(f"Loading data for split '{split}' from: {data_dir}/{CONFIG_NAME}")
+        viewer = SimpleLeaderboardViewer(
+            data_dir=data_dir,
             config=CONFIG_NAME,
+            split=split
         )
         # Simplify tag map creation