openhands openhands commited on
Commit
1027cfb
·
1 Parent(s): 085a012

Convert to JSONL data format and remove agent-eval dependency

Browse files

- Replace parquet files with JSONL for better readability and simplicity
- Remove agent-eval library dependency to avoid secret requirements
- Create simple_data_loader.py as lightweight replacement for LeaderboardViewer
- Create submission_utils.py for submission handling functions
- Simplify Dockerfile (no SSH keys or secrets needed)
- Reduce requirements.txt to essential packages only
- Generate mock data for all 6 benchmarks (swe-bench, multi-swe-bench, swe-bench-multimodal, swt-bench, commit0, gaia)
- Update aliases.py to define constants locally
- Fix imports in submission.py and test files

Co-authored-by: openhands <openhands@all-hands.dev>

Dockerfile CHANGED
@@ -1,9 +1,8 @@
1
  FROM python:3.10-slim
2
 
3
-
4
- # (0) Install SSH client tools (and git, if you're pulling via SSH)
5
  RUN apt-get update && \
6
- apt-get install -y --no-install-recommends openssh-client git && \
7
  rm -rf /var/lib/apt/lists/*
8
 
9
  # The two following lines are requirements for the Dev Mode to be functional
@@ -11,23 +10,13 @@ RUN apt-get update && \
11
  RUN useradd -m -u 1000 user
12
  WORKDIR /app
13
 
14
-
15
- # (2) Copy dependencies manifest
16
  COPY --chown=user requirements.txt requirements.txt
17
 
18
- # (3) Install dependencies, mounting SSH keys and optional HTTPS creds
19
- RUN --mount=type=secret,id=AGENTEVAL_DEPLOY_KEY,mode=0400,required=true \
20
- --mount=type=secret,id=ASTABENCH_DEPLOY_KEY,mode=0400,required=true \
21
- mkdir -p /root/.ssh && chmod 700 /root/.ssh && \
22
- cat /run/secrets/AGENTEVAL_DEPLOY_KEY > /root/.ssh/id_ed25519 && chmod 600 /root/.ssh/id_ed25519 && \
23
- cat /run/secrets/ASTABENCH_DEPLOY_KEY > /root/.ssh/id_astabench && chmod 600 /root/.ssh/id_astabench && \
24
- ssh-keyscan github.com >> /root/.ssh/known_hosts && \
25
- printf 'Host github.com\n User git\n IdentityFile /root/.ssh/id_ed25519\n IdentityFile /root/.ssh/id_astabench\n StrictHostKeyChecking no\n' >> /root/.ssh/config && \
26
- # rewrite all GitHub HTTPS URLs to SSH so nested deps install via SSH
27
- git config --global url."ssh://git@github.com/".insteadOf "https://github.com/" && \
28
- pip install --no-cache-dir --upgrade -r requirements.txt
29
-
30
- # (4) Copy in your Gradio app code
31
  COPY . .
32
  RUN mkdir -p /home/user/data && chown -R user:user /home/user/data
33
 
 
1
  FROM python:3.10-slim
2
 
3
+ # Install git for cloning results repository
 
4
  RUN apt-get update && \
5
+ apt-get install -y --no-install-recommends git && \
6
  rm -rf /var/lib/apt/lists/*
7
 
8
  # The two following lines are requirements for the Dev Mode to be functional
 
10
  RUN useradd -m -u 1000 user
11
  WORKDIR /app
12
 
13
+ # Copy dependencies manifest
 
14
  COPY --chown=user requirements.txt requirements.txt
15
 
16
+ # Install dependencies (no secrets needed)
17
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
18
+
19
+ # Copy in your Gradio app code
 
 
 
 
 
 
 
 
 
20
  COPY . .
21
  RUN mkdir -p /home/user/data && chown -R user:user /home/user/data
22
 
aliases.py CHANGED
@@ -1,12 +1,11 @@
1
- from agenteval.config import (
2
- OPENNESS_OPEN_SOURCE_OPEN_WEIGHTS as CANONICAL_OPENNESS_OPEN_SOURCE_OPEN_WEIGHTS,
3
- OPENNESS_OPEN_SOURCE_CLOSED_WEIGHTS as CANONICAL_OPENNESS_OPEN_SOURCE_CLOSED_WEIGHTS,
4
- OPENNESS_CLOSED_API_AVAILABLE as CANONICAL_OPENNESS_CLOSED_API_AVAILABLE,
5
- OPENNESS_CLOSED_UI_ONLY as CANONICAL_OPENNESS_CLOSED_UI_ONLY,
6
- TOOL_USAGE_STANDARD as CANONICAL_TOOL_USAGE_STANDARD,
7
- TOOL_USAGE_CUSTOM_INTERFACE as CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE,
8
- TOOL_USAGE_FULLY_CUSTOM as CANONICAL_TOOL_USAGE_FULLY_CUSTOM,
9
- )
10
 
11
 
12
  OPENNESS_ALIASES = {
 
1
+ # Define constants that were previously imported from agenteval
2
+ CANONICAL_OPENNESS_OPEN_SOURCE_OPEN_WEIGHTS = "open_source_open_weights"
3
+ CANONICAL_OPENNESS_OPEN_SOURCE_CLOSED_WEIGHTS = "open_source_closed_weights"
4
+ CANONICAL_OPENNESS_CLOSED_API_AVAILABLE = "closed_api_available"
5
+ CANONICAL_OPENNESS_CLOSED_UI_ONLY = "closed_ui_only"
6
+ CANONICAL_TOOL_USAGE_STANDARD = "standard"
7
+ CANONICAL_TOOL_USAGE_CUSTOM_INTERFACE = "custom_interface"
8
+ CANONICAL_TOOL_USAGE_FULLY_CUSTOM = "fully_custom"
 
9
 
10
 
11
  OPENNESS_ALIASES = {
app.py CHANGED
@@ -239,18 +239,24 @@ demo = gr.Blocks(
239
  with demo.route("Home", "/home"):
240
  build_main_page()
241
 
242
- with demo.route("Literature Understanding", "/literature-understanding"):
243
  build_lit_page()
244
 
245
- with demo.route("Code & Execution", "/code-execution"):
246
  build_c_and_e_page()
247
 
248
- with demo.route("Data Analysis", "/data-analysis"):
249
  build_data_analysis_page()
250
 
251
- with demo.route("End-to-End Discovery", "/discovery"):
252
  build_e2e_page()
253
 
 
 
 
 
 
 
254
  with demo.route("About", "/about"):
255
  build_about_page()
256
 
 
239
  with demo.route("Home", "/home"):
240
  build_main_page()
241
 
242
+ with demo.route("SWE-bench", "/swe-bench"):
243
  build_lit_page()
244
 
245
+ with demo.route("Multi-SWE-bench", "/multi-swe-bench"):
246
  build_c_and_e_page()
247
 
248
+ with demo.route("SWE-bench Multimodal", "/swe-bench-multimodal"):
249
  build_data_analysis_page()
250
 
251
+ with demo.route("SWT-bench", "/swt-bench"):
252
  build_e2e_page()
253
 
254
+ with demo.route("Commit0", "/commit0"):
255
+ build_lit_page()
256
+
257
+ with demo.route("GAIA", "/gaia"):
258
+ build_c_and_e_page()
259
+
260
  with demo.route("About", "/about"):
261
  build_about_page()
262
 
generate_mock_data.py DELETED
@@ -1,102 +0,0 @@
1
- """Generate mock results data in agenteval format for OpenHands Index."""
2
- import json
3
- import pandas as pd
4
- import pyarrow as pa
5
- import pyarrow.parquet as pq
6
- from pathlib import Path
7
-
8
- # Load the suite config
9
- with open("data/1.0.0-dev1/agenteval.json") as f:
10
- suite_config_data = json.load(f)
11
-
12
- suite_config = suite_config_data["suite_config"]
13
-
14
- # Mock agents
15
- agents = [
16
- {
17
- "name": "OpenHands CodeAct v2.1",
18
- "source_url": "https://github.com/OpenHands/OpenHands"
19
- },
20
- {
21
- "name": "Aider",
22
- "source_url": "https://github.com/paul-gauthier/aider"
23
- },
24
- {
25
- "name": "SWE-agent",
26
- "source_url": "https://github.com/princeton-nlp/SWE-agent"
27
- }
28
- ]
29
-
30
- def create_mock_results(split_name):
31
- """Create mock results for a split."""
32
- split_config = next(s for s in suite_config["splits"] if s["name"] == split_name)
33
-
34
- rows = []
35
- for agent in agents:
36
- # Create results for each task
37
- results = []
38
- for task in split_config["tasks"]:
39
- task_name = task["name"]
40
- primary_metric = task["primary_metric"]
41
-
42
- # Generate mock score (different for each agent)
43
- base_score = 0.3 + (hash(agent["name"]) % 50) / 100
44
- score = base_score + (hash(task_name) % 30) / 100
45
- score = min(score, 1.0)
46
-
47
- task_result = {
48
- "task_name": task_name,
49
- "eval_spec": {
50
- "model": "gpt-4",
51
- "solver": f"openhands/{task_name}",
52
- },
53
- "metrics": [
54
- {
55
- "name": primary_metric,
56
- "value": score
57
- }
58
- ],
59
- "model_usages": []
60
- }
61
- results.append(task_result)
62
-
63
- # Create row
64
- row = {
65
- "suite_config": suite_config,
66
- "split": split_name,
67
- "results": results,
68
- "submission": {
69
- "agent_name": agent["name"],
70
- "source_url": agent["source_url"],
71
- "openness": "open-source/open-weights",
72
- "tool_usage": "standard"
73
- }
74
- }
75
- rows.append(row)
76
-
77
- return rows
78
-
79
- # Create mock data for both splits
80
- all_rows = []
81
- for split in ["validation", "test"]:
82
- all_rows.extend(create_mock_results(split))
83
-
84
- # Convert to DataFrame
85
- df = pd.DataFrame(all_rows)
86
-
87
- # Save as parquet
88
- output_dir = Path("mock_results/1.0.0-dev1")
89
- output_dir.mkdir(parents=True, exist_ok=True)
90
-
91
- # Save validation split
92
- validation_df = df[df["split"] == "validation"]
93
- validation_df.to_parquet(output_dir / "validation.parquet", index=False)
94
-
95
- # Save test split
96
- test_df = df[df["split"] == "test"]
97
- test_df.to_parquet(output_dir / "test.parquet", index=False)
98
-
99
- print(f"Created mock data:")
100
- print(f" - Validation: {len(validation_df)} rows")
101
- print(f" - Test: {len(test_df)} rows")
102
- print(f" - Output: {output_dir}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
generate_mock_jsonl.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Generate mock results data in JSONL format for OpenHands Index."""
2
+ import json
3
+ import os
4
+ from pathlib import Path
5
+ from datetime import datetime
6
+
7
+ # Define the 6 benchmarks
8
+ BENCHMARKS = {
9
+ "swe-bench": {
10
+ "tags": ["swe-bench"],
11
+ "metric": "resolve_rate",
12
+ "metric_display": "Resolve Rate (%)"
13
+ },
14
+ "multi-swe-bench": {
15
+ "tags": ["multi-swe-bench"],
16
+ "metric": "resolve_rate",
17
+ "metric_display": "Resolve Rate (%)"
18
+ },
19
+ "swe-bench-multimodal": {
20
+ "tags": ["swe-bench-multimodal"],
21
+ "metric": "resolve_rate",
22
+ "metric_display": "Resolve Rate (%)"
23
+ },
24
+ "swt-bench": {
25
+ "tags": ["swt-bench"],
26
+ "metric": "success_rate",
27
+ "metric_display": "Success Rate (%)"
28
+ },
29
+ "commit0": {
30
+ "tags": ["commit0"],
31
+ "metric": "test_pass_rate",
32
+ "metric_display": "Test Pass Rate (%)"
33
+ },
34
+ "gaia": {
35
+ "tags": ["gaia"],
36
+ "metric": "accuracy",
37
+ "metric_display": "Accuracy (%)"
38
+ }
39
+ }
40
+
41
+ # Mock agents with realistic scores
42
+ MOCK_AGENTS = [
43
+ {
44
+ "agent_name": "OpenHands CodeAct v2.1",
45
+ "llm_base": "claude-3-5-sonnet-20241022",
46
+ "openness": "closed_api_available",
47
+ "tool_usage": "standard",
48
+ "scores": {
49
+ "swe-bench": 48.3,
50
+ "multi-swe-bench": 35.2,
51
+ "swe-bench-multimodal": 42.1,
52
+ "swt-bench": 65.4,
53
+ "commit0": 71.2,
54
+ "gaia": 58.7
55
+ }
56
+ },
57
+ {
58
+ "agent_name": "OpenHands CodeAct v2.0",
59
+ "llm_base": "gpt-4o-2024-11-20",
60
+ "openness": "closed_api_available",
61
+ "tool_usage": "standard",
62
+ "scores": {
63
+ "swe-bench": 45.1,
64
+ "multi-swe-bench": 32.8,
65
+ "swe-bench-multimodal": 39.5,
66
+ "swt-bench": 62.3,
67
+ "commit0": 68.9,
68
+ "gaia": 55.2
69
+ }
70
+ },
71
+ {
72
+ "agent_name": "AutoCodeRover",
73
+ "llm_base": "gpt-4-turbo-2024-04-09",
74
+ "openness": "closed_api_available",
75
+ "tool_usage": "standard",
76
+ "scores": {
77
+ "swe-bench": 38.7,
78
+ "multi-swe-bench": 28.4,
79
+ "swe-bench-multimodal": 34.2,
80
+ "swt-bench": 54.1,
81
+ "commit0": 61.5,
82
+ "gaia": 48.3
83
+ }
84
+ },
85
+ {
86
+ "agent_name": "Agentless",
87
+ "llm_base": "gpt-4o-mini-2024-07-18",
88
+ "openness": "closed_api_available",
89
+ "tool_usage": "standard",
90
+ "scores": {
91
+ "swe-bench": 32.5,
92
+ "multi-swe-bench": 24.1,
93
+ "swe-bench-multimodal": 28.9,
94
+ "swt-bench": 47.8,
95
+ "commit0": 55.3,
96
+ "gaia": 42.1
97
+ }
98
+ },
99
+ {
100
+ "agent_name": "SWE-Agent",
101
+ "llm_base": "claude-3-opus-20240229",
102
+ "openness": "closed_api_available",
103
+ "tool_usage": "custom_interface",
104
+ "scores": {
105
+ "swe-bench": 29.8,
106
+ "multi-swe-bench": 21.5,
107
+ "swe-bench-multimodal": 25.7,
108
+ "swt-bench": 44.2,
109
+ "commit0": 52.1,
110
+ "gaia": 39.4
111
+ }
112
+ },
113
+ ]
114
+
115
+
116
+ def generate_mock_data():
117
+ """Generate mock JSONL files for all benchmarks."""
118
+ output_dir = Path("mock_results/1.0.0-dev1")
119
+ output_dir.mkdir(parents=True, exist_ok=True)
120
+
121
+ # Create agenteval.json config
122
+ config = {
123
+ "suite_config": {
124
+ "name": "openhands-index",
125
+ "version": "1.0.0-dev1",
126
+ "splits": []
127
+ }
128
+ }
129
+
130
+ # Generate data for each benchmark
131
+ for benchmark_name, benchmark_info in BENCHMARKS.items():
132
+ print(f"Generating mock data for {benchmark_name}...")
133
+
134
+ # Add to config
135
+ config["suite_config"]["splits"].append({
136
+ "name": benchmark_name,
137
+ "tasks": [{
138
+ "name": benchmark_name,
139
+ "tags": benchmark_info["tags"]
140
+ }]
141
+ })
142
+
143
+ # Generate JSONL file
144
+ jsonl_path = output_dir / f"{benchmark_name}.jsonl"
145
+ with open(jsonl_path, 'w') as f:
146
+ for agent in MOCK_AGENTS:
147
+ record = {
148
+ "agent_name": agent["agent_name"],
149
+ "llm_base": agent["llm_base"],
150
+ "openness": agent["openness"],
151
+ "tool_usage": agent["tool_usage"],
152
+ "score": agent["scores"][benchmark_name],
153
+ "metric": benchmark_info["metric"],
154
+ "submission_time": datetime.now().isoformat(),
155
+ "tags": benchmark_info["tags"],
156
+ # Additional metadata
157
+ "total_cost": round(10.0 + agent["scores"][benchmark_name] * 0.5, 2),
158
+ "total_runtime": round(300 + agent["scores"][benchmark_name] * 5, 1),
159
+ }
160
+ f.write(json.dumps(record) + '\n')
161
+
162
+ print(f" Created {jsonl_path}")
163
+
164
+ # Write config file
165
+ config_path = output_dir / "agenteval.json"
166
+ with open(config_path, 'w') as f:
167
+ json.dump(config, f, indent=2)
168
+ print(f"\nCreated config: {config_path}")
169
+
170
+ print("\n✓ Mock data generation complete!")
171
+ print(f" Location: {output_dir}")
172
+ print(f" Benchmarks: {', '.join(BENCHMARKS.keys())}")
173
+ print(f" Agents: {len(MOCK_AGENTS)}")
174
+
175
+
176
+ if __name__ == "__main__":
177
+ generate_mock_data()
mock_results/1.0.0-dev1/agenteval.json ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "suite_config": {
3
+ "name": "openhands-index",
4
+ "version": "1.0.0-dev1",
5
+ "splits": [
6
+ {
7
+ "name": "swe-bench",
8
+ "tasks": [
9
+ {
10
+ "name": "swe-bench",
11
+ "tags": [
12
+ "swe-bench"
13
+ ]
14
+ }
15
+ ]
16
+ },
17
+ {
18
+ "name": "multi-swe-bench",
19
+ "tasks": [
20
+ {
21
+ "name": "multi-swe-bench",
22
+ "tags": [
23
+ "multi-swe-bench"
24
+ ]
25
+ }
26
+ ]
27
+ },
28
+ {
29
+ "name": "swe-bench-multimodal",
30
+ "tasks": [
31
+ {
32
+ "name": "swe-bench-multimodal",
33
+ "tags": [
34
+ "swe-bench-multimodal"
35
+ ]
36
+ }
37
+ ]
38
+ },
39
+ {
40
+ "name": "swt-bench",
41
+ "tasks": [
42
+ {
43
+ "name": "swt-bench",
44
+ "tags": [
45
+ "swt-bench"
46
+ ]
47
+ }
48
+ ]
49
+ },
50
+ {
51
+ "name": "commit0",
52
+ "tasks": [
53
+ {
54
+ "name": "commit0",
55
+ "tags": [
56
+ "commit0"
57
+ ]
58
+ }
59
+ ]
60
+ },
61
+ {
62
+ "name": "gaia",
63
+ "tasks": [
64
+ {
65
+ "name": "gaia",
66
+ "tags": [
67
+ "gaia"
68
+ ]
69
+ }
70
+ ]
71
+ }
72
+ ]
73
+ }
74
+ }
mock_results/1.0.0-dev1/commit0.jsonl ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 71.2, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093416", "tags": ["commit0"], "total_cost": 45.6, "total_runtime": 656.0}
2
+ {"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 68.9, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093428", "tags": ["commit0"], "total_cost": 44.45, "total_runtime": 644.5}
3
+ {"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 61.5, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093436", "tags": ["commit0"], "total_cost": 40.75, "total_runtime": 607.5}
4
+ {"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 55.3, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093445", "tags": ["commit0"], "total_cost": 37.65, "total_runtime": 576.5}
5
+ {"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 52.1, "metric": "test_pass_rate", "submission_time": "2025-11-24T19:56:00.093450", "tags": ["commit0"], "total_cost": 36.05, "total_runtime": 560.5}
mock_results/1.0.0-dev1/gaia.jsonl ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 58.7, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093541", "tags": ["gaia"], "total_cost": 39.35, "total_runtime": 593.5}
2
+ {"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 55.2, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093556", "tags": ["gaia"], "total_cost": 37.6, "total_runtime": 576.0}
3
+ {"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 48.3, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093563", "tags": ["gaia"], "total_cost": 34.15, "total_runtime": 541.5}
4
+ {"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 42.1, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093568", "tags": ["gaia"], "total_cost": 31.05, "total_runtime": 510.5}
5
+ {"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 39.4, "metric": "accuracy", "submission_time": "2025-11-24T19:56:00.093574", "tags": ["gaia"], "total_cost": 29.7, "total_runtime": 497.0}
mock_results/1.0.0-dev1/multi-swe-bench.jsonl ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 35.2, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093026", "tags": ["multi-swe-bench"], "total_cost": 27.6, "total_runtime": 476.0}
2
+ {"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 32.8, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093040", "tags": ["multi-swe-bench"], "total_cost": 26.4, "total_runtime": 464.0}
3
+ {"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 28.4, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093048", "tags": ["multi-swe-bench"], "total_cost": 24.2, "total_runtime": 442.0}
4
+ {"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 24.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093058", "tags": ["multi-swe-bench"], "total_cost": 22.05, "total_runtime": 420.5}
5
+ {"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 21.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093067", "tags": ["multi-swe-bench"], "total_cost": 20.75, "total_runtime": 407.5}
mock_results/1.0.0-dev1/swe-bench-multimodal.jsonl ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 42.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093172", "tags": ["swe-bench-multimodal"], "total_cost": 31.05, "total_runtime": 510.5}
2
+ {"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 39.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093187", "tags": ["swe-bench-multimodal"], "total_cost": 29.75, "total_runtime": 497.5}
3
+ {"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 34.2, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093200", "tags": ["swe-bench-multimodal"], "total_cost": 27.1, "total_runtime": 471.0}
4
+ {"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 28.9, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093208", "tags": ["swe-bench-multimodal"], "total_cost": 24.45, "total_runtime": 444.5}
5
+ {"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 25.7, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.093218", "tags": ["swe-bench-multimodal"], "total_cost": 22.85, "total_runtime": 428.5}
mock_results/1.0.0-dev1/swe-bench.jsonl ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 48.3, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092865", "tags": ["swe-bench"], "total_cost": 34.15, "total_runtime": 541.5}
2
+ {"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 45.1, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092895", "tags": ["swe-bench"], "total_cost": 32.55, "total_runtime": 525.5}
3
+ {"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 38.7, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092908", "tags": ["swe-bench"], "total_cost": 29.35, "total_runtime": 493.5}
4
+ {"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 32.5, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092916", "tags": ["swe-bench"], "total_cost": 26.25, "total_runtime": 462.5}
5
+ {"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 29.8, "metric": "resolve_rate", "submission_time": "2025-11-24T19:56:00.092922", "tags": ["swe-bench"], "total_cost": 24.9, "total_runtime": 449.0}
mock_results/1.0.0-dev1/swt-bench.jsonl ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {"agent_name": "OpenHands CodeAct v2.1", "llm_base": "claude-3-5-sonnet-20241022", "openness": "closed_api_available", "tool_usage": "standard", "score": 65.4, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093304", "tags": ["swt-bench"], "total_cost": 42.7, "total_runtime": 627.0}
2
+ {"agent_name": "OpenHands CodeAct v2.0", "llm_base": "gpt-4o-2024-11-20", "openness": "closed_api_available", "tool_usage": "standard", "score": 62.3, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093317", "tags": ["swt-bench"], "total_cost": 41.15, "total_runtime": 611.5}
3
+ {"agent_name": "AutoCodeRover", "llm_base": "gpt-4-turbo-2024-04-09", "openness": "closed_api_available", "tool_usage": "standard", "score": 54.1, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093326", "tags": ["swt-bench"], "total_cost": 37.05, "total_runtime": 570.5}
4
+ {"agent_name": "Agentless", "llm_base": "gpt-4o-mini-2024-07-18", "openness": "closed_api_available", "tool_usage": "standard", "score": 47.8, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093333", "tags": ["swt-bench"], "total_cost": 33.9, "total_runtime": 539.0}
5
+ {"agent_name": "SWE-Agent", "llm_base": "claude-3-opus-20240229", "openness": "closed_api_available", "tool_usage": "custom_interface", "score": 44.2, "metric": "success_rate", "submission_time": "2025-11-24T19:56:00.093338", "tags": ["swt-bench"], "total_cost": 32.1, "total_runtime": 521.0}
requirements.txt CHANGED
@@ -1,132 +1,13 @@
1
- agent-eval==0.1.43
2
- aiobotocore==2.22.0
3
- aiofiles==24.1.0
4
- aiohappyeyeballs==2.6.1
5
- aiohttp==3.11.18
6
- aioitertools==0.12.0
7
- aiosignal==1.3.2
8
- annotated-types==0.7.0
9
- anyio==4.9.0
10
- APScheduler==3.11.0
11
- async-timeout==5.0.1
12
- attrs==25.3.0
13
- Authlib==1.5.2
14
- beautifulsoup4==4.13.4
15
- black==25.1.0
16
- botocore==1.37.3
17
- certifi==2025.4.26
18
- cffi==1.17.1
19
- charset-normalizer==3.4.2
20
- click==8.1.8
21
- contourpy==1.3.2
22
- cryptography==44.0.3
23
- cycler==0.12.1
24
- datasets==4.0.0
25
- debugpy==1.8.14
26
- dill==0.3.8
27
- distro==1.9.0
28
- docstring_parser==0.16
29
- exceptiongroup==1.2.2
30
- fastapi==0.115.12
31
- ffmpy==0.5.0
32
- filelock==3.18.0
33
- fonttools==4.58.1
34
- frozenlist==1.6.0
35
- fsspec==2025.3.0
36
  gradio==5.30.0
37
- gradio_client==1.10.1
38
- gradio_modal==0.0.4
39
- groovy==0.1.2
40
- h11==0.16.0
41
- httpcore==1.0.9
42
- httpx==0.28.1
43
- huggingface-hub==0.30.2
44
- idna==3.10
45
- ijson==3.3.0
46
- importlib_metadata==8.7.0
47
- inspect_ai==0.3.104
48
- isort==6.0.1
49
- itsdangerous==2.2.0
50
- Jinja2==3.1.6
51
- jiter==0.9.0
52
- jmespath==1.0.1
53
- jsonlines==4.0.0
54
- jsonpatch==1.33
55
- jsonpointer==3.0.0
56
- jsonschema==4.23.0
57
- jsonschema-specifications==2025.4.1
58
- kiwisolver==1.4.8
59
- linkify-it-py==2.0.3
60
- litellm==1.68.1
61
- markdown-it-py==3.0.0
62
- MarkupSafe==3.0.2
63
- matplotlib==3.10.3
64
- mdit-py-plugins==0.4.2
65
- mdurl==0.1.2
66
- mmh3==5.1.0
67
- mplcursors==0.6
68
- multidict==6.4.3
69
- multiprocess==0.70.16
70
- mypy_extensions==1.1.0
71
- narwhals==1.38.2
72
- nest-asyncio==1.6.0
73
- numpy==2.2.5
74
- openai==1.75.0
75
- orjson==3.10.18
76
- packaging==25.0
77
  pandas==2.2.3
78
- pathspec==0.12.1
79
- pillow==11.2.1
80
- platformdirs==4.3.7
81
  plotly==6.0.1
82
- propcache==0.3.1
83
- psutil==7.0.0
84
- pyarrow==20.0.0
85
- pycparser==2.22
86
- pydantic==2.11.4
87
- pydantic_core==2.33.2
88
- pydub==0.25.1
89
- Pygments==2.19.1
90
- pyparsing==3.2.3
91
- python-dateutil==2.9.0.post0
92
- python-dotenv==1.1.0
93
- python-multipart==0.0.20
94
- pytz==2025.2
95
- PyYAML==6.0.2
96
- referencing==0.36.2
97
- regex==2024.11.6
98
  requests==2.32.3
99
- rich==13.9.4
100
- rpds-py==0.24.0
101
- ruff==0.11.8
102
- s3fs==2025.3.0
103
- safehttpx==0.1.6
104
  seaborn==0.13.2
105
- semantic-version==2.10.0
106
- semver==3.0.4
107
- shellingham==1.5.4
108
- shortuuid==1.0.13
109
- six==1.17.0
110
- sniffio==1.3.1
111
- soupsieve==2.7
112
- starlette==0.46.2
113
- tenacity==9.1.2
114
- textual<3.0.0
115
- tiktoken==0.9.0
116
- tokenizers==0.21.1
117
- tomli==2.2.1
118
- tomlkit==0.13.2
119
- tqdm==4.67.1
120
- typer==0.15.3
121
- typing-inspection==0.4.0
122
- typing_extensions==4.13.2
123
- tzdata==2025.2
124
- tzlocal==5.3.1
125
- uc-micro-py==1.0.3
126
- urllib3==2.4.0
127
- uvicorn==0.34.2
128
- websockets==15.0.1
129
- wrapt==1.17.2
130
- xxhash==3.5.0
131
- yarl==1.20.0
132
- zipp==3.21.0
 
1
+ # Core dependencies
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  gradio==5.30.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  pandas==2.2.3
 
 
 
4
  plotly==6.0.1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  requests==2.32.3
6
+ huggingface-hub==0.30.2
7
+ APScheduler==3.11.0
8
+
9
+ # Additional dependencies for UI and processing
10
+ matplotlib==3.10.3
11
  seaborn==0.13.2
12
+ Pillow==11.2.1
13
+ PyYAML==6.0.2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
simple_data_loader.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Simple data loader for OpenHands Index leaderboard.
3
+ Loads JSONL files from local directory or GitHub repository.
4
+ """
5
+ import os
6
+ import pandas as pd
7
+ import json
8
+ from pathlib import Path
9
+
10
+
11
+ class SimpleLeaderboardViewer:
12
+ """Simple replacement for agent-eval's LeaderboardViewer."""
13
+
14
+ def __init__(self, data_dir: str, config: str, split: str):
15
+ """
16
+ Args:
17
+ data_dir: Path to data directory
18
+ config: Config name (e.g., "1.0.0-dev1")
19
+ split: Split name (e.g., "validation" or "test")
20
+ """
21
+ self.data_dir = Path(data_dir)
22
+ self.config = config
23
+ self.split = split
24
+ self.config_path = self.data_dir / config
25
+
26
+ # Load suite configuration
27
+ config_file = self.config_path / "agenteval.json"
28
+ if config_file.exists():
29
+ with open(config_file) as f:
30
+ suite_config = json.load(f)
31
+ self.suite_config = suite_config["suite_config"]
32
+ else:
33
+ self.suite_config = {
34
+ "name": "openhands-index",
35
+ "version": config,
36
+ "splits": []
37
+ }
38
+
39
+ # Build tag map from config
40
+ self.tag_map = {}
41
+ for split_config in self.suite_config.get("splits", []):
42
+ if split_config["name"] == split:
43
+ for task in split_config.get("tasks", []):
44
+ for tag in task.get("tags", []):
45
+ if tag not in self.tag_map:
46
+ self.tag_map[tag] = []
47
+ self.tag_map[tag].append(task["name"])
48
+
49
+ def _load(self):
50
+ """Load the JSONL file for the split and return DataFrame and tag map."""
51
+ jsonl_file = self.config_path / f"{self.split}.jsonl"
52
+
53
+ if not jsonl_file.exists():
54
+ # Return empty dataframe with error message
55
+ return pd.DataFrame({
56
+ "Message": [f"No data found for split '{self.split}'. Expected file: {jsonl_file}"]
57
+ }), {}
58
+
59
+ try:
60
+ # Read JSONL file
61
+ records = []
62
+ with open(jsonl_file, 'r') as f:
63
+ for line in f:
64
+ if line.strip():
65
+ records.append(json.loads(line))
66
+
67
+ if not records:
68
+ return pd.DataFrame({
69
+ "Message": [f"No data in file: {jsonl_file}"]
70
+ }), {}
71
+
72
+ df = pd.DataFrame(records)
73
+ return df, self.tag_map
74
+ except Exception as e:
75
+ return pd.DataFrame({
76
+ "Message": [f"Error loading data: {e}"]
77
+ }), {}
78
+
79
+ def get_dataframe(self):
80
+ """Get the raw dataframe."""
81
+ df, _ = self._load()
82
+ return df
83
+
84
+
85
+ def load_mock_data_locally(data_dir: str = "mock_results"):
86
+ """
87
+ Load mock data from local directory for testing.
88
+
89
+ Args:
90
+ data_dir: Path to mock results directory
91
+
92
+ Returns:
93
+ Dictionary mapping split names to SimpleLeaderboardViewer instances
94
+ """
95
+ viewers = {}
96
+ data_path = Path(data_dir)
97
+
98
+ if not data_path.exists():
99
+ print(f"Warning: Mock data directory '{data_dir}' not found")
100
+ return viewers
101
+
102
+ # Find all config directories
103
+ for config_dir in data_path.iterdir():
104
+ if config_dir.is_dir():
105
+ config_name = config_dir.name
106
+
107
+ # Find all JSONL files (each represents a split)
108
+ for jsonl_file in config_dir.glob("*.jsonl"):
109
+ split_name = jsonl_file.stem
110
+ viewer = SimpleLeaderboardViewer(
111
+ data_dir=str(data_path),
112
+ config=config_name,
113
+ split=split_name
114
+ )
115
+ viewers[split_name] = viewer
116
+
117
+ return viewers
submission.py CHANGED
@@ -2,11 +2,6 @@ import logging
2
  import typing
3
 
4
  import matplotlib
5
- from agenteval.cli import SUBMISSION_METADATA_FILENAME
6
- from agenteval.models import SubmissionMetadata
7
- from datasets.exceptions import DataFilesNotFoundError
8
- from gradio_modal import Modal
9
-
10
  matplotlib.use('Agg')
11
 
12
  import os
@@ -17,12 +12,15 @@ from email.utils import parseaddr
17
 
18
  import gradio as gr
19
  import requests
20
- from agenteval.leaderboard.upload import sanitize_path_component, _validate_path_component
21
- from datasets import Dataset, DatasetDict, VerificationMode, load_dataset
22
- from datasets.data_files import EmptyDatasetError
23
  from huggingface_hub import HfApi
24
 
25
  import aliases
 
 
 
 
 
 
26
  from config import (
27
  CONFIG_NAME,
28
  CONTACT_DATASET,
 
2
  import typing
3
 
4
  import matplotlib
 
 
 
 
 
5
  matplotlib.use('Agg')
6
 
7
  import os
 
12
 
13
  import gradio as gr
14
  import requests
 
 
 
15
  from huggingface_hub import HfApi
16
 
17
  import aliases
18
+ from submission_utils import (
19
+ SUBMISSION_METADATA_FILENAME,
20
+ SubmissionMetadata,
21
+ sanitize_path_component,
22
+ _validate_path_component
23
+ )
24
  from config import (
25
  CONFIG_NAME,
26
  CONTACT_DATASET,
submission_utils.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Utilities for submission handling, replacing agent-eval dependencies.
3
+ """
4
+ import re
5
+ from pathlib import Path
6
+ from typing import Optional
7
+
8
+
9
+ # Constants
10
+ SUBMISSION_METADATA_FILENAME = "metadata.json"
11
+
12
+
13
+ # Simple SubmissionMetadata class
14
+ class SubmissionMetadata:
15
+ """Simple metadata for submissions."""
16
+ def __init__(self, **kwargs):
17
+ self.agent_name = kwargs.get("agent_name", "")
18
+ self.llm_base = kwargs.get("llm_base", "")
19
+ self.openness = kwargs.get("openness", "")
20
+ self.tool_usage = kwargs.get("tool_usage", "")
21
+ self.submitter_name = kwargs.get("submitter_name", "")
22
+ self.submitter_email = kwargs.get("submitter_email", "")
23
+
24
+ def to_dict(self):
25
+ return {
26
+ "agent_name": self.agent_name,
27
+ "llm_base": self.llm_base,
28
+ "openness": self.openness,
29
+ "tool_usage": self.tool_usage,
30
+ "submitter_name": self.submitter_name,
31
+ "submitter_email": self.submitter_email,
32
+ }
33
+
34
+
35
+ # Path validation functions
36
+ def _validate_path_component(component: str, allow_underscores: bool = True) -> None:
37
+ """
38
+ Validate a single path component.
39
+
40
+ Args:
41
+ component: The path component to validate
42
+ allow_underscores: Whether to allow underscores in the component
43
+
44
+ Raises:
45
+ ValueError: If the component is invalid
46
+ """
47
+ if not component:
48
+ raise ValueError("Path component cannot be empty")
49
+
50
+ if component in (".", ".."):
51
+ raise ValueError(f"Path component cannot be '{component}'")
52
+
53
+ # Check for invalid characters
54
+ pattern = r'^[a-zA-Z0-9_\-\.]+$' if allow_underscores else r'^[a-zA-Z0-9\-\.]+$'
55
+ if not re.match(pattern, component):
56
+ raise ValueError(
57
+ f"Path component '{component}' contains invalid characters. "
58
+ f"Only alphanumeric, hyphens, dots{', and underscores' if allow_underscores else ''} are allowed."
59
+ )
60
+
61
+
62
+ def sanitize_path_component(component: str, replacement: str = "_") -> str:
63
+ """
64
+ Sanitize a path component by replacing invalid characters.
65
+
66
+ Args:
67
+ component: The path component to sanitize
68
+ replacement: The character to use for replacing invalid characters
69
+
70
+ Returns:
71
+ Sanitized path component
72
+ """
73
+ if not component:
74
+ return "unnamed"
75
+
76
+ # Replace any non-alphanumeric, non-hyphen, non-dot, non-underscore with replacement
77
+ sanitized = re.sub(r'[^a-zA-Z0-9_\-\.]', replacement, component)
78
+
79
+ # Remove leading/trailing dots or hyphens
80
+ sanitized = sanitized.strip('.-')
81
+
82
+ # Collapse multiple replacements into one
83
+ sanitized = re.sub(f'{re.escape(replacement)}+', replacement, sanitized)
84
+
85
+ if not sanitized:
86
+ return "unnamed"
87
+
88
+ return sanitized
tests/integration/test_submission.py CHANGED
@@ -4,14 +4,12 @@ from datetime import datetime
4
 
5
  import gradio
6
  import pytest
7
- import pyarrow as pa
8
- from agenteval.models import SubmissionMetadata
9
- from datasets import load_dataset, VerificationMode
10
  from huggingface_hub import HfApi, hf_hub_download
11
 
12
  from aliases import CANONICAL_TOOL_USAGE_STANDARD, CANONICAL_OPENNESS_CLOSED_UI_ONLY
13
  from config import IS_INTERNAL, CONFIG_NAME, CONTACT_DATASET, SUBMISSION_DATASET
14
  from submission import add_new_eval
 
15
 
16
  _hf = HfApi()
17
 
 
4
 
5
  import gradio
6
  import pytest
 
 
 
7
  from huggingface_hub import HfApi, hf_hub_download
8
 
9
  from aliases import CANONICAL_TOOL_USAGE_STANDARD, CANONICAL_OPENNESS_CLOSED_UI_ONLY
10
  from config import IS_INTERNAL, CONFIG_NAME, CONTACT_DATASET, SUBMISSION_DATASET
11
  from submission import add_new_eval
12
+ from submission_utils import SubmissionMetadata
13
 
14
  _hf = HfApi()
15
 
ui_components.py CHANGED
@@ -4,10 +4,10 @@ import plotly.graph_objects as go
4
  import os
5
  import base64
6
 
7
- from agenteval.leaderboard.view import LeaderboardViewer
8
  from huggingface_hub import HfApi
9
 
10
  import aliases
 
11
  from leaderboard_transformer import (
12
  DataTransformer,
13
  transform_raw_dataframe,
@@ -473,12 +473,14 @@ def get_leaderboard_viewer_instance(split: str):
473
 
474
  # --- Cache miss: try to load data from the source ---
475
  try:
476
- print(f"Using Hugging Face dataset for split '{split}': {RESULTS_DATASET}/{CONFIG_NAME}")
477
- viewer = LeaderboardViewer(
478
- repo_id=RESULTS_DATASET,
 
 
 
479
  config=CONFIG_NAME,
480
- split=split,
481
- is_internal=IS_INTERNAL
482
  )
483
 
484
  # Simplify tag map creation
 
4
  import os
5
  import base64
6
 
 
7
  from huggingface_hub import HfApi
8
 
9
  import aliases
10
+ from simple_data_loader import SimpleLeaderboardViewer
11
  from leaderboard_transformer import (
12
  DataTransformer,
13
  transform_raw_dataframe,
 
473
 
474
  # --- Cache miss: try to load data from the source ---
475
  try:
476
+ # First try to load from extracted data directory (local mock data)
477
+ data_dir = EXTRACTED_DATA_DIR if os.path.exists(EXTRACTED_DATA_DIR) else "mock_results"
478
+
479
+ print(f"Loading data for split '{split}' from: {data_dir}/{CONFIG_NAME}")
480
+ viewer = SimpleLeaderboardViewer(
481
+ data_dir=data_dir,
482
  config=CONFIG_NAME,
483
+ split=split
 
484
  )
485
 
486
  # Simplify tag map creation