IgorSlinko commited on
Commit
781ed01
·
1 Parent(s): fd3977c

Add SWE-bench leaderboard viewer with S3 trajectory download

Browse files

- Gradio app showing bash-only leaderboard
- Download trajectories from S3 bucket
- UV-based dependency management

Files changed (7) hide show
  1. .env.example +1 -0
  2. .gitignore +8 -0
  3. app.py +172 -0
  4. pyproject.toml +26 -0
  5. src/__init__.py +0 -0
  6. src/download_swebench_leaderboard.py +84 -0
  7. uv.lock +0 -0
.env.example ADDED
@@ -0,0 +1 @@
 
 
1
+ HF_TOKEN=
.gitignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ .env
2
+ .venv/
3
+ __pycache__/
4
+ *.pyc
5
+ data/
6
+ .DS_Store
7
+
8
+
app.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import subprocess
4
+ from pathlib import Path
5
+
6
+ import gradio as gr
7
+ import pandas as pd
8
+
9
+ from src.download_swebench_leaderboard import download_leaderboard, get_leaderboard
10
+
11
+ DATA_DIR = Path("data")
12
+ TRAJS_DIR = DATA_DIR / "swebench_trajs"
13
+ LEADERBOARD_CACHE = DATA_DIR / "swebench_leaderboard_latest.json"
14
+ S3_BUCKET = "s3://swe-bench-experiments/bash-only"
15
+
16
+
17
+ def load_or_download_leaderboard():
18
+ if LEADERBOARD_CACHE.exists():
19
+ with open(LEADERBOARD_CACHE) as f:
20
+ return json.load(f)
21
+
22
+ filename = download_leaderboard(output_dir=str(DATA_DIR))
23
+ os.rename(filename, LEADERBOARD_CACHE)
24
+ with open(LEADERBOARD_CACHE) as f:
25
+ return json.load(f)
26
+
27
+
28
+ def get_bash_only_df():
29
+ data = load_or_download_leaderboard()
30
+ leaderboards = data.get("leaderboards", [])
31
+ bash_only = next((lb for lb in leaderboards if lb["name"] == "bash-only"), None)
32
+
33
+ if not bash_only:
34
+ return pd.DataFrame()
35
+
36
+ rows = []
37
+ for r in bash_only["results"]:
38
+ rows.append({
39
+ "name": r.get("name", ""),
40
+ "date": r.get("date", ""),
41
+ "cost": round(r.get("cost", 0), 2),
42
+ "instance_cost": round(r.get("instance_cost", 0), 4),
43
+ "instance_calls": r.get("instance_calls", 0),
44
+ "folder": r.get("folder", ""),
45
+ "os_model": "✅" if r.get("os_model") else "❌",
46
+ "os_system": "✅" if r.get("os_system") else "❌",
47
+ })
48
+
49
+ df = pd.DataFrame(rows)
50
+ return df
51
+
52
+
53
+ def get_model_details(folder: str):
54
+ if not folder:
55
+ return None, "Select a model from the table"
56
+
57
+ data = load_or_download_leaderboard()
58
+ leaderboards = data.get("leaderboards", [])
59
+ bash_only = next((lb for lb in leaderboards if lb["name"] == "bash-only"), None)
60
+
61
+ if not bash_only:
62
+ return None, "Leaderboard not found"
63
+
64
+ model = next((r for r in bash_only["results"] if r.get("folder") == folder), None)
65
+ if not model:
66
+ return None, f"Model with folder '{folder}' not found"
67
+
68
+ return model, None
69
+
70
+
71
+ def download_trajectories_from_s3(folder: str, progress=gr.Progress()):
72
+ if not folder:
73
+ return "❌ No model selected"
74
+
75
+ model, error = get_model_details(folder)
76
+ if error:
77
+ return f"❌ {error}"
78
+
79
+ output_dir = TRAJS_DIR / folder
80
+ if output_dir.exists() and any(output_dir.iterdir()):
81
+ file_count = len(list(output_dir.glob("*/*.traj.json")))
82
+ return f"✅ Already downloaded: {output_dir}\n\n{file_count} trajectory files"
83
+
84
+ s3_path = f"{S3_BUCKET}/{folder}/trajs/"
85
+ output_dir.mkdir(parents=True, exist_ok=True)
86
+
87
+ progress(0, desc="Starting S3 download...")
88
+
89
+ try:
90
+ result = subprocess.run(
91
+ ["aws", "s3", "cp", "--recursive", s3_path, str(output_dir), "--no-sign-request"],
92
+ capture_output=True,
93
+ text=True,
94
+ timeout=600,
95
+ )
96
+
97
+ if result.returncode != 0:
98
+ return f"❌ S3 download failed:\n{result.stderr}"
99
+
100
+ file_count = len(list(output_dir.glob("*/*.traj.json")))
101
+ if file_count == 0:
102
+ file_count = len(list(output_dir.glob("*.json")))
103
+
104
+ per_instance = model.get("per_instance_details", {})
105
+ resolved_count = sum(1 for v in per_instance.values() if v.get("resolved"))
106
+ total_count = len(per_instance)
107
+
108
+ return f"✅ Downloaded to {output_dir}\n\n{file_count} trajectory files\nResolved: {resolved_count}/{total_count} ({100*resolved_count/total_count:.1f}%)"
109
+
110
+ except subprocess.TimeoutExpired:
111
+ return "❌ Download timed out (>10 min)"
112
+ except FileNotFoundError:
113
+ return "❌ AWS CLI not found. Install with: pip install awscli"
114
+ except Exception as e:
115
+ return f"❌ Error: {e}"
116
+
117
+
118
+ def on_row_select(evt: gr.SelectData, df: pd.DataFrame):
119
+ if evt.index is None:
120
+ return "", "", gr.update()
121
+
122
+ row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
123
+ row = df.iloc[row_idx]
124
+ folder = row["folder"]
125
+ name = row["name"]
126
+
127
+ return folder, name, gr.update(interactive=True)
128
+
129
+
130
+ def build_app():
131
+ df = get_bash_only_df()
132
+
133
+ with gr.Blocks(title="SWE-bench Routing Cost Calculator") as app:
134
+ gr.Markdown("# 🧮 SWE-bench Bash-Only Leaderboard")
135
+ gr.Markdown("Select a model to use as base for cost analysis")
136
+
137
+ with gr.Row():
138
+ with gr.Column(scale=3):
139
+ leaderboard_table = gr.Dataframe(
140
+ value=df,
141
+ label="Bash-Only Leaderboard",
142
+ interactive=False,
143
+ wrap=True,
144
+ )
145
+
146
+ with gr.Column(scale=1):
147
+ gr.Markdown("### Selected Model")
148
+ selected_name = gr.Textbox(label="Model Name", interactive=False)
149
+ selected_folder = gr.Textbox(label="Folder ID", interactive=False)
150
+
151
+ download_btn = gr.Button("📥 Download Trajectories", interactive=False)
152
+ download_status = gr.Textbox(label="Status", interactive=False, lines=3)
153
+
154
+ leaderboard_table.select(
155
+ fn=on_row_select,
156
+ inputs=[leaderboard_table],
157
+ outputs=[selected_folder, selected_name, download_btn],
158
+ )
159
+
160
+ download_btn.click(
161
+ fn=download_trajectories_from_s3,
162
+ inputs=[selected_folder],
163
+ outputs=[download_status],
164
+ )
165
+
166
+ return app
167
+
168
+
169
+ if __name__ == "__main__":
170
+ app = build_app()
171
+ app.launch()
172
+
pyproject.toml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "routing-money-calculation"
3
+ version = "0.1.0"
4
+ description = "Rough estimate of routing cost for AI agents"
5
+ readme = "README.md"
6
+ license = "Apache-2.0"
7
+ requires-python = ">=3.10"
8
+ dependencies = [
9
+ "gradio>=6.0.2",
10
+ "pandas>=2.0.0",
11
+ "requests>=2.31.0",
12
+ "python-dotenv>=1.0.0",
13
+ ]
14
+
15
+ [project.optional-dependencies]
16
+ dev = [
17
+ "ruff>=0.8.0",
18
+ ]
19
+
20
+ [tool.ruff]
21
+ line-length = 100
22
+ target-version = "py310"
23
+
24
+ [tool.ruff.lint]
25
+ select = ["E", "F", "I", "W"]
26
+
src/__init__.py ADDED
File without changes
src/download_swebench_leaderboard.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from datetime import datetime
4
+
5
+ import requests
6
+
7
+
8
+ LEADERBOARD_URL = "https://raw.githubusercontent.com/SWE-bench/swe-bench.github.io/master/data/leaderboards.json"
9
+
10
+
11
+ def download_leaderboard(output_dir: str = "data") -> str:
12
+ response = requests.get(LEADERBOARD_URL)
13
+ response.raise_for_status()
14
+
15
+ data = response.json()
16
+
17
+ os.makedirs(output_dir, exist_ok=True)
18
+
19
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
20
+ filename = f"{output_dir}/swebench_leaderboard_{timestamp}.json"
21
+
22
+ with open(filename, "w", encoding="utf-8") as f:
23
+ json.dump(data, f, indent=2, ensure_ascii=False)
24
+
25
+ print(f"Downloaded leaderboard to {filename}")
26
+ print(f"Available keys: {list(data.keys()) if isinstance(data, dict) else 'list'}")
27
+
28
+ return filename
29
+
30
+
31
+ def get_leaderboard(leaderboard_name: str = "bash-only") -> list:
32
+ response = requests.get(LEADERBOARD_URL)
33
+ response.raise_for_status()
34
+ data = response.json()
35
+
36
+ leaderboards = data.get("leaderboards", [])
37
+ for lb in leaderboards:
38
+ if lb.get("name") == leaderboard_name:
39
+ return lb.get("results", [])
40
+
41
+ available = [lb.get("name") for lb in leaderboards]
42
+ raise ValueError(f"Leaderboard '{leaderboard_name}' not found. Available: {available}")
43
+
44
+
45
+ def main():
46
+ import argparse
47
+
48
+ parser = argparse.ArgumentParser(description="Download SWE-bench leaderboard data")
49
+ parser.add_argument(
50
+ "--output-dir",
51
+ default="data",
52
+ help="Output directory for downloaded data",
53
+ )
54
+ parser.add_argument(
55
+ "--show-structure",
56
+ action="store_true",
57
+ help="Print the structure of the JSON data",
58
+ )
59
+
60
+ args = parser.parse_args()
61
+
62
+ filename = download_leaderboard(output_dir=args.output_dir)
63
+
64
+ if args.show_structure:
65
+ with open(filename, "r", encoding="utf-8") as f:
66
+ data = json.load(f)
67
+
68
+ if isinstance(data, dict):
69
+ print("\nJSON structure:")
70
+ for key, value in data.items():
71
+ if isinstance(value, list):
72
+ print(f" {key}: list with {len(value)} items")
73
+ if value:
74
+ first_keys = list(value[0].keys()) if isinstance(value[0], dict) else type(value[0])
75
+ print(f" First item keys: {first_keys}")
76
+ elif isinstance(value, dict):
77
+ print(f" {key}: dict with keys {list(value.keys())[:5]}...")
78
+ else:
79
+ print(f" {key}: {type(value).__name__}")
80
+
81
+
82
+ if __name__ == "__main__":
83
+ main()
84
+
uv.lock ADDED
The diff for this file is too large to render. See raw diff