IgorSlinko's picture
Add SWE-bench leaderboard viewer with S3 trajectory download
781ed01
raw
history blame
5.47 kB
import json
import os
import subprocess
from pathlib import Path
import gradio as gr
import pandas as pd
from src.download_swebench_leaderboard import download_leaderboard, get_leaderboard
DATA_DIR = Path("data")
TRAJS_DIR = DATA_DIR / "swebench_trajs"
LEADERBOARD_CACHE = DATA_DIR / "swebench_leaderboard_latest.json"
S3_BUCKET = "s3://swe-bench-experiments/bash-only"
def load_or_download_leaderboard():
if LEADERBOARD_CACHE.exists():
with open(LEADERBOARD_CACHE) as f:
return json.load(f)
filename = download_leaderboard(output_dir=str(DATA_DIR))
os.rename(filename, LEADERBOARD_CACHE)
with open(LEADERBOARD_CACHE) as f:
return json.load(f)
def get_bash_only_df():
data = load_or_download_leaderboard()
leaderboards = data.get("leaderboards", [])
bash_only = next((lb for lb in leaderboards if lb["name"] == "bash-only"), None)
if not bash_only:
return pd.DataFrame()
rows = []
for r in bash_only["results"]:
rows.append({
"name": r.get("name", ""),
"date": r.get("date", ""),
"cost": round(r.get("cost", 0), 2),
"instance_cost": round(r.get("instance_cost", 0), 4),
"instance_calls": r.get("instance_calls", 0),
"folder": r.get("folder", ""),
"os_model": "โœ…" if r.get("os_model") else "โŒ",
"os_system": "โœ…" if r.get("os_system") else "โŒ",
})
df = pd.DataFrame(rows)
return df
def get_model_details(folder: str):
if not folder:
return None, "Select a model from the table"
data = load_or_download_leaderboard()
leaderboards = data.get("leaderboards", [])
bash_only = next((lb for lb in leaderboards if lb["name"] == "bash-only"), None)
if not bash_only:
return None, "Leaderboard not found"
model = next((r for r in bash_only["results"] if r.get("folder") == folder), None)
if not model:
return None, f"Model with folder '{folder}' not found"
return model, None
def download_trajectories_from_s3(folder: str, progress=gr.Progress()):
if not folder:
return "โŒ No model selected"
model, error = get_model_details(folder)
if error:
return f"โŒ {error}"
output_dir = TRAJS_DIR / folder
if output_dir.exists() and any(output_dir.iterdir()):
file_count = len(list(output_dir.glob("*/*.traj.json")))
return f"โœ… Already downloaded: {output_dir}\n\n{file_count} trajectory files"
s3_path = f"{S3_BUCKET}/{folder}/trajs/"
output_dir.mkdir(parents=True, exist_ok=True)
progress(0, desc="Starting S3 download...")
try:
result = subprocess.run(
["aws", "s3", "cp", "--recursive", s3_path, str(output_dir), "--no-sign-request"],
capture_output=True,
text=True,
timeout=600,
)
if result.returncode != 0:
return f"โŒ S3 download failed:\n{result.stderr}"
file_count = len(list(output_dir.glob("*/*.traj.json")))
if file_count == 0:
file_count = len(list(output_dir.glob("*.json")))
per_instance = model.get("per_instance_details", {})
resolved_count = sum(1 for v in per_instance.values() if v.get("resolved"))
total_count = len(per_instance)
return f"โœ… Downloaded to {output_dir}\n\n{file_count} trajectory files\nResolved: {resolved_count}/{total_count} ({100*resolved_count/total_count:.1f}%)"
except subprocess.TimeoutExpired:
return "โŒ Download timed out (>10 min)"
except FileNotFoundError:
return "โŒ AWS CLI not found. Install with: pip install awscli"
except Exception as e:
return f"โŒ Error: {e}"
def on_row_select(evt: gr.SelectData, df: pd.DataFrame):
if evt.index is None:
return "", "", gr.update()
row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
row = df.iloc[row_idx]
folder = row["folder"]
name = row["name"]
return folder, name, gr.update(interactive=True)
def build_app():
df = get_bash_only_df()
with gr.Blocks(title="SWE-bench Routing Cost Calculator") as app:
gr.Markdown("# ๐Ÿงฎ SWE-bench Bash-Only Leaderboard")
gr.Markdown("Select a model to use as base for cost analysis")
with gr.Row():
with gr.Column(scale=3):
leaderboard_table = gr.Dataframe(
value=df,
label="Bash-Only Leaderboard",
interactive=False,
wrap=True,
)
with gr.Column(scale=1):
gr.Markdown("### Selected Model")
selected_name = gr.Textbox(label="Model Name", interactive=False)
selected_folder = gr.Textbox(label="Folder ID", interactive=False)
download_btn = gr.Button("๐Ÿ“ฅ Download Trajectories", interactive=False)
download_status = gr.Textbox(label="Status", interactive=False, lines=3)
leaderboard_table.select(
fn=on_row_select,
inputs=[leaderboard_table],
outputs=[selected_folder, selected_name, download_btn],
)
download_btn.click(
fn=download_trajectories_from_s3,
inputs=[selected_folder],
outputs=[download_status],
)
return app
if __name__ == "__main__":
app = build_app()
app.launch()