Commit
·
bca0945
1
Parent(s):
745add3
v0.3.14: Add logging, support alternative trajectory format, fix token columns
Browse files- Add logging setup with file handler (logs/app.log) and console output
- Log all parsing errors with full traceback (exc_info=True)
- Add unhandled exception hook to capture crashes
- Add logs/ to .gitignore
- Support alternative trajectory format (Llama 4 Scout style)
- Detect 'trajectory' array format vs 'messages' format
- New _parse_trajectory_format_to_steps() for trajectory format
- Extract api_calls from trajectory length when model_stats missing
- Add ensure_token_columns() to guarantee token columns exist
- Apply ensure_token_columns in load_all_trajectories, load_all_trajectories_calculated
- Fix KeyError: 'completion_tokens' for models with missing token data
- .gitignore +1 -0
- app.py +157 -15
.gitignore
CHANGED
|
@@ -3,6 +3,7 @@
|
|
| 3 |
__pycache__/
|
| 4 |
*.pyc
|
| 5 |
data/
|
|
|
|
| 6 |
.DS_Store
|
| 7 |
|
| 8 |
|
|
|
|
| 3 |
__pycache__/
|
| 4 |
*.pyc
|
| 5 |
data/
|
| 6 |
+
logs/
|
| 7 |
.DS_Store
|
| 8 |
|
| 9 |
|
app.py
CHANGED
|
@@ -1,8 +1,10 @@
|
|
| 1 |
import json
|
|
|
|
| 2 |
import os
|
| 3 |
import random
|
| 4 |
import re
|
| 5 |
import subprocess
|
|
|
|
| 6 |
from pathlib import Path
|
| 7 |
|
| 8 |
import gradio as gr
|
|
@@ -23,6 +25,29 @@ LEADERBOARD_CACHE = DATA_DIR / "swebench_leaderboard_latest.json"
|
|
| 23 |
LITELLM_PRICES_CACHE = DATA_DIR / "litellm_prices.json"
|
| 24 |
S3_BUCKET = "s3://swe-bench-experiments/bash-only"
|
| 25 |
LITELLM_PRICES_URL = "https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
_litellm_prices_cache = None
|
| 28 |
_trajectories_cache = {}
|
|
@@ -106,6 +131,11 @@ def parse_trajectory_to_steps(traj_path: Path, model_name: str) -> list[dict]:
|
|
| 106 |
data = json.load(f)
|
| 107 |
|
| 108 |
messages = data.get("messages", [])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
if not messages:
|
| 110 |
return []
|
| 111 |
|
|
@@ -151,6 +181,40 @@ def parse_trajectory_to_steps(traj_path: Path, model_name: str) -> list[dict]:
|
|
| 151 |
return steps
|
| 152 |
|
| 153 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
def get_default_overhead(model_name: str) -> float:
|
| 155 |
"""Get default tokenizer overhead for model provider"""
|
| 156 |
model_lower = model_name.lower() if model_name else ""
|
|
@@ -212,13 +276,33 @@ def apply_no_cache(df: pd.DataFrame) -> pd.DataFrame:
|
|
| 212 |
return df
|
| 213 |
|
| 214 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
def load_all_trajectories_calculated(folder: str) -> pd.DataFrame:
|
| 216 |
"""Load trajectories with self-calculated token counts using calculate_routing_tokens"""
|
| 217 |
global _calculated_tokens_cache
|
| 218 |
|
| 219 |
cache_key = f"calculated_{folder}"
|
| 220 |
if cache_key in _calculated_tokens_cache:
|
| 221 |
-
return _calculated_tokens_cache[cache_key]
|
| 222 |
|
| 223 |
trajectory_steps = load_all_trajectory_steps(folder)
|
| 224 |
|
|
@@ -251,9 +335,9 @@ def load_all_trajectories_calculated(folder: str) -> pd.DataFrame:
|
|
| 251 |
"cache_creation_tokens": cache_creation,
|
| 252 |
})
|
| 253 |
except Exception as e:
|
| 254 |
-
|
| 255 |
|
| 256 |
-
df = pd.DataFrame(rows)
|
| 257 |
_calculated_tokens_cache[cache_key] = df
|
| 258 |
return df
|
| 259 |
|
|
@@ -301,7 +385,7 @@ def load_all_trajectory_steps(folder: str) -> dict[str, list[dict]]:
|
|
| 301 |
if steps:
|
| 302 |
result[instance_id] = steps
|
| 303 |
except Exception as e:
|
| 304 |
-
|
| 305 |
|
| 306 |
_trajectory_steps_cache[cache_key] = result
|
| 307 |
return result
|
|
@@ -519,10 +603,29 @@ def parse_trajectory(traj_path: Path) -> dict:
|
|
| 519 |
model_config = config.get("model", {})
|
| 520 |
model_name = model_config.get("cost_calc_model_override", model_config.get("model_name", ""))
|
| 521 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 522 |
result = {
|
| 523 |
"instance_id": data.get("instance_id", traj_path.stem),
|
| 524 |
"model_name": model_name,
|
| 525 |
-
"api_calls":
|
| 526 |
"instance_cost": model_stats.get("instance_cost", 0),
|
| 527 |
"prompt_tokens": 0,
|
| 528 |
"completion_tokens": 0,
|
|
@@ -555,7 +658,7 @@ def load_all_trajectories(folder: str) -> pd.DataFrame:
|
|
| 555 |
global _trajectories_cache
|
| 556 |
|
| 557 |
if folder in _trajectories_cache:
|
| 558 |
-
return _trajectories_cache[folder]
|
| 559 |
|
| 560 |
output_dir = TRAJS_DIR / folder
|
| 561 |
|
|
@@ -574,9 +677,9 @@ def load_all_trajectories(folder: str) -> pd.DataFrame:
|
|
| 574 |
try:
|
| 575 |
rows.append(parse_trajectory(traj_path))
|
| 576 |
except Exception as e:
|
| 577 |
-
|
| 578 |
|
| 579 |
-
df = pd.DataFrame(rows)
|
| 580 |
_trajectories_cache[folder] = df
|
| 581 |
return df
|
| 582 |
|
|
@@ -981,8 +1084,8 @@ def get_prices_for_folder(folder: str) -> tuple[dict, str]:
|
|
| 981 |
return result, model_hint
|
| 982 |
|
| 983 |
|
| 984 |
-
def
|
| 985 |
-
if
|
| 986 |
return (
|
| 987 |
"", "",
|
| 988 |
gr.update(visible=False),
|
|
@@ -994,7 +1097,6 @@ def on_row_select(evt: gr.SelectData, df: pd.DataFrame):
|
|
| 994 |
gr.update(value=1.0),
|
| 995 |
)
|
| 996 |
|
| 997 |
-
row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
|
| 998 |
row = df.iloc[row_idx]
|
| 999 |
folder = row["folder"]
|
| 1000 |
name = row["name"]
|
|
@@ -1023,6 +1125,18 @@ def on_row_select(evt: gr.SelectData, df: pd.DataFrame):
|
|
| 1023 |
)
|
| 1024 |
|
| 1025 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1026 |
def create_routed_token_chart(base_tokens: dict, additional_models: list):
|
| 1027 |
"""
|
| 1028 |
Create grouped bar chart for tokens by type, comparing base vs additional models.
|
|
@@ -1155,8 +1269,8 @@ def build_app():
|
|
| 1155 |
with gr.Blocks(title="SWE-bench Routing Cost Calculator") as app:
|
| 1156 |
trajectories_state = gr.State(None)
|
| 1157 |
|
| 1158 |
-
gr.Markdown("# 🧮 SWE-bench Bash-Only Leaderboard `v0.3.
|
| 1159 |
-
gr.Markdown("Select a model
|
| 1160 |
|
| 1161 |
with gr.Row():
|
| 1162 |
with gr.Column(scale=3):
|
|
@@ -1165,6 +1279,7 @@ def build_app():
|
|
| 1165 |
label="Bash-Only Leaderboard",
|
| 1166 |
interactive=False,
|
| 1167 |
wrap=True,
|
|
|
|
| 1168 |
)
|
| 1169 |
|
| 1170 |
with gr.Column(visible=False) as analysis_section:
|
|
@@ -1777,7 +1892,23 @@ def build_app():
|
|
| 1777 |
outputs=[selected_folder, selected_name, analyze_btn, price_input, price_cache_read, price_cache_creation, price_completion, detected_model, thinking_overhead],
|
| 1778 |
)
|
| 1779 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1780 |
def load_and_analyze(folder, input_price, cache_read_price, cache_creation_price, completion_price, overhead, with_cache, progress=gr.Progress()):
|
|
|
|
| 1781 |
empty_result = (
|
| 1782 |
"",
|
| 1783 |
gr.update(visible=False),
|
|
@@ -1789,10 +1920,12 @@ def build_app():
|
|
| 1789 |
)
|
| 1790 |
|
| 1791 |
if not folder:
|
|
|
|
| 1792 |
yield empty_result
|
| 1793 |
return
|
| 1794 |
|
| 1795 |
if not check_trajectories_downloaded(folder):
|
|
|
|
| 1796 |
yield (
|
| 1797 |
"⏳ Downloading trajectories...",
|
| 1798 |
gr.update(visible=False),
|
|
@@ -1802,8 +1935,10 @@ def build_app():
|
|
| 1802 |
None,
|
| 1803 |
gr.update(visible=False),
|
| 1804 |
)
|
|
|
|
| 1805 |
status, _ = download_trajectories_from_s3(folder)
|
| 1806 |
if "❌" in status:
|
|
|
|
| 1807 |
yield (
|
| 1808 |
status,
|
| 1809 |
gr.update(visible=False),
|
|
@@ -1814,6 +1949,7 @@ def build_app():
|
|
| 1814 |
gr.update(visible=False),
|
| 1815 |
)
|
| 1816 |
return
|
|
|
|
| 1817 |
|
| 1818 |
yield (
|
| 1819 |
"⏳ Loading trajectories...",
|
|
@@ -1825,15 +1961,19 @@ def build_app():
|
|
| 1825 |
gr.update(visible=False),
|
| 1826 |
)
|
| 1827 |
|
| 1828 |
-
|
| 1829 |
-
|
|
|
|
|
|
|
| 1830 |
df_calc["api_calls"] = df_meta["api_calls"].values
|
| 1831 |
df_calc["instance_cost"] = df_meta["instance_cost"].values
|
|
|
|
| 1832 |
trajectory_steps = load_all_trajectory_steps(folder)
|
| 1833 |
|
| 1834 |
state_data = {"meta": df_meta, "calculated": df_calc, "folder": folder, "steps": trajectory_steps}
|
| 1835 |
|
| 1836 |
if df_meta.empty:
|
|
|
|
| 1837 |
yield (
|
| 1838 |
"❌ No trajectories found",
|
| 1839 |
gr.update(visible=False),
|
|
@@ -1845,6 +1985,7 @@ def build_app():
|
|
| 1845 |
)
|
| 1846 |
return
|
| 1847 |
|
|
|
|
| 1848 |
fig_steps, fig_cost, _, _, _ = create_basic_histograms(
|
| 1849 |
df_meta, input_price, cache_read_price, cache_creation_price, completion_price
|
| 1850 |
)
|
|
@@ -1867,6 +2008,7 @@ def build_app():
|
|
| 1867 |
df_calc_processed, input_price, cache_read_price, cache_creation_price, completion_price
|
| 1868 |
)
|
| 1869 |
|
|
|
|
| 1870 |
yield (
|
| 1871 |
f"✅ Loaded {len(df_meta)} trajectories",
|
| 1872 |
gr.update(visible=True),
|
|
|
|
| 1 |
import json
|
| 2 |
+
import logging
|
| 3 |
import os
|
| 4 |
import random
|
| 5 |
import re
|
| 6 |
import subprocess
|
| 7 |
+
import sys
|
| 8 |
from pathlib import Path
|
| 9 |
|
| 10 |
import gradio as gr
|
|
|
|
| 25 |
LITELLM_PRICES_CACHE = DATA_DIR / "litellm_prices.json"
|
| 26 |
S3_BUCKET = "s3://swe-bench-experiments/bash-only"
|
| 27 |
LITELLM_PRICES_URL = "https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json"
|
| 28 |
+
LOG_DIR = Path("logs")
|
| 29 |
+
LOG_DIR.mkdir(parents=True, exist_ok=True)
|
| 30 |
+
LOG_FILE = LOG_DIR / "app.log"
|
| 31 |
+
|
| 32 |
+
logging.basicConfig(
|
| 33 |
+
level=logging.INFO,
|
| 34 |
+
format="%(asctime)s [%(levelname)s] %(message)s",
|
| 35 |
+
handlers=[
|
| 36 |
+
logging.FileHandler(LOG_FILE, encoding="utf-8"),
|
| 37 |
+
logging.StreamHandler(sys.stdout),
|
| 38 |
+
],
|
| 39 |
+
force=True,
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def _log_unhandled(exc_type, exc_value, exc_traceback):
|
| 44 |
+
if issubclass(exc_type, KeyboardInterrupt):
|
| 45 |
+
sys.__excepthook__(exc_type, exc_value, exc_traceback)
|
| 46 |
+
return
|
| 47 |
+
logging.error("Uncaught exception", exc_info=(exc_type, exc_value, exc_traceback))
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
sys.excepthook = _log_unhandled
|
| 51 |
|
| 52 |
_litellm_prices_cache = None
|
| 53 |
_trajectories_cache = {}
|
|
|
|
| 131 |
data = json.load(f)
|
| 132 |
|
| 133 |
messages = data.get("messages", [])
|
| 134 |
+
trajectory_data = data.get("trajectory", [])
|
| 135 |
+
|
| 136 |
+
if not messages and trajectory_data:
|
| 137 |
+
return _parse_trajectory_format_to_steps(trajectory_data, model_name)
|
| 138 |
+
|
| 139 |
if not messages:
|
| 140 |
return []
|
| 141 |
|
|
|
|
| 181 |
return steps
|
| 182 |
|
| 183 |
|
| 184 |
+
def _parse_trajectory_format_to_steps(trajectory_data: list, model_name: str) -> list[dict]:
|
| 185 |
+
"""
|
| 186 |
+
Parse alternative trajectory format (with "trajectory" array) into steps.
|
| 187 |
+
"""
|
| 188 |
+
count_tokens, _ = get_tokenizer(model_name)
|
| 189 |
+
|
| 190 |
+
steps = []
|
| 191 |
+
for i, traj_step in enumerate(trajectory_data):
|
| 192 |
+
query = traj_step.get("query", [])
|
| 193 |
+
response_text = traj_step.get("response", "")
|
| 194 |
+
observation_text = traj_step.get("observation", "")
|
| 195 |
+
|
| 196 |
+
system_user_tokens = 0
|
| 197 |
+
if i == 0:
|
| 198 |
+
for q in query:
|
| 199 |
+
content = q.get("content", "")
|
| 200 |
+
if isinstance(content, list):
|
| 201 |
+
content = json.dumps(content)
|
| 202 |
+
system_user_tokens += count_tokens(str(content))
|
| 203 |
+
|
| 204 |
+
completion_tokens = count_tokens(str(response_text)) if response_text else 0
|
| 205 |
+
observation_tokens = count_tokens(str(observation_text)) if observation_text else None
|
| 206 |
+
|
| 207 |
+
step = {
|
| 208 |
+
"model": model_name,
|
| 209 |
+
"system_user": system_user_tokens,
|
| 210 |
+
"completion": completion_tokens,
|
| 211 |
+
"observation": observation_tokens,
|
| 212 |
+
}
|
| 213 |
+
steps.append(step)
|
| 214 |
+
|
| 215 |
+
return steps
|
| 216 |
+
|
| 217 |
+
|
| 218 |
def get_default_overhead(model_name: str) -> float:
|
| 219 |
"""Get default tokenizer overhead for model provider"""
|
| 220 |
model_lower = model_name.lower() if model_name else ""
|
|
|
|
| 276 |
return df
|
| 277 |
|
| 278 |
|
| 279 |
+
def ensure_token_columns(df: pd.DataFrame) -> pd.DataFrame:
|
| 280 |
+
"""Ensure token-related columns exist and are numeric."""
|
| 281 |
+
if df is None or df.empty:
|
| 282 |
+
return df
|
| 283 |
+
df = df.copy()
|
| 284 |
+
required = [
|
| 285 |
+
"prompt_tokens",
|
| 286 |
+
"completion_tokens",
|
| 287 |
+
"cache_read_tokens",
|
| 288 |
+
"cache_creation_tokens",
|
| 289 |
+
]
|
| 290 |
+
for col in required:
|
| 291 |
+
if col not in df.columns:
|
| 292 |
+
df[col] = 0
|
| 293 |
+
df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0).astype(int)
|
| 294 |
+
if "total_tokens" in df.columns:
|
| 295 |
+
df["total_tokens"] = pd.to_numeric(df["total_tokens"], errors="coerce").fillna(0).astype(int)
|
| 296 |
+
return df
|
| 297 |
+
|
| 298 |
+
|
| 299 |
def load_all_trajectories_calculated(folder: str) -> pd.DataFrame:
|
| 300 |
"""Load trajectories with self-calculated token counts using calculate_routing_tokens"""
|
| 301 |
global _calculated_tokens_cache
|
| 302 |
|
| 303 |
cache_key = f"calculated_{folder}"
|
| 304 |
if cache_key in _calculated_tokens_cache:
|
| 305 |
+
return ensure_token_columns(_calculated_tokens_cache[cache_key])
|
| 306 |
|
| 307 |
trajectory_steps = load_all_trajectory_steps(folder)
|
| 308 |
|
|
|
|
| 335 |
"cache_creation_tokens": cache_creation,
|
| 336 |
})
|
| 337 |
except Exception as e:
|
| 338 |
+
logging.error("Error calculating tokens for %s: %s", instance_id, e, exc_info=True)
|
| 339 |
|
| 340 |
+
df = ensure_token_columns(pd.DataFrame(rows))
|
| 341 |
_calculated_tokens_cache[cache_key] = df
|
| 342 |
return df
|
| 343 |
|
|
|
|
| 385 |
if steps:
|
| 386 |
result[instance_id] = steps
|
| 387 |
except Exception as e:
|
| 388 |
+
logging.error("Error parsing steps for %s: %s", traj_path, e, exc_info=True)
|
| 389 |
|
| 390 |
_trajectory_steps_cache[cache_key] = result
|
| 391 |
return result
|
|
|
|
| 603 |
model_config = config.get("model", {})
|
| 604 |
model_name = model_config.get("cost_calc_model_override", model_config.get("model_name", ""))
|
| 605 |
|
| 606 |
+
trajectory_steps = data.get("trajectory", [])
|
| 607 |
+
is_trajectory_format = len(trajectory_steps) > 0 and "messages" not in data
|
| 608 |
+
|
| 609 |
+
if is_trajectory_format and not model_name:
|
| 610 |
+
for step in trajectory_steps:
|
| 611 |
+
query = step.get("query", [])
|
| 612 |
+
for q in query:
|
| 613 |
+
if q.get("role") == "system":
|
| 614 |
+
content = q.get("content", "")
|
| 615 |
+
if "llama" in content.lower() or "meta" in content.lower():
|
| 616 |
+
model_name = "llama"
|
| 617 |
+
break
|
| 618 |
+
if model_name:
|
| 619 |
+
break
|
| 620 |
+
|
| 621 |
+
api_calls = model_stats.get("api_calls", 0)
|
| 622 |
+
if api_calls == 0 and is_trajectory_format:
|
| 623 |
+
api_calls = len(trajectory_steps)
|
| 624 |
+
|
| 625 |
result = {
|
| 626 |
"instance_id": data.get("instance_id", traj_path.stem),
|
| 627 |
"model_name": model_name,
|
| 628 |
+
"api_calls": api_calls,
|
| 629 |
"instance_cost": model_stats.get("instance_cost", 0),
|
| 630 |
"prompt_tokens": 0,
|
| 631 |
"completion_tokens": 0,
|
|
|
|
| 658 |
global _trajectories_cache
|
| 659 |
|
| 660 |
if folder in _trajectories_cache:
|
| 661 |
+
return ensure_token_columns(_trajectories_cache[folder])
|
| 662 |
|
| 663 |
output_dir = TRAJS_DIR / folder
|
| 664 |
|
|
|
|
| 677 |
try:
|
| 678 |
rows.append(parse_trajectory(traj_path))
|
| 679 |
except Exception as e:
|
| 680 |
+
logging.error("Error parsing %s: %s", traj_path, e, exc_info=True)
|
| 681 |
|
| 682 |
+
df = ensure_token_columns(pd.DataFrame(rows))
|
| 683 |
_trajectories_cache[folder] = df
|
| 684 |
return df
|
| 685 |
|
|
|
|
| 1084 |
return result, model_hint
|
| 1085 |
|
| 1086 |
|
| 1087 |
+
def _build_selection_payload(row_idx: int | None, df: pd.DataFrame):
|
| 1088 |
+
if df is None or df.empty or row_idx is None:
|
| 1089 |
return (
|
| 1090 |
"", "",
|
| 1091 |
gr.update(visible=False),
|
|
|
|
| 1097 |
gr.update(value=1.0),
|
| 1098 |
)
|
| 1099 |
|
|
|
|
| 1100 |
row = df.iloc[row_idx]
|
| 1101 |
folder = row["folder"]
|
| 1102 |
name = row["name"]
|
|
|
|
| 1125 |
)
|
| 1126 |
|
| 1127 |
|
| 1128 |
+
def on_row_select(evt: gr.SelectData, df: pd.DataFrame):
|
| 1129 |
+
row_idx = None
|
| 1130 |
+
if evt is not None and evt.index is not None:
|
| 1131 |
+
row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
|
| 1132 |
+
return _build_selection_payload(row_idx, df)
|
| 1133 |
+
|
| 1134 |
+
|
| 1135 |
+
def select_first_row(df: pd.DataFrame):
|
| 1136 |
+
default_idx = 0 if df is not None and not df.empty else None
|
| 1137 |
+
return _build_selection_payload(default_idx, df)
|
| 1138 |
+
|
| 1139 |
+
|
| 1140 |
def create_routed_token_chart(base_tokens: dict, additional_models: list):
|
| 1141 |
"""
|
| 1142 |
Create grouped bar chart for tokens by type, comparing base vs additional models.
|
|
|
|
| 1269 |
with gr.Blocks(title="SWE-bench Routing Cost Calculator") as app:
|
| 1270 |
trajectories_state = gr.State(None)
|
| 1271 |
|
| 1272 |
+
gr.Markdown("# 🧮 SWE-bench Bash-Only Leaderboard `v0.3.14`")
|
| 1273 |
+
gr.Markdown("## 🎯 Select a base model for cost analysis (click a row)")
|
| 1274 |
|
| 1275 |
with gr.Row():
|
| 1276 |
with gr.Column(scale=3):
|
|
|
|
| 1279 |
label="Bash-Only Leaderboard",
|
| 1280 |
interactive=False,
|
| 1281 |
wrap=True,
|
| 1282 |
+
elem_id="leaderboard-table",
|
| 1283 |
)
|
| 1284 |
|
| 1285 |
with gr.Column(visible=False) as analysis_section:
|
|
|
|
| 1892 |
outputs=[selected_folder, selected_name, analyze_btn, price_input, price_cache_read, price_cache_creation, price_completion, detected_model, thinking_overhead],
|
| 1893 |
)
|
| 1894 |
|
| 1895 |
+
app.load(
|
| 1896 |
+
fn=select_first_row,
|
| 1897 |
+
inputs=[leaderboard_table],
|
| 1898 |
+
outputs=[selected_folder, selected_name, analyze_btn, price_input, price_cache_read, price_cache_creation, price_completion, detected_model, thinking_overhead],
|
| 1899 |
+
js="""
|
| 1900 |
+
(data) => {
|
| 1901 |
+
const row = gradioApp()?.querySelector('#leaderboard-table table tbody tr');
|
| 1902 |
+
if (row) {
|
| 1903 |
+
row.click();
|
| 1904 |
+
}
|
| 1905 |
+
return data;
|
| 1906 |
+
}
|
| 1907 |
+
""",
|
| 1908 |
+
)
|
| 1909 |
+
|
| 1910 |
def load_and_analyze(folder, input_price, cache_read_price, cache_creation_price, completion_price, overhead, with_cache, progress=gr.Progress()):
|
| 1911 |
+
progress(0, desc="Ready")
|
| 1912 |
empty_result = (
|
| 1913 |
"",
|
| 1914 |
gr.update(visible=False),
|
|
|
|
| 1920 |
)
|
| 1921 |
|
| 1922 |
if not folder:
|
| 1923 |
+
progress(1, desc="No folder selected")
|
| 1924 |
yield empty_result
|
| 1925 |
return
|
| 1926 |
|
| 1927 |
if not check_trajectories_downloaded(folder):
|
| 1928 |
+
progress(0.1, desc="Preparing download")
|
| 1929 |
yield (
|
| 1930 |
"⏳ Downloading trajectories...",
|
| 1931 |
gr.update(visible=False),
|
|
|
|
| 1935 |
None,
|
| 1936 |
gr.update(visible=False),
|
| 1937 |
)
|
| 1938 |
+
progress(0.3, desc="Downloading")
|
| 1939 |
status, _ = download_trajectories_from_s3(folder)
|
| 1940 |
if "❌" in status:
|
| 1941 |
+
progress(1, desc="Download failed")
|
| 1942 |
yield (
|
| 1943 |
status,
|
| 1944 |
gr.update(visible=False),
|
|
|
|
| 1949 |
gr.update(visible=False),
|
| 1950 |
)
|
| 1951 |
return
|
| 1952 |
+
progress(0.45, desc="Loading trajectories")
|
| 1953 |
|
| 1954 |
yield (
|
| 1955 |
"⏳ Loading trajectories...",
|
|
|
|
| 1961 |
gr.update(visible=False),
|
| 1962 |
)
|
| 1963 |
|
| 1964 |
+
progress(0.6, desc="Reading metadata")
|
| 1965 |
+
df_meta = ensure_token_columns(load_all_trajectories(folder))
|
| 1966 |
+
progress(0.7, desc="Reading calculated")
|
| 1967 |
+
df_calc = ensure_token_columns(load_all_trajectories_calculated(folder))
|
| 1968 |
df_calc["api_calls"] = df_meta["api_calls"].values
|
| 1969 |
df_calc["instance_cost"] = df_meta["instance_cost"].values
|
| 1970 |
+
progress(0.8, desc="Reading steps")
|
| 1971 |
trajectory_steps = load_all_trajectory_steps(folder)
|
| 1972 |
|
| 1973 |
state_data = {"meta": df_meta, "calculated": df_calc, "folder": folder, "steps": trajectory_steps}
|
| 1974 |
|
| 1975 |
if df_meta.empty:
|
| 1976 |
+
progress(1, desc="No trajectories found")
|
| 1977 |
yield (
|
| 1978 |
"❌ No trajectories found",
|
| 1979 |
gr.update(visible=False),
|
|
|
|
| 1985 |
)
|
| 1986 |
return
|
| 1987 |
|
| 1988 |
+
progress(0.9, desc="Building charts")
|
| 1989 |
fig_steps, fig_cost, _, _, _ = create_basic_histograms(
|
| 1990 |
df_meta, input_price, cache_read_price, cache_creation_price, completion_price
|
| 1991 |
)
|
|
|
|
| 2008 |
df_calc_processed, input_price, cache_read_price, cache_creation_price, completion_price
|
| 2009 |
)
|
| 2010 |
|
| 2011 |
+
progress(1, desc="Done")
|
| 2012 |
yield (
|
| 2013 |
f"✅ Loaded {len(df_meta)} trajectories",
|
| 2014 |
gr.update(visible=True),
|