Commit
·
10a0b43
1
Parent(s):
748e866
Add metadata-based single trajectory statistics, fix calculated formula (v0.3.37)
Browse files- Add "One trajectory statistics. Metadata from .traj" section
- Fix "One trajectory statistics. Calculated" to use same formula as aggregate charts
- uncached_input = prompt_tokens - cache_read - cache_creation
- app.py +376 -10
- requirements.txt +1 -0
app.py
CHANGED
|
@@ -174,6 +174,276 @@ def calculate_per_step_tokens(steps: list[dict]) -> list[dict]:
|
|
| 174 |
return result
|
| 175 |
|
| 176 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
def create_single_trajectory_chart(steps: list[dict]):
|
| 178 |
"""Create stacked bar chart for a single trajectory showing tokens per step."""
|
| 179 |
import plotly.graph_objects as go
|
|
@@ -184,10 +454,11 @@ def create_single_trajectory_chart(steps: list[dict]):
|
|
| 184 |
per_step_data = calculate_per_step_tokens(steps)
|
| 185 |
|
| 186 |
x_labels = [f"Step {d['step']}" for d in per_step_data]
|
| 187 |
-
uncached = [d["uncached_input"] / 1e3 for d in per_step_data]
|
| 188 |
cache_read = [d["cache_read"] / 1e3 for d in per_step_data]
|
| 189 |
cache_creation = [d["cache_creation"] / 1e3 for d in per_step_data]
|
| 190 |
completion = [d["completion"] / 1e3 for d in per_step_data]
|
|
|
|
|
|
|
| 191 |
|
| 192 |
fig = go.Figure()
|
| 193 |
|
|
@@ -244,10 +515,16 @@ def create_single_trajectory_cost_chart(steps: list[dict], input_price: float, c
|
|
| 244 |
per_step_data = calculate_per_step_tokens(steps)
|
| 245 |
|
| 246 |
x_labels = [f"Step {d['step']}" for d in per_step_data]
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
|
| 252 |
fig = go.Figure()
|
| 253 |
|
|
@@ -842,8 +1119,34 @@ def parse_trajectory(traj_path: Path) -> dict:
|
|
| 842 |
result["prompt_tokens"] += usage.get("prompt_tokens", 0) or 0
|
| 843 |
result["completion_tokens"] += usage.get("completion_tokens", 0) or 0
|
| 844 |
result["total_tokens"] += usage.get("total_tokens", 0) or 0
|
| 845 |
-
|
| 846 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 847 |
|
| 848 |
return result
|
| 849 |
|
|
@@ -1515,7 +1818,7 @@ def build_app():
|
|
| 1515 |
""")
|
| 1516 |
trajectories_state = gr.State(None)
|
| 1517 |
|
| 1518 |
-
gr.Markdown("# 🧮 SWE-bench Bash-Only Leaderboard `v0.3.
|
| 1519 |
gr.Markdown("## 🎯 Select a base model for cost analysis (click a row)")
|
| 1520 |
|
| 1521 |
with gr.Row():
|
|
@@ -1558,6 +1861,14 @@ def build_app():
|
|
| 1558 |
with gr.Row():
|
| 1559 |
plot_cost_breakdown_calc = gr.Plot(label="Cost per Trajectory")
|
| 1560 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1561 |
with gr.Accordion("One trajectory statistics. Calculated from .traj messages", open=False, visible=False) as single_traj_accordion:
|
| 1562 |
with gr.Row():
|
| 1563 |
single_traj_dropdown = gr.Dropdown(label="Select Issue", choices=[], interactive=True)
|
|
@@ -2387,6 +2698,10 @@ def build_app():
|
|
| 2387 |
gr.update(),
|
| 2388 |
gr.update(),
|
| 2389 |
gr.update(),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2390 |
)
|
| 2391 |
|
| 2392 |
if not folder:
|
|
@@ -2408,6 +2723,10 @@ def build_app():
|
|
| 2408 |
gr.update(),
|
| 2409 |
gr.update(),
|
| 2410 |
gr.update(),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2411 |
)
|
| 2412 |
progress(0.3, desc="Downloading")
|
| 2413 |
status, _ = download_trajectories_from_s3(folder)
|
|
@@ -2425,6 +2744,10 @@ def build_app():
|
|
| 2425 |
gr.update(),
|
| 2426 |
gr.update(),
|
| 2427 |
gr.update(),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2428 |
)
|
| 2429 |
return
|
| 2430 |
progress(0.45, desc="Loading trajectories")
|
|
@@ -2441,6 +2764,10 @@ def build_app():
|
|
| 2441 |
gr.update(),
|
| 2442 |
gr.update(),
|
| 2443 |
gr.update(),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2444 |
)
|
| 2445 |
|
| 2446 |
progress(0.6, desc="Reading metadata")
|
|
@@ -2449,8 +2776,10 @@ def build_app():
|
|
| 2449 |
df_calc = ensure_token_columns(load_all_trajectories_calculated(folder))
|
| 2450 |
df_calc["api_calls"] = df_meta["api_calls"].values
|
| 2451 |
df_calc["instance_cost"] = df_meta["instance_cost"].values
|
| 2452 |
-
progress(0.
|
| 2453 |
trajectory_steps = load_all_trajectory_steps(folder)
|
|
|
|
|
|
|
| 2454 |
|
| 2455 |
model_details, _ = get_model_details(folder)
|
| 2456 |
resolved_instances = {}
|
|
@@ -2459,7 +2788,7 @@ def build_app():
|
|
| 2459 |
for inst_id, details in per_instance.items():
|
| 2460 |
resolved_instances[inst_id] = details.get("resolved", False)
|
| 2461 |
|
| 2462 |
-
state_data = {"meta": df_meta, "calculated": df_calc, "folder": folder, "steps": trajectory_steps, "resolved": resolved_instances}
|
| 2463 |
|
| 2464 |
if df_meta.empty:
|
| 2465 |
progress(1, desc="No trajectories found")
|
|
@@ -2475,6 +2804,10 @@ def build_app():
|
|
| 2475 |
gr.update(),
|
| 2476 |
gr.update(),
|
| 2477 |
gr.update(),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2478 |
)
|
| 2479 |
return
|
| 2480 |
|
|
@@ -2504,6 +2837,10 @@ def build_app():
|
|
| 2504 |
issue_ids = sorted(trajectory_steps.keys())
|
| 2505 |
first_issue = issue_ids[0] if issue_ids else None
|
| 2506 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2507 |
progress(1, desc="Done")
|
| 2508 |
yield (
|
| 2509 |
f"✅ Loaded {len(df_meta)} trajectories",
|
|
@@ -2517,6 +2854,10 @@ def build_app():
|
|
| 2517 |
gr.update(choices=issue_ids, value=first_issue),
|
| 2518 |
gr.update(),
|
| 2519 |
gr.update(),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2520 |
)
|
| 2521 |
|
| 2522 |
def on_single_traj_select(state_data, issue_id, input_price, cache_read_price, cache_creation_price, completion_price):
|
|
@@ -2530,6 +2871,17 @@ def build_app():
|
|
| 2530 |
cost_chart = create_single_trajectory_cost_chart(steps, input_price, cache_read_price, cache_creation_price, completion_price)
|
| 2531 |
return tokens_chart, cost_chart
|
| 2532 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2533 |
analyze_btn.click(
|
| 2534 |
fn=load_and_analyze,
|
| 2535 |
inputs=[selected_folder, price_input, price_cache_read, price_cache_creation, price_completion, thinking_overhead, use_cache],
|
|
@@ -2545,11 +2897,19 @@ def build_app():
|
|
| 2545 |
single_traj_dropdown,
|
| 2546 |
single_traj_plot,
|
| 2547 |
single_traj_cost_plot,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2548 |
],
|
| 2549 |
).then(
|
| 2550 |
fn=on_single_traj_select,
|
| 2551 |
inputs=[trajectories_state, single_traj_dropdown, price_input, price_cache_read, price_cache_creation, price_completion],
|
| 2552 |
outputs=[single_traj_plot, single_traj_cost_plot],
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2553 |
)
|
| 2554 |
|
| 2555 |
def recalculate_costs(state_data, input_price, cache_read_price, cache_creation_price, completion_price, overhead, with_cache):
|
|
@@ -2613,6 +2973,12 @@ def build_app():
|
|
| 2613 |
outputs=[single_traj_plot, single_traj_cost_plot],
|
| 2614 |
)
|
| 2615 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2616 |
thinking_overhead.change(
|
| 2617 |
fn=on_calc_options_change,
|
| 2618 |
inputs=calc_options_inputs,
|
|
|
|
| 174 |
return result
|
| 175 |
|
| 176 |
|
| 177 |
+
def _parse_usage_from_log_line(line: str) -> dict | None:
|
| 178 |
+
"""
|
| 179 |
+
Parse usage info from log line containing ModelResponse or similar format.
|
| 180 |
+
Returns dict with prompt_tokens, completion_tokens, cached_tokens, etc.
|
| 181 |
+
"""
|
| 182 |
+
if "usage=" not in line:
|
| 183 |
+
return None
|
| 184 |
+
|
| 185 |
+
result = {}
|
| 186 |
+
|
| 187 |
+
for field in ["completion_tokens", "prompt_tokens", "total_tokens"]:
|
| 188 |
+
match = re.search(rf'{field}=(\d+)', line)
|
| 189 |
+
if match:
|
| 190 |
+
result[field] = int(match.group(1))
|
| 191 |
+
|
| 192 |
+
cached_match = re.search(r'cached_tokens=(\d+)', line)
|
| 193 |
+
if cached_match:
|
| 194 |
+
result["cached_tokens"] = int(cached_match.group(1))
|
| 195 |
+
|
| 196 |
+
return result if result else None
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
def _parse_old_format_log(log_path: Path) -> list[dict]:
|
| 200 |
+
"""
|
| 201 |
+
Parse old SWE-agent format .info.log file to extract per-step token usage.
|
| 202 |
+
"""
|
| 203 |
+
result = []
|
| 204 |
+
step = 0
|
| 205 |
+
|
| 206 |
+
try:
|
| 207 |
+
with open(log_path, "r", encoding="utf-8") as f:
|
| 208 |
+
for line in f:
|
| 209 |
+
if "usage=Usage(" not in line:
|
| 210 |
+
continue
|
| 211 |
+
|
| 212 |
+
usage = _parse_usage_from_log_line(line)
|
| 213 |
+
if not usage:
|
| 214 |
+
continue
|
| 215 |
+
|
| 216 |
+
prompt_tokens = usage.get("prompt_tokens", 0)
|
| 217 |
+
completion_tokens = usage.get("completion_tokens", 0)
|
| 218 |
+
cached_tokens = usage.get("cached_tokens", 0)
|
| 219 |
+
|
| 220 |
+
uncached_input = max(0, prompt_tokens - cached_tokens)
|
| 221 |
+
|
| 222 |
+
result.append({
|
| 223 |
+
"step": step,
|
| 224 |
+
"cache_read": cached_tokens,
|
| 225 |
+
"uncached_input": uncached_input,
|
| 226 |
+
"completion": completion_tokens,
|
| 227 |
+
"cache_creation": 0,
|
| 228 |
+
})
|
| 229 |
+
step += 1
|
| 230 |
+
except Exception as e:
|
| 231 |
+
logging.debug("Error parsing log file %s: %s", log_path, e)
|
| 232 |
+
|
| 233 |
+
return result
|
| 234 |
+
|
| 235 |
+
|
| 236 |
+
def parse_trajectory_metadata_per_step(traj_path: Path) -> list[dict]:
|
| 237 |
+
"""
|
| 238 |
+
Parse trajectory file and extract per-step metadata from usage fields.
|
| 239 |
+
Supports both new format (.traj.json with messages[].extra.response.usage)
|
| 240 |
+
and old format (.traj with separate .info.log file).
|
| 241 |
+
|
| 242 |
+
Returns list of per-step data:
|
| 243 |
+
[{step: 0, cache_read: X, uncached_input: Y, completion: Z, cache_creation: W}, ...]
|
| 244 |
+
"""
|
| 245 |
+
with open(traj_path, "r", encoding="utf-8") as f:
|
| 246 |
+
data = json.load(f)
|
| 247 |
+
|
| 248 |
+
messages = data.get("messages", [])
|
| 249 |
+
result = []
|
| 250 |
+
step = 0
|
| 251 |
+
|
| 252 |
+
for msg in messages:
|
| 253 |
+
if msg.get("role") != "assistant":
|
| 254 |
+
continue
|
| 255 |
+
|
| 256 |
+
usage = None
|
| 257 |
+
if "usage" in msg:
|
| 258 |
+
usage = msg["usage"]
|
| 259 |
+
elif "extra" in msg and isinstance(msg["extra"], dict):
|
| 260 |
+
response = msg["extra"].get("response", {})
|
| 261 |
+
if isinstance(response, dict):
|
| 262 |
+
usage = response.get("usage", {})
|
| 263 |
+
|
| 264 |
+
if usage:
|
| 265 |
+
prompt_tokens = usage.get("prompt_tokens", 0) or 0
|
| 266 |
+
completion_tokens = usage.get("completion_tokens", 0) or 0
|
| 267 |
+
cache_read = usage.get("cache_read_input_tokens", 0) or 0
|
| 268 |
+
cache_creation = usage.get("cache_creation_input_tokens", 0) or 0
|
| 269 |
+
|
| 270 |
+
prompt_tokens_details = usage.get("prompt_tokens_details", {})
|
| 271 |
+
if isinstance(prompt_tokens_details, dict):
|
| 272 |
+
cached_from_details = prompt_tokens_details.get("cached_tokens", 0) or 0
|
| 273 |
+
if cached_from_details > 0 and cache_read == 0:
|
| 274 |
+
cache_read = cached_from_details
|
| 275 |
+
|
| 276 |
+
uncached_input = max(0, prompt_tokens - cache_read - cache_creation)
|
| 277 |
+
|
| 278 |
+
result.append({
|
| 279 |
+
"step": step,
|
| 280 |
+
"cache_read": cache_read,
|
| 281 |
+
"uncached_input": uncached_input,
|
| 282 |
+
"completion": completion_tokens,
|
| 283 |
+
"cache_creation": cache_creation,
|
| 284 |
+
})
|
| 285 |
+
step += 1
|
| 286 |
+
|
| 287 |
+
if not result:
|
| 288 |
+
log_path = traj_path.with_suffix(".info.log")
|
| 289 |
+
if not log_path.exists():
|
| 290 |
+
base_name = traj_path.stem.replace(".traj", "")
|
| 291 |
+
log_path = traj_path.parent / f"{base_name}.info.log"
|
| 292 |
+
|
| 293 |
+
if log_path.exists():
|
| 294 |
+
result = _parse_old_format_log(log_path)
|
| 295 |
+
|
| 296 |
+
return result
|
| 297 |
+
|
| 298 |
+
|
| 299 |
+
def load_all_trajectory_metadata_steps(folder: str) -> dict[str, list[dict]]:
|
| 300 |
+
"""
|
| 301 |
+
Load per-step metadata for all trajectories.
|
| 302 |
+
|
| 303 |
+
Returns:
|
| 304 |
+
dict mapping instance_id -> list of per-step metadata
|
| 305 |
+
"""
|
| 306 |
+
output_dir = TRAJS_DIR / folder
|
| 307 |
+
|
| 308 |
+
traj_files = list(output_dir.glob("*/*.traj.json"))
|
| 309 |
+
if not traj_files:
|
| 310 |
+
traj_files = list(output_dir.glob("*/*.traj"))
|
| 311 |
+
if not traj_files:
|
| 312 |
+
traj_files = list(output_dir.glob("*.traj.json"))
|
| 313 |
+
if not traj_files:
|
| 314 |
+
traj_files = list(output_dir.glob("*.traj"))
|
| 315 |
+
if not traj_files:
|
| 316 |
+
traj_files = list(output_dir.glob("*.json"))
|
| 317 |
+
|
| 318 |
+
result = {}
|
| 319 |
+
for traj_path in traj_files:
|
| 320 |
+
try:
|
| 321 |
+
instance_id = traj_path.stem.replace(".traj", "")
|
| 322 |
+
steps = parse_trajectory_metadata_per_step(traj_path)
|
| 323 |
+
if steps:
|
| 324 |
+
result[instance_id] = steps
|
| 325 |
+
except Exception as e:
|
| 326 |
+
logging.error("Error parsing metadata steps for %s: %s", traj_path, e, exc_info=True)
|
| 327 |
+
|
| 328 |
+
return result
|
| 329 |
+
|
| 330 |
+
|
| 331 |
+
def create_single_trajectory_meta_chart(steps: list[dict]):
|
| 332 |
+
"""Create stacked bar chart for a single trajectory showing metadata tokens per step."""
|
| 333 |
+
import plotly.graph_objects as go
|
| 334 |
+
|
| 335 |
+
if not steps:
|
| 336 |
+
return None
|
| 337 |
+
|
| 338 |
+
x_labels = [f"Step {d['step']}" for d in steps]
|
| 339 |
+
uncached = [d["uncached_input"] / 1e3 for d in steps]
|
| 340 |
+
cache_read = [d["cache_read"] / 1e3 for d in steps]
|
| 341 |
+
cache_creation = [d["cache_creation"] / 1e3 for d in steps]
|
| 342 |
+
completion = [d["completion"] / 1e3 for d in steps]
|
| 343 |
+
|
| 344 |
+
fig = go.Figure()
|
| 345 |
+
|
| 346 |
+
fig.add_trace(go.Bar(
|
| 347 |
+
name="Uncached Input",
|
| 348 |
+
x=x_labels,
|
| 349 |
+
y=uncached,
|
| 350 |
+
marker_color="#EF553B",
|
| 351 |
+
hovertemplate="Step %{x}<br>Uncached Input: %{y:.2f}K<extra></extra>",
|
| 352 |
+
))
|
| 353 |
+
|
| 354 |
+
fig.add_trace(go.Bar(
|
| 355 |
+
name="Cache Read",
|
| 356 |
+
x=x_labels,
|
| 357 |
+
y=cache_read,
|
| 358 |
+
marker_color="#19D3F3",
|
| 359 |
+
hovertemplate="Step %{x}<br>Cache Read: %{y:.2f}K<extra></extra>",
|
| 360 |
+
))
|
| 361 |
+
|
| 362 |
+
fig.add_trace(go.Bar(
|
| 363 |
+
name="Cache Creation",
|
| 364 |
+
x=x_labels,
|
| 365 |
+
y=cache_creation,
|
| 366 |
+
marker_color="#FFA15A",
|
| 367 |
+
hovertemplate="Step %{x}<br>Cache Creation: %{y:.2f}K<extra></extra>",
|
| 368 |
+
))
|
| 369 |
+
|
| 370 |
+
fig.add_trace(go.Bar(
|
| 371 |
+
name="Completion",
|
| 372 |
+
x=x_labels,
|
| 373 |
+
y=completion,
|
| 374 |
+
marker_color="#AB63FA",
|
| 375 |
+
hovertemplate="Step %{x}<br>Completion: %{y:.2f}K<extra></extra>",
|
| 376 |
+
))
|
| 377 |
+
|
| 378 |
+
fig.update_layout(
|
| 379 |
+
barmode="stack",
|
| 380 |
+
xaxis_title="Step",
|
| 381 |
+
yaxis_title="Tokens (K)",
|
| 382 |
+
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
|
| 383 |
+
margin=dict(l=50, r=20, t=40, b=40),
|
| 384 |
+
)
|
| 385 |
+
|
| 386 |
+
return fig
|
| 387 |
+
|
| 388 |
+
|
| 389 |
+
def create_single_trajectory_meta_cost_chart(steps: list[dict], input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float):
|
| 390 |
+
"""Create stacked bar chart for a single trajectory showing metadata cost per step."""
|
| 391 |
+
import plotly.graph_objects as go
|
| 392 |
+
|
| 393 |
+
if not steps:
|
| 394 |
+
return None
|
| 395 |
+
|
| 396 |
+
x_labels = [f"Step {d['step']}" for d in steps]
|
| 397 |
+
uncached_cost = [d["uncached_input"] * input_price / 1e6 for d in steps]
|
| 398 |
+
cache_read_cost = [d["cache_read"] * cache_read_price / 1e6 for d in steps]
|
| 399 |
+
cache_creation_cost = [d["cache_creation"] * cache_creation_price / 1e6 for d in steps]
|
| 400 |
+
completion_cost = [d["completion"] * completion_price / 1e6 for d in steps]
|
| 401 |
+
|
| 402 |
+
fig = go.Figure()
|
| 403 |
+
|
| 404 |
+
fig.add_trace(go.Bar(
|
| 405 |
+
name="Uncached Input",
|
| 406 |
+
x=x_labels,
|
| 407 |
+
y=uncached_cost,
|
| 408 |
+
marker_color="#EF553B",
|
| 409 |
+
hovertemplate="Step %{x}<br>Uncached Input: $%{y:.4f}<extra></extra>",
|
| 410 |
+
))
|
| 411 |
+
|
| 412 |
+
fig.add_trace(go.Bar(
|
| 413 |
+
name="Cache Read",
|
| 414 |
+
x=x_labels,
|
| 415 |
+
y=cache_read_cost,
|
| 416 |
+
marker_color="#19D3F3",
|
| 417 |
+
hovertemplate="Step %{x}<br>Cache Read: $%{y:.4f}<extra></extra>",
|
| 418 |
+
))
|
| 419 |
+
|
| 420 |
+
fig.add_trace(go.Bar(
|
| 421 |
+
name="Cache Creation",
|
| 422 |
+
x=x_labels,
|
| 423 |
+
y=cache_creation_cost,
|
| 424 |
+
marker_color="#FFA15A",
|
| 425 |
+
hovertemplate="Step %{x}<br>Cache Creation: $%{y:.4f}<extra></extra>",
|
| 426 |
+
))
|
| 427 |
+
|
| 428 |
+
fig.add_trace(go.Bar(
|
| 429 |
+
name="Completion",
|
| 430 |
+
x=x_labels,
|
| 431 |
+
y=completion_cost,
|
| 432 |
+
marker_color="#AB63FA",
|
| 433 |
+
hovertemplate="Step %{x}<br>Completion: $%{y:.4f}<extra></extra>",
|
| 434 |
+
))
|
| 435 |
+
|
| 436 |
+
fig.update_layout(
|
| 437 |
+
barmode="stack",
|
| 438 |
+
xaxis_title="Step",
|
| 439 |
+
yaxis_title="Cost ($)",
|
| 440 |
+
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
|
| 441 |
+
margin=dict(l=50, r=20, t=40, b=40),
|
| 442 |
+
)
|
| 443 |
+
|
| 444 |
+
return fig
|
| 445 |
+
|
| 446 |
+
|
| 447 |
def create_single_trajectory_chart(steps: list[dict]):
|
| 448 |
"""Create stacked bar chart for a single trajectory showing tokens per step."""
|
| 449 |
import plotly.graph_objects as go
|
|
|
|
| 454 |
per_step_data = calculate_per_step_tokens(steps)
|
| 455 |
|
| 456 |
x_labels = [f"Step {d['step']}" for d in per_step_data]
|
|
|
|
| 457 |
cache_read = [d["cache_read"] / 1e3 for d in per_step_data]
|
| 458 |
cache_creation = [d["cache_creation"] / 1e3 for d in per_step_data]
|
| 459 |
completion = [d["completion"] / 1e3 for d in per_step_data]
|
| 460 |
+
prompt_tokens = [(d["cache_read"] + d["uncached_input"]) / 1e3 for d in per_step_data]
|
| 461 |
+
uncached = [max(0, p - cr - cc) for p, cr, cc in zip(prompt_tokens, cache_read, cache_creation)]
|
| 462 |
|
| 463 |
fig = go.Figure()
|
| 464 |
|
|
|
|
| 515 |
per_step_data = calculate_per_step_tokens(steps)
|
| 516 |
|
| 517 |
x_labels = [f"Step {d['step']}" for d in per_step_data]
|
| 518 |
+
cache_read = [d["cache_read"] for d in per_step_data]
|
| 519 |
+
cache_creation = [d["cache_creation"] for d in per_step_data]
|
| 520 |
+
completion = [d["completion"] for d in per_step_data]
|
| 521 |
+
prompt_tokens = [d["cache_read"] + d["uncached_input"] for d in per_step_data]
|
| 522 |
+
uncached = [max(0, p - cr - cc) for p, cr, cc in zip(prompt_tokens, cache_read, cache_creation)]
|
| 523 |
+
|
| 524 |
+
uncached_cost = [u * input_price / 1e6 for u in uncached]
|
| 525 |
+
cache_read_cost = [cr * cache_read_price / 1e6 for cr in cache_read]
|
| 526 |
+
cache_creation_cost = [cc * cache_creation_price / 1e6 for cc in cache_creation]
|
| 527 |
+
completion_cost = [c * completion_price / 1e6 for c in completion]
|
| 528 |
|
| 529 |
fig = go.Figure()
|
| 530 |
|
|
|
|
| 1119 |
result["prompt_tokens"] += usage.get("prompt_tokens", 0) or 0
|
| 1120 |
result["completion_tokens"] += usage.get("completion_tokens", 0) or 0
|
| 1121 |
result["total_tokens"] += usage.get("total_tokens", 0) or 0
|
| 1122 |
+
|
| 1123 |
+
cache_read = usage.get("cache_read_input_tokens", 0) or 0
|
| 1124 |
+
cache_creation = usage.get("cache_creation_input_tokens", 0) or 0
|
| 1125 |
+
|
| 1126 |
+
prompt_tokens_details = usage.get("prompt_tokens_details", {})
|
| 1127 |
+
if isinstance(prompt_tokens_details, dict):
|
| 1128 |
+
cached_from_details = prompt_tokens_details.get("cached_tokens", 0) or 0
|
| 1129 |
+
if cached_from_details > 0 and cache_read == 0:
|
| 1130 |
+
cache_read = cached_from_details
|
| 1131 |
+
|
| 1132 |
+
result["cache_read_tokens"] += cache_read
|
| 1133 |
+
result["cache_creation_tokens"] += cache_creation
|
| 1134 |
+
|
| 1135 |
+
if result["prompt_tokens"] == 0 and result["completion_tokens"] == 0:
|
| 1136 |
+
log_path = traj_path.with_suffix(".info.log")
|
| 1137 |
+
if not log_path.exists():
|
| 1138 |
+
base_name = traj_path.stem.replace(".traj", "")
|
| 1139 |
+
log_path = traj_path.parent / f"{base_name}.info.log"
|
| 1140 |
+
|
| 1141 |
+
if log_path.exists():
|
| 1142 |
+
steps = _parse_old_format_log(log_path)
|
| 1143 |
+
for step_data in steps:
|
| 1144 |
+
result["prompt_tokens"] += step_data["cache_read"] + step_data["uncached_input"]
|
| 1145 |
+
result["completion_tokens"] += step_data["completion"]
|
| 1146 |
+
result["cache_read_tokens"] += step_data["cache_read"]
|
| 1147 |
+
result["total_tokens"] = result["prompt_tokens"] + result["completion_tokens"]
|
| 1148 |
+
if result["api_calls"] == 0:
|
| 1149 |
+
result["api_calls"] = len(steps)
|
| 1150 |
|
| 1151 |
return result
|
| 1152 |
|
|
|
|
| 1818 |
""")
|
| 1819 |
trajectories_state = gr.State(None)
|
| 1820 |
|
| 1821 |
+
gr.Markdown("# 🧮 SWE-bench Bash-Only Leaderboard `v0.3.37`")
|
| 1822 |
gr.Markdown("## 🎯 Select a base model for cost analysis (click a row)")
|
| 1823 |
|
| 1824 |
with gr.Row():
|
|
|
|
| 1861 |
with gr.Row():
|
| 1862 |
plot_cost_breakdown_calc = gr.Plot(label="Cost per Trajectory")
|
| 1863 |
|
| 1864 |
+
with gr.Accordion("One trajectory statistics. Metadata from .traj", open=False, visible=False) as single_traj_meta_accordion:
|
| 1865 |
+
with gr.Row():
|
| 1866 |
+
single_traj_meta_dropdown = gr.Dropdown(label="Select Issue", choices=[], interactive=True)
|
| 1867 |
+
with gr.Row():
|
| 1868 |
+
single_traj_meta_plot = gr.Plot(label="Tokens per Step (stacked)")
|
| 1869 |
+
with gr.Row():
|
| 1870 |
+
single_traj_meta_cost_plot = gr.Plot(label="Cost per Step (stacked) ($)")
|
| 1871 |
+
|
| 1872 |
with gr.Accordion("One trajectory statistics. Calculated from .traj messages", open=False, visible=False) as single_traj_accordion:
|
| 1873 |
with gr.Row():
|
| 1874 |
single_traj_dropdown = gr.Dropdown(label="Select Issue", choices=[], interactive=True)
|
|
|
|
| 2698 |
gr.update(),
|
| 2699 |
gr.update(),
|
| 2700 |
gr.update(),
|
| 2701 |
+
gr.update(visible=False),
|
| 2702 |
+
gr.update(),
|
| 2703 |
+
gr.update(),
|
| 2704 |
+
gr.update(),
|
| 2705 |
)
|
| 2706 |
|
| 2707 |
if not folder:
|
|
|
|
| 2723 |
gr.update(),
|
| 2724 |
gr.update(),
|
| 2725 |
gr.update(),
|
| 2726 |
+
gr.update(visible=False),
|
| 2727 |
+
gr.update(),
|
| 2728 |
+
gr.update(),
|
| 2729 |
+
gr.update(),
|
| 2730 |
)
|
| 2731 |
progress(0.3, desc="Downloading")
|
| 2732 |
status, _ = download_trajectories_from_s3(folder)
|
|
|
|
| 2744 |
gr.update(),
|
| 2745 |
gr.update(),
|
| 2746 |
gr.update(),
|
| 2747 |
+
gr.update(visible=False),
|
| 2748 |
+
gr.update(),
|
| 2749 |
+
gr.update(),
|
| 2750 |
+
gr.update(),
|
| 2751 |
)
|
| 2752 |
return
|
| 2753 |
progress(0.45, desc="Loading trajectories")
|
|
|
|
| 2764 |
gr.update(),
|
| 2765 |
gr.update(),
|
| 2766 |
gr.update(),
|
| 2767 |
+
gr.update(visible=False),
|
| 2768 |
+
gr.update(),
|
| 2769 |
+
gr.update(),
|
| 2770 |
+
gr.update(),
|
| 2771 |
)
|
| 2772 |
|
| 2773 |
progress(0.6, desc="Reading metadata")
|
|
|
|
| 2776 |
df_calc = ensure_token_columns(load_all_trajectories_calculated(folder))
|
| 2777 |
df_calc["api_calls"] = df_meta["api_calls"].values
|
| 2778 |
df_calc["instance_cost"] = df_meta["instance_cost"].values
|
| 2779 |
+
progress(0.75, desc="Reading steps")
|
| 2780 |
trajectory_steps = load_all_trajectory_steps(folder)
|
| 2781 |
+
progress(0.8, desc="Reading metadata steps")
|
| 2782 |
+
metadata_steps = load_all_trajectory_metadata_steps(folder)
|
| 2783 |
|
| 2784 |
model_details, _ = get_model_details(folder)
|
| 2785 |
resolved_instances = {}
|
|
|
|
| 2788 |
for inst_id, details in per_instance.items():
|
| 2789 |
resolved_instances[inst_id] = details.get("resolved", False)
|
| 2790 |
|
| 2791 |
+
state_data = {"meta": df_meta, "calculated": df_calc, "folder": folder, "steps": trajectory_steps, "metadata_steps": metadata_steps, "resolved": resolved_instances}
|
| 2792 |
|
| 2793 |
if df_meta.empty:
|
| 2794 |
progress(1, desc="No trajectories found")
|
|
|
|
| 2804 |
gr.update(),
|
| 2805 |
gr.update(),
|
| 2806 |
gr.update(),
|
| 2807 |
+
gr.update(visible=False),
|
| 2808 |
+
gr.update(),
|
| 2809 |
+
gr.update(),
|
| 2810 |
+
gr.update(),
|
| 2811 |
)
|
| 2812 |
return
|
| 2813 |
|
|
|
|
| 2837 |
issue_ids = sorted(trajectory_steps.keys())
|
| 2838 |
first_issue = issue_ids[0] if issue_ids else None
|
| 2839 |
|
| 2840 |
+
meta_issue_ids = sorted(metadata_steps.keys())
|
| 2841 |
+
first_meta_issue = meta_issue_ids[0] if meta_issue_ids else None
|
| 2842 |
+
has_meta_steps = len(meta_issue_ids) > 0
|
| 2843 |
+
|
| 2844 |
progress(1, desc="Done")
|
| 2845 |
yield (
|
| 2846 |
f"✅ Loaded {len(df_meta)} trajectories",
|
|
|
|
| 2854 |
gr.update(choices=issue_ids, value=first_issue),
|
| 2855 |
gr.update(),
|
| 2856 |
gr.update(),
|
| 2857 |
+
gr.update(visible=has_meta_steps),
|
| 2858 |
+
gr.update(choices=meta_issue_ids, value=first_meta_issue),
|
| 2859 |
+
gr.update(),
|
| 2860 |
+
gr.update(),
|
| 2861 |
)
|
| 2862 |
|
| 2863 |
def on_single_traj_select(state_data, issue_id, input_price, cache_read_price, cache_creation_price, completion_price):
|
|
|
|
| 2871 |
cost_chart = create_single_trajectory_cost_chart(steps, input_price, cache_read_price, cache_creation_price, completion_price)
|
| 2872 |
return tokens_chart, cost_chart
|
| 2873 |
|
| 2874 |
+
def on_single_traj_meta_select(state_data, issue_id, input_price, cache_read_price, cache_creation_price, completion_price):
|
| 2875 |
+
if state_data is None or not issue_id:
|
| 2876 |
+
return None, None
|
| 2877 |
+
metadata_steps = state_data.get("metadata_steps", {})
|
| 2878 |
+
if issue_id not in metadata_steps:
|
| 2879 |
+
return None, None
|
| 2880 |
+
steps = metadata_steps[issue_id]
|
| 2881 |
+
tokens_chart = create_single_trajectory_meta_chart(steps)
|
| 2882 |
+
cost_chart = create_single_trajectory_meta_cost_chart(steps, input_price, cache_read_price, cache_creation_price, completion_price)
|
| 2883 |
+
return tokens_chart, cost_chart
|
| 2884 |
+
|
| 2885 |
analyze_btn.click(
|
| 2886 |
fn=load_and_analyze,
|
| 2887 |
inputs=[selected_folder, price_input, price_cache_read, price_cache_creation, price_completion, thinking_overhead, use_cache],
|
|
|
|
| 2897 |
single_traj_dropdown,
|
| 2898 |
single_traj_plot,
|
| 2899 |
single_traj_cost_plot,
|
| 2900 |
+
single_traj_meta_accordion,
|
| 2901 |
+
single_traj_meta_dropdown,
|
| 2902 |
+
single_traj_meta_plot,
|
| 2903 |
+
single_traj_meta_cost_plot,
|
| 2904 |
],
|
| 2905 |
).then(
|
| 2906 |
fn=on_single_traj_select,
|
| 2907 |
inputs=[trajectories_state, single_traj_dropdown, price_input, price_cache_read, price_cache_creation, price_completion],
|
| 2908 |
outputs=[single_traj_plot, single_traj_cost_plot],
|
| 2909 |
+
).then(
|
| 2910 |
+
fn=on_single_traj_meta_select,
|
| 2911 |
+
inputs=[trajectories_state, single_traj_meta_dropdown, price_input, price_cache_read, price_cache_creation, price_completion],
|
| 2912 |
+
outputs=[single_traj_meta_plot, single_traj_meta_cost_plot],
|
| 2913 |
)
|
| 2914 |
|
| 2915 |
def recalculate_costs(state_data, input_price, cache_read_price, cache_creation_price, completion_price, overhead, with_cache):
|
|
|
|
| 2973 |
outputs=[single_traj_plot, single_traj_cost_plot],
|
| 2974 |
)
|
| 2975 |
|
| 2976 |
+
single_traj_meta_dropdown.change(
|
| 2977 |
+
fn=on_single_traj_meta_select,
|
| 2978 |
+
inputs=[trajectories_state, single_traj_meta_dropdown, price_input, price_cache_read, price_cache_creation, price_completion],
|
| 2979 |
+
outputs=[single_traj_meta_plot, single_traj_meta_cost_plot],
|
| 2980 |
+
)
|
| 2981 |
+
|
| 2982 |
thinking_overhead.change(
|
| 2983 |
fn=on_calc_options_change,
|
| 2984 |
inputs=calc_options_inputs,
|
requirements.txt
CHANGED
|
@@ -6,3 +6,4 @@ python-dotenv>=1.0.0
|
|
| 6 |
tiktoken>=0.12.0
|
| 7 |
awscli
|
| 8 |
|
|
|
|
|
|
| 6 |
tiktoken>=0.12.0
|
| 7 |
awscli
|
| 8 |
|
| 9 |
+
|