IgorSlinko commited on
Commit
7e7e3a1
·
1 Parent(s): 781ed01

Add trajectory analysis with cost breakdown

Browse files

- Add 6 analysis plots: API calls, cost distribution, token usage, cost by token type, billable tokens per instance, cost breakdown per instance
- Load token prices from litellm model_prices_and_context_window.json
- Show ✅/❌ indicators for auto-loaded vs manual price fields
- Move analysis section under leaderboard table in left column
- Add tight margins to Plotly charts for better layout
- Use gr.State for folder storage instead of hidden textbox

Files changed (3) hide show
  1. app.py +487 -19
  2. pyproject.toml +1 -0
  3. uv.lock +24 -0
app.py CHANGED
@@ -5,13 +5,71 @@ from pathlib import Path
5
 
6
  import gradio as gr
7
  import pandas as pd
 
 
 
8
 
9
- from src.download_swebench_leaderboard import download_leaderboard, get_leaderboard
10
 
11
  DATA_DIR = Path("data")
12
  TRAJS_DIR = DATA_DIR / "swebench_trajs"
13
  LEADERBOARD_CACHE = DATA_DIR / "swebench_leaderboard_latest.json"
 
14
  S3_BUCKET = "s3://swe-bench-experiments/bash-only"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
 
17
  def load_or_download_leaderboard():
@@ -46,8 +104,7 @@ def get_bash_only_df():
46
  "os_system": "✅" if r.get("os_system") else "❌",
47
  })
48
 
49
- df = pd.DataFrame(rows)
50
- return df
51
 
52
 
53
  def get_model_details(folder: str):
@@ -68,18 +125,27 @@ def get_model_details(folder: str):
68
  return model, None
69
 
70
 
 
 
 
 
 
 
 
71
  def download_trajectories_from_s3(folder: str, progress=gr.Progress()):
72
  if not folder:
73
- return "❌ No model selected"
74
 
75
  model, error = get_model_details(folder)
76
  if error:
77
- return f"❌ {error}"
78
 
79
  output_dir = TRAJS_DIR / folder
80
  if output_dir.exists() and any(output_dir.iterdir()):
81
  file_count = len(list(output_dir.glob("*/*.traj.json")))
82
- return f"✅ Already downloaded: {output_dir}\n\n{file_count} trajectory files"
 
 
83
 
84
  s3_path = f"{S3_BUCKET}/{folder}/trajs/"
85
  output_dir.mkdir(parents=True, exist_ok=True)
@@ -95,7 +161,7 @@ def download_trajectories_from_s3(folder: str, progress=gr.Progress()):
95
  )
96
 
97
  if result.returncode != 0:
98
- return f"❌ S3 download failed:\n{result.stderr}"
99
 
100
  file_count = len(list(output_dir.glob("*/*.traj.json")))
101
  if file_count == 0:
@@ -105,62 +171,464 @@ def download_trajectories_from_s3(folder: str, progress=gr.Progress()):
105
  resolved_count = sum(1 for v in per_instance.values() if v.get("resolved"))
106
  total_count = len(per_instance)
107
 
108
- return f"✅ Downloaded to {output_dir}\n\n{file_count} trajectory files\nResolved: {resolved_count}/{total_count} ({100*resolved_count/total_count:.1f}%)"
 
109
 
110
  except subprocess.TimeoutExpired:
111
- return "❌ Download timed out (>10 min)"
112
  except FileNotFoundError:
113
- return "❌ AWS CLI not found. Install with: pip install awscli"
114
  except Exception as e:
115
- return f"❌ Error: {e}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
 
118
  def on_row_select(evt: gr.SelectData, df: pd.DataFrame):
119
  if evt.index is None:
120
- return "", "", gr.update()
 
 
 
 
 
 
 
 
121
 
122
  row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
123
  row = df.iloc[row_idx]
124
  folder = row["folder"]
125
  name = row["name"]
126
 
127
- return folder, name, gr.update(interactive=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
 
129
 
130
  def build_app():
131
- df = get_bash_only_df()
132
 
133
  with gr.Blocks(title="SWE-bench Routing Cost Calculator") as app:
 
 
134
  gr.Markdown("# 🧮 SWE-bench Bash-Only Leaderboard")
135
  gr.Markdown("Select a model to use as base for cost analysis")
136
 
137
  with gr.Row():
138
  with gr.Column(scale=3):
139
  leaderboard_table = gr.Dataframe(
140
- value=df,
141
  label="Bash-Only Leaderboard",
142
  interactive=False,
143
  wrap=True,
144
  )
145
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  with gr.Column(scale=1):
 
147
  gr.Markdown("### Selected Model")
148
  selected_name = gr.Textbox(label="Model Name", interactive=False)
149
- selected_folder = gr.Textbox(label="Folder ID", interactive=False)
150
 
151
  download_btn = gr.Button("📥 Download Trajectories", interactive=False)
152
  download_status = gr.Textbox(label="Status", interactive=False, lines=3)
153
 
 
 
 
 
 
 
 
 
 
154
  leaderboard_table.select(
155
  fn=on_row_select,
156
  inputs=[leaderboard_table],
157
- outputs=[selected_folder, selected_name, download_btn],
158
  )
159
 
160
  download_btn.click(
161
  fn=download_trajectories_from_s3,
162
  inputs=[selected_folder],
163
- outputs=[download_status],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  )
165
 
166
  return app
@@ -168,5 +636,5 @@ def build_app():
168
 
169
  if __name__ == "__main__":
170
  app = build_app()
 
171
  app.launch()
172
-
 
5
 
6
  import gradio as gr
7
  import pandas as pd
8
+ import plotly.express as px
9
+ import plotly.graph_objects as go
10
+ import requests
11
 
12
+ from src.download_swebench_leaderboard import download_leaderboard
13
 
14
  DATA_DIR = Path("data")
15
  TRAJS_DIR = DATA_DIR / "swebench_trajs"
16
  LEADERBOARD_CACHE = DATA_DIR / "swebench_leaderboard_latest.json"
17
+ LITELLM_PRICES_CACHE = DATA_DIR / "litellm_prices.json"
18
  S3_BUCKET = "s3://swe-bench-experiments/bash-only"
19
+ LITELLM_PRICES_URL = "https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json"
20
+
21
+ _litellm_prices_cache = None
22
+ _trajectories_cache = {}
23
+
24
+
25
+ def get_litellm_prices() -> dict:
26
+ global _litellm_prices_cache
27
+ if _litellm_prices_cache is not None:
28
+ return _litellm_prices_cache
29
+
30
+ if LITELLM_PRICES_CACHE.exists():
31
+ with open(LITELLM_PRICES_CACHE) as f:
32
+ _litellm_prices_cache = json.load(f)
33
+ return _litellm_prices_cache
34
+
35
+ try:
36
+ response = requests.get(LITELLM_PRICES_URL, timeout=30)
37
+ response.raise_for_status()
38
+ _litellm_prices_cache = response.json()
39
+
40
+ DATA_DIR.mkdir(exist_ok=True)
41
+ with open(LITELLM_PRICES_CACHE, "w") as f:
42
+ json.dump(_litellm_prices_cache, f)
43
+ except Exception:
44
+ _litellm_prices_cache = {}
45
+
46
+ return _litellm_prices_cache
47
+
48
+
49
+ def get_model_prices(model_name: str) -> dict | None:
50
+ if not model_name:
51
+ return None
52
+
53
+ prices = get_litellm_prices()
54
+
55
+ clean_name = model_name.replace("anthropic/", "").replace("openai/", "")
56
+
57
+ candidates = [
58
+ model_name,
59
+ clean_name,
60
+ f"anthropic/{clean_name}",
61
+ f"openai/{clean_name}",
62
+ ]
63
+
64
+ for key in candidates:
65
+ if key in prices:
66
+ return prices[key]
67
+
68
+ for key, value in prices.items():
69
+ if clean_name in key or model_name in key:
70
+ return value
71
+
72
+ return None
73
 
74
 
75
  def load_or_download_leaderboard():
 
104
  "os_system": "✅" if r.get("os_system") else "❌",
105
  })
106
 
107
+ return pd.DataFrame(rows)
 
108
 
109
 
110
  def get_model_details(folder: str):
 
125
  return model, None
126
 
127
 
128
+ def check_trajectories_downloaded(folder: str) -> bool:
129
+ if not folder:
130
+ return False
131
+ output_dir = TRAJS_DIR / folder
132
+ return output_dir.exists() and any(output_dir.iterdir())
133
+
134
+
135
  def download_trajectories_from_s3(folder: str, progress=gr.Progress()):
136
  if not folder:
137
+ return "❌ No model selected", gr.update(visible=False)
138
 
139
  model, error = get_model_details(folder)
140
  if error:
141
+ return f"❌ {error}", gr.update(visible=False)
142
 
143
  output_dir = TRAJS_DIR / folder
144
  if output_dir.exists() and any(output_dir.iterdir()):
145
  file_count = len(list(output_dir.glob("*/*.traj.json")))
146
+ if file_count == 0:
147
+ file_count = len(list(output_dir.glob("*.json")))
148
+ return f"✅ Already downloaded: {output_dir}\n\n{file_count} trajectory files", gr.update(visible=True)
149
 
150
  s3_path = f"{S3_BUCKET}/{folder}/trajs/"
151
  output_dir.mkdir(parents=True, exist_ok=True)
 
161
  )
162
 
163
  if result.returncode != 0:
164
+ return f"❌ S3 download failed:\n{result.stderr}", gr.update(visible=False)
165
 
166
  file_count = len(list(output_dir.glob("*/*.traj.json")))
167
  if file_count == 0:
 
171
  resolved_count = sum(1 for v in per_instance.values() if v.get("resolved"))
172
  total_count = len(per_instance)
173
 
174
+ status = f"✅ Downloaded to {output_dir}\n\n{file_count} trajectory files\nResolved: {resolved_count}/{total_count} ({100*resolved_count/total_count:.1f}%)"
175
+ return status, gr.update(visible=True)
176
 
177
  except subprocess.TimeoutExpired:
178
+ return "❌ Download timed out (>10 min)", gr.update(visible=False)
179
  except FileNotFoundError:
180
+ return "❌ AWS CLI not found. Install with: pip install awscli", gr.update(visible=False)
181
  except Exception as e:
182
+ return f"❌ Error: {e}", gr.update(visible=False)
183
+
184
+
185
+ def parse_trajectory(traj_path: Path) -> dict:
186
+ with open(traj_path, "r", encoding="utf-8") as f:
187
+ data = json.load(f)
188
+
189
+ info = data.get("info", {})
190
+ model_stats = info.get("model_stats", {})
191
+ config = info.get("config", {})
192
+ model_config = config.get("model", {})
193
+ model_name = model_config.get("cost_calc_model_override", model_config.get("model_name", ""))
194
+
195
+ result = {
196
+ "instance_id": data.get("instance_id", traj_path.stem),
197
+ "model_name": model_name,
198
+ "api_calls": model_stats.get("api_calls", 0),
199
+ "instance_cost": model_stats.get("instance_cost", 0),
200
+ "prompt_tokens": 0,
201
+ "completion_tokens": 0,
202
+ "total_tokens": 0,
203
+ "cache_read_tokens": 0,
204
+ "cache_creation_tokens": 0,
205
+ }
206
+
207
+ messages = data.get("messages", [])
208
+ for msg in messages:
209
+ usage = None
210
+ if "usage" in msg:
211
+ usage = msg["usage"]
212
+ elif "extra" in msg and isinstance(msg["extra"], dict):
213
+ response = msg["extra"].get("response", {})
214
+ if isinstance(response, dict):
215
+ usage = response.get("usage", {})
216
+
217
+ if usage:
218
+ result["prompt_tokens"] += usage.get("prompt_tokens", 0) or 0
219
+ result["completion_tokens"] += usage.get("completion_tokens", 0) or 0
220
+ result["total_tokens"] += usage.get("total_tokens", 0) or 0
221
+ result["cache_read_tokens"] += usage.get("cache_read_input_tokens", 0) or 0
222
+ result["cache_creation_tokens"] += usage.get("cache_creation_input_tokens", 0) or 0
223
+
224
+ return result
225
+
226
+
227
+ def load_all_trajectories(folder: str) -> pd.DataFrame:
228
+ global _trajectories_cache
229
+
230
+ if folder in _trajectories_cache:
231
+ return _trajectories_cache[folder]
232
+
233
+ output_dir = TRAJS_DIR / folder
234
+
235
+ traj_files = list(output_dir.glob("*/*.traj.json"))
236
+ if not traj_files:
237
+ traj_files = list(output_dir.glob("*.traj.json"))
238
+ if not traj_files:
239
+ traj_files = list(output_dir.glob("*.json"))
240
+
241
+ rows = []
242
+ for traj_path in traj_files:
243
+ try:
244
+ rows.append(parse_trajectory(traj_path))
245
+ except Exception as e:
246
+ print(f"Error parsing {traj_path}: {e}")
247
+
248
+ df = pd.DataFrame(rows)
249
+ _trajectories_cache[folder] = df
250
+ return df
251
+
252
+
253
+ def create_basic_histograms(df: pd.DataFrame, cache_read_price: float, cache_creation_price: float, completion_price: float):
254
+ if df.empty:
255
+ return None, None, None, None, None
256
+
257
+ fig_steps = px.histogram(
258
+ df,
259
+ x="api_calls",
260
+ nbins=30,
261
+ title="Distribution of API Calls (Steps) per Instance",
262
+ color_discrete_sequence=["#636EFA"],
263
+ )
264
+ fig_steps.update_layout(
265
+ xaxis_title="API Calls (Steps)",
266
+ yaxis_title="Number of Instances",
267
+ showlegend=False,
268
+ margin=dict(l=40, r=20, t=40, b=40),
269
+ )
270
+ fig_steps.add_annotation(
271
+ text=f"Mean: {df['api_calls'].mean():.1f} | Median: {df['api_calls'].median():.0f}",
272
+ xref="paper", yref="paper",
273
+ x=0.95, y=0.95, showarrow=False,
274
+ font=dict(size=12),
275
+ )
276
+
277
+ fig_cost = px.histogram(
278
+ df,
279
+ x="instance_cost",
280
+ nbins=30,
281
+ title="Distribution of Cost per Instance ($)",
282
+ color_discrete_sequence=["#00CC96"],
283
+ )
284
+ fig_cost.update_layout(
285
+ xaxis_title="Cost ($)",
286
+ yaxis_title="Number of Instances",
287
+ showlegend=False,
288
+ margin=dict(l=40, r=20, t=40, b=40),
289
+ )
290
+ fig_cost.add_annotation(
291
+ text=f"Mean: ${df['instance_cost'].mean():.4f} | Total: ${df['instance_cost'].sum():.2f}",
292
+ xref="paper", yref="paper",
293
+ x=0.95, y=0.95, showarrow=False,
294
+ font=dict(size=12),
295
+ )
296
+
297
+ total_prompt = df["prompt_tokens"].sum()
298
+ total_completion = df["completion_tokens"].sum()
299
+ total_cache_read = df["cache_read_tokens"].sum()
300
+ total_cache_creation = df["cache_creation_tokens"].sum()
301
+
302
+ token_data = pd.DataFrame({
303
+ "Token Type": ["Prompt", "Completion", "Cache Read", "Cache Creation"],
304
+ "Total Tokens": [total_prompt, total_completion, total_cache_read, total_cache_creation],
305
+ })
306
+
307
+ fig_tokens = px.bar(
308
+ token_data,
309
+ x="Token Type",
310
+ y="Total Tokens",
311
+ title="Total Tokens by Type",
312
+ color="Token Type",
313
+ color_discrete_sequence=["#EF553B", "#AB63FA", "#19D3F3", "#FFA15A"],
314
+ )
315
+ fig_tokens.update_layout(
316
+ xaxis_title="Token Type",
317
+ yaxis_title="Total Tokens",
318
+ showlegend=False,
319
+ margin=dict(l=40, r=20, t=40, b=40),
320
+ )
321
+
322
+ total_all = token_data["Total Tokens"].sum()
323
+ fig_tokens.add_annotation(
324
+ text=f"Total: {total_all:,.0f}",
325
+ xref="paper", yref="paper",
326
+ x=0.95, y=0.95, showarrow=False,
327
+ font=dict(size=12),
328
+ )
329
+
330
+ # Cost by token type (prompt tokens not billed separately, included in cache)
331
+ cost_completion = total_completion * completion_price / 1e6
332
+ cost_cache_read = total_cache_read * cache_read_price / 1e6
333
+ cost_cache_creation = total_cache_creation * cache_creation_price / 1e6
334
+
335
+ cost_data = pd.DataFrame({
336
+ "Token Type": ["Completion", "Cache Read", "Cache Creation"],
337
+ "Cost ($)": [cost_completion, cost_cache_read, cost_cache_creation],
338
+ })
339
+
340
+ fig_tokens_cost = px.bar(
341
+ cost_data,
342
+ x="Token Type",
343
+ y="Cost ($)",
344
+ title="Total Cost by Token Type ($)",
345
+ color="Token Type",
346
+ color_discrete_sequence=["#AB63FA", "#19D3F3", "#FFA15A"],
347
+ )
348
+ fig_tokens_cost.update_layout(
349
+ xaxis_title="Token Type",
350
+ yaxis_title="Cost ($)",
351
+ showlegend=False,
352
+ margin=dict(l=40, r=20, t=40, b=40),
353
+ )
354
+
355
+ total_cost = cost_completion + cost_cache_read + cost_cache_creation
356
+ fig_tokens_cost.add_annotation(
357
+ text=f"Total: ${total_cost:.2f}",
358
+ xref="paper", yref="paper",
359
+ x=0.95, y=0.95, showarrow=False,
360
+ font=dict(size=12),
361
+ )
362
+
363
+ df_sorted = df.sort_values("cache_read_tokens", ascending=False).reset_index(drop=True)
364
+ df_sorted["instance_idx"] = range(len(df_sorted))
365
+
366
+ fig_stacked = go.Figure()
367
+
368
+ fig_stacked.add_trace(go.Bar(
369
+ name="Cache Read",
370
+ x=df_sorted["instance_idx"],
371
+ y=df_sorted["cache_read_tokens"],
372
+ marker_color="#19D3F3",
373
+ hovertemplate="Instance: %{x}<br>Cache Read: %{y:,.0f}<extra></extra>",
374
+ ))
375
+
376
+ fig_stacked.add_trace(go.Bar(
377
+ name="Cache Creation",
378
+ x=df_sorted["instance_idx"],
379
+ y=df_sorted["cache_creation_tokens"],
380
+ marker_color="#FFA15A",
381
+ hovertemplate="Instance: %{x}<br>Cache Creation: %{y:,.0f}<extra></extra>",
382
+ ))
383
+
384
+ fig_stacked.add_trace(go.Bar(
385
+ name="Completion",
386
+ x=df_sorted["instance_idx"],
387
+ y=df_sorted["completion_tokens"],
388
+ marker_color="#AB63FA",
389
+ hovertemplate="Instance: %{x}<br>Completion: %{y:,.0f}<extra></extra>",
390
+ ))
391
+
392
+ fig_stacked.update_layout(
393
+ barmode="stack",
394
+ title="Billable Tokens per Instance (stacked)",
395
+ xaxis_title="Instance (sorted by cache read)",
396
+ yaxis_title="Tokens",
397
+ legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
398
+ margin=dict(l=50, r=20, t=60, b=40),
399
+ )
400
+
401
+ return fig_steps, fig_cost, fig_tokens, fig_tokens_cost, fig_stacked
402
+
403
+
404
+ def create_cost_breakdown(df: pd.DataFrame, cache_read_price: float, cache_creation_price: float, completion_price: float):
405
+ if df.empty:
406
+ return None
407
+
408
+ df_sorted = df.sort_values("cache_read_tokens", ascending=False).reset_index(drop=True)
409
+ df_sorted["instance_idx"] = range(len(df_sorted))
410
+
411
+ df_sorted["cost_cache_read"] = df_sorted["cache_read_tokens"] * cache_read_price / 1e6
412
+ df_sorted["cost_cache_creation"] = df_sorted["cache_creation_tokens"] * cache_creation_price / 1e6
413
+ df_sorted["cost_completion"] = df_sorted["completion_tokens"] * completion_price / 1e6
414
+
415
+ fig = go.Figure()
416
+
417
+ fig.add_trace(go.Bar(
418
+ name=f"Cache Read (${cache_read_price:.2f}/1M)",
419
+ x=df_sorted["instance_idx"],
420
+ y=df_sorted["cost_cache_read"],
421
+ marker_color="#19D3F3",
422
+ hovertemplate="Instance: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
423
+ ))
424
+
425
+ fig.add_trace(go.Bar(
426
+ name=f"Cache Creation (${cache_creation_price:.2f}/1M)",
427
+ x=df_sorted["instance_idx"],
428
+ y=df_sorted["cost_cache_creation"],
429
+ marker_color="#FFA15A",
430
+ hovertemplate="Instance: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
431
+ ))
432
+
433
+ fig.add_trace(go.Bar(
434
+ name=f"Completion (${completion_price:.2f}/1M)",
435
+ x=df_sorted["instance_idx"],
436
+ y=df_sorted["cost_completion"],
437
+ marker_color="#AB63FA",
438
+ hovertemplate="Instance: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
439
+ ))
440
+
441
+ total_cost = (
442
+ df_sorted["cost_cache_read"].sum() +
443
+ df_sorted["cost_cache_creation"].sum() +
444
+ df_sorted["cost_completion"].sum()
445
+ )
446
+
447
+ fig.update_layout(
448
+ barmode="stack",
449
+ title="Cost Breakdown per Instance",
450
+ xaxis_title="Instance (sorted by cache read)",
451
+ yaxis_title="Cost ($)",
452
+ legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
453
+ margin=dict(l=50, r=20, t=60, b=40),
454
+ )
455
+
456
+ fig.add_annotation(
457
+ text=f"Total: ${total_cost:.2f}",
458
+ xref="paper", yref="paper",
459
+ x=0.95, y=0.95, showarrow=False,
460
+ font=dict(size=14),
461
+ bgcolor="white",
462
+ )
463
+
464
+ return fig
465
+
466
+
467
+ def extract_model_from_folder(folder: str) -> str:
468
+ """Extract model name from folder like '20251124_mini-v1.16.0_claude-opus-4-5-20251101'"""
469
+ if not folder:
470
+ return ""
471
+ parts = folder.split("_")
472
+ if len(parts) >= 3:
473
+ return "_".join(parts[2:])
474
+ return folder
475
+
476
+
477
+ def get_prices_for_folder(folder: str) -> tuple[float, float, float, str]:
478
+ """Get prices from litellm based on folder name. Returns (cache_read, cache_creation, completion, model_name)"""
479
+ model_hint = extract_model_from_folder(folder)
480
+ if not model_hint:
481
+ return 0, 0, 0, ""
482
+
483
+ prices = get_model_prices(model_hint)
484
+ if prices:
485
+ cache_read = prices.get("cache_read_input_token_cost", 0) * 1e6
486
+ cache_creation = prices.get("cache_creation_input_token_cost", 0) * 1e6
487
+ completion = prices.get("output_cost_per_token", 0) * 1e6
488
+ return cache_read, cache_creation, completion, model_hint
489
+
490
+ return 0, 0, 0, model_hint
491
 
492
 
493
  def on_row_select(evt: gr.SelectData, df: pd.DataFrame):
494
  if evt.index is None:
495
+ return (
496
+ "", "",
497
+ gr.update(interactive=False),
498
+ gr.update(visible=False),
499
+ gr.update(value=0, label="💲 Cache Read"),
500
+ gr.update(value=0, label="💲 Cache Creation"),
501
+ gr.update(value=0, label="💲 Completion"),
502
+ ""
503
+ )
504
 
505
  row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
506
  row = df.iloc[row_idx]
507
  folder = row["folder"]
508
  name = row["name"]
509
 
510
+ show_analyze = check_trajectories_downloaded(folder)
511
+
512
+ cache_read, cache_creation, completion, model_hint = get_prices_for_folder(folder)
513
+
514
+ def price_update(value, name):
515
+ if value > 0:
516
+ return gr.update(value=value, label=f"✅ {name}")
517
+ else:
518
+ return gr.update(value=value, label=f"❌ {name}")
519
+
520
+ return (
521
+ folder, name,
522
+ gr.update(interactive=True),
523
+ gr.update(visible=show_analyze),
524
+ price_update(cache_read, "Cache Read"),
525
+ price_update(cache_creation, "Cache Creation"),
526
+ price_update(completion, "Completion"),
527
+ model_hint
528
+ )
529
 
530
 
531
  def build_app():
532
+ leaderboard_df = get_bash_only_df()
533
 
534
  with gr.Blocks(title="SWE-bench Routing Cost Calculator") as app:
535
+ trajectories_state = gr.State(None)
536
+
537
  gr.Markdown("# 🧮 SWE-bench Bash-Only Leaderboard")
538
  gr.Markdown("Select a model to use as base for cost analysis")
539
 
540
  with gr.Row():
541
  with gr.Column(scale=3):
542
  leaderboard_table = gr.Dataframe(
543
+ value=leaderboard_df,
544
  label="Bash-Only Leaderboard",
545
  interactive=False,
546
  wrap=True,
547
  )
548
 
549
+ with gr.Column(visible=False) as analysis_section:
550
+ gr.Markdown("## 📊 Trajectory Analysis")
551
+
552
+ with gr.Row():
553
+ plot_steps = gr.Plot(label="API Calls Distribution")
554
+ plot_cost = gr.Plot(label="Cost Distribution")
555
+
556
+ with gr.Row():
557
+ plot_tokens = gr.Plot(label="Token Usage by Type")
558
+ plot_tokens_cost = gr.Plot(label="Cost by Token Type ($)")
559
+
560
+ with gr.Row():
561
+ plot_stacked = gr.Plot(label="Billable Tokens per Instance")
562
+
563
+ with gr.Row():
564
+ plot_cost_breakdown = gr.Plot(label="Cost Breakdown per Instance ($)")
565
+
566
  with gr.Column(scale=1):
567
+ selected_folder = gr.State("")
568
  gr.Markdown("### Selected Model")
569
  selected_name = gr.Textbox(label="Model Name", interactive=False)
 
570
 
571
  download_btn = gr.Button("📥 Download Trajectories", interactive=False)
572
  download_status = gr.Textbox(label="Status", interactive=False, lines=3)
573
 
574
+ analyze_btn = gr.Button("📊 Load & Analyze", visible=False, variant="primary")
575
+
576
+ gr.Markdown("---")
577
+ gr.Markdown("### 💰 Token Prices ($/1M) · *[litellm](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)*")
578
+ detected_model = gr.Textbox(label="Detected Model", interactive=False)
579
+ price_cache_read = gr.Number(label="💲 Cache Read", value=0, precision=2)
580
+ price_cache_creation = gr.Number(label="💲 Cache Creation", value=0, precision=2)
581
+ price_completion = gr.Number(label="💲 Completion", value=0, precision=2)
582
+
583
  leaderboard_table.select(
584
  fn=on_row_select,
585
  inputs=[leaderboard_table],
586
+ outputs=[selected_folder, selected_name, download_btn, analyze_btn, price_cache_read, price_cache_creation, price_completion, detected_model],
587
  )
588
 
589
  download_btn.click(
590
  fn=download_trajectories_from_s3,
591
  inputs=[selected_folder],
592
+ outputs=[download_status, analyze_btn],
593
+ )
594
+
595
+ def load_and_analyze(folder, cache_read_price, cache_creation_price, completion_price):
596
+ empty_result = (
597
+ gr.update(visible=False),
598
+ None, None, None, None, None, None,
599
+ )
600
+
601
+ if not folder:
602
+ yield empty_result
603
+ return
604
+
605
+ yield (
606
+ gr.update(visible=True),
607
+ None, None, None, None, None, None,
608
+ )
609
+
610
+ df = load_all_trajectories(folder)
611
+ if df.empty:
612
+ yield empty_result
613
+ return
614
+
615
+ fig_steps, fig_cost, fig_tokens, fig_tokens_cost, fig_stacked = create_basic_histograms(
616
+ df, cache_read_price, cache_creation_price, completion_price
617
+ )
618
+ fig_cost_breakdown = create_cost_breakdown(df, cache_read_price, cache_creation_price, completion_price)
619
+
620
+ yield (
621
+ gr.update(visible=True),
622
+ fig_steps, fig_cost, fig_tokens, fig_tokens_cost, fig_stacked, fig_cost_breakdown,
623
+ )
624
+
625
+ analyze_btn.click(
626
+ fn=load_and_analyze,
627
+ inputs=[selected_folder, price_cache_read, price_cache_creation, price_completion],
628
+ outputs=[
629
+ analysis_section,
630
+ plot_steps, plot_cost, plot_tokens, plot_tokens_cost, plot_stacked, plot_cost_breakdown,
631
+ ],
632
  )
633
 
634
  return app
 
636
 
637
  if __name__ == "__main__":
638
  app = build_app()
639
+ app.queue()
640
  app.launch()
 
pyproject.toml CHANGED
@@ -8,6 +8,7 @@ requires-python = ">=3.10"
8
  dependencies = [
9
  "gradio>=6.0.2",
10
  "pandas>=2.0.0",
 
11
  "requests>=2.31.0",
12
  "python-dotenv>=1.0.0",
13
  ]
 
8
  dependencies = [
9
  "gradio>=6.0.2",
10
  "pandas>=2.0.0",
11
+ "plotly>=5.18.0",
12
  "requests>=2.31.0",
13
  "python-dotenv>=1.0.0",
14
  ]
uv.lock CHANGED
@@ -615,6 +615,15 @@ wheels = [
615
  { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" },
616
  ]
617
 
 
 
 
 
 
 
 
 
 
618
  [[package]]
619
  name = "numpy"
620
  version = "2.2.6"
@@ -1016,6 +1025,19 @@ wheels = [
1016
  { url = "https://files.pythonhosted.org/packages/95/7e/f896623c3c635a90537ac093c6a618ebe1a90d87206e42309cb5d98a1b9e/pillow-12.0.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:b290fd8aa38422444d4b50d579de197557f182ef1068b75f5aa8558638b8d0a5", size = 6997850, upload-time = "2025-10-15T18:24:11.495Z" },
1017
  ]
1018
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1019
  [[package]]
1020
  name = "pydantic"
1021
  version = "2.12.4"
@@ -1305,6 +1327,7 @@ source = { virtual = "." }
1305
  dependencies = [
1306
  { name = "gradio" },
1307
  { name = "pandas" },
 
1308
  { name = "python-dotenv" },
1309
  { name = "requests" },
1310
  ]
@@ -1318,6 +1341,7 @@ dev = [
1318
  requires-dist = [
1319
  { name = "gradio", specifier = ">=6.0.2" },
1320
  { name = "pandas", specifier = ">=2.0.0" },
 
1321
  { name = "python-dotenv", specifier = ">=1.0.0" },
1322
  { name = "requests", specifier = ">=2.31.0" },
1323
  { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.8.0" },
 
615
  { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" },
616
  ]
617
 
618
+ [[package]]
619
+ name = "narwhals"
620
+ version = "2.13.0"
621
+ source = { registry = "https://pypi.org/simple" }
622
+ sdist = { url = "https://files.pythonhosted.org/packages/89/ea/f82ef99ced4d03c33bb314c9b84a08a0a86c448aaa11ffd6256b99538aa5/narwhals-2.13.0.tar.gz", hash = "sha256:ee94c97f4cf7cfeebbeca8d274784df8b3d7fd3f955ce418af998d405576fdd9", size = 594555, upload-time = "2025-12-01T13:54:05.329Z" }
623
+ wheels = [
624
+ { url = "https://files.pythonhosted.org/packages/87/0d/1861d1599571974b15b025e12b142d8e6b42ad66c8a07a89cb0fc21f1e03/narwhals-2.13.0-py3-none-any.whl", hash = "sha256:9b795523c179ca78204e3be53726da374168f906e38de2ff174c2363baaaf481", size = 426407, upload-time = "2025-12-01T13:54:03.861Z" },
625
+ ]
626
+
627
  [[package]]
628
  name = "numpy"
629
  version = "2.2.6"
 
1025
  { url = "https://files.pythonhosted.org/packages/95/7e/f896623c3c635a90537ac093c6a618ebe1a90d87206e42309cb5d98a1b9e/pillow-12.0.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:b290fd8aa38422444d4b50d579de197557f182ef1068b75f5aa8558638b8d0a5", size = 6997850, upload-time = "2025-10-15T18:24:11.495Z" },
1026
  ]
1027
 
1028
+ [[package]]
1029
+ name = "plotly"
1030
+ version = "6.5.0"
1031
+ source = { registry = "https://pypi.org/simple" }
1032
+ dependencies = [
1033
+ { name = "narwhals" },
1034
+ { name = "packaging" },
1035
+ ]
1036
+ sdist = { url = "https://files.pythonhosted.org/packages/94/05/1199e2a03ce6637960bc1e951ca0f928209a48cfceb57355806a88f214cf/plotly-6.5.0.tar.gz", hash = "sha256:d5d38224883fd38c1409bef7d6a8dc32b74348d39313f3c52ca998b8e447f5c8", size = 7013624, upload-time = "2025-11-17T18:39:24.523Z" }
1037
+ wheels = [
1038
+ { url = "https://files.pythonhosted.org/packages/e7/c3/3031c931098de393393e1f93a38dc9ed6805d86bb801acc3cf2d5bd1e6b7/plotly-6.5.0-py3-none-any.whl", hash = "sha256:5ac851e100367735250206788a2b1325412aa4a4917a4fe3e6f0bc5aa6f3d90a", size = 9893174, upload-time = "2025-11-17T18:39:20.351Z" },
1039
+ ]
1040
+
1041
  [[package]]
1042
  name = "pydantic"
1043
  version = "2.12.4"
 
1327
  dependencies = [
1328
  { name = "gradio" },
1329
  { name = "pandas" },
1330
+ { name = "plotly" },
1331
  { name = "python-dotenv" },
1332
  { name = "requests" },
1333
  ]
 
1341
  requires-dist = [
1342
  { name = "gradio", specifier = ">=6.0.2" },
1343
  { name = "pandas", specifier = ">=2.0.0" },
1344
+ { name = "plotly", specifier = ">=5.18.0" },
1345
  { name = "python-dotenv", specifier = ">=1.0.0" },
1346
  { name = "requests", specifier = ">=2.31.0" },
1347
  { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.8.0" },