IgorSlinko commited on
Commit
10a0b43
·
1 Parent(s): 748e866

Add metadata-based single trajectory statistics, fix calculated formula (v0.3.37)

Browse files

- Add "One trajectory statistics. Metadata from .traj" section
- Fix "One trajectory statistics. Calculated" to use same formula as aggregate charts
- uncached_input = prompt_tokens - cache_read - cache_creation

Files changed (2) hide show
  1. app.py +376 -10
  2. requirements.txt +1 -0
app.py CHANGED
@@ -174,6 +174,276 @@ def calculate_per_step_tokens(steps: list[dict]) -> list[dict]:
174
  return result
175
 
176
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  def create_single_trajectory_chart(steps: list[dict]):
178
  """Create stacked bar chart for a single trajectory showing tokens per step."""
179
  import plotly.graph_objects as go
@@ -184,10 +454,11 @@ def create_single_trajectory_chart(steps: list[dict]):
184
  per_step_data = calculate_per_step_tokens(steps)
185
 
186
  x_labels = [f"Step {d['step']}" for d in per_step_data]
187
- uncached = [d["uncached_input"] / 1e3 for d in per_step_data]
188
  cache_read = [d["cache_read"] / 1e3 for d in per_step_data]
189
  cache_creation = [d["cache_creation"] / 1e3 for d in per_step_data]
190
  completion = [d["completion"] / 1e3 for d in per_step_data]
 
 
191
 
192
  fig = go.Figure()
193
 
@@ -244,10 +515,16 @@ def create_single_trajectory_cost_chart(steps: list[dict], input_price: float, c
244
  per_step_data = calculate_per_step_tokens(steps)
245
 
246
  x_labels = [f"Step {d['step']}" for d in per_step_data]
247
- uncached_cost = [d["uncached_input"] * input_price / 1e6 for d in per_step_data]
248
- cache_read_cost = [d["cache_read"] * cache_read_price / 1e6 for d in per_step_data]
249
- cache_creation_cost = [d["cache_creation"] * cache_creation_price / 1e6 for d in per_step_data]
250
- completion_cost = [d["completion"] * completion_price / 1e6 for d in per_step_data]
 
 
 
 
 
 
251
 
252
  fig = go.Figure()
253
 
@@ -842,8 +1119,34 @@ def parse_trajectory(traj_path: Path) -> dict:
842
  result["prompt_tokens"] += usage.get("prompt_tokens", 0) or 0
843
  result["completion_tokens"] += usage.get("completion_tokens", 0) or 0
844
  result["total_tokens"] += usage.get("total_tokens", 0) or 0
845
- result["cache_read_tokens"] += usage.get("cache_read_input_tokens", 0) or 0
846
- result["cache_creation_tokens"] += usage.get("cache_creation_input_tokens", 0) or 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
847
 
848
  return result
849
 
@@ -1515,7 +1818,7 @@ def build_app():
1515
  """)
1516
  trajectories_state = gr.State(None)
1517
 
1518
- gr.Markdown("# 🧮 SWE-bench Bash-Only Leaderboard `v0.3.36`")
1519
  gr.Markdown("## 🎯 Select a base model for cost analysis (click a row)")
1520
 
1521
  with gr.Row():
@@ -1558,6 +1861,14 @@ def build_app():
1558
  with gr.Row():
1559
  plot_cost_breakdown_calc = gr.Plot(label="Cost per Trajectory")
1560
 
 
 
 
 
 
 
 
 
1561
  with gr.Accordion("One trajectory statistics. Calculated from .traj messages", open=False, visible=False) as single_traj_accordion:
1562
  with gr.Row():
1563
  single_traj_dropdown = gr.Dropdown(label="Select Issue", choices=[], interactive=True)
@@ -2387,6 +2698,10 @@ def build_app():
2387
  gr.update(),
2388
  gr.update(),
2389
  gr.update(),
 
 
 
 
2390
  )
2391
 
2392
  if not folder:
@@ -2408,6 +2723,10 @@ def build_app():
2408
  gr.update(),
2409
  gr.update(),
2410
  gr.update(),
 
 
 
 
2411
  )
2412
  progress(0.3, desc="Downloading")
2413
  status, _ = download_trajectories_from_s3(folder)
@@ -2425,6 +2744,10 @@ def build_app():
2425
  gr.update(),
2426
  gr.update(),
2427
  gr.update(),
 
 
 
 
2428
  )
2429
  return
2430
  progress(0.45, desc="Loading trajectories")
@@ -2441,6 +2764,10 @@ def build_app():
2441
  gr.update(),
2442
  gr.update(),
2443
  gr.update(),
 
 
 
 
2444
  )
2445
 
2446
  progress(0.6, desc="Reading metadata")
@@ -2449,8 +2776,10 @@ def build_app():
2449
  df_calc = ensure_token_columns(load_all_trajectories_calculated(folder))
2450
  df_calc["api_calls"] = df_meta["api_calls"].values
2451
  df_calc["instance_cost"] = df_meta["instance_cost"].values
2452
- progress(0.8, desc="Reading steps")
2453
  trajectory_steps = load_all_trajectory_steps(folder)
 
 
2454
 
2455
  model_details, _ = get_model_details(folder)
2456
  resolved_instances = {}
@@ -2459,7 +2788,7 @@ def build_app():
2459
  for inst_id, details in per_instance.items():
2460
  resolved_instances[inst_id] = details.get("resolved", False)
2461
 
2462
- state_data = {"meta": df_meta, "calculated": df_calc, "folder": folder, "steps": trajectory_steps, "resolved": resolved_instances}
2463
 
2464
  if df_meta.empty:
2465
  progress(1, desc="No trajectories found")
@@ -2475,6 +2804,10 @@ def build_app():
2475
  gr.update(),
2476
  gr.update(),
2477
  gr.update(),
 
 
 
 
2478
  )
2479
  return
2480
 
@@ -2504,6 +2837,10 @@ def build_app():
2504
  issue_ids = sorted(trajectory_steps.keys())
2505
  first_issue = issue_ids[0] if issue_ids else None
2506
 
 
 
 
 
2507
  progress(1, desc="Done")
2508
  yield (
2509
  f"✅ Loaded {len(df_meta)} trajectories",
@@ -2517,6 +2854,10 @@ def build_app():
2517
  gr.update(choices=issue_ids, value=first_issue),
2518
  gr.update(),
2519
  gr.update(),
 
 
 
 
2520
  )
2521
 
2522
  def on_single_traj_select(state_data, issue_id, input_price, cache_read_price, cache_creation_price, completion_price):
@@ -2530,6 +2871,17 @@ def build_app():
2530
  cost_chart = create_single_trajectory_cost_chart(steps, input_price, cache_read_price, cache_creation_price, completion_price)
2531
  return tokens_chart, cost_chart
2532
 
 
 
 
 
 
 
 
 
 
 
 
2533
  analyze_btn.click(
2534
  fn=load_and_analyze,
2535
  inputs=[selected_folder, price_input, price_cache_read, price_cache_creation, price_completion, thinking_overhead, use_cache],
@@ -2545,11 +2897,19 @@ def build_app():
2545
  single_traj_dropdown,
2546
  single_traj_plot,
2547
  single_traj_cost_plot,
 
 
 
 
2548
  ],
2549
  ).then(
2550
  fn=on_single_traj_select,
2551
  inputs=[trajectories_state, single_traj_dropdown, price_input, price_cache_read, price_cache_creation, price_completion],
2552
  outputs=[single_traj_plot, single_traj_cost_plot],
 
 
 
 
2553
  )
2554
 
2555
  def recalculate_costs(state_data, input_price, cache_read_price, cache_creation_price, completion_price, overhead, with_cache):
@@ -2613,6 +2973,12 @@ def build_app():
2613
  outputs=[single_traj_plot, single_traj_cost_plot],
2614
  )
2615
 
 
 
 
 
 
 
2616
  thinking_overhead.change(
2617
  fn=on_calc_options_change,
2618
  inputs=calc_options_inputs,
 
174
  return result
175
 
176
 
177
+ def _parse_usage_from_log_line(line: str) -> dict | None:
178
+ """
179
+ Parse usage info from log line containing ModelResponse or similar format.
180
+ Returns dict with prompt_tokens, completion_tokens, cached_tokens, etc.
181
+ """
182
+ if "usage=" not in line:
183
+ return None
184
+
185
+ result = {}
186
+
187
+ for field in ["completion_tokens", "prompt_tokens", "total_tokens"]:
188
+ match = re.search(rf'{field}=(\d+)', line)
189
+ if match:
190
+ result[field] = int(match.group(1))
191
+
192
+ cached_match = re.search(r'cached_tokens=(\d+)', line)
193
+ if cached_match:
194
+ result["cached_tokens"] = int(cached_match.group(1))
195
+
196
+ return result if result else None
197
+
198
+
199
+ def _parse_old_format_log(log_path: Path) -> list[dict]:
200
+ """
201
+ Parse old SWE-agent format .info.log file to extract per-step token usage.
202
+ """
203
+ result = []
204
+ step = 0
205
+
206
+ try:
207
+ with open(log_path, "r", encoding="utf-8") as f:
208
+ for line in f:
209
+ if "usage=Usage(" not in line:
210
+ continue
211
+
212
+ usage = _parse_usage_from_log_line(line)
213
+ if not usage:
214
+ continue
215
+
216
+ prompt_tokens = usage.get("prompt_tokens", 0)
217
+ completion_tokens = usage.get("completion_tokens", 0)
218
+ cached_tokens = usage.get("cached_tokens", 0)
219
+
220
+ uncached_input = max(0, prompt_tokens - cached_tokens)
221
+
222
+ result.append({
223
+ "step": step,
224
+ "cache_read": cached_tokens,
225
+ "uncached_input": uncached_input,
226
+ "completion": completion_tokens,
227
+ "cache_creation": 0,
228
+ })
229
+ step += 1
230
+ except Exception as e:
231
+ logging.debug("Error parsing log file %s: %s", log_path, e)
232
+
233
+ return result
234
+
235
+
236
+ def parse_trajectory_metadata_per_step(traj_path: Path) -> list[dict]:
237
+ """
238
+ Parse trajectory file and extract per-step metadata from usage fields.
239
+ Supports both new format (.traj.json with messages[].extra.response.usage)
240
+ and old format (.traj with separate .info.log file).
241
+
242
+ Returns list of per-step data:
243
+ [{step: 0, cache_read: X, uncached_input: Y, completion: Z, cache_creation: W}, ...]
244
+ """
245
+ with open(traj_path, "r", encoding="utf-8") as f:
246
+ data = json.load(f)
247
+
248
+ messages = data.get("messages", [])
249
+ result = []
250
+ step = 0
251
+
252
+ for msg in messages:
253
+ if msg.get("role") != "assistant":
254
+ continue
255
+
256
+ usage = None
257
+ if "usage" in msg:
258
+ usage = msg["usage"]
259
+ elif "extra" in msg and isinstance(msg["extra"], dict):
260
+ response = msg["extra"].get("response", {})
261
+ if isinstance(response, dict):
262
+ usage = response.get("usage", {})
263
+
264
+ if usage:
265
+ prompt_tokens = usage.get("prompt_tokens", 0) or 0
266
+ completion_tokens = usage.get("completion_tokens", 0) or 0
267
+ cache_read = usage.get("cache_read_input_tokens", 0) or 0
268
+ cache_creation = usage.get("cache_creation_input_tokens", 0) or 0
269
+
270
+ prompt_tokens_details = usage.get("prompt_tokens_details", {})
271
+ if isinstance(prompt_tokens_details, dict):
272
+ cached_from_details = prompt_tokens_details.get("cached_tokens", 0) or 0
273
+ if cached_from_details > 0 and cache_read == 0:
274
+ cache_read = cached_from_details
275
+
276
+ uncached_input = max(0, prompt_tokens - cache_read - cache_creation)
277
+
278
+ result.append({
279
+ "step": step,
280
+ "cache_read": cache_read,
281
+ "uncached_input": uncached_input,
282
+ "completion": completion_tokens,
283
+ "cache_creation": cache_creation,
284
+ })
285
+ step += 1
286
+
287
+ if not result:
288
+ log_path = traj_path.with_suffix(".info.log")
289
+ if not log_path.exists():
290
+ base_name = traj_path.stem.replace(".traj", "")
291
+ log_path = traj_path.parent / f"{base_name}.info.log"
292
+
293
+ if log_path.exists():
294
+ result = _parse_old_format_log(log_path)
295
+
296
+ return result
297
+
298
+
299
+ def load_all_trajectory_metadata_steps(folder: str) -> dict[str, list[dict]]:
300
+ """
301
+ Load per-step metadata for all trajectories.
302
+
303
+ Returns:
304
+ dict mapping instance_id -> list of per-step metadata
305
+ """
306
+ output_dir = TRAJS_DIR / folder
307
+
308
+ traj_files = list(output_dir.glob("*/*.traj.json"))
309
+ if not traj_files:
310
+ traj_files = list(output_dir.glob("*/*.traj"))
311
+ if not traj_files:
312
+ traj_files = list(output_dir.glob("*.traj.json"))
313
+ if not traj_files:
314
+ traj_files = list(output_dir.glob("*.traj"))
315
+ if not traj_files:
316
+ traj_files = list(output_dir.glob("*.json"))
317
+
318
+ result = {}
319
+ for traj_path in traj_files:
320
+ try:
321
+ instance_id = traj_path.stem.replace(".traj", "")
322
+ steps = parse_trajectory_metadata_per_step(traj_path)
323
+ if steps:
324
+ result[instance_id] = steps
325
+ except Exception as e:
326
+ logging.error("Error parsing metadata steps for %s: %s", traj_path, e, exc_info=True)
327
+
328
+ return result
329
+
330
+
331
+ def create_single_trajectory_meta_chart(steps: list[dict]):
332
+ """Create stacked bar chart for a single trajectory showing metadata tokens per step."""
333
+ import plotly.graph_objects as go
334
+
335
+ if not steps:
336
+ return None
337
+
338
+ x_labels = [f"Step {d['step']}" for d in steps]
339
+ uncached = [d["uncached_input"] / 1e3 for d in steps]
340
+ cache_read = [d["cache_read"] / 1e3 for d in steps]
341
+ cache_creation = [d["cache_creation"] / 1e3 for d in steps]
342
+ completion = [d["completion"] / 1e3 for d in steps]
343
+
344
+ fig = go.Figure()
345
+
346
+ fig.add_trace(go.Bar(
347
+ name="Uncached Input",
348
+ x=x_labels,
349
+ y=uncached,
350
+ marker_color="#EF553B",
351
+ hovertemplate="Step %{x}<br>Uncached Input: %{y:.2f}K<extra></extra>",
352
+ ))
353
+
354
+ fig.add_trace(go.Bar(
355
+ name="Cache Read",
356
+ x=x_labels,
357
+ y=cache_read,
358
+ marker_color="#19D3F3",
359
+ hovertemplate="Step %{x}<br>Cache Read: %{y:.2f}K<extra></extra>",
360
+ ))
361
+
362
+ fig.add_trace(go.Bar(
363
+ name="Cache Creation",
364
+ x=x_labels,
365
+ y=cache_creation,
366
+ marker_color="#FFA15A",
367
+ hovertemplate="Step %{x}<br>Cache Creation: %{y:.2f}K<extra></extra>",
368
+ ))
369
+
370
+ fig.add_trace(go.Bar(
371
+ name="Completion",
372
+ x=x_labels,
373
+ y=completion,
374
+ marker_color="#AB63FA",
375
+ hovertemplate="Step %{x}<br>Completion: %{y:.2f}K<extra></extra>",
376
+ ))
377
+
378
+ fig.update_layout(
379
+ barmode="stack",
380
+ xaxis_title="Step",
381
+ yaxis_title="Tokens (K)",
382
+ legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
383
+ margin=dict(l=50, r=20, t=40, b=40),
384
+ )
385
+
386
+ return fig
387
+
388
+
389
+ def create_single_trajectory_meta_cost_chart(steps: list[dict], input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float):
390
+ """Create stacked bar chart for a single trajectory showing metadata cost per step."""
391
+ import plotly.graph_objects as go
392
+
393
+ if not steps:
394
+ return None
395
+
396
+ x_labels = [f"Step {d['step']}" for d in steps]
397
+ uncached_cost = [d["uncached_input"] * input_price / 1e6 for d in steps]
398
+ cache_read_cost = [d["cache_read"] * cache_read_price / 1e6 for d in steps]
399
+ cache_creation_cost = [d["cache_creation"] * cache_creation_price / 1e6 for d in steps]
400
+ completion_cost = [d["completion"] * completion_price / 1e6 for d in steps]
401
+
402
+ fig = go.Figure()
403
+
404
+ fig.add_trace(go.Bar(
405
+ name="Uncached Input",
406
+ x=x_labels,
407
+ y=uncached_cost,
408
+ marker_color="#EF553B",
409
+ hovertemplate="Step %{x}<br>Uncached Input: $%{y:.4f}<extra></extra>",
410
+ ))
411
+
412
+ fig.add_trace(go.Bar(
413
+ name="Cache Read",
414
+ x=x_labels,
415
+ y=cache_read_cost,
416
+ marker_color="#19D3F3",
417
+ hovertemplate="Step %{x}<br>Cache Read: $%{y:.4f}<extra></extra>",
418
+ ))
419
+
420
+ fig.add_trace(go.Bar(
421
+ name="Cache Creation",
422
+ x=x_labels,
423
+ y=cache_creation_cost,
424
+ marker_color="#FFA15A",
425
+ hovertemplate="Step %{x}<br>Cache Creation: $%{y:.4f}<extra></extra>",
426
+ ))
427
+
428
+ fig.add_trace(go.Bar(
429
+ name="Completion",
430
+ x=x_labels,
431
+ y=completion_cost,
432
+ marker_color="#AB63FA",
433
+ hovertemplate="Step %{x}<br>Completion: $%{y:.4f}<extra></extra>",
434
+ ))
435
+
436
+ fig.update_layout(
437
+ barmode="stack",
438
+ xaxis_title="Step",
439
+ yaxis_title="Cost ($)",
440
+ legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
441
+ margin=dict(l=50, r=20, t=40, b=40),
442
+ )
443
+
444
+ return fig
445
+
446
+
447
  def create_single_trajectory_chart(steps: list[dict]):
448
  """Create stacked bar chart for a single trajectory showing tokens per step."""
449
  import plotly.graph_objects as go
 
454
  per_step_data = calculate_per_step_tokens(steps)
455
 
456
  x_labels = [f"Step {d['step']}" for d in per_step_data]
 
457
  cache_read = [d["cache_read"] / 1e3 for d in per_step_data]
458
  cache_creation = [d["cache_creation"] / 1e3 for d in per_step_data]
459
  completion = [d["completion"] / 1e3 for d in per_step_data]
460
+ prompt_tokens = [(d["cache_read"] + d["uncached_input"]) / 1e3 for d in per_step_data]
461
+ uncached = [max(0, p - cr - cc) for p, cr, cc in zip(prompt_tokens, cache_read, cache_creation)]
462
 
463
  fig = go.Figure()
464
 
 
515
  per_step_data = calculate_per_step_tokens(steps)
516
 
517
  x_labels = [f"Step {d['step']}" for d in per_step_data]
518
+ cache_read = [d["cache_read"] for d in per_step_data]
519
+ cache_creation = [d["cache_creation"] for d in per_step_data]
520
+ completion = [d["completion"] for d in per_step_data]
521
+ prompt_tokens = [d["cache_read"] + d["uncached_input"] for d in per_step_data]
522
+ uncached = [max(0, p - cr - cc) for p, cr, cc in zip(prompt_tokens, cache_read, cache_creation)]
523
+
524
+ uncached_cost = [u * input_price / 1e6 for u in uncached]
525
+ cache_read_cost = [cr * cache_read_price / 1e6 for cr in cache_read]
526
+ cache_creation_cost = [cc * cache_creation_price / 1e6 for cc in cache_creation]
527
+ completion_cost = [c * completion_price / 1e6 for c in completion]
528
 
529
  fig = go.Figure()
530
 
 
1119
  result["prompt_tokens"] += usage.get("prompt_tokens", 0) or 0
1120
  result["completion_tokens"] += usage.get("completion_tokens", 0) or 0
1121
  result["total_tokens"] += usage.get("total_tokens", 0) or 0
1122
+
1123
+ cache_read = usage.get("cache_read_input_tokens", 0) or 0
1124
+ cache_creation = usage.get("cache_creation_input_tokens", 0) or 0
1125
+
1126
+ prompt_tokens_details = usage.get("prompt_tokens_details", {})
1127
+ if isinstance(prompt_tokens_details, dict):
1128
+ cached_from_details = prompt_tokens_details.get("cached_tokens", 0) or 0
1129
+ if cached_from_details > 0 and cache_read == 0:
1130
+ cache_read = cached_from_details
1131
+
1132
+ result["cache_read_tokens"] += cache_read
1133
+ result["cache_creation_tokens"] += cache_creation
1134
+
1135
+ if result["prompt_tokens"] == 0 and result["completion_tokens"] == 0:
1136
+ log_path = traj_path.with_suffix(".info.log")
1137
+ if not log_path.exists():
1138
+ base_name = traj_path.stem.replace(".traj", "")
1139
+ log_path = traj_path.parent / f"{base_name}.info.log"
1140
+
1141
+ if log_path.exists():
1142
+ steps = _parse_old_format_log(log_path)
1143
+ for step_data in steps:
1144
+ result["prompt_tokens"] += step_data["cache_read"] + step_data["uncached_input"]
1145
+ result["completion_tokens"] += step_data["completion"]
1146
+ result["cache_read_tokens"] += step_data["cache_read"]
1147
+ result["total_tokens"] = result["prompt_tokens"] + result["completion_tokens"]
1148
+ if result["api_calls"] == 0:
1149
+ result["api_calls"] = len(steps)
1150
 
1151
  return result
1152
 
 
1818
  """)
1819
  trajectories_state = gr.State(None)
1820
 
1821
+ gr.Markdown("# 🧮 SWE-bench Bash-Only Leaderboard `v0.3.37`")
1822
  gr.Markdown("## 🎯 Select a base model for cost analysis (click a row)")
1823
 
1824
  with gr.Row():
 
1861
  with gr.Row():
1862
  plot_cost_breakdown_calc = gr.Plot(label="Cost per Trajectory")
1863
 
1864
+ with gr.Accordion("One trajectory statistics. Metadata from .traj", open=False, visible=False) as single_traj_meta_accordion:
1865
+ with gr.Row():
1866
+ single_traj_meta_dropdown = gr.Dropdown(label="Select Issue", choices=[], interactive=True)
1867
+ with gr.Row():
1868
+ single_traj_meta_plot = gr.Plot(label="Tokens per Step (stacked)")
1869
+ with gr.Row():
1870
+ single_traj_meta_cost_plot = gr.Plot(label="Cost per Step (stacked) ($)")
1871
+
1872
  with gr.Accordion("One trajectory statistics. Calculated from .traj messages", open=False, visible=False) as single_traj_accordion:
1873
  with gr.Row():
1874
  single_traj_dropdown = gr.Dropdown(label="Select Issue", choices=[], interactive=True)
 
2698
  gr.update(),
2699
  gr.update(),
2700
  gr.update(),
2701
+ gr.update(visible=False),
2702
+ gr.update(),
2703
+ gr.update(),
2704
+ gr.update(),
2705
  )
2706
 
2707
  if not folder:
 
2723
  gr.update(),
2724
  gr.update(),
2725
  gr.update(),
2726
+ gr.update(visible=False),
2727
+ gr.update(),
2728
+ gr.update(),
2729
+ gr.update(),
2730
  )
2731
  progress(0.3, desc="Downloading")
2732
  status, _ = download_trajectories_from_s3(folder)
 
2744
  gr.update(),
2745
  gr.update(),
2746
  gr.update(),
2747
+ gr.update(visible=False),
2748
+ gr.update(),
2749
+ gr.update(),
2750
+ gr.update(),
2751
  )
2752
  return
2753
  progress(0.45, desc="Loading trajectories")
 
2764
  gr.update(),
2765
  gr.update(),
2766
  gr.update(),
2767
+ gr.update(visible=False),
2768
+ gr.update(),
2769
+ gr.update(),
2770
+ gr.update(),
2771
  )
2772
 
2773
  progress(0.6, desc="Reading metadata")
 
2776
  df_calc = ensure_token_columns(load_all_trajectories_calculated(folder))
2777
  df_calc["api_calls"] = df_meta["api_calls"].values
2778
  df_calc["instance_cost"] = df_meta["instance_cost"].values
2779
+ progress(0.75, desc="Reading steps")
2780
  trajectory_steps = load_all_trajectory_steps(folder)
2781
+ progress(0.8, desc="Reading metadata steps")
2782
+ metadata_steps = load_all_trajectory_metadata_steps(folder)
2783
 
2784
  model_details, _ = get_model_details(folder)
2785
  resolved_instances = {}
 
2788
  for inst_id, details in per_instance.items():
2789
  resolved_instances[inst_id] = details.get("resolved", False)
2790
 
2791
+ state_data = {"meta": df_meta, "calculated": df_calc, "folder": folder, "steps": trajectory_steps, "metadata_steps": metadata_steps, "resolved": resolved_instances}
2792
 
2793
  if df_meta.empty:
2794
  progress(1, desc="No trajectories found")
 
2804
  gr.update(),
2805
  gr.update(),
2806
  gr.update(),
2807
+ gr.update(visible=False),
2808
+ gr.update(),
2809
+ gr.update(),
2810
+ gr.update(),
2811
  )
2812
  return
2813
 
 
2837
  issue_ids = sorted(trajectory_steps.keys())
2838
  first_issue = issue_ids[0] if issue_ids else None
2839
 
2840
+ meta_issue_ids = sorted(metadata_steps.keys())
2841
+ first_meta_issue = meta_issue_ids[0] if meta_issue_ids else None
2842
+ has_meta_steps = len(meta_issue_ids) > 0
2843
+
2844
  progress(1, desc="Done")
2845
  yield (
2846
  f"✅ Loaded {len(df_meta)} trajectories",
 
2854
  gr.update(choices=issue_ids, value=first_issue),
2855
  gr.update(),
2856
  gr.update(),
2857
+ gr.update(visible=has_meta_steps),
2858
+ gr.update(choices=meta_issue_ids, value=first_meta_issue),
2859
+ gr.update(),
2860
+ gr.update(),
2861
  )
2862
 
2863
  def on_single_traj_select(state_data, issue_id, input_price, cache_read_price, cache_creation_price, completion_price):
 
2871
  cost_chart = create_single_trajectory_cost_chart(steps, input_price, cache_read_price, cache_creation_price, completion_price)
2872
  return tokens_chart, cost_chart
2873
 
2874
+ def on_single_traj_meta_select(state_data, issue_id, input_price, cache_read_price, cache_creation_price, completion_price):
2875
+ if state_data is None or not issue_id:
2876
+ return None, None
2877
+ metadata_steps = state_data.get("metadata_steps", {})
2878
+ if issue_id not in metadata_steps:
2879
+ return None, None
2880
+ steps = metadata_steps[issue_id]
2881
+ tokens_chart = create_single_trajectory_meta_chart(steps)
2882
+ cost_chart = create_single_trajectory_meta_cost_chart(steps, input_price, cache_read_price, cache_creation_price, completion_price)
2883
+ return tokens_chart, cost_chart
2884
+
2885
  analyze_btn.click(
2886
  fn=load_and_analyze,
2887
  inputs=[selected_folder, price_input, price_cache_read, price_cache_creation, price_completion, thinking_overhead, use_cache],
 
2897
  single_traj_dropdown,
2898
  single_traj_plot,
2899
  single_traj_cost_plot,
2900
+ single_traj_meta_accordion,
2901
+ single_traj_meta_dropdown,
2902
+ single_traj_meta_plot,
2903
+ single_traj_meta_cost_plot,
2904
  ],
2905
  ).then(
2906
  fn=on_single_traj_select,
2907
  inputs=[trajectories_state, single_traj_dropdown, price_input, price_cache_read, price_cache_creation, price_completion],
2908
  outputs=[single_traj_plot, single_traj_cost_plot],
2909
+ ).then(
2910
+ fn=on_single_traj_meta_select,
2911
+ inputs=[trajectories_state, single_traj_meta_dropdown, price_input, price_cache_read, price_cache_creation, price_completion],
2912
+ outputs=[single_traj_meta_plot, single_traj_meta_cost_plot],
2913
  )
2914
 
2915
  def recalculate_costs(state_data, input_price, cache_read_price, cache_creation_price, completion_price, overhead, with_cache):
 
2973
  outputs=[single_traj_plot, single_traj_cost_plot],
2974
  )
2975
 
2976
+ single_traj_meta_dropdown.change(
2977
+ fn=on_single_traj_meta_select,
2978
+ inputs=[trajectories_state, single_traj_meta_dropdown, price_input, price_cache_read, price_cache_creation, price_completion],
2979
+ outputs=[single_traj_meta_plot, single_traj_meta_cost_plot],
2980
+ )
2981
+
2982
  thinking_overhead.change(
2983
  fn=on_calc_options_change,
2984
  inputs=calc_options_inputs,
requirements.txt CHANGED
@@ -6,3 +6,4 @@ python-dotenv>=1.0.0
6
  tiktoken>=0.12.0
7
  awscli
8
 
 
 
6
  tiktoken>=0.12.0
7
  awscli
8
 
9
+