IgorSlinko commited on
Commit
9399ab7
Β·
1 Parent(s): 723fdc8

Major improvements to token calculation and UX

Browse files

- Pre-calculate both Metadata and Calculated tokens on Load & Analyze
- Instant switching between token sources (no reload needed)
- Fix token calculation: only count user messages that have assistant response
- Provider-specific tokenizer overhead: Claude=1.24, Gemini/OpenAI=1.0
- Case-insensitive model name matching for litellm prices
- Add '% resolved' column, remove 'os_system' from leaderboard table
- Rename 'Instance' to 'Trajectory' throughout UI
- Sort bottom charts by total tokens (not cache_read)
- Make last two charts side-by-side
- Add 'Use Cache' checkbox for Calculated mode
- Fix division by zero when total_count=0
- Copy api_calls and instance_cost from metadata to calculated data

Files changed (1) hide show
  1. app.py +177 -97
app.py CHANGED
@@ -28,22 +28,33 @@ _trajectories_cache = {}
28
  _calculated_tokens_cache = {}
29
 
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  def get_tokenizer(model_name: str):
32
  """Get appropriate tokenizer for model. Returns (tokenizer_func, name)"""
33
  global _tokenizer_cache
34
-
35
  model_lower = model_name.lower() if model_name else ""
36
-
37
- # Determine tokenizer type
38
  if "gpt-4o" in model_lower or "o1" in model_lower or "o3" in model_lower:
39
  tokenizer_name = "o200k_base"
40
  elif "gpt" in model_lower or "claude" in model_lower or "anthropic" in model_lower:
41
  tokenizer_name = "cl100k_base"
42
  elif "gemini" in model_lower or "google" in model_lower:
43
- # Gemini uses ~3.23 chars per token (calculated from actual API responses)
44
  return lambda text: int(len(text) / 3.23), "gemini_approx"
45
  else:
46
- # Default to cl100k_base for unknown models
47
  tokenizer_name = "cl100k_base"
48
 
49
  if tokenizer_name not in _tokenizer_cache:
@@ -75,8 +86,7 @@ def calculate_tokens_from_trajectory(traj_path: Path, model_name: str) -> dict:
75
  return {"prompt_tokens": 0, "completion_tokens": 0, "cache_read_tokens": 0, "cache_creation_tokens": 0, "api_calls": 0}
76
 
77
  count_tokens, _ = get_tokenizer(model_name)
78
-
79
- # Calculate tokens for each message
80
  message_tokens = []
81
  for msg in messages:
82
  content = msg.get("content", "")
@@ -108,7 +118,6 @@ def calculate_tokens_from_trajectory(traj_path: Path, model_name: str) -> dict:
108
  context_so_far += mt["tokens"]
109
 
110
  next_is_assistant = (i + 1 < len(message_tokens) and message_tokens[i + 1]["role"] == "assistant")
111
- is_last = (i == len(message_tokens) - 1)
112
 
113
  if next_is_assistant:
114
  prompt_tokens += context_so_far
@@ -117,10 +126,6 @@ def calculate_tokens_from_trajectory(traj_path: Path, model_name: str) -> dict:
117
  assistant_tokens = message_tokens[i + 1]["tokens"]
118
  cache_creation_tokens += (context_so_far - cached_context) + assistant_tokens
119
  cached_context = context_so_far + assistant_tokens
120
- elif is_last:
121
- prompt_tokens += context_so_far
122
- cache_read_tokens += cached_context
123
- cache_creation_tokens += context_so_far - cached_context
124
 
125
  return {
126
  "prompt_tokens": prompt_tokens,
@@ -145,6 +150,17 @@ def apply_thinking_overhead(df: pd.DataFrame, overhead: float) -> pd.DataFrame:
145
  return df
146
 
147
 
 
 
 
 
 
 
 
 
 
 
 
148
  def load_all_trajectories_calculated(folder: str) -> pd.DataFrame:
149
  """Load trajectories with self-calculated token counts"""
150
  global _calculated_tokens_cache
@@ -220,6 +236,11 @@ def get_litellm_prices() -> dict:
220
  return _litellm_prices_cache
221
 
222
 
 
 
 
 
 
223
  def get_model_prices(model_name: str) -> dict | None:
224
  if not model_name:
225
  return None
@@ -227,8 +248,7 @@ def get_model_prices(model_name: str) -> dict | None:
227
  prices = get_litellm_prices()
228
 
229
  clean_name = model_name.replace("anthropic/", "").replace("openai/", "")
230
-
231
- # Try without date suffix (e.g., gemini-3-pro-preview-20251118 -> gemini-3-pro-preview)
232
  name_without_date = re.sub(r'-\d{8}$', '', clean_name)
233
 
234
  candidates = [
@@ -245,9 +265,16 @@ def get_model_prices(model_name: str) -> dict | None:
245
  if key in prices:
246
  return prices[key]
247
 
248
- # Fuzzy match
 
 
249
  for key, value in prices.items():
250
- if clean_name in key or model_name in key or name_without_date in key:
 
 
 
 
 
251
  return value
252
 
253
  return None
@@ -274,15 +301,21 @@ def get_bash_only_df():
274
 
275
  rows = []
276
  for r in bash_only["results"]:
 
 
 
 
 
 
277
  rows.append({
278
  "name": r.get("name", ""),
 
279
  "date": r.get("date", ""),
280
  "cost": round(r.get("cost", 0), 2),
281
  "instance_cost": round(r.get("instance_cost", 0), 4),
282
  "instance_calls": r.get("instance_calls", 0),
283
  "folder": r.get("folder", ""),
284
  "os_model": "βœ…" if r.get("os_model") else "❌",
285
- "os_system": "βœ…" if r.get("os_system") else "❌",
286
  })
287
 
288
  return pd.DataFrame(rows)
@@ -352,7 +385,12 @@ def download_trajectories_from_s3(folder: str, progress=gr.Progress()):
352
  resolved_count = sum(1 for v in per_instance.values() if v.get("resolved"))
353
  total_count = len(per_instance)
354
 
355
- status = f"βœ… Downloaded to {output_dir}\n\n{file_count} trajectory files\nResolved: {resolved_count}/{total_count} ({100*resolved_count/total_count:.1f}%)"
 
 
 
 
 
356
  return status, gr.update(visible=True)
357
 
358
  except subprocess.TimeoutExpired:
@@ -520,32 +558,34 @@ def create_token_charts(df: pd.DataFrame, input_price: float, cache_read_price:
520
 
521
  fig_tokens_cost = create_cost_by_type_chart(df, input_price, cache_read_price, cache_creation_price, completion_price)
522
 
523
- # Stacked bar chart
524
- df_sorted = df.sort_values("cache_read_tokens", ascending=False).reset_index(drop=True)
525
- df_sorted["instance_idx"] = range(len(df_sorted))
526
  df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0)
 
 
 
527
 
528
  fig_stacked = go.Figure()
529
  fig_stacked.add_trace(go.Bar(
530
- name="Uncached Input", x=df_sorted["instance_idx"], y=df_sorted["uncached_input_tokens"],
531
- marker_color="#EF553B", hovertemplate="Instance: %{x}<br>Uncached Input: %{y:,.0f}<extra></extra>",
532
  ))
533
  fig_stacked.add_trace(go.Bar(
534
- name="Cache Read", x=df_sorted["instance_idx"], y=df_sorted["cache_read_tokens"],
535
- marker_color="#19D3F3", hovertemplate="Instance: %{x}<br>Cache Read: %{y:,.0f}<extra></extra>",
536
  ))
537
  fig_stacked.add_trace(go.Bar(
538
- name="Cache Creation", x=df_sorted["instance_idx"], y=df_sorted["cache_creation_tokens"],
539
- marker_color="#FFA15A", hovertemplate="Instance: %{x}<br>Cache Creation: %{y:,.0f}<extra></extra>",
540
  ))
541
  fig_stacked.add_trace(go.Bar(
542
- name="Completion", x=df_sorted["instance_idx"], y=df_sorted["completion_tokens"],
543
- marker_color="#AB63FA", hovertemplate="Instance: %{x}<br>Completion: %{y:,.0f}<extra></extra>",
544
  ))
545
  fig_stacked.update_layout(
546
  barmode="stack",
547
- title="Billable Tokens per Instance (stacked)",
548
- xaxis_title="Instance (sorted by cache read)",
549
  yaxis_title="Tokens",
550
  legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
551
  margin=dict(l=50, r=20, t=60, b=40),
@@ -562,12 +602,12 @@ def create_basic_histograms(df: pd.DataFrame, input_price: float, cache_read_pri
562
  df,
563
  x="api_calls",
564
  nbins=30,
565
- title="Distribution of API Calls (Steps) per Instance",
566
  color_discrete_sequence=["#636EFA"],
567
  )
568
  fig_steps.update_layout(
569
  xaxis_title="API Calls (Steps)",
570
- yaxis_title="Number of Instances",
571
  showlegend=False,
572
  margin=dict(l=40, r=20, t=40, b=40),
573
  )
@@ -587,7 +627,7 @@ def create_basic_histograms(df: pd.DataFrame, input_price: float, cache_read_pri
587
  )
588
  fig_cost.update_layout(
589
  xaxis_title="Cost ($)",
590
- yaxis_title="Number of Instances",
591
  showlegend=False,
592
  margin=dict(l=40, r=20, t=40, b=40),
593
  )
@@ -601,7 +641,7 @@ def create_basic_histograms(df: pd.DataFrame, input_price: float, cache_read_pri
601
  total_completion = df["completion_tokens"].sum()
602
  total_cache_read = df["cache_read_tokens"].sum()
603
  total_cache_creation = df["cache_creation_tokens"].sum()
604
- # Uncached input = prompt - cache_read - cache_creation (per instance, then sum)
605
  df_temp = df.copy()
606
  df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0)
607
  total_uncached_input = df_temp["uncached_input"].sum()
@@ -637,49 +677,51 @@ def create_basic_histograms(df: pd.DataFrame, input_price: float, cache_read_pri
637
  # Cost by token type (use separate function)
638
  fig_tokens_cost = create_cost_by_type_chart(df, input_price, cache_read_price, cache_creation_price, completion_price)
639
 
640
- df_sorted = df.sort_values("cache_read_tokens", ascending=False).reset_index(drop=True)
641
- df_sorted["instance_idx"] = range(len(df_sorted))
642
- # Uncached input = prompt - cache_read - cache_creation
643
  df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0)
 
 
 
644
 
645
  fig_stacked = go.Figure()
646
 
647
  fig_stacked.add_trace(go.Bar(
648
  name="Uncached Input",
649
- x=df_sorted["instance_idx"],
650
  y=df_sorted["uncached_input_tokens"],
651
  marker_color="#EF553B",
652
- hovertemplate="Instance: %{x}<br>Uncached Input: %{y:,.0f}<extra></extra>",
653
  ))
654
 
655
  fig_stacked.add_trace(go.Bar(
656
  name="Cache Read",
657
- x=df_sorted["instance_idx"],
658
  y=df_sorted["cache_read_tokens"],
659
  marker_color="#19D3F3",
660
- hovertemplate="Instance: %{x}<br>Cache Read: %{y:,.0f}<extra></extra>",
661
  ))
662
 
663
  fig_stacked.add_trace(go.Bar(
664
  name="Cache Creation",
665
- x=df_sorted["instance_idx"],
666
  y=df_sorted["cache_creation_tokens"],
667
  marker_color="#FFA15A",
668
- hovertemplate="Instance: %{x}<br>Cache Creation: %{y:,.0f}<extra></extra>",
669
  ))
670
 
671
  fig_stacked.add_trace(go.Bar(
672
  name="Completion",
673
- x=df_sorted["instance_idx"],
674
  y=df_sorted["completion_tokens"],
675
  marker_color="#AB63FA",
676
- hovertemplate="Instance: %{x}<br>Completion: %{y:,.0f}<extra></extra>",
677
  ))
678
 
679
  fig_stacked.update_layout(
680
  barmode="stack",
681
- title="Billable Tokens per Instance (stacked)",
682
- xaxis_title="Instance (sorted by cache read)",
683
  yaxis_title="Tokens",
684
  legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
685
  margin=dict(l=50, r=20, t=60, b=40),
@@ -692,11 +734,12 @@ def create_cost_breakdown(df: pd.DataFrame, input_price: float, cache_read_price
692
  if df.empty:
693
  return None
694
 
695
- df_sorted = df.sort_values("cache_read_tokens", ascending=False).reset_index(drop=True)
696
- df_sorted["instance_idx"] = range(len(df_sorted))
697
-
698
- # Uncached input = prompt - cache_read - cache_creation
699
  df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0)
 
 
 
700
 
701
  df_sorted["cost_uncached_input"] = df_sorted["uncached_input_tokens"] * input_price / 1e6
702
  df_sorted["cost_cache_read"] = df_sorted["cache_read_tokens"] * cache_read_price / 1e6
@@ -707,34 +750,34 @@ def create_cost_breakdown(df: pd.DataFrame, input_price: float, cache_read_price
707
 
708
  fig.add_trace(go.Bar(
709
  name=f"Uncached Input (${input_price:.2f}/1M)",
710
- x=df_sorted["instance_idx"],
711
  y=df_sorted["cost_uncached_input"],
712
  marker_color="#EF553B",
713
- hovertemplate="Instance: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
714
  ))
715
 
716
  fig.add_trace(go.Bar(
717
  name=f"Cache Read (${cache_read_price:.2f}/1M)",
718
- x=df_sorted["instance_idx"],
719
  y=df_sorted["cost_cache_read"],
720
  marker_color="#19D3F3",
721
- hovertemplate="Instance: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
722
  ))
723
 
724
  fig.add_trace(go.Bar(
725
  name=f"Cache Creation (${cache_creation_price:.2f}/1M)",
726
- x=df_sorted["instance_idx"],
727
  y=df_sorted["cost_cache_creation"],
728
  marker_color="#FFA15A",
729
- hovertemplate="Instance: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
730
  ))
731
 
732
  fig.add_trace(go.Bar(
733
  name=f"Completion (${completion_price:.2f}/1M)",
734
- x=df_sorted["instance_idx"],
735
  y=df_sorted["cost_completion"],
736
  marker_color="#AB63FA",
737
- hovertemplate="Instance: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
738
  ))
739
 
740
  total_cost = (
@@ -746,8 +789,8 @@ def create_cost_breakdown(df: pd.DataFrame, input_price: float, cache_read_price
746
 
747
  fig.update_layout(
748
  barmode="stack",
749
- title="Cost Breakdown per Instance",
750
- xaxis_title="Instance (sorted by cache read)",
751
  yaxis_title="Cost ($)",
752
  legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
753
  margin=dict(l=50, r=20, t=60, b=40),
@@ -829,14 +872,15 @@ def get_prices_for_folder(folder: str) -> tuple[dict, str]:
829
  def on_row_select(evt: gr.SelectData, df: pd.DataFrame):
830
  if evt.index is None:
831
  return (
832
- "", "",
833
- gr.update(interactive=False),
834
- gr.update(visible=False),
835
  gr.update(value=0, label="πŸ’² Input"),
836
  gr.update(value=0, label="πŸ’² Cache Read"),
837
  gr.update(value=0, label="πŸ’² Cache Creation"),
838
  gr.update(value=0, label="πŸ’² Completion"),
839
- ""
 
840
  )
841
 
842
  row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
@@ -847,6 +891,7 @@ def on_row_select(evt: gr.SelectData, df: pd.DataFrame):
847
  show_analyze = check_trajectories_downloaded(folder)
848
 
849
  prices_dict, model_hint = get_prices_for_folder(folder)
 
850
 
851
  def price_update(price_info, name):
852
  value = price_info["value"]
@@ -858,14 +903,15 @@ def on_row_select(evt: gr.SelectData, df: pd.DataFrame):
858
  return gr.update(value=0, label=f"❌ {name}")
859
 
860
  return (
861
- folder, name,
862
- gr.update(interactive=True),
863
- gr.update(visible=show_analyze),
864
  price_update(prices_dict["input"], "Input"),
865
  price_update(prices_dict["cache_read"], "Cache Read"),
866
  price_update(prices_dict["cache_creation"], "Cache Creation"),
867
  price_update(prices_dict["completion"], "Completion"),
868
- model_hint
 
869
  )
870
 
871
 
@@ -899,10 +945,8 @@ def build_app():
899
  plot_tokens_cost = gr.Plot(label="Cost by Token Type ($)")
900
 
901
  with gr.Row():
902
- plot_stacked = gr.Plot(label="Billable Tokens per Instance")
903
-
904
- with gr.Row():
905
- plot_cost_breakdown = gr.Plot(label="Cost Breakdown per Instance ($)")
906
 
907
  with gr.Column(scale=1):
908
  selected_folder = gr.State("")
@@ -935,20 +979,27 @@ def build_app():
935
  info="Multiplier for Calculated tokens (tiktoken β†’ native)",
936
  visible=False,
937
  )
 
 
 
 
 
 
938
 
939
- def update_overhead_visibility(source):
940
- return gr.update(visible=(source == "Calculated"))
 
941
 
942
  token_source.change(
943
- fn=update_overhead_visibility,
944
  inputs=[token_source],
945
- outputs=[thinking_overhead],
946
  )
947
 
948
  leaderboard_table.select(
949
  fn=on_row_select,
950
  inputs=[leaderboard_table],
951
- outputs=[selected_folder, selected_name, download_btn, analyze_btn, price_input, price_cache_read, price_cache_creation, price_completion, detected_model],
952
  )
953
 
954
  download_btn.click(
@@ -957,7 +1008,7 @@ def build_app():
957
  outputs=[download_status, analyze_btn],
958
  )
959
 
960
- def load_and_analyze(folder, input_price, cache_read_price, cache_creation_price, completion_price, source, overhead):
961
  empty_result = (
962
  gr.update(visible=False),
963
  None, None, None, None, None, None,
@@ -974,11 +1025,19 @@ def build_app():
974
  None,
975
  )
976
 
 
 
 
 
 
 
 
977
  if source == "Metadata":
978
- df = load_all_trajectories(folder)
979
  else:
980
- df = load_all_trajectories_calculated(folder)
981
- df = apply_thinking_overhead(df, overhead)
 
982
 
983
  if df.empty:
984
  yield empty_result
@@ -992,12 +1051,12 @@ def build_app():
992
  yield (
993
  gr.update(visible=True),
994
  fig_steps, fig_cost, fig_tokens, fig_tokens_cost, fig_stacked, fig_cost_breakdown,
995
- df,
996
  )
997
 
998
  analyze_btn.click(
999
  fn=load_and_analyze,
1000
- inputs=[selected_folder, price_input, price_cache_read, price_cache_creation, price_completion, token_source, thinking_overhead],
1001
  outputs=[
1002
  analysis_section,
1003
  plot_steps, plot_cost, plot_tokens, plot_tokens_cost, plot_stacked, plot_cost_breakdown,
@@ -1005,14 +1064,25 @@ def build_app():
1005
  ],
1006
  )
1007
 
1008
- def recalculate_costs(df, input_price, cache_read_price, cache_creation_price, completion_price):
1009
- if df is None or (isinstance(df, pd.DataFrame) and df.empty):
1010
  return None, None
 
 
 
 
 
 
 
 
 
 
 
1011
  fig_tokens_cost = create_cost_by_type_chart(df, input_price, cache_read_price, cache_creation_price, completion_price)
1012
  fig_cost_breakdown = create_cost_breakdown(df, input_price, cache_read_price, cache_creation_price, completion_price)
1013
  return fig_tokens_cost, fig_cost_breakdown
1014
 
1015
- price_inputs = [trajectories_state, price_input, price_cache_read, price_cache_creation, price_completion]
1016
  price_outputs = [plot_tokens_cost, plot_cost_breakdown]
1017
 
1018
  price_input.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs)
@@ -1020,16 +1090,17 @@ def build_app():
1020
  price_cache_creation.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs)
1021
  price_completion.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs)
1022
 
1023
- def on_source_change(folder, input_price, cache_read_price, cache_creation_price, completion_price, source, overhead):
1024
  """Recalculate only token-dependent charts when source changes"""
1025
- if not folder:
1026
  return None, None, None, None
1027
 
1028
  if source == "Metadata":
1029
- df = load_all_trajectories(folder)
1030
  else:
1031
- df = load_all_trajectories_calculated(folder)
1032
- df = apply_thinking_overhead(df, overhead)
 
1033
 
1034
  if df.empty:
1035
  return None, None, None, None
@@ -1041,16 +1112,25 @@ def build_app():
1041
 
1042
  return fig_tokens, fig_tokens_cost, fig_stacked, fig_cost_breakdown
1043
 
 
 
 
1044
  token_source.change(
1045
  fn=on_source_change,
1046
- inputs=[selected_folder, price_input, price_cache_read, price_cache_creation, price_completion, token_source, thinking_overhead],
1047
- outputs=[plot_tokens, plot_tokens_cost, plot_stacked, plot_cost_breakdown],
1048
  )
1049
 
1050
  thinking_overhead.change(
1051
  fn=on_source_change,
1052
- inputs=[selected_folder, price_input, price_cache_read, price_cache_creation, price_completion, token_source, thinking_overhead],
1053
- outputs=[plot_tokens, plot_tokens_cost, plot_stacked, plot_cost_breakdown],
 
 
 
 
 
 
1054
  )
1055
 
1056
  return app
 
28
  _calculated_tokens_cache = {}
29
 
30
 
31
+ def get_default_overhead(model_name: str) -> float:
32
+ """Get default tokenizer overhead for model provider"""
33
+ model_lower = model_name.lower() if model_name else ""
34
+
35
+ if "claude" in model_lower or "anthropic" in model_lower:
36
+ return 1.24
37
+ elif "gemini" in model_lower or "google" in model_lower:
38
+ return 1.0
39
+ elif "gpt" in model_lower or "openai" in model_lower or "o1" in model_lower or "o3" in model_lower:
40
+ return 1.0
41
+ else:
42
+ return 1.0
43
+
44
+
45
  def get_tokenizer(model_name: str):
46
  """Get appropriate tokenizer for model. Returns (tokenizer_func, name)"""
47
  global _tokenizer_cache
48
+
49
  model_lower = model_name.lower() if model_name else ""
50
+
 
51
  if "gpt-4o" in model_lower or "o1" in model_lower or "o3" in model_lower:
52
  tokenizer_name = "o200k_base"
53
  elif "gpt" in model_lower or "claude" in model_lower or "anthropic" in model_lower:
54
  tokenizer_name = "cl100k_base"
55
  elif "gemini" in model_lower or "google" in model_lower:
 
56
  return lambda text: int(len(text) / 3.23), "gemini_approx"
57
  else:
 
58
  tokenizer_name = "cl100k_base"
59
 
60
  if tokenizer_name not in _tokenizer_cache:
 
86
  return {"prompt_tokens": 0, "completion_tokens": 0, "cache_read_tokens": 0, "cache_creation_tokens": 0, "api_calls": 0}
87
 
88
  count_tokens, _ = get_tokenizer(model_name)
89
+
 
90
  message_tokens = []
91
  for msg in messages:
92
  content = msg.get("content", "")
 
118
  context_so_far += mt["tokens"]
119
 
120
  next_is_assistant = (i + 1 < len(message_tokens) and message_tokens[i + 1]["role"] == "assistant")
 
121
 
122
  if next_is_assistant:
123
  prompt_tokens += context_so_far
 
126
  assistant_tokens = message_tokens[i + 1]["tokens"]
127
  cache_creation_tokens += (context_so_far - cached_context) + assistant_tokens
128
  cached_context = context_so_far + assistant_tokens
 
 
 
 
129
 
130
  return {
131
  "prompt_tokens": prompt_tokens,
 
150
  return df
151
 
152
 
153
+ def apply_no_cache(df: pd.DataFrame) -> pd.DataFrame:
154
+ """Convert all tokens to uncached input + completion (no caching)"""
155
+ if df.empty:
156
+ return df
157
+
158
+ df = df.copy()
159
+ df["cache_read_tokens"] = 0
160
+ df["cache_creation_tokens"] = 0
161
+ return df
162
+
163
+
164
  def load_all_trajectories_calculated(folder: str) -> pd.DataFrame:
165
  """Load trajectories with self-calculated token counts"""
166
  global _calculated_tokens_cache
 
236
  return _litellm_prices_cache
237
 
238
 
239
+ def normalize_model_name(name: str) -> str:
240
+ """Normalize model name for comparison: lowercase, remove separators"""
241
+ return re.sub(r'[-_./]', '', name.lower())
242
+
243
+
244
  def get_model_prices(model_name: str) -> dict | None:
245
  if not model_name:
246
  return None
 
248
  prices = get_litellm_prices()
249
 
250
  clean_name = model_name.replace("anthropic/", "").replace("openai/", "")
251
+
 
252
  name_without_date = re.sub(r'-\d{8}$', '', clean_name)
253
 
254
  candidates = [
 
265
  if key in prices:
266
  return prices[key]
267
 
268
+ normalized_name = normalize_model_name(clean_name)
269
+ normalized_no_date = normalize_model_name(name_without_date)
270
+
271
  for key, value in prices.items():
272
+ key_normalized = normalize_model_name(key)
273
+ if normalized_name in key_normalized or normalized_no_date in key_normalized:
274
+ return value
275
+ key_last_part = key.split('/')[-1] if '/' in key else key
276
+ key_last_normalized = normalize_model_name(key_last_part)
277
+ if normalized_name == key_last_normalized or normalized_no_date == key_last_normalized:
278
  return value
279
 
280
  return None
 
301
 
302
  rows = []
303
  for r in bash_only["results"]:
304
+ resolved_pct = r.get("resolved", 0)
305
+ if isinstance(resolved_pct, (int, float)):
306
+ resolved_str = f"{resolved_pct:.1f}%"
307
+ else:
308
+ resolved_str = str(resolved_pct)
309
+
310
  rows.append({
311
  "name": r.get("name", ""),
312
+ "% resolved": resolved_str,
313
  "date": r.get("date", ""),
314
  "cost": round(r.get("cost", 0), 2),
315
  "instance_cost": round(r.get("instance_cost", 0), 4),
316
  "instance_calls": r.get("instance_calls", 0),
317
  "folder": r.get("folder", ""),
318
  "os_model": "βœ…" if r.get("os_model") else "❌",
 
319
  })
320
 
321
  return pd.DataFrame(rows)
 
385
  resolved_count = sum(1 for v in per_instance.values() if v.get("resolved"))
386
  total_count = len(per_instance)
387
 
388
+ if total_count > 0:
389
+ resolved_pct = f"{100*resolved_count/total_count:.1f}%"
390
+ else:
391
+ resolved_pct = "N/A"
392
+
393
+ status = f"βœ… Downloaded to {output_dir}\n\n{file_count} trajectory files\nResolved: {resolved_count}/{total_count} ({resolved_pct})"
394
  return status, gr.update(visible=True)
395
 
396
  except subprocess.TimeoutExpired:
 
558
 
559
  fig_tokens_cost = create_cost_by_type_chart(df, input_price, cache_read_price, cache_creation_price, completion_price)
560
 
561
+ # Stacked bar chart - sort by total tokens (sum of all stacked)
562
+ df_sorted = df.copy()
 
563
  df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0)
564
+ df_sorted["total_stacked"] = df_sorted["uncached_input_tokens"] + df_sorted["cache_read_tokens"] + df_sorted["cache_creation_tokens"] + df_sorted["completion_tokens"]
565
+ df_sorted = df_sorted.sort_values("total_stacked", ascending=False).reset_index(drop=True)
566
+ df_sorted["trajectory_idx"] = range(len(df_sorted))
567
 
568
  fig_stacked = go.Figure()
569
  fig_stacked.add_trace(go.Bar(
570
+ name="Uncached Input", x=df_sorted["trajectory_idx"], y=df_sorted["uncached_input_tokens"],
571
+ marker_color="#EF553B", hovertemplate="Trajectory: %{x}<br>Uncached Input: %{y:,.0f}<extra></extra>",
572
  ))
573
  fig_stacked.add_trace(go.Bar(
574
+ name="Cache Read", x=df_sorted["trajectory_idx"], y=df_sorted["cache_read_tokens"],
575
+ marker_color="#19D3F3", hovertemplate="Trajectory: %{x}<br>Cache Read: %{y:,.0f}<extra></extra>",
576
  ))
577
  fig_stacked.add_trace(go.Bar(
578
+ name="Cache Creation", x=df_sorted["trajectory_idx"], y=df_sorted["cache_creation_tokens"],
579
+ marker_color="#FFA15A", hovertemplate="Trajectory: %{x}<br>Cache Creation: %{y:,.0f}<extra></extra>",
580
  ))
581
  fig_stacked.add_trace(go.Bar(
582
+ name="Completion", x=df_sorted["trajectory_idx"], y=df_sorted["completion_tokens"],
583
+ marker_color="#AB63FA", hovertemplate="Trajectory: %{x}<br>Completion: %{y:,.0f}<extra></extra>",
584
  ))
585
  fig_stacked.update_layout(
586
  barmode="stack",
587
+ title="Tokens per Trajectory (stacked)",
588
+ xaxis_title="Trajectory (sorted by total tokens)",
589
  yaxis_title="Tokens",
590
  legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
591
  margin=dict(l=50, r=20, t=60, b=40),
 
602
  df,
603
  x="api_calls",
604
  nbins=30,
605
+ title="Distribution of API Calls (Steps) per Trajectory",
606
  color_discrete_sequence=["#636EFA"],
607
  )
608
  fig_steps.update_layout(
609
  xaxis_title="API Calls (Steps)",
610
+ yaxis_title="Number of Trajectories",
611
  showlegend=False,
612
  margin=dict(l=40, r=20, t=40, b=40),
613
  )
 
627
  )
628
  fig_cost.update_layout(
629
  xaxis_title="Cost ($)",
630
+ yaxis_title="Number of Trajectories",
631
  showlegend=False,
632
  margin=dict(l=40, r=20, t=40, b=40),
633
  )
 
641
  total_completion = df["completion_tokens"].sum()
642
  total_cache_read = df["cache_read_tokens"].sum()
643
  total_cache_creation = df["cache_creation_tokens"].sum()
644
+ # Uncached input = prompt - cache_read - cache_creation (per trajectory, then sum)
645
  df_temp = df.copy()
646
  df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0)
647
  total_uncached_input = df_temp["uncached_input"].sum()
 
677
  # Cost by token type (use separate function)
678
  fig_tokens_cost = create_cost_by_type_chart(df, input_price, cache_read_price, cache_creation_price, completion_price)
679
 
680
+ # Sort by total tokens (sum of all stacked)
681
+ df_sorted = df.copy()
 
682
  df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0)
683
+ df_sorted["total_stacked"] = df_sorted["uncached_input_tokens"] + df_sorted["cache_read_tokens"] + df_sorted["cache_creation_tokens"] + df_sorted["completion_tokens"]
684
+ df_sorted = df_sorted.sort_values("total_stacked", ascending=False).reset_index(drop=True)
685
+ df_sorted["trajectory_idx"] = range(len(df_sorted))
686
 
687
  fig_stacked = go.Figure()
688
 
689
  fig_stacked.add_trace(go.Bar(
690
  name="Uncached Input",
691
+ x=df_sorted["trajectory_idx"],
692
  y=df_sorted["uncached_input_tokens"],
693
  marker_color="#EF553B",
694
+ hovertemplate="Trajectory: %{x}<br>Uncached Input: %{y:,.0f}<extra></extra>",
695
  ))
696
 
697
  fig_stacked.add_trace(go.Bar(
698
  name="Cache Read",
699
+ x=df_sorted["trajectory_idx"],
700
  y=df_sorted["cache_read_tokens"],
701
  marker_color="#19D3F3",
702
+ hovertemplate="Trajectory: %{x}<br>Cache Read: %{y:,.0f}<extra></extra>",
703
  ))
704
 
705
  fig_stacked.add_trace(go.Bar(
706
  name="Cache Creation",
707
+ x=df_sorted["trajectory_idx"],
708
  y=df_sorted["cache_creation_tokens"],
709
  marker_color="#FFA15A",
710
+ hovertemplate="Trajectory: %{x}<br>Cache Creation: %{y:,.0f}<extra></extra>",
711
  ))
712
 
713
  fig_stacked.add_trace(go.Bar(
714
  name="Completion",
715
+ x=df_sorted["trajectory_idx"],
716
  y=df_sorted["completion_tokens"],
717
  marker_color="#AB63FA",
718
+ hovertemplate="Trajectory: %{x}<br>Completion: %{y:,.0f}<extra></extra>",
719
  ))
720
 
721
  fig_stacked.update_layout(
722
  barmode="stack",
723
+ title="Tokens per Trajectory (stacked)",
724
+ xaxis_title="Trajectory (sorted by total tokens)",
725
  yaxis_title="Tokens",
726
  legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
727
  margin=dict(l=50, r=20, t=60, b=40),
 
734
  if df.empty:
735
  return None
736
 
737
+ # Sort by total tokens (sum of all stacked)
738
+ df_sorted = df.copy()
 
 
739
  df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0)
740
+ df_sorted["total_stacked"] = df_sorted["uncached_input_tokens"] + df_sorted["cache_read_tokens"] + df_sorted["cache_creation_tokens"] + df_sorted["completion_tokens"]
741
+ df_sorted = df_sorted.sort_values("total_stacked", ascending=False).reset_index(drop=True)
742
+ df_sorted["trajectory_idx"] = range(len(df_sorted))
743
 
744
  df_sorted["cost_uncached_input"] = df_sorted["uncached_input_tokens"] * input_price / 1e6
745
  df_sorted["cost_cache_read"] = df_sorted["cache_read_tokens"] * cache_read_price / 1e6
 
750
 
751
  fig.add_trace(go.Bar(
752
  name=f"Uncached Input (${input_price:.2f}/1M)",
753
+ x=df_sorted["trajectory_idx"],
754
  y=df_sorted["cost_uncached_input"],
755
  marker_color="#EF553B",
756
+ hovertemplate="Trajectory: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
757
  ))
758
 
759
  fig.add_trace(go.Bar(
760
  name=f"Cache Read (${cache_read_price:.2f}/1M)",
761
+ x=df_sorted["trajectory_idx"],
762
  y=df_sorted["cost_cache_read"],
763
  marker_color="#19D3F3",
764
+ hovertemplate="Trajectory: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
765
  ))
766
 
767
  fig.add_trace(go.Bar(
768
  name=f"Cache Creation (${cache_creation_price:.2f}/1M)",
769
+ x=df_sorted["trajectory_idx"],
770
  y=df_sorted["cost_cache_creation"],
771
  marker_color="#FFA15A",
772
+ hovertemplate="Trajectory: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
773
  ))
774
 
775
  fig.add_trace(go.Bar(
776
  name=f"Completion (${completion_price:.2f}/1M)",
777
+ x=df_sorted["trajectory_idx"],
778
  y=df_sorted["cost_completion"],
779
  marker_color="#AB63FA",
780
+ hovertemplate="Trajectory: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
781
  ))
782
 
783
  total_cost = (
 
789
 
790
  fig.update_layout(
791
  barmode="stack",
792
+ title="Cost per Trajectory",
793
+ xaxis_title="Trajectory (sorted by total tokens)",
794
  yaxis_title="Cost ($)",
795
  legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
796
  margin=dict(l=50, r=20, t=60, b=40),
 
872
  def on_row_select(evt: gr.SelectData, df: pd.DataFrame):
873
  if evt.index is None:
874
  return (
875
+ "", "",
876
+ gr.update(interactive=False),
877
+ gr.update(visible=False),
878
  gr.update(value=0, label="πŸ’² Input"),
879
  gr.update(value=0, label="πŸ’² Cache Read"),
880
  gr.update(value=0, label="πŸ’² Cache Creation"),
881
  gr.update(value=0, label="πŸ’² Completion"),
882
+ "",
883
+ gr.update(value=1.0),
884
  )
885
 
886
  row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
 
891
  show_analyze = check_trajectories_downloaded(folder)
892
 
893
  prices_dict, model_hint = get_prices_for_folder(folder)
894
+ default_overhead = get_default_overhead(model_hint)
895
 
896
  def price_update(price_info, name):
897
  value = price_info["value"]
 
903
  return gr.update(value=0, label=f"❌ {name}")
904
 
905
  return (
906
+ folder, name,
907
+ gr.update(interactive=True),
908
+ gr.update(visible=show_analyze),
909
  price_update(prices_dict["input"], "Input"),
910
  price_update(prices_dict["cache_read"], "Cache Read"),
911
  price_update(prices_dict["cache_creation"], "Cache Creation"),
912
  price_update(prices_dict["completion"], "Completion"),
913
+ model_hint,
914
+ gr.update(value=default_overhead),
915
  )
916
 
917
 
 
945
  plot_tokens_cost = gr.Plot(label="Cost by Token Type ($)")
946
 
947
  with gr.Row():
948
+ plot_stacked = gr.Plot(label="Tokens per Trajectory")
949
+ plot_cost_breakdown = gr.Plot(label="Cost per Trajectory ($)")
 
 
950
 
951
  with gr.Column(scale=1):
952
  selected_folder = gr.State("")
 
979
  info="Multiplier for Calculated tokens (tiktoken β†’ native)",
980
  visible=False,
981
  )
982
+ use_cache = gr.Checkbox(
983
+ label="Use Cache",
984
+ value=True,
985
+ info="If disabled, all tokens are Uncached Input or Completion",
986
+ visible=False,
987
+ )
988
 
989
+ def update_calculated_options_visibility(source):
990
+ is_calc = source == "Calculated"
991
+ return gr.update(visible=is_calc), gr.update(visible=is_calc)
992
 
993
  token_source.change(
994
+ fn=update_calculated_options_visibility,
995
  inputs=[token_source],
996
+ outputs=[thinking_overhead, use_cache],
997
  )
998
 
999
  leaderboard_table.select(
1000
  fn=on_row_select,
1001
  inputs=[leaderboard_table],
1002
+ outputs=[selected_folder, selected_name, download_btn, analyze_btn, price_input, price_cache_read, price_cache_creation, price_completion, detected_model, thinking_overhead],
1003
  )
1004
 
1005
  download_btn.click(
 
1008
  outputs=[download_status, analyze_btn],
1009
  )
1010
 
1011
+ def load_and_analyze(folder, input_price, cache_read_price, cache_creation_price, completion_price, source, overhead, with_cache):
1012
  empty_result = (
1013
  gr.update(visible=False),
1014
  None, None, None, None, None, None,
 
1025
  None,
1026
  )
1027
 
1028
+ df_meta = load_all_trajectories(folder)
1029
+ df_calc = load_all_trajectories_calculated(folder)
1030
+ df_calc["api_calls"] = df_meta["api_calls"].values
1031
+ df_calc["instance_cost"] = df_meta["instance_cost"].values
1032
+
1033
+ state_data = {"meta": df_meta, "calculated": df_calc}
1034
+
1035
  if source == "Metadata":
1036
+ df = df_meta
1037
  else:
1038
+ df = apply_thinking_overhead(df_calc.copy(), overhead)
1039
+ if not with_cache:
1040
+ df = apply_no_cache(df)
1041
 
1042
  if df.empty:
1043
  yield empty_result
 
1051
  yield (
1052
  gr.update(visible=True),
1053
  fig_steps, fig_cost, fig_tokens, fig_tokens_cost, fig_stacked, fig_cost_breakdown,
1054
+ state_data,
1055
  )
1056
 
1057
  analyze_btn.click(
1058
  fn=load_and_analyze,
1059
+ inputs=[selected_folder, price_input, price_cache_read, price_cache_creation, price_completion, token_source, thinking_overhead, use_cache],
1060
  outputs=[
1061
  analysis_section,
1062
  plot_steps, plot_cost, plot_tokens, plot_tokens_cost, plot_stacked, plot_cost_breakdown,
 
1064
  ],
1065
  )
1066
 
1067
+ def recalculate_costs(state_data, input_price, cache_read_price, cache_creation_price, completion_price, source, overhead, with_cache):
1068
+ if state_data is None:
1069
  return None, None
1070
+
1071
+ if source == "Metadata":
1072
+ df = state_data["meta"]
1073
+ else:
1074
+ df = apply_thinking_overhead(state_data["calculated"].copy(), overhead)
1075
+ if not with_cache:
1076
+ df = apply_no_cache(df)
1077
+
1078
+ if df.empty:
1079
+ return None, None
1080
+
1081
  fig_tokens_cost = create_cost_by_type_chart(df, input_price, cache_read_price, cache_creation_price, completion_price)
1082
  fig_cost_breakdown = create_cost_breakdown(df, input_price, cache_read_price, cache_creation_price, completion_price)
1083
  return fig_tokens_cost, fig_cost_breakdown
1084
 
1085
+ price_inputs = [trajectories_state, price_input, price_cache_read, price_cache_creation, price_completion, token_source, thinking_overhead, use_cache]
1086
  price_outputs = [plot_tokens_cost, plot_cost_breakdown]
1087
 
1088
  price_input.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs)
 
1090
  price_cache_creation.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs)
1091
  price_completion.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs)
1092
 
1093
+ def on_source_change(state_data, input_price, cache_read_price, cache_creation_price, completion_price, source, overhead, with_cache):
1094
  """Recalculate only token-dependent charts when source changes"""
1095
+ if state_data is None:
1096
  return None, None, None, None
1097
 
1098
  if source == "Metadata":
1099
+ df = state_data["meta"]
1100
  else:
1101
+ df = apply_thinking_overhead(state_data["calculated"].copy(), overhead)
1102
+ if not with_cache:
1103
+ df = apply_no_cache(df)
1104
 
1105
  if df.empty:
1106
  return None, None, None, None
 
1112
 
1113
  return fig_tokens, fig_tokens_cost, fig_stacked, fig_cost_breakdown
1114
 
1115
+ source_change_inputs = [trajectories_state, price_input, price_cache_read, price_cache_creation, price_completion, token_source, thinking_overhead, use_cache]
1116
+ source_change_outputs = [plot_tokens, plot_tokens_cost, plot_stacked, plot_cost_breakdown]
1117
+
1118
  token_source.change(
1119
  fn=on_source_change,
1120
+ inputs=source_change_inputs,
1121
+ outputs=source_change_outputs,
1122
  )
1123
 
1124
  thinking_overhead.change(
1125
  fn=on_source_change,
1126
+ inputs=source_change_inputs,
1127
+ outputs=source_change_outputs,
1128
+ )
1129
+
1130
+ use_cache.change(
1131
+ fn=on_source_change,
1132
+ inputs=source_change_inputs,
1133
+ outputs=source_change_outputs,
1134
  )
1135
 
1136
  return app