Spaces:

JetBrains-Research
/

SWE-bench-Costs-Calculator

Sleeping

IgorSlinko commited on 9 days ago

Commit

9399ab7

1 Parent(s): 723fdc8

Major improvements to token calculation and UX

- Pre-calculate both Metadata and Calculated tokens on Load & Analyze
- Instant switching between token sources (no reload needed)
- Fix token calculation: only count user messages that have assistant response
- Provider-specific tokenizer overhead: Claude=1.24, Gemini/OpenAI=1.0
- Case-insensitive model name matching for litellm prices
- Add '% resolved' column, remove 'os_system' from leaderboard table
- Rename 'Instance' to 'Trajectory' throughout UI
- Sort bottom charts by total tokens (not cache_read)
- Make last two charts side-by-side
- Add 'Use Cache' checkbox for Calculated mode
- Fix division by zero when total_count=0
- Copy api_calls and instance_cost from metadata to calculated data

Files changed (1) hide show

app.py +177 -97

app.py CHANGED Viewed

@@ -28,22 +28,33 @@ _trajectories_cache = {}
 _calculated_tokens_cache = {}
 def get_tokenizer(model_name: str):
     """Get appropriate tokenizer for model. Returns (tokenizer_func, name)"""
     global _tokenizer_cache
     model_lower = model_name.lower() if model_name else ""
-    # Determine tokenizer type
     if "gpt-4o" in model_lower or "o1" in model_lower or "o3" in model_lower:
         tokenizer_name = "o200k_base"
     elif "gpt" in model_lower or "claude" in model_lower or "anthropic" in model_lower:
         tokenizer_name = "cl100k_base"
     elif "gemini" in model_lower or "google" in model_lower:
-        # Gemini uses ~3.23 chars per token (calculated from actual API responses)
         return lambda text: int(len(text) / 3.23), "gemini_approx"
     else:
-        # Default to cl100k_base for unknown models
         tokenizer_name = "cl100k_base"
     if tokenizer_name not in _tokenizer_cache:
@@ -75,8 +86,7 @@ def calculate_tokens_from_trajectory(traj_path: Path, model_name: str) -> dict:
         return {"prompt_tokens": 0, "completion_tokens": 0, "cache_read_tokens": 0, "cache_creation_tokens": 0, "api_calls": 0}
     count_tokens, _ = get_tokenizer(model_name)
-    # Calculate tokens for each message
     message_tokens = []
     for msg in messages:
         content = msg.get("content", "")
@@ -108,7 +118,6 @@ def calculate_tokens_from_trajectory(traj_path: Path, model_name: str) -> dict:
             context_so_far += mt["tokens"]
             next_is_assistant = (i + 1 < len(message_tokens) and message_tokens[i + 1]["role"] == "assistant")
-            is_last = (i == len(message_tokens) - 1)
             if next_is_assistant:
                 prompt_tokens += context_so_far
@@ -117,10 +126,6 @@ def calculate_tokens_from_trajectory(traj_path: Path, model_name: str) -> dict:
                 assistant_tokens = message_tokens[i + 1]["tokens"]
                 cache_creation_tokens += (context_so_far - cached_context) + assistant_tokens
                 cached_context = context_so_far + assistant_tokens
-            elif is_last:
-                prompt_tokens += context_so_far
-                cache_read_tokens += cached_context
-                cache_creation_tokens += context_so_far - cached_context
     return {
         "prompt_tokens": prompt_tokens,
@@ -145,6 +150,17 @@ def apply_thinking_overhead(df: pd.DataFrame, overhead: float) -> pd.DataFrame:
     return df
 def load_all_trajectories_calculated(folder: str) -> pd.DataFrame:
     """Load trajectories with self-calculated token counts"""
     global _calculated_tokens_cache
@@ -220,6 +236,11 @@ def get_litellm_prices() -> dict:
     return _litellm_prices_cache
 def get_model_prices(model_name: str) -> dict | None:
     if not model_name:
         return None
@@ -227,8 +248,7 @@ def get_model_prices(model_name: str) -> dict | None:
     prices = get_litellm_prices()
     clean_name = model_name.replace("anthropic/", "").replace("openai/", "")
-    # Try without date suffix (e.g., gemini-3-pro-preview-20251118 -> gemini-3-pro-preview)
     name_without_date = re.sub(r'-\d{8}$', '', clean_name)
     candidates = [
@@ -245,9 +265,16 @@ def get_model_prices(model_name: str) -> dict | None:
         if key in prices:
             return prices[key]
-    # Fuzzy match
     for key, value in prices.items():
-        if clean_name in key or model_name in key or name_without_date in key:
             return value
     return None
@@ -274,15 +301,21 @@ def get_bash_only_df():
     rows = []
     for r in bash_only["results"]:
         rows.append({
             "name": r.get("name", ""),
             "date": r.get("date", ""),
             "cost": round(r.get("cost", 0), 2),
             "instance_cost": round(r.get("instance_cost", 0), 4),
             "instance_calls": r.get("instance_calls", 0),
             "folder": r.get("folder", ""),
             "os_model": "✅" if r.get("os_model") else "❌",
-            "os_system": "✅" if r.get("os_system") else "❌",
         })
     return pd.DataFrame(rows)
@@ -352,7 +385,12 @@ def download_trajectories_from_s3(folder: str, progress=gr.Progress()):
         resolved_count = sum(1 for v in per_instance.values() if v.get("resolved"))
         total_count = len(per_instance)
-        status = f"✅ Downloaded to {output_dir}\n\n{file_count} trajectory files\nResolved: {resolved_count}/{total_count} ({100*resolved_count/total_count:.1f}%)"
         return status, gr.update(visible=True)
     except subprocess.TimeoutExpired:
@@ -520,32 +558,34 @@ def create_token_charts(df: pd.DataFrame, input_price: float, cache_read_price:
     fig_tokens_cost = create_cost_by_type_chart(df, input_price, cache_read_price, cache_creation_price, completion_price)
-    # Stacked bar chart
-    df_sorted = df.sort_values("cache_read_tokens", ascending=False).reset_index(drop=True)
-    df_sorted["instance_idx"] = range(len(df_sorted))
     df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0)
     fig_stacked = go.Figure()
     fig_stacked.add_trace(go.Bar(
-        name="Uncached Input", x=df_sorted["instance_idx"], y=df_sorted["uncached_input_tokens"],
-        marker_color="#EF553B", hovertemplate="Instance: %{x}<br>Uncached Input: %{y:,.0f}<extra></extra>",
     ))
     fig_stacked.add_trace(go.Bar(
-        name="Cache Read", x=df_sorted["instance_idx"], y=df_sorted["cache_read_tokens"],
-        marker_color="#19D3F3", hovertemplate="Instance: %{x}<br>Cache Read: %{y:,.0f}<extra></extra>",
     ))
     fig_stacked.add_trace(go.Bar(
-        name="Cache Creation", x=df_sorted["instance_idx"], y=df_sorted["cache_creation_tokens"],
-        marker_color="#FFA15A", hovertemplate="Instance: %{x}<br>Cache Creation: %{y:,.0f}<extra></extra>",
     ))
     fig_stacked.add_trace(go.Bar(
-        name="Completion", x=df_sorted["instance_idx"], y=df_sorted["completion_tokens"],
-        marker_color="#AB63FA", hovertemplate="Instance: %{x}<br>Completion: %{y:,.0f}<extra></extra>",
     ))
     fig_stacked.update_layout(
         barmode="stack",
-        title="Billable Tokens per Instance (stacked)",
-        xaxis_title="Instance (sorted by cache read)",
         yaxis_title="Tokens",
         legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
         margin=dict(l=50, r=20, t=60, b=40),
@@ -562,12 +602,12 @@ def create_basic_histograms(df: pd.DataFrame, input_price: float, cache_read_pri
         df,
         x="api_calls",
         nbins=30,
-        title="Distribution of API Calls (Steps) per Instance",
         color_discrete_sequence=["#636EFA"],
     )
     fig_steps.update_layout(
         xaxis_title="API Calls (Steps)",
-        yaxis_title="Number of Instances",
         showlegend=False,
         margin=dict(l=40, r=20, t=40, b=40),
     )
@@ -587,7 +627,7 @@ def create_basic_histograms(df: pd.DataFrame, input_price: float, cache_read_pri
     )
     fig_cost.update_layout(
         xaxis_title="Cost ($)",
-        yaxis_title="Number of Instances",
         showlegend=False,
         margin=dict(l=40, r=20, t=40, b=40),
     )
@@ -601,7 +641,7 @@ def create_basic_histograms(df: pd.DataFrame, input_price: float, cache_read_pri
     total_completion = df["completion_tokens"].sum()
     total_cache_read = df["cache_read_tokens"].sum()
     total_cache_creation = df["cache_creation_tokens"].sum()
-    # Uncached input = prompt - cache_read - cache_creation (per instance, then sum)
     df_temp = df.copy()
     df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0)
     total_uncached_input = df_temp["uncached_input"].sum()
@@ -637,49 +677,51 @@ def create_basic_histograms(df: pd.DataFrame, input_price: float, cache_read_pri
     # Cost by token type (use separate function)
     fig_tokens_cost = create_cost_by_type_chart(df, input_price, cache_read_price, cache_creation_price, completion_price)
-    df_sorted = df.sort_values("cache_read_tokens", ascending=False).reset_index(drop=True)
-    df_sorted["instance_idx"] = range(len(df_sorted))
-    # Uncached input = prompt - cache_read - cache_creation
     df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0)
     fig_stacked = go.Figure()
     fig_stacked.add_trace(go.Bar(
         name="Uncached Input",
-        x=df_sorted["instance_idx"],
         y=df_sorted["uncached_input_tokens"],
         marker_color="#EF553B",
-        hovertemplate="Instance: %{x}<br>Uncached Input: %{y:,.0f}<extra></extra>",
     ))
     fig_stacked.add_trace(go.Bar(
         name="Cache Read",
-        x=df_sorted["instance_idx"],
         y=df_sorted["cache_read_tokens"],
         marker_color="#19D3F3",
-        hovertemplate="Instance: %{x}<br>Cache Read: %{y:,.0f}<extra></extra>",
     ))
     fig_stacked.add_trace(go.Bar(
         name="Cache Creation",
-        x=df_sorted["instance_idx"],
         y=df_sorted["cache_creation_tokens"],
         marker_color="#FFA15A",
-        hovertemplate="Instance: %{x}<br>Cache Creation: %{y:,.0f}<extra></extra>",
     ))
     fig_stacked.add_trace(go.Bar(
         name="Completion",
-        x=df_sorted["instance_idx"],
         y=df_sorted["completion_tokens"],
         marker_color="#AB63FA",
-        hovertemplate="Instance: %{x}<br>Completion: %{y:,.0f}<extra></extra>",
     ))
     fig_stacked.update_layout(
         barmode="stack",
-        title="Billable Tokens per Instance (stacked)",
-        xaxis_title="Instance (sorted by cache read)",
         yaxis_title="Tokens",
         legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
         margin=dict(l=50, r=20, t=60, b=40),
@@ -692,11 +734,12 @@ def create_cost_breakdown(df: pd.DataFrame, input_price: float, cache_read_price
     if df.empty:
         return None
-    df_sorted = df.sort_values("cache_read_tokens", ascending=False).reset_index(drop=True)
-    df_sorted["instance_idx"] = range(len(df_sorted))
-    # Uncached input = prompt - cache_read - cache_creation
     df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0)
     df_sorted["cost_uncached_input"] = df_sorted["uncached_input_tokens"] * input_price / 1e6
     df_sorted["cost_cache_read"] = df_sorted["cache_read_tokens"] * cache_read_price / 1e6
@@ -707,34 +750,34 @@ def create_cost_breakdown(df: pd.DataFrame, input_price: float, cache_read_price
     fig.add_trace(go.Bar(
         name=f"Uncached Input (${input_price:.2f}/1M)",
-        x=df_sorted["instance_idx"],
         y=df_sorted["cost_uncached_input"],
         marker_color="#EF553B",
-        hovertemplate="Instance: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
     ))
     fig.add_trace(go.Bar(
         name=f"Cache Read (${cache_read_price:.2f}/1M)",
-        x=df_sorted["instance_idx"],
         y=df_sorted["cost_cache_read"],
         marker_color="#19D3F3",
-        hovertemplate="Instance: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
     ))
     fig.add_trace(go.Bar(
         name=f"Cache Creation (${cache_creation_price:.2f}/1M)",
-        x=df_sorted["instance_idx"],
         y=df_sorted["cost_cache_creation"],
         marker_color="#FFA15A",
-        hovertemplate="Instance: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
     ))
     fig.add_trace(go.Bar(
         name=f"Completion (${completion_price:.2f}/1M)",
-        x=df_sorted["instance_idx"],
         y=df_sorted["cost_completion"],
         marker_color="#AB63FA",
-        hovertemplate="Instance: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
     ))
     total_cost = (
@@ -746,8 +789,8 @@ def create_cost_breakdown(df: pd.DataFrame, input_price: float, cache_read_price
     fig.update_layout(
         barmode="stack",
-        title="Cost Breakdown per Instance",
-        xaxis_title="Instance (sorted by cache read)",
         yaxis_title="Cost ($)",
         legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
         margin=dict(l=50, r=20, t=60, b=40),
@@ -829,14 +872,15 @@ def get_prices_for_folder(folder: str) -> tuple[dict, str]:
 def on_row_select(evt: gr.SelectData, df: pd.DataFrame):
     if evt.index is None:
         return (
-            "", "",
-            gr.update(interactive=False),
-            gr.update(visible=False),
             gr.update(value=0, label="💲 Input"),
             gr.update(value=0, label="💲 Cache Read"),
             gr.update(value=0, label="💲 Cache Creation"),
             gr.update(value=0, label="💲 Completion"),
-            ""
         )
     row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
@@ -847,6 +891,7 @@ def on_row_select(evt: gr.SelectData, df: pd.DataFrame):
     show_analyze = check_trajectories_downloaded(folder)
     prices_dict, model_hint = get_prices_for_folder(folder)
     def price_update(price_info, name):
         value = price_info["value"]
@@ -858,14 +903,15 @@ def on_row_select(evt: gr.SelectData, df: pd.DataFrame):
             return gr.update(value=0, label=f"❌ {name}")
     return (
-        folder, name,
-        gr.update(interactive=True),
-        gr.update(visible=show_analyze),
         price_update(prices_dict["input"], "Input"),
         price_update(prices_dict["cache_read"], "Cache Read"),
         price_update(prices_dict["cache_creation"], "Cache Creation"),
         price_update(prices_dict["completion"], "Completion"),
-        model_hint
     )
@@ -899,10 +945,8 @@ def build_app():
                         plot_tokens_cost = gr.Plot(label="Cost by Token Type ($)")
                     with gr.Row():
-                        plot_stacked = gr.Plot(label="Billable Tokens per Instance")
-                    with gr.Row():
-                        plot_cost_breakdown = gr.Plot(label="Cost Breakdown per Instance ($)")
             with gr.Column(scale=1):
                 selected_folder = gr.State("")
@@ -935,20 +979,27 @@ def build_app():
                     info="Multiplier for Calculated tokens (tiktoken → native)",
                     visible=False,
                 )
-        def update_overhead_visibility(source):
-            return gr.update(visible=(source == "Calculated"))
         token_source.change(
-            fn=update_overhead_visibility,
             inputs=[token_source],
-            outputs=[thinking_overhead],
         )
         leaderboard_table.select(
             fn=on_row_select,
             inputs=[leaderboard_table],
-            outputs=[selected_folder, selected_name, download_btn, analyze_btn, price_input, price_cache_read, price_cache_creation, price_completion, detected_model],
         )
         download_btn.click(
@@ -957,7 +1008,7 @@ def build_app():
             outputs=[download_status, analyze_btn],
         )
-        def load_and_analyze(folder, input_price, cache_read_price, cache_creation_price, completion_price, source, overhead):
             empty_result = (
                 gr.update(visible=False),
                 None, None, None, None, None, None,
@@ -974,11 +1025,19 @@ def build_app():
                 None,
             )
             if source == "Metadata":
-                df = load_all_trajectories(folder)
             else:
-                df = load_all_trajectories_calculated(folder)
-                df = apply_thinking_overhead(df, overhead)
             if df.empty:
                 yield empty_result
@@ -992,12 +1051,12 @@ def build_app():
             yield (
                 gr.update(visible=True),
                 fig_steps, fig_cost, fig_tokens, fig_tokens_cost, fig_stacked, fig_cost_breakdown,
-                df,
             )
         analyze_btn.click(
             fn=load_and_analyze,
-            inputs=[selected_folder, price_input, price_cache_read, price_cache_creation, price_completion, token_source, thinking_overhead],
             outputs=[
                 analysis_section,
                 plot_steps, plot_cost, plot_tokens, plot_tokens_cost, plot_stacked, plot_cost_breakdown,
@@ -1005,14 +1064,25 @@ def build_app():
             ],
         )
-        def recalculate_costs(df, input_price, cache_read_price, cache_creation_price, completion_price):
-            if df is None or (isinstance(df, pd.DataFrame) and df.empty):
                 return None, None
             fig_tokens_cost = create_cost_by_type_chart(df, input_price, cache_read_price, cache_creation_price, completion_price)
             fig_cost_breakdown = create_cost_breakdown(df, input_price, cache_read_price, cache_creation_price, completion_price)
             return fig_tokens_cost, fig_cost_breakdown
-        price_inputs = [trajectories_state, price_input, price_cache_read, price_cache_creation, price_completion]
         price_outputs = [plot_tokens_cost, plot_cost_breakdown]
         price_input.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs)
@@ -1020,16 +1090,17 @@ def build_app():
         price_cache_creation.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs)
         price_completion.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs)
-        def on_source_change(folder, input_price, cache_read_price, cache_creation_price, completion_price, source, overhead):
             """Recalculate only token-dependent charts when source changes"""
-            if not folder:
                 return None, None, None, None
             if source == "Metadata":
-                df = load_all_trajectories(folder)
             else:
-                df = load_all_trajectories_calculated(folder)
-                df = apply_thinking_overhead(df, overhead)
             if df.empty:
                 return None, None, None, None
@@ -1041,16 +1112,25 @@ def build_app():
             return fig_tokens, fig_tokens_cost, fig_stacked, fig_cost_breakdown
         token_source.change(
             fn=on_source_change,
-            inputs=[selected_folder, price_input, price_cache_read, price_cache_creation, price_completion, token_source, thinking_overhead],
-            outputs=[plot_tokens, plot_tokens_cost, plot_stacked, plot_cost_breakdown],
         )
         thinking_overhead.change(
             fn=on_source_change,
-            inputs=[selected_folder, price_input, price_cache_read, price_cache_creation, price_completion, token_source, thinking_overhead],
-            outputs=[plot_tokens, plot_tokens_cost, plot_stacked, plot_cost_breakdown],
         )
     return app

 _calculated_tokens_cache = {}
+def get_default_overhead(model_name: str) -> float:
+    """Get default tokenizer overhead for model provider"""
+    model_lower = model_name.lower() if model_name else ""
+    if "claude" in model_lower or "anthropic" in model_lower:
+        return 1.24
+    elif "gemini" in model_lower or "google" in model_lower:
+        return 1.0
+    elif "gpt" in model_lower or "openai" in model_lower or "o1" in model_lower or "o3" in model_lower:
+        return 1.0
+    else:
+        return 1.0
 def get_tokenizer(model_name: str):
     """Get appropriate tokenizer for model. Returns (tokenizer_func, name)"""
     global _tokenizer_cache
     model_lower = model_name.lower() if model_name else ""
     if "gpt-4o" in model_lower or "o1" in model_lower or "o3" in model_lower:
         tokenizer_name = "o200k_base"
     elif "gpt" in model_lower or "claude" in model_lower or "anthropic" in model_lower:
         tokenizer_name = "cl100k_base"
     elif "gemini" in model_lower or "google" in model_lower:
         return lambda text: int(len(text) / 3.23), "gemini_approx"
     else:
         tokenizer_name = "cl100k_base"
     if tokenizer_name not in _tokenizer_cache:
         return {"prompt_tokens": 0, "completion_tokens": 0, "cache_read_tokens": 0, "cache_creation_tokens": 0, "api_calls": 0}
     count_tokens, _ = get_tokenizer(model_name)
     message_tokens = []
     for msg in messages:
         content = msg.get("content", "")
             context_so_far += mt["tokens"]
             next_is_assistant = (i + 1 < len(message_tokens) and message_tokens[i + 1]["role"] == "assistant")
             if next_is_assistant:
                 prompt_tokens += context_so_far
                 assistant_tokens = message_tokens[i + 1]["tokens"]
                 cache_creation_tokens += (context_so_far - cached_context) + assistant_tokens
                 cached_context = context_so_far + assistant_tokens
     return {
         "prompt_tokens": prompt_tokens,
     return df
+def apply_no_cache(df: pd.DataFrame) -> pd.DataFrame:
+    """Convert all tokens to uncached input + completion (no caching)"""
+    if df.empty:
+        return df
+    df = df.copy()
+    df["cache_read_tokens"] = 0
+    df["cache_creation_tokens"] = 0
+    return df
 def load_all_trajectories_calculated(folder: str) -> pd.DataFrame:
     """Load trajectories with self-calculated token counts"""
     global _calculated_tokens_cache
     return _litellm_prices_cache
+def normalize_model_name(name: str) -> str:
+    """Normalize model name for comparison: lowercase, remove separators"""
+    return re.sub(r'[-_./]', '', name.lower())
 def get_model_prices(model_name: str) -> dict | None:
     if not model_name:
         return None
     prices = get_litellm_prices()
     clean_name = model_name.replace("anthropic/", "").replace("openai/", "")
     name_without_date = re.sub(r'-\d{8}$', '', clean_name)
     candidates = [
         if key in prices:
             return prices[key]
+    normalized_name = normalize_model_name(clean_name)
+    normalized_no_date = normalize_model_name(name_without_date)
     for key, value in prices.items():
+        key_normalized = normalize_model_name(key)
+        if normalized_name in key_normalized or normalized_no_date in key_normalized:
+            return value
+        key_last_part = key.split('/')[-1] if '/' in key else key
+        key_last_normalized = normalize_model_name(key_last_part)
+        if normalized_name == key_last_normalized or normalized_no_date == key_last_normalized:
             return value
     return None
     rows = []
     for r in bash_only["results"]:
+        resolved_pct = r.get("resolved", 0)
+        if isinstance(resolved_pct, (int, float)):
+            resolved_str = f"{resolved_pct:.1f}%"
+        else:
+            resolved_str = str(resolved_pct)
         rows.append({
             "name": r.get("name", ""),
+            "% resolved": resolved_str,
             "date": r.get("date", ""),
             "cost": round(r.get("cost", 0), 2),
             "instance_cost": round(r.get("instance_cost", 0), 4),
             "instance_calls": r.get("instance_calls", 0),
             "folder": r.get("folder", ""),
             "os_model": "✅" if r.get("os_model") else "❌",
         })
     return pd.DataFrame(rows)
         resolved_count = sum(1 for v in per_instance.values() if v.get("resolved"))
         total_count = len(per_instance)
+        if total_count > 0:
+            resolved_pct = f"{100*resolved_count/total_count:.1f}%"
+        else:
+            resolved_pct = "N/A"
+        status = f"✅ Downloaded to {output_dir}\n\n{file_count} trajectory files\nResolved: {resolved_count}/{total_count} ({resolved_pct})"
         return status, gr.update(visible=True)
     except subprocess.TimeoutExpired:
     fig_tokens_cost = create_cost_by_type_chart(df, input_price, cache_read_price, cache_creation_price, completion_price)
+    # Stacked bar chart - sort by total tokens (sum of all stacked)
+    df_sorted = df.copy()
     df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0)
+    df_sorted["total_stacked"] = df_sorted["uncached_input_tokens"] + df_sorted["cache_read_tokens"] + df_sorted["cache_creation_tokens"] + df_sorted["completion_tokens"]
+    df_sorted = df_sorted.sort_values("total_stacked", ascending=False).reset_index(drop=True)
+    df_sorted["trajectory_idx"] = range(len(df_sorted))
     fig_stacked = go.Figure()
     fig_stacked.add_trace(go.Bar(
+        name="Uncached Input", x=df_sorted["trajectory_idx"], y=df_sorted["uncached_input_tokens"],
+        marker_color="#EF553B", hovertemplate="Trajectory: %{x}<br>Uncached Input: %{y:,.0f}<extra></extra>",
     ))
     fig_stacked.add_trace(go.Bar(
+        name="Cache Read", x=df_sorted["trajectory_idx"], y=df_sorted["cache_read_tokens"],
+        marker_color="#19D3F3", hovertemplate="Trajectory: %{x}<br>Cache Read: %{y:,.0f}<extra></extra>",
     ))
     fig_stacked.add_trace(go.Bar(
+        name="Cache Creation", x=df_sorted["trajectory_idx"], y=df_sorted["cache_creation_tokens"],
+        marker_color="#FFA15A", hovertemplate="Trajectory: %{x}<br>Cache Creation: %{y:,.0f}<extra></extra>",
     ))
     fig_stacked.add_trace(go.Bar(
+        name="Completion", x=df_sorted["trajectory_idx"], y=df_sorted["completion_tokens"],
+        marker_color="#AB63FA", hovertemplate="Trajectory: %{x}<br>Completion: %{y:,.0f}<extra></extra>",
     ))
     fig_stacked.update_layout(
         barmode="stack",
+        title="Tokens per Trajectory (stacked)",
+        xaxis_title="Trajectory (sorted by total tokens)",
         yaxis_title="Tokens",
         legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
         margin=dict(l=50, r=20, t=60, b=40),
         df,
         x="api_calls",
         nbins=30,
+        title="Distribution of API Calls (Steps) per Trajectory",
         color_discrete_sequence=["#636EFA"],
     )
     fig_steps.update_layout(
         xaxis_title="API Calls (Steps)",
+        yaxis_title="Number of Trajectories",
         showlegend=False,
         margin=dict(l=40, r=20, t=40, b=40),
     )
     )
     fig_cost.update_layout(
         xaxis_title="Cost ($)",
+        yaxis_title="Number of Trajectories",
         showlegend=False,
         margin=dict(l=40, r=20, t=40, b=40),
     )
     total_completion = df["completion_tokens"].sum()
     total_cache_read = df["cache_read_tokens"].sum()
     total_cache_creation = df["cache_creation_tokens"].sum()
+    # Uncached input = prompt - cache_read - cache_creation (per trajectory, then sum)
     df_temp = df.copy()
     df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0)
     total_uncached_input = df_temp["uncached_input"].sum()
     # Cost by token type (use separate function)
     fig_tokens_cost = create_cost_by_type_chart(df, input_price, cache_read_price, cache_creation_price, completion_price)
+    # Sort by total tokens (sum of all stacked)
+    df_sorted = df.copy()
     df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0)
+    df_sorted["total_stacked"] = df_sorted["uncached_input_tokens"] + df_sorted["cache_read_tokens"] + df_sorted["cache_creation_tokens"] + df_sorted["completion_tokens"]
+    df_sorted = df_sorted.sort_values("total_stacked", ascending=False).reset_index(drop=True)
+    df_sorted["trajectory_idx"] = range(len(df_sorted))
     fig_stacked = go.Figure()
     fig_stacked.add_trace(go.Bar(
         name="Uncached Input",
+        x=df_sorted["trajectory_idx"],
         y=df_sorted["uncached_input_tokens"],
         marker_color="#EF553B",
+        hovertemplate="Trajectory: %{x}<br>Uncached Input: %{y:,.0f}<extra></extra>",
     ))
     fig_stacked.add_trace(go.Bar(
         name="Cache Read",
+        x=df_sorted["trajectory_idx"],
         y=df_sorted["cache_read_tokens"],
         marker_color="#19D3F3",
+        hovertemplate="Trajectory: %{x}<br>Cache Read: %{y:,.0f}<extra></extra>",
     ))
     fig_stacked.add_trace(go.Bar(
         name="Cache Creation",
+        x=df_sorted["trajectory_idx"],
         y=df_sorted["cache_creation_tokens"],
         marker_color="#FFA15A",
+        hovertemplate="Trajectory: %{x}<br>Cache Creation: %{y:,.0f}<extra></extra>",
     ))
     fig_stacked.add_trace(go.Bar(
         name="Completion",
+        x=df_sorted["trajectory_idx"],
         y=df_sorted["completion_tokens"],
         marker_color="#AB63FA",
+        hovertemplate="Trajectory: %{x}<br>Completion: %{y:,.0f}<extra></extra>",
     ))
     fig_stacked.update_layout(
         barmode="stack",
+        title="Tokens per Trajectory (stacked)",
+        xaxis_title="Trajectory (sorted by total tokens)",
         yaxis_title="Tokens",
         legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
         margin=dict(l=50, r=20, t=60, b=40),
     if df.empty:
         return None
+    # Sort by total tokens (sum of all stacked)
+    df_sorted = df.copy()
     df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0)
+    df_sorted["total_stacked"] = df_sorted["uncached_input_tokens"] + df_sorted["cache_read_tokens"] + df_sorted["cache_creation_tokens"] + df_sorted["completion_tokens"]
+    df_sorted = df_sorted.sort_values("total_stacked", ascending=False).reset_index(drop=True)
+    df_sorted["trajectory_idx"] = range(len(df_sorted))
     df_sorted["cost_uncached_input"] = df_sorted["uncached_input_tokens"] * input_price / 1e6
     df_sorted["cost_cache_read"] = df_sorted["cache_read_tokens"] * cache_read_price / 1e6
     fig.add_trace(go.Bar(
         name=f"Uncached Input (${input_price:.2f}/1M)",
+        x=df_sorted["trajectory_idx"],
         y=df_sorted["cost_uncached_input"],
         marker_color="#EF553B",
+        hovertemplate="Trajectory: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
     ))
     fig.add_trace(go.Bar(
         name=f"Cache Read (${cache_read_price:.2f}/1M)",
+        x=df_sorted["trajectory_idx"],
         y=df_sorted["cost_cache_read"],
         marker_color="#19D3F3",
+        hovertemplate="Trajectory: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
     ))
     fig.add_trace(go.Bar(
         name=f"Cache Creation (${cache_creation_price:.2f}/1M)",
+        x=df_sorted["trajectory_idx"],
         y=df_sorted["cost_cache_creation"],
         marker_color="#FFA15A",
+        hovertemplate="Trajectory: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
     ))
     fig.add_trace(go.Bar(
         name=f"Completion (${completion_price:.2f}/1M)",
+        x=df_sorted["trajectory_idx"],
         y=df_sorted["cost_completion"],
         marker_color="#AB63FA",
+        hovertemplate="Trajectory: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
     ))
     total_cost = (
     fig.update_layout(
         barmode="stack",
+        title="Cost per Trajectory",
+        xaxis_title="Trajectory (sorted by total tokens)",
         yaxis_title="Cost ($)",
         legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
         margin=dict(l=50, r=20, t=60, b=40),
 def on_row_select(evt: gr.SelectData, df: pd.DataFrame):
     if evt.index is None:
         return (
+            "", "",
+            gr.update(interactive=False),
+            gr.update(visible=False),
             gr.update(value=0, label="💲 Input"),
             gr.update(value=0, label="💲 Cache Read"),
             gr.update(value=0, label="💲 Cache Creation"),
             gr.update(value=0, label="💲 Completion"),
+            "",
+            gr.update(value=1.0),
         )
     row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
     show_analyze = check_trajectories_downloaded(folder)
     prices_dict, model_hint = get_prices_for_folder(folder)
+    default_overhead = get_default_overhead(model_hint)
     def price_update(price_info, name):
         value = price_info["value"]
             return gr.update(value=0, label=f"❌ {name}")
     return (
+        folder, name,
+        gr.update(interactive=True),
+        gr.update(visible=show_analyze),
         price_update(prices_dict["input"], "Input"),
         price_update(prices_dict["cache_read"], "Cache Read"),
         price_update(prices_dict["cache_creation"], "Cache Creation"),
         price_update(prices_dict["completion"], "Completion"),
+        model_hint,
+        gr.update(value=default_overhead),
     )
                         plot_tokens_cost = gr.Plot(label="Cost by Token Type ($)")
                     with gr.Row():
+                        plot_stacked = gr.Plot(label="Tokens per Trajectory")
+                        plot_cost_breakdown = gr.Plot(label="Cost per Trajectory ($)")
             with gr.Column(scale=1):
                 selected_folder = gr.State("")
                     info="Multiplier for Calculated tokens (tiktoken → native)",
                     visible=False,
                 )
+                use_cache = gr.Checkbox(
+                    label="Use Cache",
+                    value=True,
+                    info="If disabled, all tokens are Uncached Input or Completion",
+                    visible=False,
+                )
+        def update_calculated_options_visibility(source):
+            is_calc = source == "Calculated"
+            return gr.update(visible=is_calc), gr.update(visible=is_calc)
         token_source.change(
+            fn=update_calculated_options_visibility,
             inputs=[token_source],
+            outputs=[thinking_overhead, use_cache],
         )
         leaderboard_table.select(
             fn=on_row_select,
             inputs=[leaderboard_table],
+            outputs=[selected_folder, selected_name, download_btn, analyze_btn, price_input, price_cache_read, price_cache_creation, price_completion, detected_model, thinking_overhead],
         )
         download_btn.click(
             outputs=[download_status, analyze_btn],
         )
+        def load_and_analyze(folder, input_price, cache_read_price, cache_creation_price, completion_price, source, overhead, with_cache):
             empty_result = (
                 gr.update(visible=False),
                 None, None, None, None, None, None,
                 None,
             )
+            df_meta = load_all_trajectories(folder)
+            df_calc = load_all_trajectories_calculated(folder)
+            df_calc["api_calls"] = df_meta["api_calls"].values
+            df_calc["instance_cost"] = df_meta["instance_cost"].values
+            state_data = {"meta": df_meta, "calculated": df_calc}
             if source == "Metadata":
+                df = df_meta
             else:
+                df = apply_thinking_overhead(df_calc.copy(), overhead)
+                if not with_cache:
+                    df = apply_no_cache(df)
             if df.empty:
                 yield empty_result
             yield (
                 gr.update(visible=True),
                 fig_steps, fig_cost, fig_tokens, fig_tokens_cost, fig_stacked, fig_cost_breakdown,
+                state_data,
             )
         analyze_btn.click(
             fn=load_and_analyze,
+            inputs=[selected_folder, price_input, price_cache_read, price_cache_creation, price_completion, token_source, thinking_overhead, use_cache],
             outputs=[
                 analysis_section,
                 plot_steps, plot_cost, plot_tokens, plot_tokens_cost, plot_stacked, plot_cost_breakdown,
             ],
         )
+        def recalculate_costs(state_data, input_price, cache_read_price, cache_creation_price, completion_price, source, overhead, with_cache):
+            if state_data is None:
                 return None, None
+            if source == "Metadata":
+                df = state_data["meta"]
+            else:
+                df = apply_thinking_overhead(state_data["calculated"].copy(), overhead)
+                if not with_cache:
+                    df = apply_no_cache(df)
+            if df.empty:
+                return None, None
             fig_tokens_cost = create_cost_by_type_chart(df, input_price, cache_read_price, cache_creation_price, completion_price)
             fig_cost_breakdown = create_cost_breakdown(df, input_price, cache_read_price, cache_creation_price, completion_price)
             return fig_tokens_cost, fig_cost_breakdown
+        price_inputs = [trajectories_state, price_input, price_cache_read, price_cache_creation, price_completion, token_source, thinking_overhead, use_cache]
         price_outputs = [plot_tokens_cost, plot_cost_breakdown]
         price_input.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs)
         price_cache_creation.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs)
         price_completion.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs)
+        def on_source_change(state_data, input_price, cache_read_price, cache_creation_price, completion_price, source, overhead, with_cache):
             """Recalculate only token-dependent charts when source changes"""
+            if state_data is None:
                 return None, None, None, None
             if source == "Metadata":
+                df = state_data["meta"]
             else:
+                df = apply_thinking_overhead(state_data["calculated"].copy(), overhead)
+                if not with_cache:
+                    df = apply_no_cache(df)
             if df.empty:
                 return None, None, None, None
             return fig_tokens, fig_tokens_cost, fig_stacked, fig_cost_breakdown
+        source_change_inputs = [trajectories_state, price_input, price_cache_read, price_cache_creation, price_completion, token_source, thinking_overhead, use_cache]
+        source_change_outputs = [plot_tokens, plot_tokens_cost, plot_stacked, plot_cost_breakdown]
         token_source.change(
             fn=on_source_change,
+            inputs=source_change_inputs,
+            outputs=source_change_outputs,
         )
         thinking_overhead.change(
             fn=on_source_change,
+            inputs=source_change_inputs,
+            outputs=source_change_outputs,
+        )
+        use_cache.change(
+            fn=on_source_change,
+            inputs=source_change_inputs,
+            outputs=source_change_outputs,
         )
     return app