Spaces:

JetBrains-Research
/

SWE-bench-Costs-Calculator

Sleeping

IgorSlinko commited on 8 days ago

Commit

5c06e74

1 Parent(s): 99badd3

Add routing calculation with proper caching simulation

- Add 'Let's ROUTE!!' button with yield for staged rendering
- Add routing token/cost charts grouped by model
- Fix original cost calculation (use uncached_input, not prompt_tokens)
- Support multiple additional models with different colors
- Rename 'routing model' to 'additional model' in charts
- Each model maintains independent cache context
- When switching models, cache is preserved (not reset)
- Proper calculation: uncached_input includes obs from prev step

Files changed (1) hide show

app.py +383 -0

app.py CHANGED Viewed

@@ -43,6 +43,114 @@ def parse_step_or_ratio(value: float, total_steps: int) -> int:
         return int(value * total_steps)
 def get_default_overhead(model_name: str) -> float:
     """Get default tokenizer overhead for model provider"""
     model_lower = model_name.lower() if model_name else ""
@@ -947,6 +1055,92 @@ def on_row_select(evt: gr.SelectData, df: pd.DataFrame):
     )
 def build_app():
     leaderboard_df = get_bash_only_df()
@@ -976,6 +1170,10 @@ def build_app():
                         plot_tokens = gr.Plot(label="Token Usage by Type")
                         plot_tokens_cost = gr.Plot(label="Cost by Token Type ($)")
                     with gr.Row():
                         plot_stacked = gr.Plot(label="Tokens per Trajectory")
                         plot_cost_breakdown = gr.Plot(label="Cost per Trajectory ($)")
@@ -1117,6 +1315,11 @@ def build_app():
                                     start_step_3 = gr.Number(label="Start (int=step; 0,0-1,0=ratio)", value=0, minimum=0, precision=2, interactive=True)
                                     end_step_3 = gr.Number(label="End (int=step; 0,0-1,0=ratio)", value=0.5, minimum=0, precision=2, interactive=True)
         def on_strategy_change(strategy):
             return (
                 gr.update(visible=strategy == "Replace on random steps"),
@@ -1242,6 +1445,186 @@ def build_app():
             outputs=[routing_price_3_input, routing_price_3_cache_read, routing_price_3_cache_creation, routing_price_3_completion],
         )
         def update_calculated_options_visibility(source):
             is_calc = source == "Calculated"
             return gr.update(visible=is_calc), gr.update(visible=is_calc)

         return int(value * total_steps)
+def get_routed_steps(total_steps: int, strategy: str, params: dict) -> set:
+    """
+    Determine which steps should be routed to alternative model.
+    Returns set of step indices (0-based) that should use the routing model.
+    """
+    import random
+    routed = set()
+    if strategy == "Replace on random steps":
+        pct = params.get("percentage", 50) / 100.0
+        num_to_route = int(total_steps * pct)
+        if num_to_route > 0:
+            routed = set(random.sample(range(total_steps), min(num_to_route, total_steps)))
+    elif strategy == "Replace every step k":
+        k = int(params.get("k", 2))
+        if k > 0:
+            routed = set(range(0, total_steps, k))
+    elif strategy == "Replace part of trajectory":
+        start = parse_step_or_ratio(params.get("start", 0), total_steps)
+        end = parse_step_or_ratio(params.get("end", 0.5), total_steps)
+        routed = set(range(start, min(end, total_steps)))
+    return routed
+def calculate_routed_cost(
+    trajectory_tokens: dict,
+    routed_steps: set,
+    base_prices: dict,
+    routing_prices: dict,
+) -> dict:
+    """
+    Calculate cost for a trajectory with routing.
+    Each model maintains its own independent cache.
+    When switching back to a model, its cache is still available.
+    Args:
+        trajectory_tokens: dict with per-step token counts
+        routed_steps: set of step indices using routing model
+        base_prices: {input, cache_read, cache_creation, completion} for base model
+        routing_prices: same for routing model
+    Returns:
+        dict with base_cost, routing_cost, total_cost
+    """
+    total_steps = trajectory_tokens.get("api_calls", 0)
+    if total_steps == 0:
+        return {"base_cost": 0, "routing_cost": 0, "total_cost": 0}
+    prompt_tokens = trajectory_tokens.get("prompt_tokens", 0)
+    completion_tokens = trajectory_tokens.get("completion_tokens", 0)
+    cache_read = trajectory_tokens.get("cache_read_tokens", 0)
+    cache_creation = trajectory_tokens.get("cache_creation_tokens", 0)
+    avg_prompt_per_step = prompt_tokens / total_steps if total_steps > 0 else 0
+    avg_completion_per_step = completion_tokens / total_steps if total_steps > 0 else 0
+    avg_cache_read_per_step = cache_read / total_steps if total_steps > 0 else 0
+    avg_cache_creation_per_step = cache_creation / total_steps if total_steps > 0 else 0
+    base_cost = 0
+    routing_cost = 0
+    base_cache_context = 0
+    routing_cache_context = 0
+    for step in range(total_steps):
+        is_routed = step in routed_steps
+        prices = routing_prices if is_routed else base_prices
+        if is_routed:
+            cache_ctx = routing_cache_context
+        else:
+            cache_ctx = base_cache_context
+        uncached_input = avg_prompt_per_step - avg_cache_read_per_step
+        if cache_ctx == 0:
+            step_cache_read = 0
+            step_uncached = avg_prompt_per_step
+        else:
+            step_cache_read = avg_cache_read_per_step
+            step_uncached = uncached_input
+        step_cost = (
+            step_uncached * prices["input"] / 1e6 +
+            step_cache_read * prices["cache_read"] / 1e6 +
+            avg_cache_creation_per_step * prices["cache_creation"] / 1e6 +
+            avg_completion_per_step * prices["completion"] / 1e6
+        )
+        if is_routed:
+            routing_cost += step_cost
+            routing_cache_context += avg_prompt_per_step + avg_completion_per_step
+        else:
+            base_cost += step_cost
+            base_cache_context += avg_prompt_per_step + avg_completion_per_step
+    return {
+        "base_cost": base_cost,
+        "routing_cost": routing_cost,
+        "total_cost": base_cost + routing_cost,
+    }
 def get_default_overhead(model_name: str) -> float:
     """Get default tokenizer overhead for model provider"""
     model_lower = model_name.lower() if model_name else ""
     )
+def create_routed_token_chart(base_tokens: dict, additional_models: list):
+    """
+    Create grouped bar chart for tokens by type, comparing base vs additional models.
+    Args:
+        base_tokens: dict with uncached_input, cache_read, cache_creation, completion
+        additional_models: list of (model_name, tokens_dict) tuples
+    """
+    import plotly.graph_objects as go
+    categories = ["Uncached Input", "Cache Read", "Cache Creation", "Completion"]
+    colors = ["#636EFA", "#EF553B", "#00CC96", "#AB63FA", "#FFA15A"]
+    fig = go.Figure()
+    base_values = [
+        base_tokens.get("uncached_input", 0) / 1e6,
+        base_tokens.get("cache_read", 0) / 1e6,
+        base_tokens.get("cache_creation", 0) / 1e6,
+        base_tokens.get("completion", 0) / 1e6,
+    ]
+    fig.add_trace(go.Bar(name="Base Model", x=categories, y=base_values, marker_color=colors[0]))
+    for i, (model_name, tokens) in enumerate(additional_models):
+        values = [
+            tokens.get("uncached_input", 0) / 1e6,
+            tokens.get("cache_read", 0) / 1e6,
+            tokens.get("cache_creation", 0) / 1e6,
+            tokens.get("completion", 0) / 1e6,
+        ]
+        color = colors[(i + 1) % len(colors)]
+        fig.add_trace(go.Bar(name=model_name or f"Model {i+1}", x=categories, y=values, marker_color=color))
+    fig.update_layout(
+        title="Tokens by Type (per Model)",
+        yaxis_title="Tokens (M)",
+        barmode="group",
+        margin=dict(l=40, r=40, t=60, b=40),
+        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
+    )
+    return fig
+def create_routed_cost_chart(base_costs: dict, additional_models: list):
+    """
+    Create grouped bar chart for costs by type, comparing base vs additional models.
+    Args:
+        base_costs: dict with uncached_input, cache_read, cache_creation, completion
+        additional_models: list of (model_name, costs_dict) tuples
+    """
+    import plotly.graph_objects as go
+    categories = ["Uncached Input", "Cache Read", "Cache Creation", "Completion"]
+    colors = ["#636EFA", "#EF553B", "#00CC96", "#AB63FA", "#FFA15A"]
+    fig = go.Figure()
+    base_values = [
+        base_costs.get("uncached_input", 0),
+        base_costs.get("cache_read", 0),
+        base_costs.get("cache_creation", 0),
+        base_costs.get("completion", 0),
+    ]
+    fig.add_trace(go.Bar(name="Base Model", x=categories, y=base_values, marker_color=colors[0]))
+    for i, (model_name, costs) in enumerate(additional_models):
+        values = [
+            costs.get("uncached_input", 0),
+            costs.get("cache_read", 0),
+            costs.get("cache_creation", 0),
+            costs.get("completion", 0),
+        ]
+        color = colors[(i + 1) % len(colors)]
+        fig.add_trace(go.Bar(name=model_name or f"Model {i+1}", x=categories, y=values, marker_color=color))
+    fig.update_layout(
+        title="Cost by Type (per Model) ($)",
+        yaxis_title="Cost ($)",
+        barmode="group",
+        margin=dict(l=40, r=40, t=60, b=40),
+        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
+    )
+    return fig
 def build_app():
     leaderboard_df = get_bash_only_df()
                         plot_tokens = gr.Plot(label="Token Usage by Type")
                         plot_tokens_cost = gr.Plot(label="Cost by Token Type ($)")
+                    with gr.Row(visible=False) as routing_plots_row:
+                        routing_tokens_plot = gr.Plot(label="Tokens by Type (per Model)")
+                        routing_cost_plot = gr.Plot(label="Cost by Type (per Model)")
                     with gr.Row():
                         plot_stacked = gr.Plot(label="Tokens per Trajectory")
                         plot_cost_breakdown = gr.Plot(label="Cost per Trajectory ($)")
                                     start_step_3 = gr.Number(label="Start (int=step; 0,0-1,0=ratio)", value=0, minimum=0, precision=2, interactive=True)
                                     end_step_3 = gr.Number(label="End (int=step; 0,0-1,0=ratio)", value=0.5, minimum=0, precision=2, interactive=True)
+                    gr.Markdown("---")
+                    route_btn = gr.Button("🚀 Let's ROUTE!!", variant="primary", size="lg")
+                    routing_result = gr.Markdown(visible=False)
         def on_strategy_change(strategy):
             return (
                 gr.update(visible=strategy == "Replace on random steps"),
             outputs=[routing_price_3_input, routing_price_3_cache_read, routing_price_3_cache_creation, routing_price_3_completion],
         )
+        def run_routing(
+            state_data,
+            base_input, base_cache_read, base_cache_creation, base_completion,
+            routing_model_1_val, r1_input, r1_cache_read, r1_cache_creation, r1_completion,
+            strategy_1_val, random_pct_1_val, step_k_1_val, start_1_val, end_1_val,
+            source, overhead, with_cache
+        ):
+            if state_data is None:
+                yield (
+                    gr.update(visible=True, value="❌ No trajectories loaded. Click 'Load & Analyze' first."),
+                    gr.update(visible=False),
+                    None, None,
+                )
+                return
+            if not routing_model_1_val:
+                yield (
+                    gr.update(visible=True, value="❌ Please select at least one routing model."),
+                    gr.update(visible=False),
+                    None, None,
+                )
+                return
+            df_key = "meta" if source == "Metadata" else "calculated"
+            df = state_data.get(df_key)
+            if df is None or df.empty:
+                yield (
+                    gr.update(visible=True, value="❌ No trajectory data available."),
+                    gr.update(visible=False),
+                    None, None,
+                )
+                return
+            if source == "Calculated":
+                df = apply_thinking_overhead(df.copy(), overhead)
+                if not with_cache:
+                    df = apply_no_cache(df)
+            base_prices = {
+                "input": base_input,
+                "cache_read": base_cache_read,
+                "cache_creation": base_cache_creation,
+                "completion": base_completion,
+            }
+            routing_prices = {
+                "input": r1_input,
+                "cache_read": r1_cache_read,
+                "cache_creation": r1_cache_creation,
+                "completion": r1_completion,
+            }
+            strategy_params = {}
+            if strategy_1_val == "Replace on random steps":
+                strategy_params["percentage"] = random_pct_1_val
+            elif strategy_1_val == "Replace every step k":
+                strategy_params["k"] = step_k_1_val
+            elif strategy_1_val == "Replace part of trajectory":
+                strategy_params["start"] = start_1_val
+                strategy_params["end"] = end_1_val
+            total_base_cost = 0
+            total_routing_cost = 0
+            total_original_cost = 0
+            base_tokens = {"uncached_input": 0, "cache_read": 0, "cache_creation": 0, "completion": 0}
+            routing_tokens = {"uncached_input": 0, "cache_read": 0, "cache_creation": 0, "completion": 0}
+            base_costs = {"uncached_input": 0, "cache_read": 0, "cache_creation": 0, "completion": 0}
+            routing_costs = {"uncached_input": 0, "cache_read": 0, "cache_creation": 0, "completion": 0}
+            for _, row in df.iterrows():
+                total_steps = int(row.get("api_calls", 0))
+                if total_steps == 0:
+                    continue
+                routed_steps = get_routed_steps(total_steps, strategy_1_val, strategy_params)
+                num_base_steps = total_steps - len(routed_steps)
+                num_routing_steps = len(routed_steps)
+                prompt_tokens = row.get("prompt_tokens", 0)
+                completion_tokens = row.get("completion_tokens", 0)
+                cache_read_tokens = row.get("cache_read_tokens", 0)
+                cache_creation_tokens = row.get("cache_creation_tokens", 0)
+                uncached_input_tokens = prompt_tokens - cache_read_tokens - cache_creation_tokens
+                if uncached_input_tokens < 0:
+                    uncached_input_tokens = 0
+                base_ratio = num_base_steps / total_steps if total_steps > 0 else 0
+                routing_ratio = num_routing_steps / total_steps if total_steps > 0 else 0
+                base_tokens["uncached_input"] += uncached_input_tokens * base_ratio
+                base_tokens["cache_read"] += cache_read_tokens * base_ratio
+                base_tokens["cache_creation"] += cache_creation_tokens * base_ratio
+                base_tokens["completion"] += completion_tokens * base_ratio
+                routing_tokens["uncached_input"] += uncached_input_tokens * routing_ratio
+                routing_tokens["cache_read"] += cache_read_tokens * routing_ratio
+                routing_tokens["cache_creation"] += cache_creation_tokens * routing_ratio
+                routing_tokens["completion"] += completion_tokens * routing_ratio
+                base_costs["uncached_input"] += uncached_input_tokens * base_ratio * base_prices["input"] / 1e6
+                base_costs["cache_read"] += cache_read_tokens * base_ratio * base_prices["cache_read"] / 1e6
+                base_costs["cache_creation"] += cache_creation_tokens * base_ratio * base_prices["cache_creation"] / 1e6
+                base_costs["completion"] += completion_tokens * base_ratio * base_prices["completion"] / 1e6
+                routing_costs["uncached_input"] += uncached_input_tokens * routing_ratio * routing_prices["input"] / 1e6
+                routing_costs["cache_read"] += cache_read_tokens * routing_ratio * routing_prices["cache_read"] / 1e6
+                routing_costs["cache_creation"] += cache_creation_tokens * routing_ratio * routing_prices["cache_creation"] / 1e6
+                routing_costs["completion"] += completion_tokens * routing_ratio * routing_prices["completion"] / 1e6
+                traj_tokens = {
+                    "api_calls": total_steps,
+                    "prompt_tokens": prompt_tokens,
+                    "completion_tokens": completion_tokens,
+                    "cache_read_tokens": cache_read_tokens,
+                    "cache_creation_tokens": cache_creation_tokens,
+                }
+                result = calculate_routed_cost(traj_tokens, routed_steps, base_prices, routing_prices)
+                total_base_cost += result["base_cost"]
+                total_routing_cost += result["routing_cost"]
+                original_cost = (
+                    uncached_input_tokens * base_prices["input"] / 1e6 +
+                    cache_read_tokens * base_prices["cache_read"] / 1e6 +
+                    cache_creation_tokens * base_prices["cache_creation"] / 1e6 +
+                    completion_tokens * base_prices["completion"] / 1e6
+                )
+                total_original_cost += original_cost
+            total_routed_cost = total_base_cost + total_routing_cost
+            savings = total_original_cost - total_routed_cost
+            savings_pct = (savings / total_original_cost * 100) if total_original_cost > 0 else 0
+            result_text = f"""
+## 🚀 Routing Results
+| Metric | Value |
+|--------|-------|
+| **Original Cost (base model only)** | ${total_original_cost:.2f} |
+| **Routed Cost** | ${total_routed_cost:.2f} |
+| ↳ Base model portion | ${total_base_cost:.2f} |
+| ↳ Routing model portion | ${total_routing_cost:.2f} |
+| **Savings** | ${savings:.2f} ({savings_pct:+.1f}%) |
+*Strategy: {strategy_1_val}*
+*Routing model: {routing_model_1_val}*
+"""
+            additional_token_models = [(routing_model_1_val, routing_tokens)]
+            additional_cost_models = [(routing_model_1_val, routing_costs)]
+            yield (
+                gr.update(visible=True, value="⏳ Creating charts..."),
+                gr.update(visible=True),
+                None,
+                None,
+            )
+            tokens_chart = create_routed_token_chart(base_tokens, additional_token_models)
+            cost_chart = create_routed_cost_chart(base_costs, additional_cost_models)
+            yield (
+                gr.update(visible=True, value=result_text),
+                gr.update(visible=True),
+                tokens_chart,
+                cost_chart,
+            )
+        route_btn.click(
+            fn=run_routing,
+            inputs=[
+                trajectories_state,
+                price_input, price_cache_read, price_cache_creation, price_completion,
+                routing_model_1, routing_price_1_input, routing_price_1_cache_read, routing_price_1_cache_creation, routing_price_1_completion,
+                strategy_1, random_pct_1, step_k_1, start_step_1, end_step_1,
+                token_source, thinking_overhead, use_cache,
+            ],
+            outputs=[routing_result, routing_plots_row, routing_tokens_plot, routing_cost_plot],
+        )
         def update_calculated_options_visibility(source):
             is_calc = source == "Calculated"
             return gr.update(visible=is_calc), gr.update(visible=is_calc)