Spaces:

JetBrains-Research
/

SWE-bench-Costs-Calculator

Sleeping

IgorSlinko commited on 3 days ago

Commit

c89587b

1 Parent(s): 33c5576

Improve routing charts with comparison view (v0.3.30)

- Add side-by-side comparison: [no routing] vs [with routing] bars
- Use hatched pattern for [no routing] bars to distinguish from routed
- Fix data source for [no routing] to match "Calculated from .traj" values
- Add base model name to legend items
- Improve legend order and styling
- Add gap between bar groups for better readability
- Keep original color palette for Metadata/Calculated charts

Files changed (1) hide show

app.py +99 -51

app.py CHANGED Viewed

@@ -742,7 +742,6 @@ def create_cost_by_type_chart(df: pd.DataFrame, input_price: float, cache_read_p
         cost_data,
         x="Token Type",
         y="Cost ($)",
-        title="",
         color="Token Type",
         color_discrete_sequence=["#EF553B", "#19D3F3", "#FFA15A", "#AB63FA"],
     )
@@ -1162,58 +1161,74 @@ def select_first_row(df: pd.DataFrame):
     return _build_selection_payload(default_idx, df)
-def create_routed_token_chart(base_tokens: dict, additional_models: list):
     """
-    Create stacked bar chart for tokens by type, comparing base vs additional models.
-    X-axis: token types, bars stacked by model.
     Args:
-        base_tokens: dict with uncached_input, cache_read, cache_creation, completion
         additional_models: list of (model_name, tokens_dict) tuples
     """
     import plotly.graph_objects as go
     categories = ["Uncached Input", "Cache Read", "Cache Creation", "Completion"]
     token_keys = ["uncached_input", "cache_read", "cache_creation", "completion"]
-    colors = ["#636EFA", "#EF553B", "#00CC96", "#AB63FA", "#FFA15A"]
     fig = go.Figure()
-    base_total = sum(base_tokens.get(k, 0) for k in token_keys)
-    base_values = [base_tokens.get(k, 0) / 1e6 for k in token_keys]
     fig.add_trace(go.Bar(
-        name="Base Model",
         x=categories,
-        y=base_values,
-        marker_color=colors[0],
-        hovertemplate="%{x}<br>Base Model: %{y:.3f}M<extra></extra>",
     ))
-    model_totals = [("Base Model", base_total)]
     for i, (model_name, tokens) in enumerate(additional_models):
-        model_total = sum(tokens.get(k, 0) for k in token_keys)
-        model_totals.append((model_name or f"Model {i+1}", model_total))
-        values = [tokens.get(k, 0) / 1e6 for k in token_keys]
-        color = colors[(i + 1) % len(colors)]
         fig.add_trace(go.Bar(
             name=model_name or f"Model {i+1}",
             x=categories,
-            y=values,
-            marker_color=color,
             hovertemplate="%{x}<br>" + (model_name or f"Model {i+1}") + ": %{y:.3f}M<extra></extra>",
         ))
-    grand_total = sum(t for _, t in model_totals)
-    annotation_lines = [f"<b>Total: {grand_total/1e6:.2f}M</b>"]
-    for name, total in model_totals:
-        annotation_lines.append(f"{name}: {total/1e6:.2f}M")
     fig.update_layout(
         yaxis_title="Tokens (M)",
         barmode="stack",
         margin=dict(l=40, r=40, t=40, b=40),
-        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
     )
     fig.add_annotation(
         text="<br>".join(annotation_lines),
@@ -1228,58 +1243,74 @@ def create_routed_token_chart(base_tokens: dict, additional_models: list):
     return fig
-def create_routed_cost_chart(base_costs: dict, additional_models: list):
     """
-    Create stacked bar chart for costs by type, comparing base vs additional models.
-    X-axis: cost types, bars stacked by model.
     Args:
-        base_costs: dict with uncached_input, cache_read, cache_creation, completion
         additional_models: list of (model_name, costs_dict) tuples
     """
     import plotly.graph_objects as go
     categories = ["Uncached Input", "Cache Read", "Cache Creation", "Completion"]
     cost_keys = ["uncached_input", "cache_read", "cache_creation", "completion"]
-    colors = ["#636EFA", "#EF553B", "#00CC96", "#AB63FA", "#FFA15A"]
     fig = go.Figure()
-    base_total = sum(base_costs.get(k, 0) for k in cost_keys)
-    base_values = [base_costs.get(k, 0) for k in cost_keys]
     fig.add_trace(go.Bar(
-        name="Base Model",
         x=categories,
-        y=base_values,
-        marker_color=colors[0],
-        hovertemplate="%{x}<br>Base Model: $%{y:.2f}<extra></extra>",
     ))
-    model_totals = [("Base Model", base_total)]
     for i, (model_name, costs) in enumerate(additional_models):
-        model_total = sum(costs.get(k, 0) for k in cost_keys)
-        model_totals.append((model_name or f"Model {i+1}", model_total))
-        values = [costs.get(k, 0) for k in cost_keys]
-        color = colors[(i + 1) % len(colors)]
         fig.add_trace(go.Bar(
             name=model_name or f"Model {i+1}",
             x=categories,
-            y=values,
-            marker_color=color,
             hovertemplate="%{x}<br>" + (model_name or f"Model {i+1}") + ": $%{y:.2f}<extra></extra>",
         ))
-    grand_total = sum(t for _, t in model_totals)
-    annotation_lines = [f"<b>Total: ${grand_total:.2f}</b>"]
-    for name, total in model_totals:
-        annotation_lines.append(f"{name}: ${total:.2f}")
     fig.update_layout(
         yaxis_title="Cost ($)",
         barmode="stack",
         margin=dict(l=40, r=40, t=40, b=40),
-        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
     )
     fig.add_annotation(
         text="<br>".join(annotation_lines),
@@ -1322,7 +1353,7 @@ def build_app():
         """)
         trajectories_state = gr.State(None)
-        gr.Markdown("# 🧮 SWE-bench Bash-Only Leaderboard `v0.3.22`")
         gr.Markdown("## 🎯 Select a base model for cost analysis (click a row)")
         with gr.Row():
@@ -1806,7 +1837,8 @@ def build_app():
             grep_1_val, grep_2_val, grep_3_val,
             resolved_model_val, unresolved_model_val,
             part_mode_val, start_1_val, end_1_val, start_2_val, end_2_val, start_3_val, end_3_val,
-            overhead, with_cache
         ):
             if state_data is None:
                 yield (
@@ -2103,6 +2135,20 @@ def build_app():
             additional_token_models = [(rc["name"], rc["tokens"]) for rc in routing_costs_list]
             additional_cost_models = [(rc["name"], rc["costs"]) for rc in routing_costs_list]
             yield (
                 gr.update(visible=True, value="⏳ Creating charts..."),
                 gr.update(visible=True),
@@ -2110,8 +2156,9 @@ def build_app():
                 None,
             )
-            tokens_chart = create_routed_token_chart(total_base_tokens, additional_token_models)
-            cost_chart = create_routed_cost_chart(base_costs, additional_cost_models)
             yield (
                 gr.update(visible=True, value=result_text),
@@ -2136,6 +2183,7 @@ def build_app():
                 resolved_model, unresolved_model,
                 part_mode, start_1, end_1, start_2, end_2, start_3, end_3,
                 thinking_overhead, use_cache,
             ],
             outputs=[routing_result, routing_plots_row, routing_tokens_plot, routing_cost_plot],
         )

         cost_data,
         x="Token Type",
         y="Cost ($)",
         color="Token Type",
         color_discrete_sequence=["#EF553B", "#19D3F3", "#FFA15A", "#AB63FA"],
     )
     return _build_selection_payload(default_idx, df)
+def create_routed_token_chart(original_tokens: dict, base_tokens: dict, additional_models: list, base_model_name: str = "Base"):
     """
+    Create grouped+stacked bar chart comparing Calculated vs Routed tokens.
     Args:
+        original_tokens: dict with uncached_input, cache_read, cache_creation, completion (from Calculated)
+        base_tokens: dict with uncached_input, cache_read, cache_creation, completion (base portion in routing)
         additional_models: list of (model_name, tokens_dict) tuples
+        base_model_name: name of the base model
     """
     import plotly.graph_objects as go
     categories = ["Uncached Input", "Cache Read", "Cache Creation", "Completion"]
     token_keys = ["uncached_input", "cache_read", "cache_creation", "completion"]
+    base_color_dark = "#636EFA"
+    base_color_light = "#A0C4FF"
+    model_colors = ["#EF553B", "#00CC96", "#AB63FA", "#FFA15A"]
     fig = go.Figure()
     fig.add_trace(go.Bar(
+        name=f"{base_model_name} [no routing]",
         x=categories,
+        y=[original_tokens.get(k, 0) / 1e6 for k in token_keys],
+        marker_color="rgba(99, 110, 250, 0.3)",
+        marker_line_color=base_color_dark,
+        marker_line_width=1,
+        marker_pattern_shape="/",
+        marker_pattern_fgcolor=base_color_dark,
+        offsetgroup="calculated",
+        hovertemplate="%{x}<br>" + base_model_name + " [no routing]: %{y:.3f}M<extra></extra>",
     ))
+    fig.add_trace(go.Bar(
+        name=f"{base_model_name} [with routing]",
+        x=categories,
+        y=[base_tokens.get(k, 0) / 1e6 for k in token_keys],
+        marker_color=base_color_dark,
+        offsetgroup="routed",
+        hovertemplate="%{x}<br>" + base_model_name + " [with routing]: %{y:.3f}M<extra></extra>",
+    ))
     for i, (model_name, tokens) in enumerate(additional_models):
         fig.add_trace(go.Bar(
             name=model_name or f"Model {i+1}",
             x=categories,
+            y=[tokens.get(k, 0) / 1e6 for k in token_keys],
+            marker_color=model_colors[i % len(model_colors)],
+            offsetgroup="routed",
             hovertemplate="%{x}<br>" + (model_name or f"Model {i+1}") + ": %{y:.3f}M<extra></extra>",
         ))
+    original_total = sum(original_tokens.get(k, 0) for k in token_keys)
+    routed_total = sum(base_tokens.get(k, 0) for k in token_keys) + sum(
+        sum(m[1].get(k, 0) for k in token_keys) for m in additional_models
+    )
+    annotation_lines = [
+        f"<b>No routing: {original_total/1e6:.2f}M</b>",
+        f"<b>With routing: {routed_total/1e6:.2f}M</b>",
+    ]
     fig.update_layout(
         yaxis_title="Tokens (M)",
         barmode="stack",
+        bargroupgap=0.1,
         margin=dict(l=40, r=40, t=40, b=40),
+        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1, traceorder="normal"),
     )
     fig.add_annotation(
         text="<br>".join(annotation_lines),
     return fig
+def create_routed_cost_chart(original_costs: dict, base_costs: dict, additional_models: list, base_model_name: str = "Base"):
     """
+    Create grouped+stacked bar chart comparing Calculated vs Routed costs.
     Args:
+        original_costs: dict with uncached_input, cache_read, cache_creation, completion (from Calculated)
+        base_costs: dict with uncached_input, cache_read, cache_creation, completion (base portion in routing)
         additional_models: list of (model_name, costs_dict) tuples
+        base_model_name: name of the base model
     """
     import plotly.graph_objects as go
     categories = ["Uncached Input", "Cache Read", "Cache Creation", "Completion"]
     cost_keys = ["uncached_input", "cache_read", "cache_creation", "completion"]
+    base_color_dark = "#636EFA"
+    base_color_light = "#A0C4FF"
+    model_colors = ["#EF553B", "#00CC96", "#AB63FA", "#FFA15A"]
     fig = go.Figure()
     fig.add_trace(go.Bar(
+        name=f"{base_model_name} [no routing]",
         x=categories,
+        y=[original_costs.get(k, 0) for k in cost_keys],
+        marker_color="rgba(99, 110, 250, 0.3)",
+        marker_line_color=base_color_dark,
+        marker_line_width=1,
+        marker_pattern_shape="/",
+        marker_pattern_fgcolor=base_color_dark,
+        offsetgroup="calculated",
+        hovertemplate="%{x}<br>" + base_model_name + " [no routing]: $%{y:.2f}<extra></extra>",
     ))
+    fig.add_trace(go.Bar(
+        name=f"{base_model_name} [with routing]",
+        x=categories,
+        y=[base_costs.get(k, 0) for k in cost_keys],
+        marker_color=base_color_dark,
+        offsetgroup="routed",
+        hovertemplate="%{x}<br>" + base_model_name + " [with routing]: $%{y:.2f}<extra></extra>",
+    ))
     for i, (model_name, costs) in enumerate(additional_models):
         fig.add_trace(go.Bar(
             name=model_name or f"Model {i+1}",
             x=categories,
+            y=[costs.get(k, 0) for k in cost_keys],
+            marker_color=model_colors[i % len(model_colors)],
+            offsetgroup="routed",
             hovertemplate="%{x}<br>" + (model_name or f"Model {i+1}") + ": $%{y:.2f}<extra></extra>",
         ))
+    original_total = sum(original_costs.get(k, 0) for k in cost_keys)
+    routed_total = sum(base_costs.get(k, 0) for k in cost_keys) + sum(
+        sum(m[1].get(k, 0) for k in cost_keys) for m in additional_models
+    )
+    annotation_lines = [
+        f"<b>No routing: ${original_total:.2f}</b>",
+        f"<b>With routing: ${routed_total:.2f}</b>",
+    ]
     fig.update_layout(
         yaxis_title="Cost ($)",
         barmode="stack",
+        bargroupgap=0.1,
         margin=dict(l=40, r=40, t=40, b=40),
+        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1, traceorder="normal"),
     )
     fig.add_annotation(
         text="<br>".join(annotation_lines),
         """)
         trajectories_state = gr.State(None)
+        gr.Markdown("# 🧮 SWE-bench Bash-Only Leaderboard `v0.3.30`")
         gr.Markdown("## 🎯 Select a base model for cost analysis (click a row)")
         with gr.Row():
             grep_1_val, grep_2_val, grep_3_val,
             resolved_model_val, unresolved_model_val,
             part_mode_val, start_1_val, end_1_val, start_2_val, end_2_val, start_3_val, end_3_val,
+            overhead, with_cache,
+            detected_model_val
         ):
             if state_data is None:
                 yield (
             additional_token_models = [(rc["name"], rc["tokens"]) for rc in routing_costs_list]
             additional_cost_models = [(rc["name"], rc["costs"]) for rc in routing_costs_list]
+            if df_calc is not None and not df_calc.empty:
+                df_temp = df_for_cost.copy()
+                df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0)
+                original_tokens_from_df = {
+                    "uncached_input": df_temp["uncached_input"].sum(),
+                    "cache_read": df_for_cost["cache_read_tokens"].sum(),
+                    "cache_creation": df_for_cost["cache_creation_tokens"].sum(),
+                    "completion": df_for_cost["completion_tokens"].sum(),
+                }
+            else:
+                original_tokens_from_df = total_original_tokens
+            original_costs = tokens_to_costs(original_tokens_from_df, base_prices)
             yield (
                 gr.update(visible=True, value="⏳ Creating charts..."),
                 gr.update(visible=True),
                 None,
             )
+            base_model_name = detected_model_val or "Base"
+            tokens_chart = create_routed_token_chart(original_tokens_from_df, total_base_tokens, additional_token_models, base_model_name)
+            cost_chart = create_routed_cost_chart(original_costs, base_costs, additional_cost_models, base_model_name)
             yield (
                 gr.update(visible=True, value=result_text),
                 resolved_model, unresolved_model,
                 part_mode, start_1, end_1, start_2, end_2, start_3, end_3,
                 thinking_overhead, use_cache,
+                detected_model,
             ],
             outputs=[routing_result, routing_plots_row, routing_tokens_plot, routing_cost_plot],
         )