IgorSlinko's picture
Support multiple routing models (up to 3)
c63e9d7
raw
history blame
80.9 kB
import json
import os
import re
import subprocess
from pathlib import Path
import gradio as gr
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import requests
import tiktoken
from src.download_swebench_leaderboard import download_leaderboard
# Tokenizer cache
_tokenizer_cache = {}
DATA_DIR = Path("data")
TRAJS_DIR = DATA_DIR / "swebench_trajs"
LEADERBOARD_CACHE = DATA_DIR / "swebench_leaderboard_latest.json"
LITELLM_PRICES_CACHE = DATA_DIR / "litellm_prices.json"
S3_BUCKET = "s3://swe-bench-experiments/bash-only"
LITELLM_PRICES_URL = "https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json"
_litellm_prices_cache = None
_trajectories_cache = {}
_calculated_tokens_cache = {}
_trajectory_steps_cache = {}
def parse_start_end(start: float, end: float, total_steps: int, mode: str) -> tuple[int, int]:
"""
Parse start and end values based on mode.
Args:
start: start value
end: end value
total_steps: total number of steps in trajectory
mode: "Indexes" or "Percentages"
Returns: (start_idx, end_idx) - both 0-based
"""
if mode == "Indexes":
return int(start), int(end)
else:
return int(start * total_steps / 100), int(end * total_steps / 100)
def get_routed_steps(total_steps: int, strategy: str, params: dict) -> set:
"""
Determine which steps should be routed to alternative model.
Returns set of step indices (0-based) that should use the routing model.
"""
import random
routed = set()
if strategy == "Replace on random steps":
pct = params.get("percentage", 50) / 100.0
num_to_route = int(total_steps * pct)
if num_to_route > 0:
routed = set(random.sample(range(total_steps), min(num_to_route, total_steps)))
elif strategy == "Replace every step k":
k = int(params.get("k", 2))
if k > 0:
routed = set(range(0, total_steps, k))
elif strategy == "Replace part of trajectory":
mode = params.get("mode", "Percentages")
start, end = parse_start_end(params.get("start", 0), params.get("end", 30), total_steps, mode)
routed = set(range(start, min(end, total_steps)))
return routed
def calculate_routing_tokens(steps: list[dict]) -> dict:
"""
Calculate token breakdown per model with proper caching simulation.
Args:
steps: list of dicts with keys:
- model: str (model name)
- system_user: int (tokens for system/user message, usually only step 0)
- completion: int (generated tokens)
- observation: int or None (env response tokens, None for last step)
Returns:
dict with per-model totals:
{model_name: {cache_read, uncached_input, completion, observation, cache_creation}}
"""
model_caches = {}
model_totals = {}
total_context = 0
prev_observation = 0
for i, step in enumerate(steps):
model = step["model"]
system_user = step.get("system_user", 0)
completion = step.get("completion", 0)
observation = step.get("observation") or 0
if model not in model_caches:
model_caches[model] = 0
if model not in model_totals:
model_totals[model] = {
"cache_read": 0,
"uncached_input": 0,
"completion": 0,
"observation": 0,
"cache_creation": 0,
}
cache_read = model_caches[model]
if i == 0:
uncached_input = system_user
else:
full_context_needed = total_context + prev_observation
uncached_input = full_context_needed - cache_read
cache_creation = uncached_input + completion
model_caches[model] = cache_read + cache_creation
model_totals[model]["cache_read"] += cache_read
model_totals[model]["uncached_input"] += uncached_input
model_totals[model]["completion"] += completion
model_totals[model]["observation"] += observation
model_totals[model]["cache_creation"] += cache_creation
total_context = cache_read + uncached_input + completion
prev_observation = observation
return model_totals
def parse_trajectory_to_steps(traj_path: Path, model_name: str) -> list[dict]:
"""
Parse trajectory file into step format for calculate_routing_tokens.
Returns list of steps with:
- model: base model name
- system_user: tokens for system + user message (step 0 only)
- completion: assistant response tokens
- observation: env response tokens (None for last step)
"""
with open(traj_path, "r", encoding="utf-8") as f:
data = json.load(f)
messages = data.get("messages", [])
if not messages:
return []
count_tokens, _ = get_tokenizer(model_name)
steps = []
system_user_tokens = 0
current_completion = 0
pending_observation = None
i = 0
while i < len(messages):
msg = messages[i]
role = msg.get("role", "user")
content = msg.get("content", "")
if isinstance(content, list):
content = json.dumps(content)
tokens = count_tokens(str(content))
if role == "system":
system_user_tokens += tokens
i += 1
elif role == "user":
if not steps:
system_user_tokens += tokens
i += 1
else:
if steps:
steps[-1]["observation"] = tokens
pending_observation = tokens
i += 1
elif role == "assistant":
step = {
"model": model_name,
"system_user": system_user_tokens if not steps else 0,
"completion": tokens,
"observation": None,
}
steps.append(step)
system_user_tokens = 0
i += 1
return steps
def get_default_overhead(model_name: str) -> float:
"""Get default tokenizer overhead for model provider"""
model_lower = model_name.lower() if model_name else ""
if "claude" in model_lower or "anthropic" in model_lower:
return 1.24
elif "gemini" in model_lower or "google" in model_lower:
return 1.0
elif "gpt" in model_lower or "openai" in model_lower or "o1" in model_lower or "o3" in model_lower:
return 1.0
else:
return 1.0
def get_tokenizer(model_name: str):
"""Get appropriate tokenizer for model. Returns (tokenizer_func, name)"""
global _tokenizer_cache
model_lower = model_name.lower() if model_name else ""
if "gpt-4o" in model_lower or "o1" in model_lower or "o3" in model_lower:
tokenizer_name = "o200k_base"
elif "gpt" in model_lower or "claude" in model_lower or "anthropic" in model_lower:
tokenizer_name = "cl100k_base"
elif "gemini" in model_lower or "google" in model_lower:
return lambda text: int(len(text) / 3.23), "gemini_approx"
else:
tokenizer_name = "cl100k_base"
if tokenizer_name not in _tokenizer_cache:
_tokenizer_cache[tokenizer_name] = tiktoken.get_encoding(tokenizer_name)
enc = _tokenizer_cache[tokenizer_name]
return lambda text: len(enc.encode(text)), tokenizer_name
def apply_thinking_overhead(df: pd.DataFrame, overhead: float) -> pd.DataFrame:
"""Apply tokenizer overhead multiplier to all token counts"""
if df.empty or overhead == 1.0:
return df
df = df.copy()
df["prompt_tokens"] = (df["prompt_tokens"] * overhead).astype(int)
df["completion_tokens"] = (df["completion_tokens"] * overhead).astype(int)
df["cache_read_tokens"] = (df["cache_read_tokens"] * overhead).astype(int)
df["cache_creation_tokens"] = (df["cache_creation_tokens"] * overhead).astype(int)
df["total_tokens"] = df["prompt_tokens"] + df["completion_tokens"]
return df
def apply_no_cache(df: pd.DataFrame) -> pd.DataFrame:
"""Convert all tokens to uncached input + completion (no caching)"""
if df.empty:
return df
df = df.copy()
df["cache_read_tokens"] = 0
df["cache_creation_tokens"] = 0
return df
def load_all_trajectories_calculated(folder: str) -> pd.DataFrame:
"""Load trajectories with self-calculated token counts using calculate_routing_tokens"""
global _calculated_tokens_cache
cache_key = f"calculated_{folder}"
if cache_key in _calculated_tokens_cache:
return _calculated_tokens_cache[cache_key]
trajectory_steps = load_all_trajectory_steps(folder)
rows = []
for instance_id, steps in trajectory_steps.items():
if not steps:
continue
try:
model_totals = calculate_routing_tokens(steps)
step_model = steps[0].get("model", "") if steps else ""
totals = model_totals.get(step_model, {})
cache_read = totals.get("cache_read", 0)
uncached_input = totals.get("uncached_input", 0)
completion = totals.get("completion", 0)
cache_creation = totals.get("cache_creation", 0)
prompt_tokens = cache_read + uncached_input
rows.append({
"instance_id": instance_id,
"model_name": step_model,
"api_calls": len(steps),
"instance_cost": 0,
"prompt_tokens": prompt_tokens,
"completion_tokens": completion,
"total_tokens": prompt_tokens + completion,
"cache_read_tokens": cache_read,
"cache_creation_tokens": cache_creation,
})
except Exception as e:
print(f"Error calculating tokens for {instance_id}: {e}")
df = pd.DataFrame(rows)
_calculated_tokens_cache[cache_key] = df
return df
def load_all_trajectory_steps(folder: str) -> dict[str, list[dict]]:
"""
Load all trajectories as step sequences for routing calculations.
Returns:
dict mapping instance_id -> list of steps for calculate_routing_tokens
"""
global _trajectory_steps_cache
cache_key = f"steps_{folder}"
if cache_key in _trajectory_steps_cache:
return _trajectory_steps_cache[cache_key]
output_dir = TRAJS_DIR / folder
traj_files = list(output_dir.glob("*/*.traj.json"))
if not traj_files:
traj_files = list(output_dir.glob("*/*.traj"))
if not traj_files:
traj_files = list(output_dir.glob("*.traj.json"))
if not traj_files:
traj_files = list(output_dir.glob("*.traj"))
if not traj_files:
traj_files = list(output_dir.glob("*.json"))
model_name = ""
if traj_files:
try:
with open(traj_files[0], "r") as f:
first_data = json.load(f)
config = first_data.get("info", {}).get("config", {}).get("model", {})
model_name = config.get("cost_calc_model_override", config.get("model_name", ""))
except Exception:
pass
result = {}
for traj_path in traj_files:
try:
instance_id = traj_path.stem.replace(".traj", "")
steps = parse_trajectory_to_steps(traj_path, model_name)
if steps:
result[instance_id] = steps
except Exception as e:
print(f"Error parsing steps for {traj_path}: {e}")
_trajectory_steps_cache[cache_key] = result
return result
def get_litellm_model_list() -> list[str]:
"""Get list of model names from litellm prices"""
prices = get_litellm_prices()
return sorted(prices.keys())
def get_litellm_prices() -> dict:
global _litellm_prices_cache
if _litellm_prices_cache is not None:
return _litellm_prices_cache
if LITELLM_PRICES_CACHE.exists():
with open(LITELLM_PRICES_CACHE) as f:
_litellm_prices_cache = json.load(f)
return _litellm_prices_cache
try:
response = requests.get(LITELLM_PRICES_URL, timeout=30)
response.raise_for_status()
_litellm_prices_cache = response.json()
DATA_DIR.mkdir(exist_ok=True)
with open(LITELLM_PRICES_CACHE, "w") as f:
json.dump(_litellm_prices_cache, f)
except Exception:
_litellm_prices_cache = {}
return _litellm_prices_cache
def normalize_model_name(name: str) -> str:
"""Normalize model name for comparison: lowercase, remove separators"""
return re.sub(r'[-_./]', '', name.lower())
def get_model_prices(model_name: str) -> dict | None:
if not model_name:
return None
prices = get_litellm_prices()
clean_name = model_name.replace("anthropic/", "").replace("openai/", "")
name_without_date = re.sub(r'-\d{8}$', '', clean_name)
candidates = [
model_name,
clean_name,
name_without_date,
f"anthropic/{clean_name}",
f"openai/{clean_name}",
f"anthropic/{name_without_date}",
f"openai/{name_without_date}",
]
for key in candidates:
if key in prices:
return prices[key]
normalized_name = normalize_model_name(clean_name)
normalized_no_date = normalize_model_name(name_without_date)
for key, value in prices.items():
key_normalized = normalize_model_name(key)
if normalized_name in key_normalized or normalized_no_date in key_normalized:
return value
key_last_part = key.split('/')[-1] if '/' in key else key
key_last_normalized = normalize_model_name(key_last_part)
if normalized_name == key_last_normalized or normalized_no_date == key_last_normalized:
return value
return None
def load_or_download_leaderboard():
if LEADERBOARD_CACHE.exists():
with open(LEADERBOARD_CACHE) as f:
return json.load(f)
filename = download_leaderboard(output_dir=str(DATA_DIR))
os.rename(filename, LEADERBOARD_CACHE)
with open(LEADERBOARD_CACHE) as f:
return json.load(f)
def get_bash_only_df():
data = load_or_download_leaderboard()
leaderboards = data.get("leaderboards", [])
bash_only = next((lb for lb in leaderboards if lb["name"] == "bash-only"), None)
if not bash_only:
return pd.DataFrame()
rows = []
for r in bash_only["results"]:
resolved_pct = r.get("resolved", 0)
if isinstance(resolved_pct, (int, float)):
resolved_str = f"{resolved_pct:.1f}%"
else:
resolved_str = str(resolved_pct)
rows.append({
"name": r.get("name", ""),
"% resolved": resolved_str,
"date": r.get("date", ""),
"cost": round(r.get("cost", 0), 2),
"instance_cost": round(r.get("instance_cost", 0), 4),
"instance_calls": r.get("instance_calls", 0),
"folder": r.get("folder", ""),
"os_model": "✅" if r.get("os_model") else "❌",
})
return pd.DataFrame(rows)
def get_model_details(folder: str):
if not folder:
return None, "Select a model from the table"
data = load_or_download_leaderboard()
leaderboards = data.get("leaderboards", [])
bash_only = next((lb for lb in leaderboards if lb["name"] == "bash-only"), None)
if not bash_only:
return None, "Leaderboard not found"
model = next((r for r in bash_only["results"] if r.get("folder") == folder), None)
if not model:
return None, f"Model with folder '{folder}' not found"
return model, None
def check_trajectories_downloaded(folder: str) -> bool:
if not folder:
return False
output_dir = TRAJS_DIR / folder
return output_dir.exists() and any(output_dir.iterdir())
def download_trajectories_from_s3(folder: str, progress=gr.Progress()):
if not folder:
return "❌ No model selected", gr.update(visible=False)
model, error = get_model_details(folder)
if error:
return f"❌ {error}", gr.update(visible=False)
output_dir = TRAJS_DIR / folder
if output_dir.exists() and any(output_dir.iterdir()):
file_count = len(list(output_dir.glob("*/*.traj.json")))
if file_count == 0:
file_count = len(list(output_dir.glob("*/*.traj")))
if file_count == 0:
file_count = len(list(output_dir.glob("*.json")))
return f"✅ Already downloaded: {output_dir}\n\n{file_count} trajectory files", gr.update(visible=True)
s3_path = f"{S3_BUCKET}/{folder}/trajs/"
output_dir.mkdir(parents=True, exist_ok=True)
progress(0, desc="Starting S3 download...")
try:
result = subprocess.run(
["aws", "s3", "cp", "--recursive", s3_path, str(output_dir), "--no-sign-request"],
capture_output=True,
text=True,
timeout=600,
)
if result.returncode != 0:
return f"❌ S3 download failed:\n{result.stderr}", gr.update(visible=False)
file_count = len(list(output_dir.glob("*/*.traj.json")))
if file_count == 0:
file_count = len(list(output_dir.glob("*/*.traj")))
if file_count == 0:
file_count = len(list(output_dir.glob("*.json")))
if file_count == 0:
return f"❌ No trajectory files found on S3 for {folder}", gr.update(visible=False)
per_instance = model.get("per_instance_details", {})
resolved_count = sum(1 for v in per_instance.values() if v.get("resolved"))
total_count = len(per_instance)
if total_count > 0:
resolved_pct = f"{100*resolved_count/total_count:.1f}%"
else:
resolved_pct = "N/A"
status = f"✅ Downloaded to {output_dir}\n\n{file_count} trajectory files\nResolved: {resolved_count}/{total_count} ({resolved_pct})"
return status, gr.update(visible=True)
except subprocess.TimeoutExpired:
return "❌ Download timed out (>10 min)", gr.update(visible=False)
except FileNotFoundError:
return "❌ AWS CLI not found. Install with: pip install awscli", gr.update(visible=False)
except Exception as e:
return f"❌ Error: {e}", gr.update(visible=False)
def parse_trajectory(traj_path: Path) -> dict:
with open(traj_path, "r", encoding="utf-8") as f:
data = json.load(f)
info = data.get("info", {})
model_stats = info.get("model_stats", {})
config = info.get("config", {})
model_config = config.get("model", {})
model_name = model_config.get("cost_calc_model_override", model_config.get("model_name", ""))
result = {
"instance_id": data.get("instance_id", traj_path.stem),
"model_name": model_name,
"api_calls": model_stats.get("api_calls", 0),
"instance_cost": model_stats.get("instance_cost", 0),
"prompt_tokens": 0,
"completion_tokens": 0,
"total_tokens": 0,
"cache_read_tokens": 0,
"cache_creation_tokens": 0,
}
messages = data.get("messages", [])
for msg in messages:
usage = None
if "usage" in msg:
usage = msg["usage"]
elif "extra" in msg and isinstance(msg["extra"], dict):
response = msg["extra"].get("response", {})
if isinstance(response, dict):
usage = response.get("usage", {})
if usage:
result["prompt_tokens"] += usage.get("prompt_tokens", 0) or 0
result["completion_tokens"] += usage.get("completion_tokens", 0) or 0
result["total_tokens"] += usage.get("total_tokens", 0) or 0
result["cache_read_tokens"] += usage.get("cache_read_input_tokens", 0) or 0
result["cache_creation_tokens"] += usage.get("cache_creation_input_tokens", 0) or 0
return result
def load_all_trajectories(folder: str) -> pd.DataFrame:
global _trajectories_cache
if folder in _trajectories_cache:
return _trajectories_cache[folder]
output_dir = TRAJS_DIR / folder
traj_files = list(output_dir.glob("*/*.traj.json"))
if not traj_files:
traj_files = list(output_dir.glob("*/*.traj"))
if not traj_files:
traj_files = list(output_dir.glob("*.traj.json"))
if not traj_files:
traj_files = list(output_dir.glob("*.traj"))
if not traj_files:
traj_files = list(output_dir.glob("*.json"))
rows = []
for traj_path in traj_files:
try:
rows.append(parse_trajectory(traj_path))
except Exception as e:
print(f"Error parsing {traj_path}: {e}")
df = pd.DataFrame(rows)
_trajectories_cache[folder] = df
return df
def create_cost_by_type_chart(df: pd.DataFrame, input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float):
"""Create Total Cost by Token Type chart (can be called separately for price updates)"""
if df.empty:
return None
total_completion = df["completion_tokens"].sum()
total_cache_read = df["cache_read_tokens"].sum()
total_cache_creation = df["cache_creation_tokens"].sum()
df_temp = df.copy()
df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0)
total_uncached_input = df_temp["uncached_input"].sum()
cost_uncached_input = total_uncached_input * input_price / 1e6
cost_cache_read = total_cache_read * cache_read_price / 1e6
cost_cache_creation = total_cache_creation * cache_creation_price / 1e6
cost_completion = total_completion * completion_price / 1e6
cost_data = pd.DataFrame({
"Token Type": ["Uncached Input", "Cache Read", "Cache Creation", "Completion"],
"Cost ($)": [cost_uncached_input, cost_cache_read, cost_cache_creation, cost_completion],
})
fig = px.bar(
cost_data,
x="Token Type",
y="Cost ($)",
title="Total Cost by Token Type ($)",
color="Token Type",
color_discrete_sequence=["#EF553B", "#19D3F3", "#FFA15A", "#AB63FA"],
)
fig.update_layout(
xaxis_title="Token Type",
yaxis_title="Cost ($)",
showlegend=False,
margin=dict(l=40, r=20, t=40, b=40),
)
total_cost = cost_uncached_input + cost_cache_read + cost_cache_creation + cost_completion
fig.add_annotation(
text=f"Total: ${total_cost:.2f}",
xref="paper", yref="paper",
x=0.95, y=0.95, showarrow=False,
font=dict(size=12),
)
return fig
def create_token_charts(df: pd.DataFrame, input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float):
"""Create only token-related charts (for source switching)"""
if df.empty:
return None, None, None
total_completion = df["completion_tokens"].sum()
total_cache_read = df["cache_read_tokens"].sum()
total_cache_creation = df["cache_creation_tokens"].sum()
df_temp = df.copy()
df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0)
total_uncached_input = df_temp["uncached_input"].sum()
token_data = pd.DataFrame({
"Token Type": ["Uncached Input", "Cache Read", "Cache Creation", "Completion"],
"Total Tokens (M)": [total_uncached_input / 1e6, total_cache_read / 1e6, total_cache_creation / 1e6, total_completion / 1e6],
})
fig_tokens = px.bar(
token_data,
x="Token Type",
y="Total Tokens (M)",
title="Total Tokens by Type",
color="Token Type",
color_discrete_sequence=["#EF553B", "#19D3F3", "#FFA15A", "#AB63FA"],
)
fig_tokens.update_layout(
xaxis_title="Token Type",
yaxis_title="Tokens (M)",
showlegend=False,
margin=dict(l=40, r=20, t=40, b=40),
)
total_all = total_uncached_input + total_cache_read + total_cache_creation + total_completion
fig_tokens.add_annotation(
text=f"Total: {total_all/1e6:.2f}M",
xref="paper", yref="paper",
x=0.95, y=0.95, showarrow=False,
font=dict(size=12),
)
fig_tokens_cost = create_cost_by_type_chart(df, input_price, cache_read_price, cache_creation_price, completion_price)
# Stacked bar chart - sort by total tokens (sum of all stacked)
df_sorted = df.copy()
df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0)
df_sorted["total_stacked"] = df_sorted["uncached_input_tokens"] + df_sorted["cache_read_tokens"] + df_sorted["cache_creation_tokens"] + df_sorted["completion_tokens"]
df_sorted = df_sorted.sort_values("total_stacked", ascending=False).reset_index(drop=True)
df_sorted["trajectory_idx"] = range(len(df_sorted))
fig_stacked = go.Figure()
fig_stacked.add_trace(go.Bar(
name="Uncached Input", x=df_sorted["trajectory_idx"], y=df_sorted["uncached_input_tokens"] / 1e6,
marker_color="#EF553B", hovertemplate="Trajectory: %{x}<br>Uncached Input: %{y:.2f}M<extra></extra>",
))
fig_stacked.add_trace(go.Bar(
name="Cache Read", x=df_sorted["trajectory_idx"], y=df_sorted["cache_read_tokens"] / 1e6,
marker_color="#19D3F3", hovertemplate="Trajectory: %{x}<br>Cache Read: %{y:.2f}M<extra></extra>",
))
fig_stacked.add_trace(go.Bar(
name="Cache Creation", x=df_sorted["trajectory_idx"], y=df_sorted["cache_creation_tokens"] / 1e6,
marker_color="#FFA15A", hovertemplate="Trajectory: %{x}<br>Cache Creation: %{y:.2f}M<extra></extra>",
))
fig_stacked.add_trace(go.Bar(
name="Completion", x=df_sorted["trajectory_idx"], y=df_sorted["completion_tokens"] / 1e6,
marker_color="#AB63FA", hovertemplate="Trajectory: %{x}<br>Completion: %{y:.2f}M<extra></extra>",
))
fig_stacked.update_layout(
barmode="stack",
title="Tokens per Trajectory (stacked)",
xaxis_title="Trajectory (sorted by total tokens)",
yaxis_title="Tokens (M)",
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
margin=dict(l=50, r=20, t=60, b=40),
)
return fig_tokens, fig_tokens_cost, fig_stacked
def create_basic_histograms(df: pd.DataFrame, input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float):
if df.empty:
return None, None, None, None, None
fig_steps = px.histogram(
df,
x="api_calls",
nbins=30,
title="Distribution of API Calls (Steps) per Trajectory",
color_discrete_sequence=["#636EFA"],
)
fig_steps.update_layout(
xaxis_title="API Calls (Steps)",
yaxis_title="Number of Trajectories",
showlegend=False,
margin=dict(l=40, r=20, t=40, b=40),
)
fig_steps.add_annotation(
text=f"Mean: {df['api_calls'].mean():.1f} | Median: {df['api_calls'].median():.0f}",
xref="paper", yref="paper",
x=0.95, y=0.95, showarrow=False,
font=dict(size=12),
)
fig_cost = px.histogram(
df,
x="instance_cost",
nbins=30,
title="Distribution of Cost Reported by Leaderboard ($)",
color_discrete_sequence=["#00CC96"],
)
fig_cost.update_layout(
xaxis_title="Cost ($)",
yaxis_title="Number of Trajectories",
showlegend=False,
margin=dict(l=40, r=20, t=40, b=40),
)
fig_cost.add_annotation(
text=f"Mean: ${df['instance_cost'].mean():.4f} | Total: ${df['instance_cost'].sum():.2f}",
xref="paper", yref="paper",
x=0.95, y=0.95, showarrow=False,
font=dict(size=12),
)
total_completion = df["completion_tokens"].sum()
total_cache_read = df["cache_read_tokens"].sum()
total_cache_creation = df["cache_creation_tokens"].sum()
# Uncached input = prompt - cache_read - cache_creation (per trajectory, then sum)
df_temp = df.copy()
df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0)
total_uncached_input = df_temp["uncached_input"].sum()
token_data = pd.DataFrame({
"Token Type": ["Uncached Input", "Cache Read", "Cache Creation", "Completion"],
"Tokens (M)": [total_uncached_input / 1e6, total_cache_read / 1e6, total_cache_creation / 1e6, total_completion / 1e6],
})
fig_tokens = px.bar(
token_data,
x="Token Type",
y="Tokens (M)",
title="Total Tokens by Type",
color="Token Type",
color_discrete_sequence=["#EF553B", "#19D3F3", "#FFA15A", "#AB63FA"],
)
fig_tokens.update_layout(
xaxis_title="Token Type",
yaxis_title="Tokens (M)",
showlegend=False,
margin=dict(l=40, r=20, t=40, b=40),
)
total_all = total_uncached_input + total_cache_read + total_cache_creation + total_completion
fig_tokens.add_annotation(
text=f"Total: {total_all/1e6:.2f}M",
xref="paper", yref="paper",
x=0.95, y=0.95, showarrow=False,
font=dict(size=12),
)
# Cost by token type (use separate function)
fig_tokens_cost = create_cost_by_type_chart(df, input_price, cache_read_price, cache_creation_price, completion_price)
# Sort by total tokens (sum of all stacked)
df_sorted = df.copy()
df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0)
df_sorted["total_stacked"] = df_sorted["uncached_input_tokens"] + df_sorted["cache_read_tokens"] + df_sorted["cache_creation_tokens"] + df_sorted["completion_tokens"]
df_sorted = df_sorted.sort_values("total_stacked", ascending=False).reset_index(drop=True)
df_sorted["trajectory_idx"] = range(len(df_sorted))
fig_stacked = go.Figure()
fig_stacked.add_trace(go.Bar(
name="Uncached Input",
x=df_sorted["trajectory_idx"],
y=df_sorted["uncached_input_tokens"] / 1e6,
marker_color="#EF553B",
hovertemplate="Trajectory: %{x}<br>Uncached Input: %{y:.3f}M<extra></extra>",
))
fig_stacked.add_trace(go.Bar(
name="Cache Read",
x=df_sorted["trajectory_idx"],
y=df_sorted["cache_read_tokens"] / 1e6,
marker_color="#19D3F3",
hovertemplate="Trajectory: %{x}<br>Cache Read: %{y:.3f}M<extra></extra>",
))
fig_stacked.add_trace(go.Bar(
name="Cache Creation",
x=df_sorted["trajectory_idx"],
y=df_sorted["cache_creation_tokens"] / 1e6,
marker_color="#FFA15A",
hovertemplate="Trajectory: %{x}<br>Cache Creation: %{y:.3f}M<extra></extra>",
))
fig_stacked.add_trace(go.Bar(
name="Completion",
x=df_sorted["trajectory_idx"],
y=df_sorted["completion_tokens"] / 1e6,
marker_color="#AB63FA",
hovertemplate="Trajectory: %{x}<br>Completion: %{y:.3f}M<extra></extra>",
))
fig_stacked.update_layout(
barmode="stack",
title="Tokens per Trajectory (stacked)",
xaxis_title="Trajectory (sorted by total tokens)",
yaxis_title="Tokens (M)",
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
margin=dict(l=50, r=20, t=60, b=40),
)
return fig_steps, fig_cost, fig_tokens, fig_tokens_cost, fig_stacked
def create_cost_breakdown(df: pd.DataFrame, input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float):
if df.empty:
return None
# Sort by total tokens (sum of all stacked)
df_sorted = df.copy()
df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0)
df_sorted["total_stacked"] = df_sorted["uncached_input_tokens"] + df_sorted["cache_read_tokens"] + df_sorted["cache_creation_tokens"] + df_sorted["completion_tokens"]
df_sorted = df_sorted.sort_values("total_stacked", ascending=False).reset_index(drop=True)
df_sorted["trajectory_idx"] = range(len(df_sorted))
df_sorted["cost_uncached_input"] = df_sorted["uncached_input_tokens"] * input_price / 1e6
df_sorted["cost_cache_read"] = df_sorted["cache_read_tokens"] * cache_read_price / 1e6
df_sorted["cost_cache_creation"] = df_sorted["cache_creation_tokens"] * cache_creation_price / 1e6
df_sorted["cost_completion"] = df_sorted["completion_tokens"] * completion_price / 1e6
fig = go.Figure()
fig.add_trace(go.Bar(
name=f"Uncached Input (${input_price:.2f}/1M)",
x=df_sorted["trajectory_idx"],
y=df_sorted["cost_uncached_input"],
marker_color="#EF553B",
hovertemplate="Trajectory: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
))
fig.add_trace(go.Bar(
name=f"Cache Read (${cache_read_price:.2f}/1M)",
x=df_sorted["trajectory_idx"],
y=df_sorted["cost_cache_read"],
marker_color="#19D3F3",
hovertemplate="Trajectory: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
))
fig.add_trace(go.Bar(
name=f"Cache Creation (${cache_creation_price:.2f}/1M)",
x=df_sorted["trajectory_idx"],
y=df_sorted["cost_cache_creation"],
marker_color="#FFA15A",
hovertemplate="Trajectory: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
))
fig.add_trace(go.Bar(
name=f"Completion (${completion_price:.2f}/1M)",
x=df_sorted["trajectory_idx"],
y=df_sorted["cost_completion"],
marker_color="#AB63FA",
hovertemplate="Trajectory: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
))
total_cost = (
df_sorted["cost_uncached_input"].sum() +
df_sorted["cost_cache_read"].sum() +
df_sorted["cost_cache_creation"].sum() +
df_sorted["cost_completion"].sum()
)
fig.update_layout(
barmode="stack",
title="Cost per Trajectory",
xaxis_title="Trajectory (sorted by total tokens)",
yaxis_title="Cost ($)",
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
margin=dict(l=50, r=20, t=60, b=40),
)
fig.add_annotation(
text=f"Total: ${total_cost:.2f}",
xref="paper", yref="paper",
x=0.95, y=0.95, showarrow=False,
font=dict(size=14),
bgcolor="white",
)
return fig
def extract_model_from_folder(folder: str) -> str:
"""Extract model name from folder like '20251124_mini-v1.16.0_claude-opus-4-5-20251101'"""
if not folder:
return ""
parts = folder.split("_")
if len(parts) >= 3:
return "_".join(parts[2:])
return folder
def get_prices_for_folder(folder: str) -> tuple[dict, str]:
"""Get prices from litellm based on folder name.
Returns (prices_dict, model_name) where prices_dict has 'value' and 'found' for each price type."""
model_hint = extract_model_from_folder(folder)
result = {
"input": {"value": 0, "found": False},
"cache_read": {"value": 0, "found": False},
"cache_creation": {"value": 0, "found": False},
"completion": {"value": 0, "found": False},
}
if not model_hint:
return result, ""
prices = get_model_prices(model_hint)
if prices:
# Get values from litellm
input_price = prices.get("input_cost_per_token", 0) * 1e6
cache_read = prices.get("cache_read_input_token_cost", 0) * 1e6
cache_creation = prices.get("cache_creation_input_token_cost", 0) * 1e6
completion = prices.get("output_cost_per_token", 0) * 1e6
result["input"] = {"value": input_price, "found": input_price > 0}
result["cache_read"] = {"value": cache_read, "found": cache_read > 0}
result["cache_creation"] = {"value": cache_creation, "found": cache_creation > 0}
result["completion"] = {"value": completion, "found": completion > 0}
# Apply fallback estimates based on standard ratios
# Cache Read = Input * 0.1 (90% discount)
# Cache Creation = Input * 1.25 (25% premium)
# Completion = Input * 5 (typical ratio)
if input_price > 0:
if not result["cache_read"]["found"]:
result["cache_read"]["value"] = input_price * 0.1
if not result["cache_creation"]["found"]:
result["cache_creation"]["value"] = input_price * 1.25
if not result["completion"]["found"]:
result["completion"]["value"] = input_price * 5
elif completion > 0:
# If we only have completion, estimate input from it
estimated_input = completion / 5
if not result["input"]["found"]:
result["input"]["value"] = estimated_input
if not result["cache_read"]["found"]:
result["cache_read"]["value"] = estimated_input * 0.1
if not result["cache_creation"]["found"]:
result["cache_creation"]["value"] = estimated_input * 1.25
return result, model_hint
def on_row_select(evt: gr.SelectData, df: pd.DataFrame):
if evt.index is None:
return (
"", "",
gr.update(visible=False),
gr.update(value=0, label="Input"),
gr.update(value=0, label="Cache Read"),
gr.update(value=0, label="Cache Creation"),
gr.update(value=0, label="Completion"),
"",
gr.update(value=1.0),
)
row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
row = df.iloc[row_idx]
folder = row["folder"]
name = row["name"]
prices_dict, model_hint = get_prices_for_folder(folder)
default_overhead = get_default_overhead(model_hint)
def price_update(price_info, name):
value = price_info["value"]
if price_info["found"]:
return gr.update(value=value, label=f"✅ {name}")
elif value > 0:
return gr.update(value=value, label=f"❌ {name} (est.)")
else:
return gr.update(value=0, label=f"❌ {name}")
return (
folder, name,
gr.update(visible=True),
price_update(prices_dict["input"], "Input"),
price_update(prices_dict["cache_read"], "Cache Read"),
price_update(prices_dict["cache_creation"], "Cache Creation"),
price_update(prices_dict["completion"], "Completion"),
model_hint,
gr.update(value=default_overhead),
)
def create_routed_token_chart(base_tokens: dict, additional_models: list):
"""
Create grouped bar chart for tokens by type, comparing base vs additional models.
Args:
base_tokens: dict with uncached_input, cache_read, cache_creation, completion
additional_models: list of (model_name, tokens_dict) tuples
"""
import plotly.graph_objects as go
categories = ["Uncached Input", "Cache Read", "Cache Creation", "Completion"]
colors = ["#636EFA", "#EF553B", "#00CC96", "#AB63FA", "#FFA15A"]
fig = go.Figure()
base_total = sum(base_tokens.get(k, 0) for k in ["uncached_input", "cache_read", "cache_creation", "completion"])
base_values = [
base_tokens.get("uncached_input", 0) / 1e6,
base_tokens.get("cache_read", 0) / 1e6,
base_tokens.get("cache_creation", 0) / 1e6,
base_tokens.get("completion", 0) / 1e6,
]
fig.add_trace(go.Bar(name="Base Model", x=categories, y=base_values, marker_color=colors[0]))
model_totals = [("Base Model", base_total)]
for i, (model_name, tokens) in enumerate(additional_models):
model_total = sum(tokens.get(k, 0) for k in ["uncached_input", "cache_read", "cache_creation", "completion"])
model_totals.append((model_name or f"Model {i+1}", model_total))
values = [
tokens.get("uncached_input", 0) / 1e6,
tokens.get("cache_read", 0) / 1e6,
tokens.get("cache_creation", 0) / 1e6,
tokens.get("completion", 0) / 1e6,
]
color = colors[(i + 1) % len(colors)]
fig.add_trace(go.Bar(name=model_name or f"Model {i+1}", x=categories, y=values, marker_color=color))
grand_total = sum(t for _, t in model_totals)
annotation_lines = [f"<b>Total: {grand_total/1e6:.2f}M</b>"]
for name, total in model_totals:
annotation_lines.append(f"{name}: {total/1e6:.2f}M")
fig.update_layout(
title="Tokens by Type (per Model)",
yaxis_title="Tokens (M)",
barmode="group",
margin=dict(l=40, r=40, t=80, b=40),
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
)
fig.add_annotation(
text="<br>".join(annotation_lines),
xref="paper", yref="paper",
x=0.02, y=0.98, showarrow=False,
font=dict(size=11),
align="left",
bgcolor="rgba(255,255,255,0.8)",
bordercolor="gray",
borderwidth=1,
)
return fig
def create_routed_cost_chart(base_costs: dict, additional_models: list):
"""
Create grouped bar chart for costs by type, comparing base vs additional models.
Args:
base_costs: dict with uncached_input, cache_read, cache_creation, completion
additional_models: list of (model_name, costs_dict) tuples
"""
import plotly.graph_objects as go
categories = ["Uncached Input", "Cache Read", "Cache Creation", "Completion"]
colors = ["#636EFA", "#EF553B", "#00CC96", "#AB63FA", "#FFA15A"]
fig = go.Figure()
base_total = sum(base_costs.get(k, 0) for k in ["uncached_input", "cache_read", "cache_creation", "completion"])
base_values = [
base_costs.get("uncached_input", 0),
base_costs.get("cache_read", 0),
base_costs.get("cache_creation", 0),
base_costs.get("completion", 0),
]
fig.add_trace(go.Bar(name="Base Model", x=categories, y=base_values, marker_color=colors[0]))
model_totals = [("Base Model", base_total)]
for i, (model_name, costs) in enumerate(additional_models):
model_total = sum(costs.get(k, 0) for k in ["uncached_input", "cache_read", "cache_creation", "completion"])
model_totals.append((model_name or f"Model {i+1}", model_total))
values = [
costs.get("uncached_input", 0),
costs.get("cache_read", 0),
costs.get("cache_creation", 0),
costs.get("completion", 0),
]
color = colors[(i + 1) % len(colors)]
fig.add_trace(go.Bar(name=model_name or f"Model {i+1}", x=categories, y=values, marker_color=color))
grand_total = sum(t for _, t in model_totals)
annotation_lines = [f"<b>Total: ${grand_total:.2f}</b>"]
for name, total in model_totals:
annotation_lines.append(f"{name}: ${total:.2f}")
fig.update_layout(
title="Cost by Type (per Model) ($)",
yaxis_title="Cost ($)",
barmode="group",
margin=dict(l=40, r=40, t=80, b=40),
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
)
fig.add_annotation(
text="<br>".join(annotation_lines),
xref="paper", yref="paper",
x=0.02, y=0.98, showarrow=False,
font=dict(size=11),
align="left",
bgcolor="rgba(255,255,255,0.8)",
bordercolor="gray",
borderwidth=1,
)
return fig
def build_app():
leaderboard_df = get_bash_only_df()
with gr.Blocks(title="SWE-bench Routing Cost Calculator") as app:
trajectories_state = gr.State(None)
gr.Markdown("# 🧮 SWE-bench Bash-Only Leaderboard")
gr.Markdown("Select a model to use as base for cost analysis")
with gr.Row():
with gr.Column(scale=3):
leaderboard_table = gr.Dataframe(
value=leaderboard_df,
label="Bash-Only Leaderboard",
interactive=False,
wrap=True,
)
with gr.Column(visible=False) as analysis_section:
gr.Markdown("## 📊 Trajectory Analysis")
with gr.Row():
plot_steps = gr.Plot(label="API Calls Distribution")
plot_cost = gr.Plot(label="Cost Distribution")
with gr.Row():
plot_tokens = gr.Plot(label="Token Usage by Type")
plot_tokens_cost = gr.Plot(label="Cost by Token Type ($)")
with gr.Row():
plot_stacked = gr.Plot(label="Tokens per Trajectory")
plot_cost_breakdown = gr.Plot(label="Cost per Trajectory ($)")
with gr.Row(visible=False) as routing_plots_row:
routing_tokens_plot = gr.Plot(label="Tokens by Type (per Model)")
routing_cost_plot = gr.Plot(label="Cost by Type (per Model)")
with gr.Column(scale=1):
selected_folder = gr.State("")
gr.Markdown("### Selected Model")
selected_name = gr.Textbox(label="Model Name", interactive=False)
analyze_btn = gr.Button("📊 Load & Analyze", visible=False, variant="primary")
download_status = gr.Textbox(label="Status", interactive=False, lines=3)
gr.Markdown("---")
gr.Markdown("### 💰 Token Prices ($/1M) · *[litellm](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)*")
detected_model = gr.Textbox(label="Detected Model", interactive=False)
with gr.Row():
price_input = gr.Number(label="Input", value=0, precision=2, scale=1)
price_cache_read = gr.Number(label="Cache Read", value=0, precision=2, scale=1)
price_cache_creation = gr.Number(label="Cache Creation", value=0, precision=2, scale=1)
price_completion = gr.Number(label="Completion", value=0, precision=2, scale=1)
gr.Markdown("---")
gr.Markdown("### 📊 Token Count Source")
token_source = gr.Radio(
choices=["Metadata", "Calculated"],
value="Metadata",
)
thinking_overhead = gr.Number(
label="🔢 Tokenizer Overhead",
value=1.21,
precision=2,
info="Multiplier for Calculated tokens (tiktoken → native)",
visible=False,
)
use_cache = gr.Checkbox(
label="Use Cache",
value=True,
info="If disabled, all tokens are Uncached Input or Completion",
visible=False,
)
gr.Markdown("---")
add_routing_btn = gr.Button("➕ Add Routing", variant="primary", visible=False)
with gr.Column(visible=False) as routing_section:
gr.Markdown("### 🔀 Routing Models")
STRATEGY_CHOICES = [
"Replace on random steps",
"Replace every step k",
"Replace part of trajectory",
]
with gr.Column():
with gr.Group():
gr.Markdown("#### Route to Model 1")
routing_model_1 = gr.Dropdown(
label="Model (type 3+ chars to search)",
choices=[],
allow_custom_value=True,
interactive=True,
)
with gr.Row():
routing_price_1_input = gr.Number(label="Input", precision=3, scale=1)
routing_price_1_cache_read = gr.Number(label="Cache Read", precision=3, scale=1)
routing_price_1_cache_creation = gr.Number(label="Cache Creation", precision=3, scale=1)
routing_price_1_completion = gr.Number(label="Completion", precision=3, scale=1)
strategy_1 = gr.Dropdown(
label="Strategy",
choices=STRATEGY_CHOICES,
value="Replace on random steps",
interactive=True,
)
with gr.Row(visible=True) as random_params_1:
random_pct_1 = gr.Number(label="Percentage (%)", value=50, minimum=0, maximum=100, precision=0, interactive=True)
with gr.Row(visible=False) as every_k_params_1:
step_k_1 = gr.Number(label="k", value=2, minimum=1, precision=0, interactive=True)
with gr.Column(visible=False) as part_params_1:
part_mode_1 = gr.Radio(
choices=["Indexes", "Percentages"],
value="Percentages",
label="Mode",
interactive=True,
)
with gr.Row():
start_step_1 = gr.Number(label="Start", value=0, minimum=0, precision=0, interactive=True)
end_step_1 = gr.Number(label="End", value=30, minimum=0, precision=0, interactive=True)
add_model_2_btn = gr.Button("+ Add another model", size="sm", visible=False)
with gr.Column(visible=False) as routing_block_2:
with gr.Group():
gr.Markdown("#### Route to Model 2")
routing_model_2 = gr.Dropdown(
label="Model (type 3+ chars to search)",
choices=[],
allow_custom_value=True,
interactive=True,
)
with gr.Row():
routing_price_2_input = gr.Number(label="Input", precision=3, scale=1)
routing_price_2_cache_read = gr.Number(label="Cache Read", precision=3, scale=1)
routing_price_2_cache_creation = gr.Number(label="Cache Creation", precision=3, scale=1)
routing_price_2_completion = gr.Number(label="Completion", precision=3, scale=1)
strategy_2 = gr.Dropdown(
label="Strategy",
choices=STRATEGY_CHOICES,
value="Replace on random steps",
interactive=True,
)
with gr.Row(visible=True) as random_params_2:
random_pct_2 = gr.Number(label="Percentage (%)", value=50, minimum=0, maximum=100, precision=0, interactive=True)
with gr.Row(visible=False) as every_k_params_2:
step_k_2 = gr.Number(label="k", value=2, minimum=1, precision=0, interactive=True)
with gr.Column(visible=False) as part_params_2:
part_mode_2 = gr.Radio(
choices=["Indexes", "Percentages"],
value="Percentages",
label="Mode",
interactive=True,
)
with gr.Row():
start_step_2 = gr.Number(label="Start", value=0, minimum=0, precision=0, interactive=True)
end_step_2 = gr.Number(label="End", value=30, minimum=0, precision=0, interactive=True)
add_model_3_btn = gr.Button("+ Add another model", size="sm", visible=False)
with gr.Column(visible=False) as routing_block_3:
with gr.Group():
gr.Markdown("#### Route to Model 3")
routing_model_3 = gr.Dropdown(
label="Model (type 3+ chars to search)",
choices=[],
allow_custom_value=True,
interactive=True,
)
with gr.Row():
routing_price_3_input = gr.Number(label="Input", precision=3, scale=1)
routing_price_3_cache_read = gr.Number(label="Cache Read", precision=3, scale=1)
routing_price_3_cache_creation = gr.Number(label="Cache Creation", precision=3, scale=1)
routing_price_3_completion = gr.Number(label="Completion", precision=3, scale=1)
strategy_3 = gr.Dropdown(
label="Strategy",
choices=STRATEGY_CHOICES,
value="Replace on random steps",
interactive=True,
)
with gr.Row(visible=True) as random_params_3:
random_pct_3 = gr.Number(label="Percentage (%)", value=50, minimum=0, maximum=100, precision=0, interactive=True)
with gr.Row(visible=False) as every_k_params_3:
step_k_3 = gr.Number(label="k", value=2, minimum=1, precision=0, interactive=True)
with gr.Column(visible=False) as part_params_3:
part_mode_3 = gr.Radio(
choices=["Indexes", "Percentages"],
value="Percentages",
label="Mode",
interactive=True,
)
with gr.Row():
start_step_3 = gr.Number(label="Start", value=0, minimum=0, precision=0, interactive=True)
end_step_3 = gr.Number(label="End", value=30, minimum=0, precision=0, interactive=True)
gr.Markdown("---")
route_btn = gr.Button("🚀 Let's ROUTE!!", variant="primary", size="lg", interactive=False)
routing_result = gr.Markdown(visible=False)
def on_strategy_change(strategy):
return (
gr.update(visible=strategy == "Replace on random steps"),
gr.update(visible=strategy == "Replace every step k"),
gr.update(visible=strategy == "Replace part of trajectory"),
)
def toggle_routing_section():
return gr.update(visible=True)
add_routing_btn.click(
fn=toggle_routing_section,
outputs=[routing_section],
)
strategy_1.change(
fn=on_strategy_change,
inputs=[strategy_1],
outputs=[random_params_1, every_k_params_1, part_params_1],
)
strategy_2.change(
fn=on_strategy_change,
inputs=[strategy_2],
outputs=[random_params_2, every_k_params_2, part_params_2],
)
strategy_3.change(
fn=on_strategy_change,
inputs=[strategy_3],
outputs=[random_params_3, every_k_params_3, part_params_3],
)
def filter_models(query):
"""Filter models based on search query (starts at 3 chars)"""
if not query or len(query) < 3:
return gr.update(choices=[])
all_models = get_litellm_model_list()
query_lower = query.lower()
filtered = [m for m in all_models if query_lower in m.lower()][:50]
return gr.update(choices=filtered)
routing_model_1.input(fn=filter_models, inputs=[routing_model_1], outputs=[routing_model_1])
routing_model_2.input(fn=filter_models, inputs=[routing_model_2], outputs=[routing_model_2])
routing_model_3.input(fn=filter_models, inputs=[routing_model_3], outputs=[routing_model_3])
def get_routing_prices_with_labels(model_name):
"""Get all 4 prices for a routing model with found/estimated labels"""
if not model_name:
return (
gr.update(value=0, label="Input"),
gr.update(value=0, label="Cache Read"),
gr.update(value=0, label="Cache Creation"),
gr.update(value=0, label="Completion"),
)
prices = get_litellm_prices()
model_prices = prices.get(model_name, {})
input_price = model_prices.get("input_cost_per_token", 0) * 1e6
cache_read = model_prices.get("cache_read_input_token_cost", 0) * 1e6
cache_creation = model_prices.get("cache_creation_input_token_cost", 0) * 1e6
completion = model_prices.get("output_cost_per_token", 0) * 1e6
input_found = input_price > 0
cache_read_found = cache_read > 0
cache_creation_found = cache_creation > 0
completion_found = completion > 0
if not cache_read_found and input_price > 0:
cache_read = input_price * 0.1
if not cache_creation_found and input_price > 0:
cache_creation = input_price * 1.25
def label(name, found):
return f"✅ {name}" if found else f"❌ {name}"
return (
gr.update(value=input_price, label=label("Input", input_found)),
gr.update(value=cache_read, label=label("Cache Read", cache_read_found)),
gr.update(value=cache_creation, label=label("Cache Creation", cache_creation_found)),
gr.update(value=completion, label=label("Completion", completion_found)),
)
def on_routing_model_1_select(model_name):
prices = get_routing_prices_with_labels(model_name)
show_btn = bool(model_name)
return *prices, gr.update(visible=show_btn), gr.update(interactive=show_btn)
def on_routing_model_2_select(model_name):
prices = get_routing_prices_with_labels(model_name)
show_btn = bool(model_name)
return *prices, gr.update(visible=show_btn)
def on_routing_model_3_select(model_name):
return get_routing_prices_with_labels(model_name)
routing_model_1.change(
fn=on_routing_model_1_select,
inputs=[routing_model_1],
outputs=[routing_price_1_input, routing_price_1_cache_read, routing_price_1_cache_creation, routing_price_1_completion, add_model_2_btn, route_btn],
)
add_model_2_btn.click(
fn=lambda: (gr.update(visible=True), gr.update(visible=False)),
outputs=[routing_block_2, add_model_2_btn],
)
routing_model_2.change(
fn=on_routing_model_2_select,
inputs=[routing_model_2],
outputs=[routing_price_2_input, routing_price_2_cache_read, routing_price_2_cache_creation, routing_price_2_completion, add_model_3_btn],
)
add_model_3_btn.click(
fn=lambda: (gr.update(visible=True), gr.update(visible=False)),
outputs=[routing_block_3, add_model_3_btn],
)
routing_model_3.change(
fn=on_routing_model_3_select,
inputs=[routing_model_3],
outputs=[routing_price_3_input, routing_price_3_cache_read, routing_price_3_cache_creation, routing_price_3_completion],
)
def run_routing(
state_data,
base_input, base_cache_read, base_cache_creation, base_completion,
routing_model_1_val, r1_input, r1_cache_read, r1_cache_creation, r1_completion,
strategy_1_val, random_pct_1_val, step_k_1_val, part_mode_1_val, start_1_val, end_1_val,
routing_model_2_val, r2_input, r2_cache_read, r2_cache_creation, r2_completion,
strategy_2_val, random_pct_2_val, step_k_2_val, part_mode_2_val, start_2_val, end_2_val,
routing_model_3_val, r3_input, r3_cache_read, r3_cache_creation, r3_completion,
strategy_3_val, random_pct_3_val, step_k_3_val, part_mode_3_val, start_3_val, end_3_val,
source, overhead, with_cache
):
if state_data is None:
yield (
gr.update(visible=True, value="❌ No trajectories loaded. Click 'Load & Analyze' first."),
gr.update(visible=False),
None, None,
)
return
if not routing_model_1_val:
yield (
gr.update(visible=True, value="❌ Please select at least one routing model."),
gr.update(visible=False),
None, None,
)
return
trajectory_steps = state_data.get("steps", {})
if not trajectory_steps:
yield (
gr.update(visible=True, value="❌ No trajectory steps data available."),
gr.update(visible=False),
None, None,
)
return
df_calc = state_data.get("calculated")
if df_calc is not None and not df_calc.empty:
df_for_cost = apply_thinking_overhead(df_calc.copy(), overhead)
if not with_cache:
df_for_cost = apply_no_cache(df_for_cost)
df_temp = df_for_cost.copy()
df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0)
total_original_cost_from_df = (
df_temp["uncached_input"].sum() * base_input / 1e6 +
df_for_cost["cache_read_tokens"].sum() * base_cache_read / 1e6 +
df_for_cost["cache_creation_tokens"].sum() * base_cache_creation / 1e6 +
df_for_cost["completion_tokens"].sum() * base_completion / 1e6
)
else:
total_original_cost_from_df = None
base_prices = {
"input": base_input,
"cache_read": base_cache_read,
"cache_creation": base_cache_creation,
"completion": base_completion,
}
def build_strategy_params(strategy, random_pct, step_k, part_mode, start_val, end_val):
params = {}
if strategy == "Replace on random steps":
params["percentage"] = random_pct
elif strategy == "Replace every step k":
params["k"] = step_k
elif strategy == "Replace part of trajectory":
params["mode"] = part_mode
params["start"] = start_val
params["end"] = end_val
return params
routing_models = []
if routing_model_1_val:
if strategy_1_val == "Replace part of trajectory" and start_1_val >= end_1_val:
yield (gr.update(visible=True, value="❌ Model 1: Start must be less than End"), gr.update(visible=False), None, None)
return
routing_models.append({
"name": routing_model_1_val,
"prices": {"input": r1_input, "cache_read": r1_cache_read, "cache_creation": r1_cache_creation, "completion": r1_completion},
"strategy": strategy_1_val,
"params": build_strategy_params(strategy_1_val, random_pct_1_val, step_k_1_val, part_mode_1_val, start_1_val, end_1_val),
})
if routing_model_2_val:
if strategy_2_val == "Replace part of trajectory" and start_2_val >= end_2_val:
yield (gr.update(visible=True, value="❌ Model 2: Start must be less than End"), gr.update(visible=False), None, None)
return
routing_models.append({
"name": routing_model_2_val,
"prices": {"input": r2_input, "cache_read": r2_cache_read, "cache_creation": r2_cache_creation, "completion": r2_completion},
"strategy": strategy_2_val,
"params": build_strategy_params(strategy_2_val, random_pct_2_val, step_k_2_val, part_mode_2_val, start_2_val, end_2_val),
})
if routing_model_3_val:
if strategy_3_val == "Replace part of trajectory" and start_3_val >= end_3_val:
yield (gr.update(visible=True, value="❌ Model 3: Start must be less than End"), gr.update(visible=False), None, None)
return
routing_models.append({
"name": routing_model_3_val,
"prices": {"input": r3_input, "cache_read": r3_cache_read, "cache_creation": r3_cache_creation, "completion": r3_completion},
"strategy": strategy_3_val,
"params": build_strategy_params(strategy_3_val, random_pct_3_val, step_k_3_val, part_mode_3_val, start_3_val, end_3_val),
})
BASE_MODEL = "__base__"
model_keys = [BASE_MODEL] + [f"__routing_{i}__" for i in range(len(routing_models))]
all_tokens = {key: {"uncached_input": 0, "cache_read": 0, "cache_creation": 0, "completion": 0} for key in model_keys}
total_original_tokens = {"uncached_input": 0, "cache_read": 0, "cache_creation": 0, "completion": 0}
for instance_id, steps in trajectory_steps.items():
if not steps:
continue
total_steps = len(steps)
routed_sets = []
for rm in routing_models:
routed_sets.append(get_routed_steps(total_steps, rm["strategy"], rm["params"]))
modified_steps = []
for i, step in enumerate(steps):
model = BASE_MODEL
for j, routed_set in enumerate(routed_sets):
if i in routed_set:
model = f"__routing_{j}__"
break
modified_steps.append({
"model": model,
"system_user": step.get("system_user", 0),
"completion": int(step.get("completion", 0) * (overhead if source == "Calculated" else 1)),
"observation": step.get("observation"),
})
model_totals = calculate_routing_tokens(modified_steps)
for key in model_keys:
totals = model_totals.get(key, {})
all_tokens[key]["cache_read"] += totals.get("cache_read", 0)
all_tokens[key]["uncached_input"] += totals.get("uncached_input", 0)
all_tokens[key]["completion"] += totals.get("completion", 0)
all_tokens[key]["cache_creation"] += totals.get("cache_creation", 0)
original_steps = []
for step in steps:
original_steps.append({
"model": BASE_MODEL,
"system_user": step.get("system_user", 0),
"completion": int(step.get("completion", 0) * (overhead if source == "Calculated" else 1)),
"observation": step.get("observation"),
})
original_totals = calculate_routing_tokens(original_steps)
orig = original_totals.get(BASE_MODEL, {})
total_original_tokens["cache_read"] += orig.get("cache_read", 0)
total_original_tokens["uncached_input"] += orig.get("uncached_input", 0)
total_original_tokens["completion"] += orig.get("completion", 0)
total_original_tokens["cache_creation"] += orig.get("cache_creation", 0)
def calc_cost(tokens: dict, prices: dict) -> float:
return (
tokens["uncached_input"] * prices["input"] / 1e6 +
tokens["cache_read"] * prices["cache_read"] / 1e6 +
tokens["cache_creation"] * prices["cache_creation"] / 1e6 +
tokens["completion"] * prices["completion"] / 1e6
)
def tokens_to_costs(tokens: dict, prices: dict) -> dict:
price_map = {"uncached_input": "input", "cache_read": "cache_read", "cache_creation": "cache_creation", "completion": "completion"}
return {k: tokens[k] * prices[price_map[k]] / 1e6 for k in tokens}
total_base_tokens = all_tokens[BASE_MODEL]
base_costs = tokens_to_costs(total_base_tokens, base_prices)
total_base_cost = calc_cost(total_base_tokens, base_prices)
routing_costs_list = []
total_routing_cost = 0
for i, rm in enumerate(routing_models):
key = f"__routing_{i}__"
tokens = all_tokens[key]
costs = tokens_to_costs(tokens, rm["prices"])
cost = calc_cost(tokens, rm["prices"])
routing_costs_list.append({"name": rm["name"], "tokens": tokens, "costs": costs, "cost": cost})
total_routing_cost += cost
if total_original_cost_from_df is not None:
total_original_cost = total_original_cost_from_df
else:
total_original_cost = calc_cost(total_original_tokens, base_prices)
total_routed_cost = total_base_cost + total_routing_cost
savings = total_original_cost - total_routed_cost
savings_pct = (savings / total_original_cost * 100) if total_original_cost > 0 else 0
result_lines = [
"## 🚀 Routing Results",
"",
"| Metric | Value |",
"|--------|-------|",
f"| **Original Cost (base model only)** | ${total_original_cost:.2f} |",
f"| **Routed Cost** | ${total_routed_cost:.2f} |",
f"| ↳ Base model portion | ${total_base_cost:.2f} |",
]
for rc in routing_costs_list:
result_lines.append(f"| ↳ {rc['name']} | ${rc['cost']:.2f} |")
result_lines.append(f"| **Savings** | ${savings:.2f} ({savings_pct:+.1f}%) |")
result_text = "\n".join(result_lines)
additional_token_models = [(rc["name"], rc["tokens"]) for rc in routing_costs_list]
additional_cost_models = [(rc["name"], rc["costs"]) for rc in routing_costs_list]
yield (
gr.update(visible=True, value="⏳ Creating charts..."),
gr.update(visible=True),
None,
None,
)
tokens_chart = create_routed_token_chart(total_base_tokens, additional_token_models)
cost_chart = create_routed_cost_chart(base_costs, additional_cost_models)
yield (
gr.update(visible=True, value=result_text),
gr.update(visible=True),
tokens_chart,
cost_chart,
)
route_btn.click(
fn=run_routing,
inputs=[
trajectories_state,
price_input, price_cache_read, price_cache_creation, price_completion,
routing_model_1, routing_price_1_input, routing_price_1_cache_read, routing_price_1_cache_creation, routing_price_1_completion,
strategy_1, random_pct_1, step_k_1, part_mode_1, start_step_1, end_step_1,
routing_model_2, routing_price_2_input, routing_price_2_cache_read, routing_price_2_cache_creation, routing_price_2_completion,
strategy_2, random_pct_2, step_k_2, part_mode_2, start_step_2, end_step_2,
routing_model_3, routing_price_3_input, routing_price_3_cache_read, routing_price_3_cache_creation, routing_price_3_completion,
strategy_3, random_pct_3, step_k_3, part_mode_3, start_step_3, end_step_3,
token_source, thinking_overhead, use_cache,
],
outputs=[routing_result, routing_plots_row, routing_tokens_plot, routing_cost_plot],
)
def update_calculated_options_visibility(source):
is_calc = source == "Calculated"
return gr.update(visible=is_calc), gr.update(visible=is_calc)
token_source.change(
fn=update_calculated_options_visibility,
inputs=[token_source],
outputs=[thinking_overhead, use_cache],
)
leaderboard_table.select(
fn=on_row_select,
inputs=[leaderboard_table],
outputs=[selected_folder, selected_name, analyze_btn, price_input, price_cache_read, price_cache_creation, price_completion, detected_model, thinking_overhead],
)
def load_and_analyze(folder, input_price, cache_read_price, cache_creation_price, completion_price, source, overhead, with_cache, progress=gr.Progress()):
empty_result = (
"",
gr.update(visible=False),
None, None, None, None, None, None,
None,
gr.update(visible=False),
)
if not folder:
yield empty_result
return
if not check_trajectories_downloaded(folder):
yield (
"⏳ Downloading trajectories...",
gr.update(visible=False),
None, None, None, None, None, None,
None,
gr.update(visible=False),
)
status, _ = download_trajectories_from_s3(folder)
if "❌" in status:
yield (
status,
gr.update(visible=False),
None, None, None, None, None, None,
None,
gr.update(visible=False),
)
return
yield (
"⏳ Loading trajectories...",
gr.update(visible=True),
None, None, None, None, None, None,
None,
gr.update(visible=False),
)
df_meta = load_all_trajectories(folder)
df_calc = load_all_trajectories_calculated(folder)
df_calc["api_calls"] = df_meta["api_calls"].values
df_calc["instance_cost"] = df_meta["instance_cost"].values
trajectory_steps = load_all_trajectory_steps(folder)
state_data = {"meta": df_meta, "calculated": df_calc, "folder": folder, "steps": trajectory_steps}
if source == "Metadata":
df = df_meta
else:
df = apply_thinking_overhead(df_calc.copy(), overhead)
if not with_cache:
df = apply_no_cache(df)
if df.empty:
yield (
"❌ No trajectories found",
gr.update(visible=False),
None, None, None, None, None, None,
None,
gr.update(visible=False),
)
return
fig_steps, fig_cost, fig_tokens, fig_tokens_cost, fig_stacked = create_basic_histograms(
df, input_price, cache_read_price, cache_creation_price, completion_price
)
fig_cost_breakdown = create_cost_breakdown(df, input_price, cache_read_price, cache_creation_price, completion_price)
yield (
f"✅ Loaded {len(df)} trajectories",
gr.update(visible=True),
fig_steps, fig_cost, fig_tokens, fig_tokens_cost, fig_stacked, fig_cost_breakdown,
state_data,
gr.update(visible=True),
)
analyze_btn.click(
fn=load_and_analyze,
inputs=[selected_folder, price_input, price_cache_read, price_cache_creation, price_completion, token_source, thinking_overhead, use_cache],
outputs=[
download_status,
analysis_section,
plot_steps, plot_cost, plot_tokens, plot_tokens_cost, plot_stacked, plot_cost_breakdown,
trajectories_state,
add_routing_btn,
],
)
def recalculate_costs(state_data, input_price, cache_read_price, cache_creation_price, completion_price, source, overhead, with_cache):
if state_data is None:
return None, None
if source == "Metadata":
df = state_data["meta"]
else:
df = apply_thinking_overhead(state_data["calculated"].copy(), overhead)
if not with_cache:
df = apply_no_cache(df)
if df.empty:
return None, None
fig_tokens_cost = create_cost_by_type_chart(df, input_price, cache_read_price, cache_creation_price, completion_price)
fig_cost_breakdown = create_cost_breakdown(df, input_price, cache_read_price, cache_creation_price, completion_price)
return fig_tokens_cost, fig_cost_breakdown
price_inputs = [trajectories_state, price_input, price_cache_read, price_cache_creation, price_completion, token_source, thinking_overhead, use_cache]
price_outputs = [plot_tokens_cost, plot_cost_breakdown]
price_input.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs)
price_cache_read.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs)
price_cache_creation.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs)
price_completion.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs)
def on_source_change(state_data, input_price, cache_read_price, cache_creation_price, completion_price, source, overhead, with_cache):
"""Recalculate only token-dependent charts when source changes"""
if state_data is None:
return None, None, None, None
if source == "Metadata":
df = state_data["meta"]
else:
df = apply_thinking_overhead(state_data["calculated"].copy(), overhead)
if not with_cache:
df = apply_no_cache(df)
if df.empty:
return None, None, None, None
fig_tokens, fig_tokens_cost, fig_stacked = create_token_charts(
df, input_price, cache_read_price, cache_creation_price, completion_price
)
fig_cost_breakdown = create_cost_breakdown(df, input_price, cache_read_price, cache_creation_price, completion_price)
return fig_tokens, fig_tokens_cost, fig_stacked, fig_cost_breakdown
source_change_inputs = [trajectories_state, price_input, price_cache_read, price_cache_creation, price_completion, token_source, thinking_overhead, use_cache]
source_change_outputs = [plot_tokens, plot_tokens_cost, plot_stacked, plot_cost_breakdown]
token_source.change(
fn=on_source_change,
inputs=source_change_inputs,
outputs=source_change_outputs,
)
thinking_overhead.change(
fn=on_source_change,
inputs=source_change_inputs,
outputs=source_change_outputs,
)
use_cache.change(
fn=on_source_change,
inputs=source_change_inputs,
outputs=source_change_outputs,
)
return app
if __name__ == "__main__":
app = build_app()
app.queue()
app.launch()