import json
import logging
import os
import random
import re
import subprocess
import sys
from pathlib import Path
import gradio as gr
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import requests
import tiktoken
from src.download_swebench_leaderboard import download_leaderboard
# Tokenizer cache
_tokenizer_cache = {}
DATA_DIR = Path("data")
TRAJS_DIR = DATA_DIR / "swebench_trajs"
LEADERBOARD_CACHE = DATA_DIR / "swebench_leaderboard_latest.json"
LITELLM_PRICES_CACHE = DATA_DIR / "litellm_prices.json"
S3_BUCKET = "s3://swe-bench-experiments/bash-only"
LITELLM_PRICES_URL = "https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json"
LOG_DIR = Path("logs")
QUICK_SELECT_MODELS = [
"openrouter/anthropic/claude-opus-4.5",
"openrouter/anthropic/claude-sonnet-4.5",
"openrouter/google/gemini-3-pro-preview",
"openrouter/openai/gpt-5-codex",
"openrouter/openai/gpt-oss-120b",
"deepinfra/Qwen/Qwen3-14B",
"deepinfra/Qwen/Qwen3-32B",
"deepinfra/Qwen/Qwen3-73B",
"deepinfra/Qwen/Qwen3-235B-A22B",
"deepinfra/Qwen/Qwen3-30B-A3B",
("deepinfra/Qwen/Qwen3-Coder-480B-A35B-Instruct", "Qwen3-Coder-480B-A35B"),
]
LOG_DIR.mkdir(parents=True, exist_ok=True)
LOG_FILE = LOG_DIR / "app.log"
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[
logging.FileHandler(LOG_FILE, encoding="utf-8"),
logging.StreamHandler(sys.stdout),
],
force=True,
)
def _log_unhandled(exc_type, exc_value, exc_traceback):
if issubclass(exc_type, KeyboardInterrupt):
sys.__excepthook__(exc_type, exc_value, exc_traceback)
return
logging.error("Uncaught exception", exc_info=(exc_type, exc_value, exc_traceback))
sys.excepthook = _log_unhandled
_litellm_prices_cache = None
_litellm_chat_prices_cache = None
_trajectories_cache = {}
_calculated_tokens_cache = {}
_trajectory_steps_cache = {}
def calculate_routing_tokens(steps: list[dict]) -> dict:
"""
Calculate token breakdown per model with proper caching simulation.
Args:
steps: list of dicts with keys:
- model: str (model name)
- system_user: int (tokens for system/user message, usually only step 0)
- completion: int (generated tokens)
- observation: int or None (env response tokens, None for last step)
Returns:
dict with per-model totals:
{model_name: {cache_read, uncached_input, completion, observation, cache_creation}}
"""
model_caches = {}
model_totals = {}
total_context = 0
prev_observation = 0
for i, step in enumerate(steps):
model = step["model"]
system_user = step.get("system_user", 0)
completion = step.get("completion", 0)
observation = step.get("observation") or 0
if model not in model_caches:
model_caches[model] = 0
if model not in model_totals:
model_totals[model] = {
"cache_read": 0,
"uncached_input": 0,
"completion": 0,
"observation": 0,
"cache_creation": 0,
}
cache_read = model_caches[model]
if i == 0:
uncached_input = system_user
else:
full_context_needed = total_context + prev_observation
uncached_input = full_context_needed - cache_read
cache_creation = uncached_input + completion
model_caches[model] = cache_read + cache_creation
model_totals[model]["cache_read"] += cache_read
model_totals[model]["uncached_input"] += uncached_input
model_totals[model]["completion"] += completion
model_totals[model]["observation"] += observation
model_totals[model]["cache_creation"] += cache_creation
total_context = cache_read + uncached_input + completion
prev_observation = observation
return model_totals
def calculate_per_step_tokens(steps: list[dict]) -> list[dict]:
"""
Calculate token breakdown per step with proper caching simulation.
Returns list of per-step data:
[{step: 0, cache_read: X, uncached_input: Y, completion: Z, cache_creation: W}, ...]
"""
result = []
cache_size = 0
total_context = 0
prev_observation = 0
for i, step in enumerate(steps):
system_user = step.get("system_user", 0)
completion = step.get("completion", 0)
observation = step.get("observation") or 0
cache_read = cache_size
if i == 0:
uncached_input = system_user
else:
full_context_needed = total_context + prev_observation
uncached_input = full_context_needed - cache_read
cache_creation = uncached_input + completion
cache_size = cache_read + cache_creation
result.append({
"step": i,
"cache_read": cache_read,
"uncached_input": uncached_input,
"completion": completion,
"cache_creation": cache_creation,
})
total_context = cache_read + uncached_input + completion
prev_observation = observation
return result
def _parse_usage_from_log_line(line: str) -> dict | None:
"""
Parse usage info from log line containing ModelResponse or similar format.
Returns dict with prompt_tokens, completion_tokens, cached_tokens, etc.
"""
if "usage=" not in line:
return None
result = {}
for field in ["completion_tokens", "prompt_tokens", "total_tokens"]:
match = re.search(rf'{field}=(\d+)', line)
if match:
result[field] = int(match.group(1))
cached_match = re.search(r'cached_tokens=(\d+)', line)
if cached_match:
result["cached_tokens"] = int(cached_match.group(1))
return result if result else None
def _parse_old_format_log(log_path: Path) -> list[dict]:
"""
Parse old SWE-agent format .info.log file to extract per-step token usage.
"""
result = []
step = 0
try:
with open(log_path, "r", encoding="utf-8") as f:
for line in f:
if "usage=Usage(" not in line:
continue
usage = _parse_usage_from_log_line(line)
if not usage:
continue
prompt_tokens = usage.get("prompt_tokens", 0)
completion_tokens = usage.get("completion_tokens", 0)
cached_tokens = usage.get("cached_tokens", 0)
uncached_input = max(0, prompt_tokens - cached_tokens)
result.append({
"step": step,
"cache_read": cached_tokens,
"uncached_input": uncached_input,
"completion": completion_tokens,
"cache_creation": 0,
})
step += 1
except Exception as e:
logging.debug("Error parsing log file %s: %s", log_path, e)
return result
def parse_trajectory_metadata_per_step(traj_path: Path) -> list[dict]:
"""
Parse trajectory file and extract per-step metadata from usage fields.
Supports both new format (.traj.json with messages[].extra.response.usage)
and old format (.traj with separate .info.log file).
Returns list of per-step data:
[{step: 0, cache_read: X, uncached_input: Y, completion: Z, cache_creation: W}, ...]
"""
with open(traj_path, "r", encoding="utf-8") as f:
data = json.load(f)
messages = data.get("messages", [])
result = []
step = 0
for msg in messages:
if msg.get("role") != "assistant":
continue
usage = None
if "usage" in msg:
usage = msg["usage"]
elif "extra" in msg and isinstance(msg["extra"], dict):
response = msg["extra"].get("response", {})
if isinstance(response, dict):
usage = response.get("usage", {})
if usage:
prompt_tokens = usage.get("prompt_tokens", 0) or 0
completion_tokens = usage.get("completion_tokens", 0) or 0
cache_read = usage.get("cache_read_input_tokens", 0) or 0
cache_creation = usage.get("cache_creation_input_tokens", 0) or 0
prompt_tokens_details = usage.get("prompt_tokens_details", {})
if isinstance(prompt_tokens_details, dict):
cached_from_details = prompt_tokens_details.get("cached_tokens", 0) or 0
if cached_from_details > 0 and cache_read == 0:
cache_read = cached_from_details
uncached_input = max(0, prompt_tokens - cache_read - cache_creation)
result.append({
"step": step,
"cache_read": cache_read,
"uncached_input": uncached_input,
"completion": completion_tokens,
"cache_creation": cache_creation,
})
step += 1
if not result:
log_path = traj_path.with_suffix(".info.log")
if not log_path.exists():
base_name = traj_path.stem.replace(".traj", "")
log_path = traj_path.parent / f"{base_name}.info.log"
if log_path.exists():
result = _parse_old_format_log(log_path)
return result
def load_all_trajectory_metadata_steps(folder: str) -> dict[str, list[dict]]:
"""
Load per-step metadata for all trajectories.
Returns:
dict mapping instance_id -> list of per-step metadata
"""
output_dir = TRAJS_DIR / folder
traj_files = list(output_dir.glob("*/*.traj.json"))
if not traj_files:
traj_files = list(output_dir.glob("*/*.traj"))
if not traj_files:
traj_files = list(output_dir.glob("*.traj.json"))
if not traj_files:
traj_files = list(output_dir.glob("*.traj"))
if not traj_files:
traj_files = list(output_dir.glob("*.json"))
result = {}
for traj_path in traj_files:
try:
instance_id = traj_path.stem.replace(".traj", "")
steps = parse_trajectory_metadata_per_step(traj_path)
if steps:
result[instance_id] = steps
except Exception as e:
logging.error("Error parsing metadata steps for %s: %s", traj_path, e, exc_info=True)
return result
def create_single_trajectory_meta_chart(steps: list[dict]):
"""Create stacked bar chart for a single trajectory showing metadata tokens per step."""
import plotly.graph_objects as go
if not steps:
return None
x_labels = [f"Step {d['step']}" for d in steps]
uncached = [d["uncached_input"] / 1e3 for d in steps]
cache_read = [d["cache_read"] / 1e3 for d in steps]
cache_creation = [d["cache_creation"] / 1e3 for d in steps]
completion = [d["completion"] / 1e3 for d in steps]
fig = go.Figure()
fig.add_trace(go.Bar(
name="Uncached Input",
x=x_labels,
y=uncached,
marker_color="#EF553B",
hovertemplate="Step %{x}
Uncached Input: %{y:.2f}K",
))
fig.add_trace(go.Bar(
name="Cache Read",
x=x_labels,
y=cache_read,
marker_color="#19D3F3",
hovertemplate="Step %{x}
Cache Read: %{y:.2f}K",
))
fig.add_trace(go.Bar(
name="Cache Creation",
x=x_labels,
y=cache_creation,
marker_color="#FFA15A",
hovertemplate="Step %{x}
Cache Creation: %{y:.2f}K",
))
fig.add_trace(go.Bar(
name="Completion",
x=x_labels,
y=completion,
marker_color="#AB63FA",
hovertemplate="Step %{x}
Completion: %{y:.2f}K",
))
fig.update_layout(
barmode="stack",
xaxis_title="Step",
yaxis_title="Tokens (K)",
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
margin=dict(l=50, r=20, t=40, b=40),
)
return fig
def create_single_trajectory_meta_cost_chart(steps: list[dict], input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float):
"""Create stacked bar chart for a single trajectory showing metadata cost per step."""
import plotly.graph_objects as go
if not steps:
return None
x_labels = [f"Step {d['step']}" for d in steps]
uncached_cost = [d["uncached_input"] * input_price / 1e6 for d in steps]
cache_read_cost = [d["cache_read"] * cache_read_price / 1e6 for d in steps]
cache_creation_cost = [d["cache_creation"] * cache_creation_price / 1e6 for d in steps]
completion_cost = [d["completion"] * completion_price / 1e6 for d in steps]
fig = go.Figure()
fig.add_trace(go.Bar(
name="Uncached Input",
x=x_labels,
y=uncached_cost,
marker_color="#EF553B",
hovertemplate="Step %{x}
Uncached Input: $%{y:.4f}",
))
fig.add_trace(go.Bar(
name="Cache Read",
x=x_labels,
y=cache_read_cost,
marker_color="#19D3F3",
hovertemplate="Step %{x}
Cache Read: $%{y:.4f}",
))
fig.add_trace(go.Bar(
name="Cache Creation",
x=x_labels,
y=cache_creation_cost,
marker_color="#FFA15A",
hovertemplate="Step %{x}
Cache Creation: $%{y:.4f}",
))
fig.add_trace(go.Bar(
name="Completion",
x=x_labels,
y=completion_cost,
marker_color="#AB63FA",
hovertemplate="Step %{x}
Completion: $%{y:.4f}",
))
fig.update_layout(
barmode="stack",
xaxis_title="Step",
yaxis_title="Cost ($)",
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
margin=dict(l=50, r=20, t=40, b=40),
)
return fig
def create_single_trajectory_chart(steps: list[dict], overhead: float = 1.0, with_cache: bool = True):
"""Create stacked bar chart for a single trajectory showing tokens per step."""
import plotly.graph_objects as go
if not steps:
return None
per_step_data = calculate_per_step_tokens(steps)
x_labels = [f"Step {d['step']}" for d in per_step_data]
cache_read_raw = [d["cache_read"] * overhead for d in per_step_data]
cache_creation_raw = [d["cache_creation"] * overhead for d in per_step_data]
completion_raw = [d["completion"] * overhead for d in per_step_data]
prompt_tokens_raw = [(d["cache_read"] + d["uncached_input"]) * overhead for d in per_step_data]
if with_cache:
uncached = [max(0, p - cr - cc) for p, cr, cc in zip(prompt_tokens_raw, cache_read_raw, cache_creation_raw)]
cache_read = cache_read_raw
cache_creation = cache_creation_raw
else:
uncached = prompt_tokens_raw
cache_read = [0] * len(per_step_data)
cache_creation = [0] * len(per_step_data)
uncached_k = [u / 1e3 for u in uncached]
cache_read_k = [cr / 1e3 for cr in cache_read]
cache_creation_k = [cc / 1e3 for cc in cache_creation]
completion_k = [c / 1e3 for c in completion_raw]
fig = go.Figure()
fig.add_trace(go.Bar(
name="Uncached Input",
x=x_labels,
y=uncached_k,
marker_color="#EF553B",
hovertemplate="Step %{x}
Uncached Input: %{y:.2f}K",
))
fig.add_trace(go.Bar(
name="Cache Read",
x=x_labels,
y=cache_read_k,
marker_color="#19D3F3",
hovertemplate="Step %{x}
Cache Read: %{y:.2f}K",
))
fig.add_trace(go.Bar(
name="Cache Creation",
x=x_labels,
y=cache_creation_k,
marker_color="#FFA15A",
hovertemplate="Step %{x}
Cache Creation: %{y:.2f}K",
))
fig.add_trace(go.Bar(
name="Completion",
x=x_labels,
y=completion_k,
marker_color="#AB63FA",
hovertemplate="Step %{x}
Completion: %{y:.2f}K",
))
fig.update_layout(
barmode="stack",
xaxis_title="Step",
yaxis_title="Tokens (K)",
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
margin=dict(l=50, r=20, t=40, b=40),
)
return fig
def create_single_trajectory_cost_chart(steps: list[dict], input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float, overhead: float = 1.0, with_cache: bool = True):
"""Create stacked bar chart for a single trajectory showing cost per step."""
import plotly.graph_objects as go
if not steps:
return None
per_step_data = calculate_per_step_tokens(steps)
x_labels = [f"Step {d['step']}" for d in per_step_data]
cache_read_raw = [d["cache_read"] * overhead for d in per_step_data]
cache_creation_raw = [d["cache_creation"] * overhead for d in per_step_data]
completion_raw = [d["completion"] * overhead for d in per_step_data]
prompt_tokens_raw = [(d["cache_read"] + d["uncached_input"]) * overhead for d in per_step_data]
if with_cache:
uncached = [max(0, p - cr - cc) for p, cr, cc in zip(prompt_tokens_raw, cache_read_raw, cache_creation_raw)]
cache_read = cache_read_raw
cache_creation = cache_creation_raw
else:
uncached = prompt_tokens_raw
cache_read = [0] * len(per_step_data)
cache_creation = [0] * len(per_step_data)
uncached_cost = [u * input_price / 1e6 for u in uncached]
cache_read_cost = [cr * cache_read_price / 1e6 for cr in cache_read]
cache_creation_cost = [cc * cache_creation_price / 1e6 for cc in cache_creation]
completion_cost = [c * completion_price / 1e6 for c in completion_raw]
fig = go.Figure()
fig.add_trace(go.Bar(
name="Uncached Input",
x=x_labels,
y=uncached_cost,
marker_color="#EF553B",
hovertemplate="Step %{x}
Uncached Input: $%{y:.4f}",
))
fig.add_trace(go.Bar(
name="Cache Read",
x=x_labels,
y=cache_read_cost,
marker_color="#19D3F3",
hovertemplate="Step %{x}
Cache Read: $%{y:.4f}",
))
fig.add_trace(go.Bar(
name="Cache Creation",
x=x_labels,
y=cache_creation_cost,
marker_color="#FFA15A",
hovertemplate="Step %{x}
Cache Creation: $%{y:.4f}",
))
fig.add_trace(go.Bar(
name="Completion",
x=x_labels,
y=completion_cost,
marker_color="#AB63FA",
hovertemplate="Step %{x}
Completion: $%{y:.4f}",
))
fig.update_layout(
barmode="stack",
xaxis_title="Step",
yaxis_title="Cost ($)",
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
margin=dict(l=50, r=20, t=40, b=40),
)
return fig
def parse_trajectory_to_steps(traj_path: Path, model_name: str) -> list[dict]:
"""
Parse trajectory file into step format for calculate_routing_tokens.
Returns list of steps with:
- model: base model name
- system_user: tokens for system + user message (step 0 only)
- completion: assistant response tokens
- observation: env response tokens (None for last step)
"""
with open(traj_path, "r", encoding="utf-8") as f:
data = json.load(f)
messages = data.get("messages", [])
trajectory_data = data.get("trajectory", [])
if not messages and trajectory_data:
return _parse_trajectory_format_to_steps(trajectory_data, model_name)
if not messages:
return []
count_tokens, _ = get_tokenizer(model_name)
steps = []
system_user_tokens = 0
current_completion = 0
pending_observation = None
i = 0
while i < len(messages):
msg = messages[i]
role = msg.get("role", "user")
content = msg.get("content", "")
if isinstance(content, list):
content = json.dumps(content)
tokens = count_tokens(str(content))
if role == "system":
system_user_tokens += tokens
i += 1
elif role == "user":
if not steps:
system_user_tokens += tokens
i += 1
else:
if steps:
steps[-1]["observation"] = tokens
pending_observation = tokens
i += 1
elif role == "assistant":
step = {
"model": model_name,
"system_user": system_user_tokens if not steps else 0,
"completion": tokens,
"observation": None,
"content": str(content),
}
steps.append(step)
system_user_tokens = 0
i += 1
return steps
def _parse_trajectory_format_to_steps(trajectory_data: list, model_name: str) -> list[dict]:
"""
Parse alternative trajectory format (with "trajectory" array) into steps.
"""
count_tokens, _ = get_tokenizer(model_name)
steps = []
for i, traj_step in enumerate(trajectory_data):
query = traj_step.get("query", [])
response_text = traj_step.get("response", "")
observation_text = traj_step.get("observation", "")
system_user_tokens = 0
if i == 0:
for q in query:
content = q.get("content", "")
if isinstance(content, list):
content = json.dumps(content)
system_user_tokens += count_tokens(str(content))
completion_tokens = count_tokens(str(response_text)) if response_text else 0
observation_tokens = count_tokens(str(observation_text)) if observation_text else None
step = {
"model": model_name,
"system_user": system_user_tokens,
"completion": completion_tokens,
"observation": observation_tokens,
"content": str(response_text) if response_text else "",
}
steps.append(step)
return steps
def get_default_overhead(model_name: str) -> float:
"""Get default tokenizer overhead for model provider"""
model_lower = model_name.lower() if model_name else ""
if "claude" in model_lower or "anthropic" in model_lower:
return 1.24
elif "gemini" in model_lower or "google" in model_lower:
return 1.0
elif "gpt" in model_lower or "openai" in model_lower or "o1" in model_lower or "o3" in model_lower:
return 1.0
else:
return 1.0
def get_tokenizer(model_name: str):
"""Get appropriate tokenizer for model. Returns (tokenizer_func, name)"""
global _tokenizer_cache
model_lower = model_name.lower() if model_name else ""
if "gpt-4o" in model_lower or "o1" in model_lower or "o3" in model_lower:
tokenizer_name = "o200k_base"
elif "gpt" in model_lower or "claude" in model_lower or "anthropic" in model_lower:
tokenizer_name = "cl100k_base"
elif "gemini" in model_lower or "google" in model_lower:
return lambda text: int(len(text) / 3.23), "gemini_approx"
else:
tokenizer_name = "cl100k_base"
if tokenizer_name not in _tokenizer_cache:
_tokenizer_cache[tokenizer_name] = tiktoken.get_encoding(tokenizer_name)
enc = _tokenizer_cache[tokenizer_name]
return lambda text: len(enc.encode(text)), tokenizer_name
def apply_thinking_overhead(df: pd.DataFrame, overhead: float) -> pd.DataFrame:
"""Apply tokenizer overhead multiplier to all token counts"""
if df.empty or overhead == 1.0:
return df
df = df.copy()
df["prompt_tokens"] = (df["prompt_tokens"] * overhead).astype(int)
df["completion_tokens"] = (df["completion_tokens"] * overhead).astype(int)
df["cache_read_tokens"] = (df["cache_read_tokens"] * overhead).astype(int)
df["cache_creation_tokens"] = (df["cache_creation_tokens"] * overhead).astype(int)
df["total_tokens"] = df["prompt_tokens"] + df["completion_tokens"]
return df
def apply_no_cache(df: pd.DataFrame) -> pd.DataFrame:
"""Convert all tokens to uncached input + completion (no caching)"""
if df.empty:
return df
df = df.copy()
df["cache_read_tokens"] = 0
df["cache_creation_tokens"] = 0
return df
def ensure_token_columns(df: pd.DataFrame) -> pd.DataFrame:
"""Ensure token-related columns exist and are numeric."""
if df is None or df.empty:
return df
df = df.copy()
required = [
"prompt_tokens",
"completion_tokens",
"cache_read_tokens",
"cache_creation_tokens",
]
for col in required:
if col not in df.columns:
df[col] = 0
df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0).astype(int)
if "total_tokens" in df.columns:
df["total_tokens"] = pd.to_numeric(df["total_tokens"], errors="coerce").fillna(0).astype(int)
return df
def load_all_trajectories_calculated(folder: str) -> pd.DataFrame:
"""Load trajectories with self-calculated token counts using calculate_routing_tokens"""
global _calculated_tokens_cache
cache_key = f"calculated_{folder}"
if cache_key in _calculated_tokens_cache:
return ensure_token_columns(_calculated_tokens_cache[cache_key])
trajectory_steps = load_all_trajectory_steps(folder)
rows = []
for instance_id, steps in trajectory_steps.items():
if not steps:
continue
try:
model_totals = calculate_routing_tokens(steps)
step_model = steps[0].get("model", "") if steps else ""
totals = model_totals.get(step_model, {})
cache_read = totals.get("cache_read", 0)
uncached_input = totals.get("uncached_input", 0)
completion = totals.get("completion", 0)
cache_creation = totals.get("cache_creation", 0)
prompt_tokens = cache_read + uncached_input
rows.append({
"instance_id": instance_id,
"model_name": step_model,
"api_calls": len(steps),
"instance_cost": 0,
"prompt_tokens": prompt_tokens,
"completion_tokens": completion,
"total_tokens": prompt_tokens + completion,
"cache_read_tokens": cache_read,
"cache_creation_tokens": cache_creation,
})
except Exception as e:
logging.error("Error calculating tokens for %s: %s", instance_id, e, exc_info=True)
df = ensure_token_columns(pd.DataFrame(rows))
_calculated_tokens_cache[cache_key] = df
return df
def load_all_trajectory_steps(folder: str) -> dict[str, list[dict]]:
"""
Load all trajectories as step sequences for routing calculations.
Returns:
dict mapping instance_id -> list of steps for calculate_routing_tokens
"""
global _trajectory_steps_cache
cache_key = f"steps_{folder}"
if cache_key in _trajectory_steps_cache:
return _trajectory_steps_cache[cache_key]
output_dir = TRAJS_DIR / folder
traj_files = list(output_dir.glob("*/*.traj.json"))
if not traj_files:
traj_files = list(output_dir.glob("*/*.traj"))
if not traj_files:
traj_files = list(output_dir.glob("*.traj.json"))
if not traj_files:
traj_files = list(output_dir.glob("*.traj"))
if not traj_files:
traj_files = list(output_dir.glob("*.json"))
model_name = ""
if traj_files:
try:
with open(traj_files[0], "r") as f:
first_data = json.load(f)
config = first_data.get("info", {}).get("config", {}).get("model", {})
model_name = config.get("cost_calc_model_override", config.get("model_name", ""))
except Exception:
pass
result = {}
for traj_path in traj_files:
try:
instance_id = traj_path.stem.replace(".traj", "")
steps = parse_trajectory_to_steps(traj_path, model_name)
if steps:
result[instance_id] = steps
except Exception as e:
logging.error("Error parsing steps for %s: %s", traj_path, e, exc_info=True)
_trajectory_steps_cache[cache_key] = result
return result
def get_litellm_prices_raw() -> dict:
"""Get raw litellm prices (all modes, unfiltered)"""
global _litellm_prices_cache
if _litellm_prices_cache is not None:
return _litellm_prices_cache
if LITELLM_PRICES_CACHE.exists():
with open(LITELLM_PRICES_CACHE) as f:
_litellm_prices_cache = json.load(f)
return _litellm_prices_cache
try:
response = requests.get(LITELLM_PRICES_URL, timeout=30)
response.raise_for_status()
_litellm_prices_cache = response.json()
DATA_DIR.mkdir(exist_ok=True)
with open(LITELLM_PRICES_CACHE, "w") as f:
json.dump(_litellm_prices_cache, f)
except Exception:
_litellm_prices_cache = {}
return _litellm_prices_cache
def get_litellm_prices() -> dict:
"""Get litellm prices filtered to chat models only"""
global _litellm_chat_prices_cache
if _litellm_chat_prices_cache is not None:
return _litellm_chat_prices_cache
raw_prices = get_litellm_prices_raw()
_litellm_chat_prices_cache = {
k: v for k, v in raw_prices.items()
if isinstance(v, dict) and v.get("mode") == "chat"
}
return _litellm_chat_prices_cache
def get_litellm_model_list() -> list[str]:
"""Get list of chat model names from litellm prices"""
prices = get_litellm_prices()
return sorted(prices.keys())
def normalize_model_name(name: str) -> str:
"""Normalize model name for comparison: lowercase, remove separators"""
return re.sub(r'[-_./]', '', name.lower())
def get_model_prices(model_name: str) -> dict | None:
if not model_name:
return None
prices = get_litellm_prices()
clean_name = model_name.replace("anthropic/", "").replace("openai/", "")
name_without_date = re.sub(r'-\d{8}$', '', clean_name)
candidates = [
model_name,
clean_name,
name_without_date,
f"anthropic/{clean_name}",
f"openai/{clean_name}",
f"anthropic/{name_without_date}",
f"openai/{name_without_date}",
]
for key in candidates:
if key in prices:
return prices[key]
normalized_name = normalize_model_name(clean_name)
normalized_no_date = normalize_model_name(name_without_date)
for key, value in prices.items():
key_normalized = normalize_model_name(key)
if normalized_name in key_normalized or normalized_no_date in key_normalized:
return value
key_last_part = key.split('/')[-1] if '/' in key else key
key_last_normalized = normalize_model_name(key_last_part)
if normalized_name == key_last_normalized or normalized_no_date == key_last_normalized:
return value
return None
def load_or_download_leaderboard():
if LEADERBOARD_CACHE.exists():
with open(LEADERBOARD_CACHE) as f:
return json.load(f)
filename = download_leaderboard(output_dir=str(DATA_DIR))
os.rename(filename, LEADERBOARD_CACHE)
with open(LEADERBOARD_CACHE) as f:
return json.load(f)
def get_bash_only_df():
data = load_or_download_leaderboard()
leaderboards = data.get("leaderboards", [])
bash_only = next((lb for lb in leaderboards if lb["name"] == "bash-only"), None)
if not bash_only:
return pd.DataFrame()
rows = []
for r in bash_only["results"]:
resolved_pct = r.get("resolved", 0)
if isinstance(resolved_pct, (int, float)):
resolved_str = f"{resolved_pct:.1f}%"
else:
resolved_str = str(resolved_pct)
rows.append({
"name": r.get("name", ""),
"% resolved": resolved_str,
"date": r.get("date", ""),
"cost": round(r.get("cost", 0), 2),
"instance_cost": round(r.get("instance_cost", 0), 4),
"instance_calls": r.get("instance_calls", 0),
"folder": r.get("folder", ""),
"os_model": "✅" if r.get("os_model") else "❌",
})
return pd.DataFrame(rows)
def get_model_details(folder: str):
if not folder:
return None, "Select a model from the table"
data = load_or_download_leaderboard()
leaderboards = data.get("leaderboards", [])
bash_only = next((lb for lb in leaderboards if lb["name"] == "bash-only"), None)
if not bash_only:
return None, "Leaderboard not found"
model = next((r for r in bash_only["results"] if r.get("folder") == folder), None)
if not model:
return None, f"Model with folder '{folder}' not found"
return model, None
def check_trajectories_downloaded(folder: str) -> bool:
if not folder:
return False
output_dir = TRAJS_DIR / folder
return output_dir.exists() and any(output_dir.iterdir())
def download_trajectories_from_s3(folder: str, progress=gr.Progress()):
if not folder:
return "❌ No model selected", gr.update(visible=False)
model, error = get_model_details(folder)
if error:
return f"❌ {error}", gr.update(visible=False)
output_dir = TRAJS_DIR / folder
if output_dir.exists() and any(output_dir.iterdir()):
file_count = len(list(output_dir.glob("*/*.traj.json")))
if file_count == 0:
file_count = len(list(output_dir.glob("*/*.traj")))
if file_count == 0:
file_count = len(list(output_dir.glob("*.json")))
return f"✅ Already downloaded: {output_dir}\n\n{file_count} trajectory files", gr.update(visible=True)
s3_path = f"{S3_BUCKET}/{folder}/trajs/"
output_dir.mkdir(parents=True, exist_ok=True)
progress(0, desc="Starting S3 download...")
try:
result = subprocess.run(
["aws", "s3", "cp", "--recursive", s3_path, str(output_dir), "--no-sign-request"],
capture_output=True,
text=True,
timeout=600,
)
if result.returncode != 0:
return f"❌ S3 download failed:\n{result.stderr}", gr.update(visible=False)
file_count = len(list(output_dir.glob("*/*.traj.json")))
if file_count == 0:
file_count = len(list(output_dir.glob("*/*.traj")))
if file_count == 0:
file_count = len(list(output_dir.glob("*.json")))
if file_count == 0:
return f"❌ No trajectory files found on S3 for {folder}", gr.update(visible=False)
per_instance = model.get("per_instance_details", {})
resolved_count = sum(1 for v in per_instance.values() if v.get("resolved"))
total_count = len(per_instance)
if total_count > 0:
resolved_pct = f"{100*resolved_count/total_count:.1f}%"
else:
resolved_pct = "N/A"
status = f"✅ Downloaded to {output_dir}\n\n{file_count} trajectory files\nResolved: {resolved_count}/{total_count} ({resolved_pct})"
return status, gr.update(visible=True)
except subprocess.TimeoutExpired:
return "❌ Download timed out (>10 min)", gr.update(visible=False)
except FileNotFoundError:
return "❌ AWS CLI not found. Install with: pip install awscli", gr.update(visible=False)
except Exception as e:
return f"❌ Error: {e}", gr.update(visible=False)
def parse_trajectory(traj_path: Path) -> dict:
with open(traj_path, "r", encoding="utf-8") as f:
data = json.load(f)
info = data.get("info", {})
model_stats = info.get("model_stats", {})
config = info.get("config", {})
model_config = config.get("model", {})
model_name = model_config.get("cost_calc_model_override", model_config.get("model_name", ""))
trajectory_steps = data.get("trajectory", [])
is_trajectory_format = len(trajectory_steps) > 0 and "messages" not in data
if is_trajectory_format and not model_name:
for step in trajectory_steps:
query = step.get("query", [])
for q in query:
if q.get("role") == "system":
content = q.get("content", "")
if "llama" in content.lower() or "meta" in content.lower():
model_name = "llama"
break
if model_name:
break
api_calls = model_stats.get("api_calls", 0)
if api_calls == 0 and is_trajectory_format:
api_calls = len(trajectory_steps)
result = {
"instance_id": data.get("instance_id", traj_path.stem),
"model_name": model_name,
"api_calls": api_calls,
"instance_cost": model_stats.get("instance_cost", 0),
"prompt_tokens": 0,
"completion_tokens": 0,
"total_tokens": 0,
"cache_read_tokens": 0,
"cache_creation_tokens": 0,
}
messages = data.get("messages", [])
for msg in messages:
usage = None
if "usage" in msg:
usage = msg["usage"]
elif "extra" in msg and isinstance(msg["extra"], dict):
response = msg["extra"].get("response", {})
if isinstance(response, dict):
usage = response.get("usage", {})
if usage:
result["prompt_tokens"] += usage.get("prompt_tokens", 0) or 0
result["completion_tokens"] += usage.get("completion_tokens", 0) or 0
result["total_tokens"] += usage.get("total_tokens", 0) or 0
cache_read = usage.get("cache_read_input_tokens", 0) or 0
cache_creation = usage.get("cache_creation_input_tokens", 0) or 0
prompt_tokens_details = usage.get("prompt_tokens_details", {})
if isinstance(prompt_tokens_details, dict):
cached_from_details = prompt_tokens_details.get("cached_tokens", 0) or 0
if cached_from_details > 0 and cache_read == 0:
cache_read = cached_from_details
result["cache_read_tokens"] += cache_read
result["cache_creation_tokens"] += cache_creation
if result["prompt_tokens"] == 0 and result["completion_tokens"] == 0:
log_path = traj_path.with_suffix(".info.log")
if not log_path.exists():
base_name = traj_path.stem.replace(".traj", "")
log_path = traj_path.parent / f"{base_name}.info.log"
if log_path.exists():
steps = _parse_old_format_log(log_path)
for step_data in steps:
result["prompt_tokens"] += step_data["cache_read"] + step_data["uncached_input"]
result["completion_tokens"] += step_data["completion"]
result["cache_read_tokens"] += step_data["cache_read"]
result["total_tokens"] = result["prompt_tokens"] + result["completion_tokens"]
if result["api_calls"] == 0:
result["api_calls"] = len(steps)
return result
def load_all_trajectories(folder: str) -> pd.DataFrame:
global _trajectories_cache
if folder in _trajectories_cache:
return ensure_token_columns(_trajectories_cache[folder])
output_dir = TRAJS_DIR / folder
traj_files = list(output_dir.glob("*/*.traj.json"))
if not traj_files:
traj_files = list(output_dir.glob("*/*.traj"))
if not traj_files:
traj_files = list(output_dir.glob("*.traj.json"))
if not traj_files:
traj_files = list(output_dir.glob("*.traj"))
if not traj_files:
traj_files = list(output_dir.glob("*.json"))
rows = []
for traj_path in traj_files:
try:
rows.append(parse_trajectory(traj_path))
except Exception as e:
logging.error("Error parsing %s: %s", traj_path, e, exc_info=True)
df = ensure_token_columns(pd.DataFrame(rows))
_trajectories_cache[folder] = df
return df
def create_cost_by_type_chart(df: pd.DataFrame, input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float):
"""Create Total Cost by Token Type chart (can be called separately for price updates)"""
if df.empty:
return None
total_completion = df["completion_tokens"].sum()
total_cache_read = df["cache_read_tokens"].sum()
total_cache_creation = df["cache_creation_tokens"].sum()
df_temp = df.copy()
df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0)
total_uncached_input = df_temp["uncached_input"].sum()
cost_uncached_input = total_uncached_input * input_price / 1e6
cost_cache_read = total_cache_read * cache_read_price / 1e6
cost_cache_creation = total_cache_creation * cache_creation_price / 1e6
cost_completion = total_completion * completion_price / 1e6
cost_data = pd.DataFrame({
"Token Type": ["Uncached Input", "Cache Read", "Cache Creation", "Completion"],
"Cost ($)": [cost_uncached_input, cost_cache_read, cost_cache_creation, cost_completion],
})
fig = px.bar(
cost_data,
x="Token Type",
y="Cost ($)",
color="Token Type",
color_discrete_sequence=["#EF553B", "#19D3F3", "#FFA15A", "#AB63FA"],
)
fig.update_layout(
xaxis_title="",
yaxis_title="Cost ($)",
showlegend=False,
margin=dict(l=60, r=20, t=20, b=40),
)
total_cost = cost_uncached_input + cost_cache_read + cost_cache_creation + cost_completion
fig.add_annotation(
text=f"Total: ${total_cost:.2f}",
xref="paper", yref="paper",
x=0.95, y=0.95, showarrow=False,
font=dict(size=12),
)
return fig
def create_token_charts(df: pd.DataFrame, input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float):
"""Create only token-related charts (for source switching)"""
if df.empty:
return None, None, None
total_completion = df["completion_tokens"].sum()
total_cache_read = df["cache_read_tokens"].sum()
total_cache_creation = df["cache_creation_tokens"].sum()
df_temp = df.copy()
df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0)
total_uncached_input = df_temp["uncached_input"].sum()
token_data = pd.DataFrame({
"Token Type": ["Uncached Input", "Cache Read", "Cache Creation", "Completion"],
"Total Tokens (M)": [total_uncached_input / 1e6, total_cache_read / 1e6, total_cache_creation / 1e6, total_completion / 1e6],
})
fig_tokens = px.bar(
token_data,
x="Token Type",
y="Total Tokens (M)",
color="Token Type",
color_discrete_sequence=["#EF553B", "#19D3F3", "#FFA15A", "#AB63FA"],
)
fig_tokens.update_layout(
xaxis_title="",
yaxis_title="Tokens (M)",
showlegend=False,
margin=dict(l=60, r=20, t=20, b=40),
)
total_all = total_uncached_input + total_cache_read + total_cache_creation + total_completion
fig_tokens.add_annotation(
text=f"Total: {total_all/1e6:.2f}M",
xref="paper", yref="paper",
x=0.95, y=0.95, showarrow=False,
font=dict(size=12),
)
fig_tokens_cost = create_cost_by_type_chart(df, input_price, cache_read_price, cache_creation_price, completion_price)
# Stacked bar chart - sort by total tokens (sum of all stacked)
df_sorted = df.copy()
df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0)
df_sorted["total_stacked"] = df_sorted["uncached_input_tokens"] + df_sorted["cache_read_tokens"] + df_sorted["cache_creation_tokens"] + df_sorted["completion_tokens"]
df_sorted = df_sorted.sort_values("total_stacked", ascending=False).reset_index(drop=True)
df_sorted["trajectory_idx"] = range(len(df_sorted))
fig_stacked = go.Figure()
fig_stacked.add_trace(go.Bar(
name="Uncached Input", x=df_sorted["trajectory_idx"], y=df_sorted["uncached_input_tokens"] / 1e6,
marker_color="#EF553B", hovertemplate="Trajectory: %{x}
Uncached Input: %{y:.2f}M",
))
fig_stacked.add_trace(go.Bar(
name="Cache Read", x=df_sorted["trajectory_idx"], y=df_sorted["cache_read_tokens"] / 1e6,
marker_color="#19D3F3", hovertemplate="Trajectory: %{x}
Cache Read: %{y:.2f}M",
))
fig_stacked.add_trace(go.Bar(
name="Cache Creation", x=df_sorted["trajectory_idx"], y=df_sorted["cache_creation_tokens"] / 1e6,
marker_color="#FFA15A", hovertemplate="Trajectory: %{x}
Cache Creation: %{y:.2f}M",
))
fig_stacked.add_trace(go.Bar(
name="Completion", x=df_sorted["trajectory_idx"], y=df_sorted["completion_tokens"] / 1e6,
marker_color="#AB63FA", hovertemplate="Trajectory: %{x}
Completion: %{y:.2f}M",
))
fig_stacked.update_layout(
barmode="stack",
xaxis_title="Trajectory (sorted by total tokens)",
yaxis_title="Tokens (M)",
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
margin=dict(l=50, r=20, t=40, b=40),
)
return fig_tokens, fig_tokens_cost, fig_stacked
def create_basic_histograms(df: pd.DataFrame, input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float):
if df.empty:
return None, None, None, None, None
fig_steps = px.histogram(
df,
x="api_calls",
nbins=30,
color_discrete_sequence=["#636EFA"],
)
fig_steps.update_layout(
xaxis_title="API Calls (Steps)",
yaxis_title="Number of Trajectories",
showlegend=False,
margin=dict(l=40, r=20, t=40, b=40),
)
fig_steps.add_annotation(
text=f"Mean: {df['api_calls'].mean():.1f} | Median: {df['api_calls'].median():.0f}",
xref="paper", yref="paper",
x=0.95, y=0.95, showarrow=False,
font=dict(size=12),
)
fig_cost = px.histogram(
df,
x="instance_cost",
nbins=30,
color_discrete_sequence=["#00CC96"],
)
fig_cost.update_layout(
xaxis_title="Cost ($)",
yaxis_title="Number of Trajectories",
showlegend=False,
margin=dict(l=40, r=20, t=40, b=40),
)
fig_cost.add_annotation(
text=f"Mean: ${df['instance_cost'].mean():.4f} | Total: ${df['instance_cost'].sum():.2f}",
xref="paper", yref="paper",
x=0.95, y=0.95, showarrow=False,
font=dict(size=12),
)
total_completion = df["completion_tokens"].sum()
total_cache_read = df["cache_read_tokens"].sum()
total_cache_creation = df["cache_creation_tokens"].sum()
# Uncached input = prompt - cache_read - cache_creation (per trajectory, then sum)
df_temp = df.copy()
df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0)
total_uncached_input = df_temp["uncached_input"].sum()
token_data = pd.DataFrame({
"Token Type": ["Uncached Input", "Cache Read", "Cache Creation", "Completion"],
"Tokens (M)": [total_uncached_input / 1e6, total_cache_read / 1e6, total_cache_creation / 1e6, total_completion / 1e6],
})
fig_tokens = px.bar(
token_data,
x="Token Type",
y="Tokens (M)",
color="Token Type",
color_discrete_sequence=["#EF553B", "#19D3F3", "#FFA15A", "#AB63FA"],
)
fig_tokens.update_layout(
xaxis_title="",
yaxis_title="Tokens (M)",
showlegend=False,
margin=dict(l=60, r=20, t=20, b=40),
)
total_all = total_uncached_input + total_cache_read + total_cache_creation + total_completion
fig_tokens.add_annotation(
text=f"Total: {total_all/1e6:.2f}M",
xref="paper", yref="paper",
x=0.95, y=0.95, showarrow=False,
font=dict(size=12),
)
# Cost by token type (use separate function)
fig_tokens_cost = create_cost_by_type_chart(df, input_price, cache_read_price, cache_creation_price, completion_price)
# Sort by total tokens (sum of all stacked)
df_sorted = df.copy()
df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0)
df_sorted["total_stacked"] = df_sorted["uncached_input_tokens"] + df_sorted["cache_read_tokens"] + df_sorted["cache_creation_tokens"] + df_sorted["completion_tokens"]
df_sorted = df_sorted.sort_values("total_stacked", ascending=False).reset_index(drop=True)
df_sorted["trajectory_idx"] = range(len(df_sorted))
fig_stacked = go.Figure()
fig_stacked.add_trace(go.Bar(
name="Uncached Input",
x=df_sorted["trajectory_idx"],
y=df_sorted["uncached_input_tokens"] / 1e6,
marker_color="#EF553B",
hovertemplate="Trajectory: %{x}
Uncached Input: %{y:.3f}M",
))
fig_stacked.add_trace(go.Bar(
name="Cache Read",
x=df_sorted["trajectory_idx"],
y=df_sorted["cache_read_tokens"] / 1e6,
marker_color="#19D3F3",
hovertemplate="Trajectory: %{x}
Cache Read: %{y:.3f}M",
))
fig_stacked.add_trace(go.Bar(
name="Cache Creation",
x=df_sorted["trajectory_idx"],
y=df_sorted["cache_creation_tokens"] / 1e6,
marker_color="#FFA15A",
hovertemplate="Trajectory: %{x}
Cache Creation: %{y:.3f}M",
))
fig_stacked.add_trace(go.Bar(
name="Completion",
x=df_sorted["trajectory_idx"],
y=df_sorted["completion_tokens"] / 1e6,
marker_color="#AB63FA",
hovertemplate="Trajectory: %{x}
Completion: %{y:.3f}M",
))
fig_stacked.update_layout(
barmode="stack",
xaxis_title="Trajectory (sorted by total tokens)",
yaxis_title="Tokens (M)",
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
margin=dict(l=50, r=20, t=40, b=40),
)
return fig_steps, fig_cost, fig_tokens, fig_tokens_cost, fig_stacked
def create_cost_breakdown(df: pd.DataFrame, input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float):
if df.empty:
return None
# Sort by total tokens (sum of all stacked)
df_sorted = df.copy()
df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0)
df_sorted["total_stacked"] = df_sorted["uncached_input_tokens"] + df_sorted["cache_read_tokens"] + df_sorted["cache_creation_tokens"] + df_sorted["completion_tokens"]
df_sorted = df_sorted.sort_values("total_stacked", ascending=False).reset_index(drop=True)
df_sorted["trajectory_idx"] = range(len(df_sorted))
df_sorted["cost_uncached_input"] = df_sorted["uncached_input_tokens"] * input_price / 1e6
df_sorted["cost_cache_read"] = df_sorted["cache_read_tokens"] * cache_read_price / 1e6
df_sorted["cost_cache_creation"] = df_sorted["cache_creation_tokens"] * cache_creation_price / 1e6
df_sorted["cost_completion"] = df_sorted["completion_tokens"] * completion_price / 1e6
fig = go.Figure()
fig.add_trace(go.Bar(
name=f"Uncached Input (${input_price:.2f}/1M)",
x=df_sorted["trajectory_idx"],
y=df_sorted["cost_uncached_input"],
marker_color="#EF553B",
hovertemplate="Trajectory: %{x}
Cost: $%{y:.4f}",
))
fig.add_trace(go.Bar(
name=f"Cache Read (${cache_read_price:.2f}/1M)",
x=df_sorted["trajectory_idx"],
y=df_sorted["cost_cache_read"],
marker_color="#19D3F3",
hovertemplate="Trajectory: %{x}
Cost: $%{y:.4f}",
))
fig.add_trace(go.Bar(
name=f"Cache Creation (${cache_creation_price:.2f}/1M)",
x=df_sorted["trajectory_idx"],
y=df_sorted["cost_cache_creation"],
marker_color="#FFA15A",
hovertemplate="Trajectory: %{x}
Cost: $%{y:.4f}",
))
fig.add_trace(go.Bar(
name=f"Completion (${completion_price:.2f}/1M)",
x=df_sorted["trajectory_idx"],
y=df_sorted["cost_completion"],
marker_color="#AB63FA",
hovertemplate="Trajectory: %{x}
Cost: $%{y:.4f}",
))
total_cost = (
df_sorted["cost_uncached_input"].sum() +
df_sorted["cost_cache_read"].sum() +
df_sorted["cost_cache_creation"].sum() +
df_sorted["cost_completion"].sum()
)
fig.update_layout(
barmode="stack",
xaxis_title="Trajectory (sorted by total tokens)",
yaxis_title="Cost ($)",
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
margin=dict(l=50, r=20, t=40, b=40),
)
fig.add_annotation(
text=f"Total: ${total_cost:.2f}",
xref="paper", yref="paper",
x=0.95, y=0.95, showarrow=False,
font=dict(size=14),
bgcolor="white",
)
return fig
def extract_model_from_folder(folder: str) -> str:
"""Extract model name from folder like '20251124_mini-v1.16.0_claude-opus-4-5-20251101'"""
if not folder:
return ""
parts = folder.split("_")
if len(parts) >= 3:
return "_".join(parts[2:])
return folder
def get_prices_for_folder(folder: str) -> tuple[dict, str]:
"""Get prices from litellm based on folder name.
Returns (prices_dict, model_name) where prices_dict has 'value' and 'found' for each price type."""
model_hint = extract_model_from_folder(folder)
result = {
"input": {"value": 0, "found": False},
"cache_read": {"value": 0, "found": False},
"cache_creation": {"value": 0, "found": False},
"completion": {"value": 0, "found": False},
}
if not model_hint:
return result, ""
prices = get_model_prices(model_hint)
if prices:
# Get values from litellm
input_price = prices.get("input_cost_per_token", 0) * 1e6
cache_read = prices.get("cache_read_input_token_cost", 0) * 1e6
cache_creation = prices.get("cache_creation_input_token_cost", 0) * 1e6
completion = prices.get("output_cost_per_token", 0) * 1e6
result["input"] = {"value": input_price, "found": input_price > 0}
result["cache_read"] = {"value": cache_read, "found": cache_read > 0}
result["cache_creation"] = {"value": cache_creation, "found": cache_creation > 0}
result["completion"] = {"value": completion, "found": completion > 0}
# Apply fallback estimates based on standard ratios
# Cache Read = Input * 0.1 (90% discount)
# Cache Creation = Input * 1.25 (25% premium)
# Completion = Input * 5 (typical ratio)
if input_price > 0:
if not result["cache_read"]["found"]:
result["cache_read"]["value"] = input_price * 0.1
if not result["cache_creation"]["found"]:
result["cache_creation"]["value"] = input_price * 1.25
if not result["completion"]["found"]:
result["completion"]["value"] = input_price * 5
elif completion > 0:
# If we only have completion, estimate input from it
estimated_input = completion / 5
if not result["input"]["found"]:
result["input"]["value"] = estimated_input
if not result["cache_read"]["found"]:
result["cache_read"]["value"] = estimated_input * 0.1
if not result["cache_creation"]["found"]:
result["cache_creation"]["value"] = estimated_input * 1.25
return result, model_hint
def _build_selection_payload(row_idx: int | None, df: pd.DataFrame):
if df is None or df.empty or row_idx is None:
return (
"", "",
gr.update(visible=False),
gr.update(value=0, label="Input"),
gr.update(value=0, label="Cache Read"),
gr.update(value=0, label="Cache Creation"),
gr.update(value=0, label="Completion"),
"",
gr.update(value=1.0),
)
row = df.iloc[row_idx]
folder = row["folder"]
name = row["name"]
prices_dict, model_hint = get_prices_for_folder(folder)
default_overhead = get_default_overhead(model_hint)
def price_update(price_info, name):
value = price_info["value"]
if price_info["found"]:
return gr.update(value=value, label=f"✅ {name}")
elif value > 0:
return gr.update(value=value, label=f"❌ {name} (est.)")
else:
return gr.update(value=0, label=f"❌ {name}")
return (
folder, name,
gr.update(visible=True),
price_update(prices_dict["input"], "Input"),
price_update(prices_dict["cache_read"], "Cache Read"),
price_update(prices_dict["cache_creation"], "Cache Creation"),
price_update(prices_dict["completion"], "Completion"),
model_hint,
gr.update(value=default_overhead),
)
def on_row_select(evt: gr.SelectData, df: pd.DataFrame):
row_idx = None
if evt is not None and evt.index is not None:
row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
return _build_selection_payload(row_idx, df)
def select_first_row(df: pd.DataFrame):
default_idx = 0 if df is not None and not df.empty else None
return _build_selection_payload(default_idx, df)
def create_routed_token_chart(original_tokens: dict, base_tokens: dict, additional_models: list, base_model_name: str = "Base"):
"""
Create grouped+stacked bar chart comparing Calculated vs Routed tokens.
Args:
original_tokens: dict with uncached_input, cache_read, cache_creation, completion (from Calculated)
base_tokens: dict with uncached_input, cache_read, cache_creation, completion (base portion in routing)
additional_models: list of (model_name, tokens_dict) tuples
base_model_name: name of the base model
"""
import plotly.graph_objects as go
categories = ["Uncached Input", "Cache Read", "Cache Creation", "Completion"]
token_keys = ["uncached_input", "cache_read", "cache_creation", "completion"]
base_color_dark = "#636EFA"
base_color_light = "#A0C4FF"
model_colors = ["#EF553B", "#00CC96", "#AB63FA", "#FFA15A"]
fig = go.Figure()
fig.add_trace(go.Bar(
name=f"{base_model_name} [no routing]",
x=categories,
y=[original_tokens.get(k, 0) / 1e6 for k in token_keys],
marker_color="rgba(99, 110, 250, 0.3)",
marker_line_color=base_color_dark,
marker_line_width=1,
marker_pattern_shape="/",
marker_pattern_fgcolor=base_color_dark,
offsetgroup="calculated",
hovertemplate="%{x}
" + base_model_name + " [no routing]: %{y:.3f}M",
))
fig.add_trace(go.Bar(
name=f"{base_model_name} [with routing]",
x=categories,
y=[base_tokens.get(k, 0) / 1e6 for k in token_keys],
marker_color=base_color_dark,
offsetgroup="routed",
hovertemplate="%{x}
" + base_model_name + " [with routing]: %{y:.3f}M",
))
for i, (model_name, tokens) in enumerate(additional_models):
fig.add_trace(go.Bar(
name=model_name or f"Model {i+1}",
x=categories,
y=[tokens.get(k, 0) / 1e6 for k in token_keys],
marker_color=model_colors[i % len(model_colors)],
offsetgroup="routed",
hovertemplate="%{x}
" + (model_name or f"Model {i+1}") + ": %{y:.3f}M",
))
original_total = sum(original_tokens.get(k, 0) for k in token_keys)
routed_total = sum(base_tokens.get(k, 0) for k in token_keys) + sum(
sum(m[1].get(k, 0) for k in token_keys) for m in additional_models
)
annotation_lines = [
f"No routing: {original_total/1e6:.2f}M",
f"With routing: {routed_total/1e6:.2f}M",
]
fig.update_layout(
yaxis_title="Tokens (M)",
barmode="stack",
bargroupgap=0.1,
margin=dict(l=40, r=40, t=40, b=40),
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1, traceorder="normal"),
)
fig.add_annotation(
text="
".join(annotation_lines),
xref="paper", yref="paper",
x=0.02, y=0.98, showarrow=False,
font=dict(size=11),
align="left",
bgcolor="rgba(255,255,255,0.8)",
bordercolor="gray",
borderwidth=1,
)
return fig
def create_routed_cost_chart(original_costs: dict, base_costs: dict, additional_models: list, base_model_name: str = "Base"):
"""
Create grouped+stacked bar chart comparing Calculated vs Routed costs.
Args:
original_costs: dict with uncached_input, cache_read, cache_creation, completion (from Calculated)
base_costs: dict with uncached_input, cache_read, cache_creation, completion (base portion in routing)
additional_models: list of (model_name, costs_dict) tuples
base_model_name: name of the base model
"""
import plotly.graph_objects as go
categories = ["Uncached Input", "Cache Read", "Cache Creation", "Completion"]
cost_keys = ["uncached_input", "cache_read", "cache_creation", "completion"]
base_color_dark = "#636EFA"
base_color_light = "#A0C4FF"
model_colors = ["#EF553B", "#00CC96", "#AB63FA", "#FFA15A"]
fig = go.Figure()
fig.add_trace(go.Bar(
name=f"{base_model_name} [no routing]",
x=categories,
y=[original_costs.get(k, 0) for k in cost_keys],
marker_color="rgba(99, 110, 250, 0.3)",
marker_line_color=base_color_dark,
marker_line_width=1,
marker_pattern_shape="/",
marker_pattern_fgcolor=base_color_dark,
offsetgroup="calculated",
hovertemplate="%{x}
" + base_model_name + " [no routing]: $%{y:.2f}",
))
fig.add_trace(go.Bar(
name=f"{base_model_name} [with routing]",
x=categories,
y=[base_costs.get(k, 0) for k in cost_keys],
marker_color=base_color_dark,
offsetgroup="routed",
hovertemplate="%{x}
" + base_model_name + " [with routing]: $%{y:.2f}",
))
for i, (model_name, costs) in enumerate(additional_models):
fig.add_trace(go.Bar(
name=model_name or f"Model {i+1}",
x=categories,
y=[costs.get(k, 0) for k in cost_keys],
marker_color=model_colors[i % len(model_colors)],
offsetgroup="routed",
hovertemplate="%{x}
" + (model_name or f"Model {i+1}") + ": $%{y:.2f}",
))
original_total = sum(original_costs.get(k, 0) for k in cost_keys)
routed_total = sum(base_costs.get(k, 0) for k in cost_keys) + sum(
sum(m[1].get(k, 0) for k in cost_keys) for m in additional_models
)
annotation_lines = [
f"No routing: ${original_total:.2f}",
f"With routing: ${routed_total:.2f}",
]
fig.update_layout(
yaxis_title="Cost ($)",
barmode="stack",
bargroupgap=0.1,
margin=dict(l=40, r=40, t=40, b=40),
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1, traceorder="normal"),
)
fig.add_annotation(
text="
".join(annotation_lines),
xref="paper", yref="paper",
x=0.02, y=0.98, showarrow=False,
font=dict(size=11),
align="left",
bgcolor="rgba(255,255,255,0.8)",
bordercolor="gray",
borderwidth=1,
)
return fig
def build_app():
leaderboard_df = get_bash_only_df()
with gr.Blocks(title="SWE-bench Routing Cost Calculator") as app:
gr.HTML("""
""")
trajectories_state = gr.State(None)
gr.Markdown("# 🧮 SWE-bench Costs Calculator `v0.3.45`")
gr.Markdown("### *Calculate cost savings with different routing strategies.*")
gr.Markdown("## 🎯 Select a base model for cost analysis (click a row)")
with gr.Row():
with gr.Column(scale=3):
leaderboard_table = gr.Dataframe(
value=leaderboard_df,
label="Bash-Only Leaderboard",
interactive=False,
wrap=True,
elem_id="leaderboard-table",
)
with gr.Column(visible=False) as analysis_section:
gr.Markdown("## 📊 Trajectory Analysis")
with gr.Accordion("Leaderboard data", open=True):
with gr.Row():
plot_steps = gr.Plot(label="Distribution of API Calls (Steps) per Trajectory")
plot_cost = gr.Plot(label="Distribution of Cost Reported by Leaderboard ($)")
with gr.Accordion("Token counts REPORTED in the metadata of .traj files [AGGREGATED ALL]", open=True):
with gr.Row():
plot_tokens_meta = gr.Plot(label="Total Tokens by Type")
plot_tokens_cost_meta = gr.Plot(label="Total Cost by Token Type ($)")
with gr.Accordion("Token counts REPORTED in the metadata of .traj files [AGGREGATED BY TRAJECTORY]", open=True):
with gr.Row():
plot_stacked_meta = gr.Plot(label="Tokens per Trajectory (stacked)")
with gr.Row():
plot_cost_breakdown_meta = gr.Plot(label="Cost per Trajectory")
with gr.Accordion("Token counts REPORTED in the metadata of .traj files [ONE TRAJECTORY]", open=True, visible=False) as single_traj_meta_accordion:
with gr.Row():
single_traj_meta_dropdown = gr.Dropdown(label="Select Issue", choices=[], interactive=True)
with gr.Row():
single_traj_meta_plot = gr.Plot(label="Tokens per Step (stacked)")
with gr.Row():
single_traj_meta_cost_plot = gr.Plot(label="Cost per Step (stacked) ($)")
with gr.Accordion("Token counts CALCULATED from .traj files [AGGREGATED ALL]", open=True):
with gr.Row():
plot_tokens_calc = gr.Plot(label="Total Tokens by Type")
plot_tokens_cost_calc = gr.Plot(label="Total Cost by Token Type ($)")
with gr.Accordion("Token counts CALCULATED from .traj files [AGGREGATED BY TRAJECTORY]", open=True):
with gr.Row():
plot_stacked_calc = gr.Plot(label="Tokens per Trajectory (stacked)")
with gr.Row():
plot_cost_breakdown_calc = gr.Plot(label="Cost per Trajectory")
with gr.Accordion("Token counts CALCULATED from .traj files [ONE TRAJECTORY]", open=True, visible=False) as single_traj_accordion:
with gr.Row():
single_traj_dropdown = gr.Dropdown(label="Select Issue", choices=[], interactive=True)
with gr.Row():
single_traj_plot = gr.Plot(label="Tokens per Step (stacked)")
with gr.Row():
single_traj_cost_plot = gr.Plot(label="Cost per Step (stacked) ($)")
with gr.Accordion("Token counts CALCULATED from .traj files, with ROUTING [AGGREGATED ALL]", open=True, visible=False) as routing_plots_row:
with gr.Row():
routing_tokens_plot = gr.Plot(label="Tokens by Type (per Model)")
routing_cost_plot = gr.Plot(label="Cost by Type (per Model) ($)")
gr.Markdown("*With routing all messages in the trajectory remain as they are, but messages that match the selected filters are assigned to selected models for routing to.*")
with gr.Column(scale=1):
selected_folder = gr.State("")
gr.Markdown("### Selected Model")
selected_name = gr.Textbox(label="Model Name", interactive=False)
analyze_btn = gr.Button("📊 Load & Analyze", visible=False, variant="primary")
download_status = gr.Textbox(label="Status", interactive=False, lines=3)
gr.Markdown("---")
gr.Markdown("### 💰 Token Prices ($/1M) · *[litellm](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)*")
detected_model = gr.Textbox(label="Detected Model", interactive=False)
with gr.Row():
price_input = gr.Number(label="Input", value=0, precision=2, scale=1)
price_cache_read = gr.Number(label="Cache Read", value=0, precision=2, scale=1)
price_cache_creation = gr.Number(label="Cache Creation", value=0, precision=2, scale=1)
price_completion = gr.Number(label="Completion", value=0, precision=2, scale=1)
gr.Markdown("---")
gr.Markdown("### 🔢 Calculated Token Options")
thinking_overhead = gr.Number(
label="Tokenizer Overhead",
value=1.21,
precision=2,
info="Multiplier for Calculated tokens (tiktoken → native)",
)
use_cache = gr.Checkbox(
label="Use Cache",
value=True,
info="If disabled, all tokens are Uncached Input or Completion",
)
gr.Markdown("---")
add_routing_btn = gr.Button("➕ Add Routing", variant="primary", visible=False)
gr.Markdown("*With routing all messages in the trajectory remain as they are, but messages that match the selected filters are assigned to selected models for routing to.*")
with gr.Column(visible=False) as routing_section:
gr.Markdown("### 🔀 Routing Models")
with gr.Column():
with gr.Group():
gr.Markdown("#### Route to Model 1")
with gr.Row(elem_classes=["quick-select-row"]):
quick_btns_1 = []
for item in QUICK_SELECT_MODELS:
if isinstance(item, tuple):
model, short_name = item
else:
model = item
short_name = model.split("/")[-1]
btn = gr.Button(short_name, size="sm", scale=0, min_width=80)
quick_btns_1.append((btn, model))
routing_model_1 = gr.Dropdown(
label="Model (type 3+ chars to search)",
choices=[],
allow_custom_value=True,
interactive=True,
)
with gr.Row():
routing_price_1_input = gr.Number(label="Input", precision=3, scale=1)
routing_price_1_cache_read = gr.Number(label="Cache Read", precision=3, scale=1)
routing_price_1_cache_creation = gr.Number(label="Cache Creation", precision=3, scale=1)
routing_price_1_completion = gr.Number(label="Completion", precision=3, scale=1)
add_model_2_btn = gr.Button("+ Add another model", size="sm", visible=False)
with gr.Column(visible=False) as routing_block_2:
with gr.Group():
gr.Markdown("#### Route to Model 2")
with gr.Row(elem_classes=["quick-select-row"]):
quick_btns_2 = []
for item in QUICK_SELECT_MODELS:
if isinstance(item, tuple):
model, short_name = item
else:
model = item
short_name = model.split("/")[-1]
btn = gr.Button(short_name, size="sm", scale=0, min_width=80)
quick_btns_2.append((btn, model))
routing_model_2 = gr.Dropdown(
label="Model (type 3+ chars to search)",
choices=[],
allow_custom_value=True,
interactive=True,
)
with gr.Row():
routing_price_2_input = gr.Number(label="Input", precision=3, scale=1)
routing_price_2_cache_read = gr.Number(label="Cache Read", precision=3, scale=1)
routing_price_2_cache_creation = gr.Number(label="Cache Creation", precision=3, scale=1)
routing_price_2_completion = gr.Number(label="Completion", precision=3, scale=1)
add_model_3_btn = gr.Button("+ Add another model", size="sm", visible=False)
with gr.Column(visible=False) as routing_block_3:
with gr.Group():
gr.Markdown("#### Route to Model 3")
with gr.Row(elem_classes=["quick-select-row"]):
quick_btns_3 = []
for item in QUICK_SELECT_MODELS:
if isinstance(item, tuple):
model, short_name = item
else:
model = item
short_name = model.split("/")[-1]
btn = gr.Button(short_name, size="sm", scale=0, min_width=80)
quick_btns_3.append((btn, model))
routing_model_3 = gr.Dropdown(
label="Model (type 3+ chars to search)",
choices=[],
allow_custom_value=True,
interactive=True,
)
with gr.Row():
routing_price_3_input = gr.Number(label="Input", precision=3, scale=1)
routing_price_3_cache_read = gr.Number(label="Cache Read", precision=3, scale=1)
routing_price_3_cache_creation = gr.Number(label="Cache Creation", precision=3, scale=1)
routing_price_3_completion = gr.Number(label="Completion", precision=3, scale=1)
gr.Markdown("---")
gr.Markdown("### 🎯 Router Strategy")
selected_strategy = gr.Radio(
choices=["Random router", "Every k-th step", "Python list slices", "Grep", "Resolved/Unresolved", "Replace part of trajectory"],
value="Random router",
label="",
interactive=True,
)
num_routing_models = gr.State(1)
with gr.Column(visible=True) as random_block:
random_hint = gr.Markdown("*Weights must sum to 1.0*")
weight_base = gr.Number(label="Base weight", value=0.5, minimum=0, maximum=1, precision=2, interactive=True)
weight_model_1 = gr.Number(label="Model 1 weight", value=0.5, minimum=0, maximum=1, precision=2, interactive=True)
weight_model_2 = gr.Number(label="Model 2 weight", value=0, minimum=0, maximum=1, precision=2, interactive=True, visible=False)
weight_model_3 = gr.Number(label="Model 3 weight", value=0, minimum=0, maximum=1, precision=2, interactive=True, visible=False)
with gr.Column(visible=False) as every_k_block:
every_k_hint = gr.Markdown("*First model has priority on overlaps*")
k_model_1 = gr.Number(label="k₁ (Model 1)", value=2, minimum=1, precision=0, interactive=True)
k_model_2 = gr.Number(label="k₂ (Model 2)", value=3, minimum=1, precision=0, interactive=True, visible=False)
k_model_3 = gr.Number(label="k₃ (Model 3)", value=5, minimum=1, precision=0, interactive=True, visible=False)
with gr.Column(visible=False) as slice_block:
slice_hint = gr.Markdown("*First model has priority on overlaps*")
slice_model_1 = gr.Textbox(label="M1 slice", value="[0::3]", interactive=True)
slice_model_2 = gr.Textbox(label="M2 slice", value="[1::3]", interactive=True, visible=False)
slice_model_3 = gr.Textbox(label="M3 slice", value="[2::3]", interactive=True, visible=False)
with gr.Column(visible=False) as grep_block:
grep_hint = gr.Markdown("*Use `|` for OR, `&` for AND (don't mix). First model has priority on overlaps*")
grep_model_1 = gr.Textbox(label="M1 grep", value="ls|find", interactive=True)
grep_model_2 = gr.Textbox(label="M2 grep", value="cat|echo|printf|tee", interactive=True, visible=False)
grep_model_3 = gr.Textbox(label="M3 grep", value="python&.py", interactive=True, visible=False)
with gr.Column(visible=False) as resolved_block:
resolved_hint = gr.Markdown("*Route all steps based on trajectory resolution status*")
resolved_model = gr.Dropdown(
label="Model for resolved trajectories",
choices=["Base", "M1", "M2", "M3"],
value="Base",
interactive=True,
)
unresolved_model = gr.Dropdown(
label="Model for unresolved trajectories",
choices=["Base", "M1", "M2", "M3"],
value="M1",
interactive=True,
)
with gr.Column(visible=False) as part_block:
part_hint = gr.Markdown("*Ranges must not overlap*")
part_mode = gr.Radio(
choices=["Indexes", "Percentages"],
value="Percentages",
label="Mode",
interactive=True,
)
start_1 = gr.Number(label="M1 Start", value=0, minimum=0, precision=0, interactive=True)
end_1 = gr.Number(label="M1 End", value=30, minimum=0, precision=0, interactive=True)
start_2 = gr.Number(label="M2 Start", value=30, minimum=0, precision=0, interactive=True, visible=False)
end_2 = gr.Number(label="M2 End", value=60, minimum=0, precision=0, interactive=True, visible=False)
start_3 = gr.Number(label="M3 Start", value=60, minimum=0, precision=0, interactive=True, visible=False)
end_3 = gr.Number(label="M3 End", value=100, minimum=0, precision=0, interactive=True, visible=False)
gr.Markdown("---")
route_btn = gr.Button("🚀 Let's ROUTE!!", variant="primary", size="lg", interactive=False)
routing_result = gr.Markdown(visible=False)
def toggle_routing_section():
return gr.update(visible=True)
add_routing_btn.click(
fn=toggle_routing_section,
outputs=[routing_section],
)
def on_strategy_change(strategy, num_models):
show_random = strategy == "Random router"
show_every_k = strategy == "Every k-th step"
show_slice = strategy == "Python list slices"
show_grep = strategy == "Grep"
show_resolved = strategy == "Resolved/Unresolved"
show_part = strategy == "Replace part of trajectory"
has_m2 = num_models >= 2
has_m3 = num_models >= 3
return [
gr.update(visible=show_random), # random_block
gr.update(visible=show_every_k), # every_k_block
gr.update(visible=show_slice), # slice_block
gr.update(visible=show_grep), # grep_block
gr.update(visible=show_resolved), # resolved_block
gr.update(visible=show_part), # part_block
gr.update(visible=show_random), # random_hint
gr.update(visible=show_random), # weight_base
gr.update(visible=show_random), # weight_model_1
gr.update(visible=show_random and has_m2), # weight_model_2
gr.update(visible=show_random and has_m3), # weight_model_3
gr.update(visible=show_every_k), # every_k_hint
gr.update(visible=show_every_k), # k_model_1
gr.update(visible=show_every_k and has_m2), # k_model_2
gr.update(visible=show_every_k and has_m3), # k_model_3
gr.update(visible=show_slice), # slice_hint
gr.update(visible=show_slice), # slice_model_1
gr.update(visible=show_slice and has_m2), # slice_model_2
gr.update(visible=show_slice and has_m3), # slice_model_3
gr.update(visible=show_grep), # grep_hint
gr.update(visible=show_grep), # grep_model_1
gr.update(visible=show_grep and has_m2), # grep_model_2
gr.update(visible=show_grep and has_m3), # grep_model_3
gr.update(visible=show_resolved), # resolved_hint
gr.update(visible=show_resolved), # resolved_model
gr.update(visible=show_resolved), # unresolved_model
gr.update(visible=show_part), # part_hint
gr.update(visible=show_part), # part_mode
gr.update(visible=show_part), # start_1
gr.update(visible=show_part), # end_1
gr.update(visible=show_part and has_m2), # start_2
gr.update(visible=show_part and has_m2), # end_2
gr.update(visible=show_part and has_m3), # start_3
gr.update(visible=show_part and has_m3), # end_3
]
selected_strategy.change(
fn=on_strategy_change,
inputs=[selected_strategy, num_routing_models],
outputs=[
random_block, every_k_block, slice_block, grep_block, resolved_block, part_block,
random_hint, weight_base, weight_model_1, weight_model_2, weight_model_3,
every_k_hint, k_model_1, k_model_2, k_model_3,
slice_hint, slice_model_1, slice_model_2, slice_model_3,
grep_hint, grep_model_1, grep_model_2, grep_model_3,
resolved_hint, resolved_model, unresolved_model,
part_hint, part_mode, start_1, end_1, start_2, end_2, start_3, end_3,
],
)
def filter_models(query):
"""Filter models based on search query (starts at 3 chars)"""
if not query or len(query) < 3:
return gr.update(choices=[])
all_models = get_litellm_model_list()
query_lower = query.lower()
filtered = [m for m in all_models if query_lower in m.lower()][:50]
return gr.update(choices=filtered)
routing_model_1.input(fn=filter_models, inputs=[routing_model_1], outputs=[routing_model_1])
routing_model_2.input(fn=filter_models, inputs=[routing_model_2], outputs=[routing_model_2])
routing_model_3.input(fn=filter_models, inputs=[routing_model_3], outputs=[routing_model_3])
def make_quick_select_fn_1(full_model_name):
def fn():
prices = get_routing_prices_with_labels(full_model_name)
return (gr.update(value=full_model_name), *prices,
gr.update(visible=True), gr.update(interactive=True))
return fn
def make_quick_select_fn_2(full_model_name):
def fn():
prices = get_routing_prices_with_labels(full_model_name)
return (gr.update(value=full_model_name), *prices,
gr.update(visible=True))
return fn
def make_quick_select_fn_3(full_model_name):
def fn():
prices = get_routing_prices_with_labels(full_model_name)
return (gr.update(value=full_model_name), *prices)
return fn
for btn, full_model in quick_btns_1:
btn.click(
fn=make_quick_select_fn_1(full_model),
outputs=[routing_model_1, routing_price_1_input, routing_price_1_cache_read, routing_price_1_cache_creation, routing_price_1_completion, add_model_2_btn, route_btn]
)
for btn, full_model in quick_btns_2:
btn.click(
fn=make_quick_select_fn_2(full_model),
outputs=[routing_model_2, routing_price_2_input, routing_price_2_cache_read, routing_price_2_cache_creation, routing_price_2_completion, add_model_3_btn]
)
for btn, full_model in quick_btns_3:
btn.click(
fn=make_quick_select_fn_3(full_model),
outputs=[routing_model_3, routing_price_3_input, routing_price_3_cache_read, routing_price_3_cache_creation, routing_price_3_completion]
)
def get_routing_prices_with_labels(model_name):
"""Get all 4 prices for a routing model with found/estimated labels"""
if not model_name:
return (
gr.update(value=0, label="Input"),
gr.update(value=0, label="Cache Read"),
gr.update(value=0, label="Cache Creation"),
gr.update(value=0, label="Completion"),
)
prices = get_litellm_prices()
model_prices = prices.get(model_name, {})
input_price = model_prices.get("input_cost_per_token", 0) * 1e6
cache_read = model_prices.get("cache_read_input_token_cost", 0) * 1e6
cache_creation = model_prices.get("cache_creation_input_token_cost", 0) * 1e6
completion = model_prices.get("output_cost_per_token", 0) * 1e6
input_found = input_price > 0
cache_read_found = cache_read > 0
cache_creation_found = cache_creation > 0
completion_found = completion > 0
if not cache_read_found and input_price > 0:
cache_read = input_price * 0.1
if not cache_creation_found and input_price > 0:
cache_creation = input_price * 1.25
def label(name, found):
return f"✅ {name}" if found else f"❌ {name}"
return (
gr.update(value=input_price, label=label("Input", input_found)),
gr.update(value=cache_read, label=label("Cache Read", cache_read_found)),
gr.update(value=cache_creation, label=label("Cache Creation", cache_creation_found)),
gr.update(value=completion, label=label("Completion", completion_found)),
)
def on_routing_model_1_select(model_name):
prices = get_routing_prices_with_labels(model_name)
show_btn = bool(model_name)
return (*prices, gr.update(visible=show_btn), gr.update(interactive=show_btn))
def on_routing_model_2_select(model_name):
prices = get_routing_prices_with_labels(model_name)
show_btn = bool(model_name)
return (*prices, gr.update(visible=show_btn))
def on_routing_model_3_select(model_name):
return get_routing_prices_with_labels(model_name)
routing_model_1.change(
fn=on_routing_model_1_select,
inputs=[routing_model_1],
outputs=[routing_price_1_input, routing_price_1_cache_read, routing_price_1_cache_creation, routing_price_1_completion, add_model_2_btn, route_btn],
)
def show_model_2(strategy):
is_random = strategy == "Random router"
is_every_k = strategy == "Every k-th step"
is_slice = strategy == "Python list slices"
is_grep = strategy == "Grep"
is_part = strategy == "Replace part of trajectory"
return (
gr.update(visible=True), # show block 2
gr.update(visible=False), # hide add button
gr.update(visible=is_random), # weight2
gr.update(visible=is_every_k), # k2
gr.update(visible=is_slice), # slice2
gr.update(visible=is_grep), # grep2
gr.update(visible=is_part), # start2
gr.update(visible=is_part), # end2
2,
)
add_model_2_btn.click(
fn=show_model_2,
inputs=[selected_strategy],
outputs=[routing_block_2, add_model_2_btn, weight_model_2, k_model_2, slice_model_2, grep_model_2, start_2, end_2, num_routing_models],
)
routing_model_2.change(
fn=on_routing_model_2_select,
inputs=[routing_model_2],
outputs=[routing_price_2_input, routing_price_2_cache_read, routing_price_2_cache_creation, routing_price_2_completion, add_model_3_btn],
)
def show_model_3(strategy):
is_random = strategy == "Random router"
is_every_k = strategy == "Every k-th step"
is_slice = strategy == "Python list slices"
is_grep = strategy == "Grep"
is_part = strategy == "Replace part of trajectory"
return (
gr.update(visible=True), # show block 3
gr.update(visible=False), # hide add button
gr.update(visible=is_random), # weight3
gr.update(visible=is_every_k), # k3
gr.update(visible=is_slice), # slice3
gr.update(visible=is_grep), # grep3
gr.update(visible=is_part), # start3
gr.update(visible=is_part), # end3
3,
)
add_model_3_btn.click(
fn=show_model_3,
inputs=[selected_strategy],
outputs=[routing_block_3, add_model_3_btn, weight_model_3, k_model_3, slice_model_3, grep_model_3, start_3, end_3, num_routing_models],
)
routing_model_3.change(
fn=on_routing_model_3_select,
inputs=[routing_model_3],
outputs=[routing_price_3_input, routing_price_3_cache_read, routing_price_3_cache_creation, routing_price_3_completion],
)
def run_routing(
state_data,
base_input, base_cache_read, base_cache_creation, base_completion,
routing_model_1_val, r1_input, r1_cache_read, r1_cache_creation, r1_completion,
routing_model_2_val, r2_input, r2_cache_read, r2_cache_creation, r2_completion,
routing_model_3_val, r3_input, r3_cache_read, r3_cache_creation, r3_completion,
strategy_val,
weight_base_val, weight_1_val, weight_2_val, weight_3_val,
k_1_val, k_2_val, k_3_val,
slice_1_val, slice_2_val, slice_3_val,
grep_1_val, grep_2_val, grep_3_val,
resolved_model_val, unresolved_model_val,
part_mode_val, start_1_val, end_1_val, start_2_val, end_2_val, start_3_val, end_3_val,
overhead, with_cache,
detected_model_val
):
if state_data is None:
yield (
gr.update(visible=True, value="❌ No trajectories loaded. Click 'Load & Analyze' first."),
gr.update(visible=False),
None, None,
)
return
if not routing_model_1_val:
yield (
gr.update(visible=True, value="❌ Please select at least one routing model."),
gr.update(visible=False),
None, None,
)
return
trajectory_steps = state_data.get("steps", {})
resolved_instances = state_data.get("resolved", {})
if not trajectory_steps:
yield (
gr.update(visible=True, value="❌ No trajectory steps data available."),
gr.update(visible=False),
None, None,
)
return
df_calc = state_data.get("calculated")
if df_calc is not None and not df_calc.empty:
df_for_cost = apply_thinking_overhead(df_calc.copy(), overhead)
if not with_cache:
df_for_cost = apply_no_cache(df_for_cost)
df_temp = df_for_cost.copy()
df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0)
total_original_cost_from_df = (
df_temp["uncached_input"].sum() * base_input / 1e6 +
df_for_cost["cache_read_tokens"].sum() * base_cache_read / 1e6 +
df_for_cost["cache_creation_tokens"].sum() * base_cache_creation / 1e6 +
df_for_cost["completion_tokens"].sum() * base_completion / 1e6
)
else:
total_original_cost_from_df = None
base_prices = {
"input": base_input,
"cache_read": base_cache_read,
"cache_creation": base_cache_creation,
"completion": base_completion,
}
routing_models = []
if routing_model_1_val:
routing_models.append({
"name": routing_model_1_val,
"prices": {"input": r1_input, "cache_read": r1_cache_read, "cache_creation": r1_cache_creation, "completion": r1_completion},
})
if routing_model_2_val:
routing_models.append({
"name": routing_model_2_val,
"prices": {"input": r2_input, "cache_read": r2_cache_read, "cache_creation": r2_cache_creation, "completion": r2_completion},
})
if routing_model_3_val:
routing_models.append({
"name": routing_model_3_val,
"prices": {"input": r3_input, "cache_read": r3_cache_read, "cache_creation": r3_cache_creation, "completion": r3_completion},
})
if strategy_val == "Replace part of trajectory":
ranges = [(start_1_val, end_1_val)]
if len(routing_models) > 1:
ranges.append((start_2_val, end_2_val))
if len(routing_models) > 2:
ranges.append((start_3_val, end_3_val))
for i, (s, e) in enumerate(ranges):
if s >= e:
yield (gr.update(visible=True, value=f"❌ Model {i+1}: Start must be less than End"), gr.update(visible=False), None, None)
return
for i in range(len(ranges)):
for j in range(i+1, len(ranges)):
s1, e1 = ranges[i]
s2, e2 = ranges[j]
if not (e1 <= s2 or e2 <= s1):
yield (gr.update(visible=True, value=f"❌ Model {i+1} and Model {j+1} ranges overlap"), gr.update(visible=False), None, None)
return
weights = None
if strategy_val == "Random router":
weights = [weight_base_val, weight_1_val]
if len(routing_models) > 1:
weights.append(weight_2_val)
if len(routing_models) > 2:
weights.append(weight_3_val)
total_weight = sum(weights)
if abs(total_weight - 1.0) > 0.01:
yield (gr.update(visible=True, value=f"❌ Weights must sum to 1.0 (current: {total_weight:.2f})"), gr.update(visible=False), None, None)
return
k_values = [k_1_val, k_2_val, k_3_val][:len(routing_models)]
slice_values = [slice_1_val, slice_2_val, slice_3_val][:len(routing_models)]
grep_values = [grep_1_val, grep_2_val, grep_3_val][:len(routing_models)]
part_ranges = [(start_1_val, end_1_val), (start_2_val, end_2_val), (start_3_val, end_3_val)][:len(routing_models)]
if strategy_val == "Grep":
for i, gv in enumerate(grep_values):
if gv and "|" in gv and "&" in gv:
yield (gr.update(visible=True, value=f"❌ M{i+1} grep: cannot mix | and & operators"), gr.update(visible=False), None, None)
return
def grep_matches(text, pattern):
"""Check if text matches grep pattern (words with | or &)"""
if not pattern or not text:
return False
pattern = pattern.strip()
if "|" in pattern:
words = [w.strip() for w in pattern.split("|") if w.strip()]
for word in words:
if re.search(r'\b' + re.escape(word) + r'\b', text):
return True
return False
elif "&" in pattern:
words = [w.strip() for w in pattern.split("&") if w.strip()]
for word in words:
if not re.search(r'\b' + re.escape(word) + r'\b', text):
return False
return True
else:
return bool(re.search(r'\b' + re.escape(pattern) + r'\b', text))
def parse_slice(slice_str, length):
"""Parse Python slice notation like [0::3] and return list of indices"""
slice_str = slice_str.strip()
if slice_str.startswith("[") and slice_str.endswith("]"):
slice_str = slice_str[1:-1]
parts = slice_str.split(":")
if len(parts) == 2:
start = int(parts[0]) if parts[0] else None
stop = int(parts[1]) if parts[1] else None
step = None
elif len(parts) == 3:
start = int(parts[0]) if parts[0] else None
stop = int(parts[1]) if parts[1] else None
step = int(parts[2]) if parts[2] else None
else:
return []
return list(range(length))[slice(start, stop, step)]
BASE_MODEL = "__base__"
model_keys = [BASE_MODEL] + [f"__routing_{i}__" for i in range(len(routing_models))]
all_tokens = {key: {"uncached_input": 0, "cache_read": 0, "cache_creation": 0, "completion": 0} for key in model_keys}
total_original_tokens = {"uncached_input": 0, "cache_read": 0, "cache_creation": 0, "completion": 0}
for instance_id, steps in trajectory_steps.items():
if not steps:
continue
total_steps = len(steps)
step_to_model = {}
if strategy_val == "Random router":
model_choices = [BASE_MODEL] + [f"__routing_{j}__" for j in range(len(routing_models))]
for i in range(total_steps):
step_to_model[i] = random.choices(model_choices, weights=weights)[0]
elif strategy_val == "Every k-th step":
for j, k_val in enumerate(k_values):
if k_val and k_val > 0:
for i in range(total_steps):
if (i + 1) % int(k_val) == 0:
if i not in step_to_model:
step_to_model[i] = f"__routing_{j}__"
elif strategy_val == "Python list slices":
for j, slice_val in enumerate(slice_values):
if slice_val:
try:
indices = parse_slice(slice_val, total_steps)
for i in indices:
if i not in step_to_model:
step_to_model[i] = f"__routing_{j}__"
except Exception:
pass
elif strategy_val == "Grep":
for i, step in enumerate(steps):
content = step.get("content", "")
for j, grep_val in enumerate(grep_values):
if grep_val and i not in step_to_model:
if grep_matches(content, grep_val):
step_to_model[i] = f"__routing_{j}__"
elif strategy_val == "Resolved/Unresolved":
is_resolved = resolved_instances.get(instance_id, False)
target_model = resolved_model_val if is_resolved else unresolved_model_val
if target_model and target_model != "Base":
model_idx = {"M1": 0, "M2": 1, "M3": 2}.get(target_model)
if model_idx is not None and model_idx < len(routing_models):
for i in range(total_steps):
step_to_model[i] = f"__routing_{model_idx}__"
elif strategy_val == "Replace part of trajectory":
for j, (start_val, end_val) in enumerate(part_ranges):
if part_mode_val == "Percentages":
start_idx = int(total_steps * start_val / 100)
end_idx = int(total_steps * end_val / 100)
else:
start_idx = int(start_val)
end_idx = min(int(end_val), total_steps)
for i in range(start_idx, end_idx):
step_to_model[i] = f"__routing_{j}__"
modified_steps = []
for i, step in enumerate(steps):
model = step_to_model.get(i, BASE_MODEL)
modified_steps.append({
"model": model,
"system_user": step.get("system_user", 0),
"completion": int(step.get("completion", 0) * overhead),
"observation": step.get("observation"),
})
model_totals = calculate_routing_tokens(modified_steps)
for key in model_keys:
totals = model_totals.get(key, {})
all_tokens[key]["cache_read"] += totals.get("cache_read", 0)
all_tokens[key]["uncached_input"] += totals.get("uncached_input", 0)
all_tokens[key]["completion"] += totals.get("completion", 0)
all_tokens[key]["cache_creation"] += totals.get("cache_creation", 0)
original_steps = []
for step in steps:
original_steps.append({
"model": BASE_MODEL,
"system_user": step.get("system_user", 0),
"completion": int(step.get("completion", 0) * overhead),
"observation": step.get("observation"),
})
original_totals = calculate_routing_tokens(original_steps)
orig = original_totals.get(BASE_MODEL, {})
total_original_tokens["cache_read"] += orig.get("cache_read", 0)
total_original_tokens["uncached_input"] += orig.get("uncached_input", 0)
total_original_tokens["completion"] += orig.get("completion", 0)
total_original_tokens["cache_creation"] += orig.get("cache_creation", 0)
def calc_cost(tokens: dict, prices: dict) -> float:
return (
tokens["uncached_input"] * prices["input"] / 1e6 +
tokens["cache_read"] * prices["cache_read"] / 1e6 +
tokens["cache_creation"] * prices["cache_creation"] / 1e6 +
tokens["completion"] * prices["completion"] / 1e6
)
def tokens_to_costs(tokens: dict, prices: dict) -> dict:
price_map = {"uncached_input": "input", "cache_read": "cache_read", "cache_creation": "cache_creation", "completion": "completion"}
return {k: tokens[k] * prices[price_map[k]] / 1e6 for k in tokens}
total_base_tokens = all_tokens[BASE_MODEL]
base_costs = tokens_to_costs(total_base_tokens, base_prices)
total_base_cost = calc_cost(total_base_tokens, base_prices)
routing_costs_list = []
total_routing_cost = 0
for i, rm in enumerate(routing_models):
key = f"__routing_{i}__"
tokens = all_tokens[key]
costs = tokens_to_costs(tokens, rm["prices"])
cost = calc_cost(tokens, rm["prices"])
routing_costs_list.append({"name": rm["name"], "tokens": tokens, "costs": costs, "cost": cost})
total_routing_cost += cost
total_original_cost = calc_cost(total_original_tokens, base_prices)
total_routed_cost = total_base_cost + total_routing_cost
savings = total_original_cost - total_routed_cost
savings_pct = (savings / total_original_cost * 100) if total_original_cost > 0 else 0
result_lines = [
"## 🚀 Routing Results",
"",
"| Metric | Value |",
"|--------|-------|",
f"| **Original Cost (base model only)** | ${total_original_cost:.2f} |",
f"| **Routed Cost** | ${total_routed_cost:.2f} |",
f"| ↳ Base model portion | ${total_base_cost:.2f} |",
]
for rc in routing_costs_list:
result_lines.append(f"| ↳ {rc['name']} | ${rc['cost']:.2f} |")
savings_color = "green" if savings >= 0 else "red"
result_lines.append(f'| **Savings** | ${savings:.2f} · {savings_pct:.1f}% |')
result_text = "\n".join(result_lines)
def apply_display_formula(tokens: dict) -> dict:
prompt = tokens["cache_read"] + tokens["uncached_input"]
if with_cache:
uncached_display = max(0, prompt - tokens["cache_read"] - tokens["cache_creation"])
return {
"uncached_input": uncached_display,
"cache_read": tokens["cache_read"],
"cache_creation": tokens["cache_creation"],
"completion": tokens["completion"],
}
else:
return {
"uncached_input": prompt,
"cache_read": 0,
"cache_creation": 0,
"completion": tokens["completion"],
}
total_base_tokens_display = apply_display_formula(total_base_tokens)
base_costs = tokens_to_costs(total_base_tokens_display, base_prices)
additional_token_models = [(rc["name"], apply_display_formula(rc["tokens"])) for rc in routing_costs_list]
additional_cost_models = []
for i, rc in enumerate(routing_costs_list):
model_prices = routing_models[i]["prices"]
additional_cost_models.append((rc["name"], tokens_to_costs(apply_display_formula(rc["tokens"]), model_prices)))
if df_calc is not None and not df_calc.empty:
df_temp = df_for_cost.copy()
df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0)
original_tokens_from_df = {
"uncached_input": df_temp["uncached_input"].sum(),
"cache_read": df_for_cost["cache_read_tokens"].sum(),
"cache_creation": df_for_cost["cache_creation_tokens"].sum(),
"completion": df_for_cost["completion_tokens"].sum(),
}
else:
original_tokens_from_df = apply_display_formula(total_original_tokens)
original_costs = tokens_to_costs(original_tokens_from_df, base_prices)
base_model_name = detected_model_val or "Base"
tokens_chart = create_routed_token_chart(original_tokens_from_df, total_base_tokens_display, additional_token_models, base_model_name)
cost_chart = create_routed_cost_chart(original_costs, base_costs, additional_cost_models, base_model_name)
yield (
gr.update(visible=True, value=result_text),
gr.update(visible=True),
tokens_chart,
cost_chart,
)
route_btn.click(
fn=run_routing,
inputs=[
trajectories_state,
price_input, price_cache_read, price_cache_creation, price_completion,
routing_model_1, routing_price_1_input, routing_price_1_cache_read, routing_price_1_cache_creation, routing_price_1_completion,
routing_model_2, routing_price_2_input, routing_price_2_cache_read, routing_price_2_cache_creation, routing_price_2_completion,
routing_model_3, routing_price_3_input, routing_price_3_cache_read, routing_price_3_cache_creation, routing_price_3_completion,
selected_strategy,
weight_base, weight_model_1, weight_model_2, weight_model_3,
k_model_1, k_model_2, k_model_3,
slice_model_1, slice_model_2, slice_model_3,
grep_model_1, grep_model_2, grep_model_3,
resolved_model, unresolved_model,
part_mode, start_1, end_1, start_2, end_2, start_3, end_3,
thinking_overhead, use_cache,
detected_model,
],
outputs=[routing_result, routing_plots_row, routing_tokens_plot, routing_cost_plot],
)
leaderboard_table.select(
fn=on_row_select,
inputs=[leaderboard_table],
outputs=[selected_folder, selected_name, analyze_btn, price_input, price_cache_read, price_cache_creation, price_completion, detected_model, thinking_overhead],
)
app.load(
fn=select_first_row,
inputs=[leaderboard_table],
outputs=[selected_folder, selected_name, analyze_btn, price_input, price_cache_read, price_cache_creation, price_completion, detected_model, thinking_overhead],
js="""
(data) => {
const row = gradioApp()?.querySelector('#leaderboard-table table tbody tr');
if (row) {
row.click();
}
return data;
}
""",
)
def load_and_analyze(folder, input_price, cache_read_price, cache_creation_price, completion_price, overhead, with_cache, progress=gr.Progress()):
progress(0, desc="Ready")
empty_result = (
"",
gr.update(visible=False),
None, None,
None, None, None, None,
None, None, None, None,
None,
gr.update(visible=False),
gr.update(visible=False),
gr.update(),
gr.update(),
gr.update(),
gr.update(visible=False),
gr.update(),
gr.update(),
gr.update(),
)
if not folder:
progress(1, desc="No folder selected")
yield empty_result
return
if not check_trajectories_downloaded(folder):
progress(0.1, desc="Preparing download")
yield (
"⏳ Downloading trajectories...",
gr.update(visible=False),
None, None,
None, None, None, None,
None, None, None, None,
None,
gr.update(visible=False),
gr.update(visible=False),
gr.update(),
gr.update(),
gr.update(),
gr.update(visible=False),
gr.update(),
gr.update(),
gr.update(),
)
progress(0.3, desc="Downloading")
status, _ = download_trajectories_from_s3(folder)
if "❌" in status:
progress(1, desc="Download failed")
yield (
status,
gr.update(visible=False),
None, None,
None, None, None, None,
None, None, None, None,
None,
gr.update(visible=False),
gr.update(visible=False),
gr.update(),
gr.update(),
gr.update(),
gr.update(visible=False),
gr.update(),
gr.update(),
gr.update(),
)
return
progress(0.45, desc="Loading trajectories")
yield (
"⏳ Loading trajectories...",
gr.update(visible=True),
None, None,
None, None, None, None,
None, None, None, None,
None,
gr.update(visible=False),
gr.update(visible=False),
gr.update(),
gr.update(),
gr.update(),
gr.update(visible=False),
gr.update(),
gr.update(),
gr.update(),
)
progress(0.6, desc="Reading metadata")
df_meta = ensure_token_columns(load_all_trajectories(folder))
progress(0.7, desc="Reading calculated")
df_calc = ensure_token_columns(load_all_trajectories_calculated(folder))
df_calc["api_calls"] = df_meta["api_calls"].values
df_calc["instance_cost"] = df_meta["instance_cost"].values
progress(0.75, desc="Reading steps")
trajectory_steps = load_all_trajectory_steps(folder)
progress(0.8, desc="Reading metadata steps")
metadata_steps = load_all_trajectory_metadata_steps(folder)
model_details, _ = get_model_details(folder)
resolved_instances = {}
if model_details:
per_instance = model_details.get("per_instance_details", {})
for inst_id, details in per_instance.items():
resolved_instances[inst_id] = details.get("resolved", False)
state_data = {"meta": df_meta, "calculated": df_calc, "folder": folder, "steps": trajectory_steps, "metadata_steps": metadata_steps, "resolved": resolved_instances}
if df_meta.empty:
progress(1, desc="No trajectories found")
yield (
"❌ No trajectories found",
gr.update(visible=False),
None, None,
None, None, None, None,
None, None, None, None,
None,
gr.update(visible=False),
gr.update(visible=False),
gr.update(),
gr.update(),
gr.update(),
gr.update(visible=False),
gr.update(),
gr.update(),
gr.update(),
)
return
progress(0.9, desc="Building charts")
fig_steps, fig_cost, _, _, _ = create_basic_histograms(
df_meta, input_price, cache_read_price, cache_creation_price, completion_price
)
fig_tokens_meta, fig_tokens_cost_meta, fig_stacked_meta = create_token_charts(
df_meta, input_price, cache_read_price, cache_creation_price, completion_price
)
fig_cost_breakdown_meta = create_cost_breakdown(
df_meta, input_price, cache_read_price, cache_creation_price, completion_price
)
df_calc_processed = apply_thinking_overhead(df_calc.copy(), overhead)
if not with_cache:
df_calc_processed = apply_no_cache(df_calc_processed)
fig_tokens_calc, fig_tokens_cost_calc, fig_stacked_calc = create_token_charts(
df_calc_processed, input_price, cache_read_price, cache_creation_price, completion_price
)
fig_cost_breakdown_calc = create_cost_breakdown(
df_calc_processed, input_price, cache_read_price, cache_creation_price, completion_price
)
issue_ids = sorted(trajectory_steps.keys())
first_issue = issue_ids[0] if issue_ids else None
meta_issue_ids = sorted(metadata_steps.keys())
first_meta_issue = meta_issue_ids[0] if meta_issue_ids else None
has_meta_steps = len(meta_issue_ids) > 0
fig_single_traj = None
fig_single_traj_cost = None
if first_issue and first_issue in trajectory_steps:
calc_steps = trajectory_steps[first_issue]
fig_single_traj = create_single_trajectory_chart(calc_steps, overhead, with_cache)
fig_single_traj_cost = create_single_trajectory_cost_chart(calc_steps, input_price, cache_read_price, cache_creation_price, completion_price, overhead, with_cache)
fig_single_traj_meta = None
fig_single_traj_meta_cost = None
if first_meta_issue and first_meta_issue in metadata_steps:
meta_steps = metadata_steps[first_meta_issue]
fig_single_traj_meta = create_single_trajectory_meta_chart(meta_steps)
fig_single_traj_meta_cost = create_single_trajectory_meta_cost_chart(meta_steps, input_price, cache_read_price, cache_creation_price, completion_price)
progress(1, desc="Done")
yield (
f"✅ Loaded {len(df_meta)} trajectories",
gr.update(visible=True),
fig_steps, fig_cost,
fig_tokens_meta, fig_tokens_cost_meta, fig_stacked_meta, fig_cost_breakdown_meta,
fig_tokens_calc, fig_tokens_cost_calc, fig_stacked_calc, fig_cost_breakdown_calc,
state_data,
gr.update(visible=True),
gr.update(visible=True),
gr.update(choices=issue_ids, value=first_issue),
fig_single_traj,
fig_single_traj_cost,
gr.update(visible=has_meta_steps),
gr.update(choices=meta_issue_ids, value=first_meta_issue),
fig_single_traj_meta,
fig_single_traj_meta_cost,
)
def on_single_traj_select(state_data, issue_id, input_price, cache_read_price, cache_creation_price, completion_price, overhead, with_cache):
if state_data is None or not issue_id:
return None, None
trajectory_steps = state_data.get("steps", {})
if issue_id not in trajectory_steps:
return None, None
steps = trajectory_steps[issue_id]
tokens_chart = create_single_trajectory_chart(steps, overhead, with_cache)
cost_chart = create_single_trajectory_cost_chart(steps, input_price, cache_read_price, cache_creation_price, completion_price, overhead, with_cache)
return tokens_chart, cost_chart
def on_single_traj_meta_select(state_data, issue_id, input_price, cache_read_price, cache_creation_price, completion_price):
if state_data is None or not issue_id:
return None, None
metadata_steps = state_data.get("metadata_steps", {})
if issue_id not in metadata_steps:
return None, None
steps = metadata_steps[issue_id]
tokens_chart = create_single_trajectory_meta_chart(steps)
cost_chart = create_single_trajectory_meta_cost_chart(steps, input_price, cache_read_price, cache_creation_price, completion_price)
return tokens_chart, cost_chart
analyze_btn.click(
fn=load_and_analyze,
inputs=[selected_folder, price_input, price_cache_read, price_cache_creation, price_completion, thinking_overhead, use_cache],
outputs=[
download_status,
analysis_section,
plot_steps, plot_cost,
plot_tokens_meta, plot_tokens_cost_meta, plot_stacked_meta, plot_cost_breakdown_meta,
plot_tokens_calc, plot_tokens_cost_calc, plot_stacked_calc, plot_cost_breakdown_calc,
trajectories_state,
add_routing_btn,
single_traj_accordion,
single_traj_dropdown,
single_traj_plot,
single_traj_cost_plot,
single_traj_meta_accordion,
single_traj_meta_dropdown,
single_traj_meta_plot,
single_traj_meta_cost_plot,
],
)
def recalculate_costs(state_data, input_price, cache_read_price, cache_creation_price, completion_price, overhead, with_cache):
if state_data is None:
return None, None, None, None
df_meta = state_data["meta"]
df_calc = state_data["calculated"]
if df_meta.empty:
return None, None, None, None
fig_tokens_cost_meta = create_cost_by_type_chart(df_meta, input_price, cache_read_price, cache_creation_price, completion_price)
fig_cost_breakdown_meta = create_cost_breakdown(df_meta, input_price, cache_read_price, cache_creation_price, completion_price)
df_calc_processed = apply_thinking_overhead(df_calc.copy(), overhead)
if not with_cache:
df_calc_processed = apply_no_cache(df_calc_processed)
fig_tokens_cost_calc = create_cost_by_type_chart(df_calc_processed, input_price, cache_read_price, cache_creation_price, completion_price)
fig_cost_breakdown_calc = create_cost_breakdown(df_calc_processed, input_price, cache_read_price, cache_creation_price, completion_price)
return fig_tokens_cost_meta, fig_cost_breakdown_meta, fig_tokens_cost_calc, fig_cost_breakdown_calc
price_inputs = [trajectories_state, price_input, price_cache_read, price_cache_creation, price_completion, thinking_overhead, use_cache]
price_outputs = [plot_tokens_cost_meta, plot_cost_breakdown_meta, plot_tokens_cost_calc, plot_cost_breakdown_calc]
price_input.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs)
price_cache_read.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs)
price_cache_creation.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs)
price_completion.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs)
def on_calc_options_change(state_data, input_price, cache_read_price, cache_creation_price, completion_price, overhead, with_cache):
"""Recalculate only calculated charts when overhead or cache options change"""
if state_data is None:
return None, None, None, None
df_calc = state_data["calculated"]
if df_calc.empty:
return None, None, None, None
df_calc_processed = apply_thinking_overhead(df_calc.copy(), overhead)
if not with_cache:
df_calc_processed = apply_no_cache(df_calc_processed)
fig_tokens_calc, fig_tokens_cost_calc, fig_stacked_calc = create_token_charts(
df_calc_processed, input_price, cache_read_price, cache_creation_price, completion_price
)
fig_cost_breakdown_calc = create_cost_breakdown(
df_calc_processed, input_price, cache_read_price, cache_creation_price, completion_price
)
return fig_tokens_calc, fig_tokens_cost_calc, fig_stacked_calc, fig_cost_breakdown_calc
calc_options_inputs = [trajectories_state, price_input, price_cache_read, price_cache_creation, price_completion, thinking_overhead, use_cache]
calc_options_outputs = [plot_tokens_calc, plot_tokens_cost_calc, plot_stacked_calc, plot_cost_breakdown_calc]
single_traj_dropdown.change(
fn=on_single_traj_select,
inputs=[trajectories_state, single_traj_dropdown, price_input, price_cache_read, price_cache_creation, price_completion, thinking_overhead, use_cache],
outputs=[single_traj_plot, single_traj_cost_plot],
)
single_traj_meta_dropdown.change(
fn=on_single_traj_meta_select,
inputs=[trajectories_state, single_traj_meta_dropdown, price_input, price_cache_read, price_cache_creation, price_completion],
outputs=[single_traj_meta_plot, single_traj_meta_cost_plot],
)
single_traj_inputs = [trajectories_state, single_traj_dropdown, price_input, price_cache_read, price_cache_creation, price_completion, thinking_overhead, use_cache]
single_traj_outputs = [single_traj_plot, single_traj_cost_plot]
routing_inputs = [
trajectories_state,
price_input, price_cache_read, price_cache_creation, price_completion,
routing_model_1, routing_price_1_input, routing_price_1_cache_read, routing_price_1_cache_creation, routing_price_1_completion,
routing_model_2, routing_price_2_input, routing_price_2_cache_read, routing_price_2_cache_creation, routing_price_2_completion,
routing_model_3, routing_price_3_input, routing_price_3_cache_read, routing_price_3_cache_creation, routing_price_3_completion,
selected_strategy,
weight_base, weight_model_1, weight_model_2, weight_model_3,
k_model_1, k_model_2, k_model_3,
slice_model_1, slice_model_2, slice_model_3,
grep_model_1, grep_model_2, grep_model_3,
resolved_model, unresolved_model,
part_mode, start_1, end_1, start_2, end_2, start_3, end_3,
thinking_overhead, use_cache,
detected_model,
]
routing_outputs = [routing_result, routing_plots_row, routing_tokens_plot, routing_cost_plot]
thinking_overhead.change(
fn=on_calc_options_change,
inputs=calc_options_inputs,
outputs=calc_options_outputs,
).then(
fn=on_single_traj_select,
inputs=single_traj_inputs,
outputs=single_traj_outputs,
).then(
fn=run_routing,
inputs=routing_inputs,
outputs=routing_outputs,
)
use_cache.change(
fn=on_calc_options_change,
inputs=calc_options_inputs,
outputs=calc_options_outputs,
).then(
fn=on_single_traj_select,
inputs=single_traj_inputs,
outputs=single_traj_outputs,
).then(
fn=run_routing,
inputs=routing_inputs,
outputs=routing_outputs,
)
return app
if __name__ == "__main__":
app = build_app()
app.queue()
app.launch()